Module ezduckdb.paths

Expand source code
from pathlib import Path


class S3AwarePath(type(Path())):
    """
    A subclass of pathlib.Path that adds additional methods for handling
    Amazon S3 paths.

    Methods
    -------
    is_s3()
        Checks if the path is an S3 path.

    __str__()
        Returns the string representation of the path, formatted correctly
        for S3 paths.

    get_s3_bucket()
        Extracts the bucket name from an S3 path.

    get_s3_prefix()
        Extracts the S3 prefix (the path inside the bucket) from an S3 path.

    get_table_name()
        Parses the schema and table name from the file stem assuming a
        specific naming convention of `<schema_name>_<table_name>.extension`.

    Raises
    ------
    Exception
        If the path is not an S3 path when calling `get_s3_bucket` or
        `get_s3_prefix`, or if the file stem does not follow the required
        naming convention when calling `get_table_name`.

    Examples
    --------
    >>> path = S3AwarePath("s3://mybucket/myfolder/myfile_name_here.parquet")
    >>> path.is_s3()
    True
    >>> str(path)
    's3://mybucket/myfolder/myfile.parquet'
    >>> path.get_s3_bucket()
    'mybucket'
    >>> path.get_s3_prefix()
    'myfolder/myfile.parquet'
    >>> path.get_table_name()
    ('myfile', 'name_here')
    """

    def is_s3(self):
        """
        Check if the path is an S3 path.

        Returns
        -------
        bool
            True if the path is an S3 path, False otherwise.
        """
        return self.parts[0] == "s3:"

    def __str__(self):
        if self.is_s3():
            return f"s3://{super().__str__()[4:]}"
        else:
            return super().__str__()

    def get_s3_bucket(self):
        """Extract the bucket name from an S3 path.

        Returns
        -------
        str
            The bucket name.

        Raises
        ------
        Exception
            If the path is not an S3 path.
        """
        if self.is_s3():
            return self.parts[1]
        else:
            raise Exception("Not an S3 path")

    def get_s3_prefix(self):
        """Extract the S3 prefix from an S3 path.

        Returns
        -------
        str
            The S3 prefix (the path inside the bucket).

        Raises
        ------
        Exception
            If the path is not an S3 path.
        """
        if self.is_s3():
            return "/".join(self.parts[2:])
        else:
            raise Exception("Not an S3 path")

    def get_table_name(self):
        """
        Parse the schema and table name from the file stem.

        Assumes a naming convention of `<schema_name>_<table_name>`.

        Returns
        -------
        tuple of str
            A tuple containing the schema name and table name.

        Raises
        ------
        Exception
            If the file stem does not follow the required naming convention.

        Examples
        --------
        >>> path = S3AwarePath("myfile_name_here.parquet")
        >>> path.get_table_name()
        ('myfile', 'name_here')
        """
        if self.stem.count("_") < 1:
            raise Exception(
                "Not a valid format. Needs at least 1 `_` to match format `<schema_name>_<table_name>`"
            )
        schema_name = self.stem.split("_")[0]
        table_name = self.stem[len(schema_name) + 1 :]
        return schema_name, table_name

Classes

class S3AwarePath (*args, **kwargs)

A subclass of pathlib.Path that adds additional methods for handling Amazon S3 paths.

Methods

is_s3() Checks if the path is an S3 path.

str() Returns the string representation of the path, formatted correctly for S3 paths.

get_s3_bucket() Extracts the bucket name from an S3 path.

get_s3_prefix() Extracts the S3 prefix (the path inside the bucket) from an S3 path.

get_table_name() Parses the schema and table name from the file stem assuming a specific naming convention of <schema_name>_<table_name>.extension.

Raises

Exception
If the path is not an S3 path when calling get_s3_bucket or get_s3_prefix, or if the file stem does not follow the required naming convention when calling get_table_name.

Examples

>>> path = S3AwarePath("s3://mybucket/myfolder/myfile_name_here.parquet")
>>> path.is_s3()
True
>>> str(path)
's3://mybucket/myfolder/myfile.parquet'
>>> path.get_s3_bucket()
'mybucket'
>>> path.get_s3_prefix()
'myfolder/myfile.parquet'
>>> path.get_table_name()
('myfile', 'name_here')
Expand source code
class S3AwarePath(type(Path())):
    """
    A subclass of pathlib.Path that adds additional methods for handling
    Amazon S3 paths.

    Methods
    -------
    is_s3()
        Checks if the path is an S3 path.

    __str__()
        Returns the string representation of the path, formatted correctly
        for S3 paths.

    get_s3_bucket()
        Extracts the bucket name from an S3 path.

    get_s3_prefix()
        Extracts the S3 prefix (the path inside the bucket) from an S3 path.

    get_table_name()
        Parses the schema and table name from the file stem assuming a
        specific naming convention of `<schema_name>_<table_name>.extension`.

    Raises
    ------
    Exception
        If the path is not an S3 path when calling `get_s3_bucket` or
        `get_s3_prefix`, or if the file stem does not follow the required
        naming convention when calling `get_table_name`.

    Examples
    --------
    >>> path = S3AwarePath("s3://mybucket/myfolder/myfile_name_here.parquet")
    >>> path.is_s3()
    True
    >>> str(path)
    's3://mybucket/myfolder/myfile.parquet'
    >>> path.get_s3_bucket()
    'mybucket'
    >>> path.get_s3_prefix()
    'myfolder/myfile.parquet'
    >>> path.get_table_name()
    ('myfile', 'name_here')
    """

    def is_s3(self):
        """
        Check if the path is an S3 path.

        Returns
        -------
        bool
            True if the path is an S3 path, False otherwise.
        """
        return self.parts[0] == "s3:"

    def __str__(self):
        if self.is_s3():
            return f"s3://{super().__str__()[4:]}"
        else:
            return super().__str__()

    def get_s3_bucket(self):
        """Extract the bucket name from an S3 path.

        Returns
        -------
        str
            The bucket name.

        Raises
        ------
        Exception
            If the path is not an S3 path.
        """
        if self.is_s3():
            return self.parts[1]
        else:
            raise Exception("Not an S3 path")

    def get_s3_prefix(self):
        """Extract the S3 prefix from an S3 path.

        Returns
        -------
        str
            The S3 prefix (the path inside the bucket).

        Raises
        ------
        Exception
            If the path is not an S3 path.
        """
        if self.is_s3():
            return "/".join(self.parts[2:])
        else:
            raise Exception("Not an S3 path")

    def get_table_name(self):
        """
        Parse the schema and table name from the file stem.

        Assumes a naming convention of `<schema_name>_<table_name>`.

        Returns
        -------
        tuple of str
            A tuple containing the schema name and table name.

        Raises
        ------
        Exception
            If the file stem does not follow the required naming convention.

        Examples
        --------
        >>> path = S3AwarePath("myfile_name_here.parquet")
        >>> path.get_table_name()
        ('myfile', 'name_here')
        """
        if self.stem.count("_") < 1:
            raise Exception(
                "Not a valid format. Needs at least 1 `_` to match format `<schema_name>_<table_name>`"
            )
        schema_name = self.stem.split("_")[0]
        table_name = self.stem[len(schema_name) + 1 :]
        return schema_name, table_name

Ancestors

  • pathlib.PosixPath
  • pathlib.Path
  • pathlib.PurePosixPath
  • pathlib.PurePath

Methods

def get_s3_bucket(self)

Extract the bucket name from an S3 path.

Returns

str
The bucket name.

Raises

Exception
If the path is not an S3 path.
Expand source code
def get_s3_bucket(self):
    """Extract the bucket name from an S3 path.

    Returns
    -------
    str
        The bucket name.

    Raises
    ------
    Exception
        If the path is not an S3 path.
    """
    if self.is_s3():
        return self.parts[1]
    else:
        raise Exception("Not an S3 path")
def get_s3_prefix(self)

Extract the S3 prefix from an S3 path.

Returns

str
The S3 prefix (the path inside the bucket).

Raises

Exception
If the path is not an S3 path.
Expand source code
def get_s3_prefix(self):
    """Extract the S3 prefix from an S3 path.

    Returns
    -------
    str
        The S3 prefix (the path inside the bucket).

    Raises
    ------
    Exception
        If the path is not an S3 path.
    """
    if self.is_s3():
        return "/".join(self.parts[2:])
    else:
        raise Exception("Not an S3 path")
def get_table_name(self)

Parse the schema and table name from the file stem.

Assumes a naming convention of <schema_name>_<table_name>.

Returns

tuple of str
A tuple containing the schema name and table name.

Raises

Exception
If the file stem does not follow the required naming convention.

Examples

>>> path = S3AwarePath("myfile_name_here.parquet")
>>> path.get_table_name()
('myfile', 'name_here')
Expand source code
def get_table_name(self):
    """
    Parse the schema and table name from the file stem.

    Assumes a naming convention of `<schema_name>_<table_name>`.

    Returns
    -------
    tuple of str
        A tuple containing the schema name and table name.

    Raises
    ------
    Exception
        If the file stem does not follow the required naming convention.

    Examples
    --------
    >>> path = S3AwarePath("myfile_name_here.parquet")
    >>> path.get_table_name()
    ('myfile', 'name_here')
    """
    if self.stem.count("_") < 1:
        raise Exception(
            "Not a valid format. Needs at least 1 `_` to match format `<schema_name>_<table_name>`"
        )
    schema_name = self.stem.split("_")[0]
    table_name = self.stem[len(schema_name) + 1 :]
    return schema_name, table_name
def is_s3(self)

Check if the path is an S3 path.

Returns

bool
True if the path is an S3 path, False otherwise.
Expand source code
def is_s3(self):
    """
    Check if the path is an S3 path.

    Returns
    -------
    bool
        True if the path is an S3 path, False otherwise.
    """
    return self.parts[0] == "s3:"