Pandas

`zenml.integrations.pandas`

Initialization of the Pandas integration.

Attributes

`PANDAS = 'pandas'` `module-attribute`

Classes

`Integration`

Base class for integration in ZenML.

Functions

`activate() -> None` `classmethod`

Abstract method to activate the integration.

Source code in src/zenml/integrations/integration.py

@classmethod
def activate(cls) -> None:
    """Abstract method to activate the integration."""

`check_installation() -> bool` `classmethod`

Method to check whether the required packages are installed.

Returns:

Type	Description
`bool`	True if all required packages are installed, False otherwise.

Source code in src/zenml/integrations/integration.py

@classmethod
def check_installation(cls) -> bool:
    """Method to check whether the required packages are installed.

    Returns:
        True if all required packages are installed, False otherwise.
    """
    for r in cls.get_requirements():
        try:
            # First check if the base package is installed
            dist = pkg_resources.get_distribution(r)

            # Next, check if the dependencies (including extras) are
            # installed
            deps: List[Requirement] = []

            _, extras = parse_requirement(r)
            if extras:
                extra_list = extras[1:-1].split(",")
                for extra in extra_list:
                    try:
                        requirements = dist.requires(extras=[extra])  # type: ignore[arg-type]
                    except pkg_resources.UnknownExtra as e:
                        logger.debug(f"Unknown extra: {str(e)}")
                        return False
                    deps.extend(requirements)
            else:
                deps = dist.requires()

            for ri in deps:
                try:
                    # Remove the "extra == ..." part from the requirement string
                    cleaned_req = re.sub(
                        r"; extra == \"\w+\"", "", str(ri)
                    )
                    pkg_resources.get_distribution(cleaned_req)
                except pkg_resources.DistributionNotFound as e:
                    logger.debug(
                        f"Unable to find required dependency "
                        f"'{e.req}' for requirement '{r}' "
                        f"necessary for integration '{cls.NAME}'."
                    )
                    return False
                except pkg_resources.VersionConflict as e:
                    logger.debug(
                        f"Package version '{e.dist}' does not match "
                        f"version '{e.req}' required by '{r}' "
                        f"necessary for integration '{cls.NAME}'."
                    )
                    return False

        except pkg_resources.DistributionNotFound as e:
            logger.debug(
                f"Unable to find required package '{e.req}' for "
                f"integration {cls.NAME}."
            )
            return False
        except pkg_resources.VersionConflict as e:
            logger.debug(
                f"Package version '{e.dist}' does not match version "
                f"'{e.req}' necessary for integration {cls.NAME}."
            )
            return False

    logger.debug(
        f"Integration {cls.NAME} is installed correctly with "
        f"requirements {cls.get_requirements()}."
    )
    return True

`flavors() -> List[Type[Flavor]]` `classmethod`

Abstract method to declare new stack component flavors.

Returns:

Type	Description
`List[Type[Flavor]]`	A list of new stack component flavors.

Source code in src/zenml/integrations/integration.py

@classmethod
def flavors(cls) -> List[Type[Flavor]]:
    """Abstract method to declare new stack component flavors.

    Returns:
        A list of new stack component flavors.
    """
    return []

`get_requirements(target_os: Optional[str] = None, python_version: Optional[str] = None) -> List[str]` `classmethod`

Method to get the requirements for the integration.

Parameters:

Name	Type	Description	Default
`target_os`	`Optional[str]`	The target operating system to get the requirements for.	`None`
`python_version`	`Optional[str]`	The Python version to use for the requirements.	`None`

Returns:

Type	Description
`List[str]`	A list of requirements.

Source code in src/zenml/integrations/integration.py

@classmethod
def get_requirements(
    cls,
    target_os: Optional[str] = None,
    python_version: Optional[str] = None,
) -> List[str]:
    """Method to get the requirements for the integration.

    Args:
        target_os: The target operating system to get the requirements for.
        python_version: The Python version to use for the requirements.

    Returns:
        A list of requirements.
    """
    return cls.REQUIREMENTS

`get_uninstall_requirements(target_os: Optional[str] = None) -> List[str]` `classmethod`

Method to get the uninstall requirements for the integration.

Parameters:

Name	Type	Description	Default
`target_os`	`Optional[str]`	The target operating system to get the requirements for.	`None`

Returns:

Type	Description
`List[str]`	A list of requirements.

Source code in src/zenml/integrations/integration.py

@classmethod
def get_uninstall_requirements(
    cls, target_os: Optional[str] = None
) -> List[str]:
    """Method to get the uninstall requirements for the integration.

    Args:
        target_os: The target operating system to get the requirements for.

    Returns:
        A list of requirements.
    """
    ret = []
    for each in cls.get_requirements(target_os=target_os):
        is_ignored = False
        for ignored in cls.REQUIREMENTS_IGNORED_ON_UNINSTALL:
            if each.startswith(ignored):
                is_ignored = True
                break
        if not is_ignored:
            ret.append(each)
    return ret

`plugin_flavors() -> List[Type[BasePluginFlavor]]` `classmethod`

Abstract method to declare new plugin flavors.

Returns:

Type	Description
`List[Type[BasePluginFlavor]]`	A list of new plugin flavors.

Source code in src/zenml/integrations/integration.py

@classmethod
def plugin_flavors(cls) -> List[Type["BasePluginFlavor"]]:
    """Abstract method to declare new plugin flavors.

    Returns:
        A list of new plugin flavors.
    """
    return []

`PandasIntegration`

Bases: Integration

Definition of Pandas integration for ZenML.

Functions

`activate() -> None` `classmethod`

Activates the integration.

Source code in src/zenml/integrations/pandas/__init__.py

@classmethod
def activate(cls) -> None:
    """Activates the integration."""
    from zenml.integrations.pandas import materializers  # noqa

Modules

`materializers`

Initialization of the Pandas materializer.

Classes

Modules

`pandas_materializer`

Materializer for Pandas.

This materializer handles pandas DataFrame and Series objects.

Environment Variables

ZENML_PANDAS_SAMPLE_ROWS: Controls the number of sample rows to include in visualizations. Defaults to 10 if not set.

Classes

PandasMaterializer(uri: str, artifact_store: Optional[BaseArtifactStore] = None)

Bases: BaseMaterializer

Materializer to read data to and from pandas.

Define self.data_path.

Parameters:

Name	Type	Description	Default
`uri`	`str`	The URI where the artifact data is stored.	required
`artifact_store`	`Optional[BaseArtifactStore]`	The artifact store where the artifact data is stored.	`None`

Source code in src/zenml/integrations/pandas/materializers/pandas_materializer.py

def __init__(
    self, uri: str, artifact_store: Optional[BaseArtifactStore] = None
):
    """Define `self.data_path`.

    Args:
        uri: The URI where the artifact data is stored.
        artifact_store: The artifact store where the artifact data is stored.
    """
    super().__init__(uri, artifact_store)
    try:
        import pyarrow  # type: ignore # noqa

        self.pyarrow_exists = True
    except ImportError:
        self.pyarrow_exists = False
        logger.warning(
            "By default, the `PandasMaterializer` stores data as a "
            "`.csv` file. If you want to store data more efficiently, "
            "you can install `pyarrow` by running "
            "'`pip install pyarrow`'. This will allow `PandasMaterializer` "
            "to automatically store the data as a `.parquet` file instead."
        )
    finally:
        self.parquet_path = os.path.join(self.uri, PARQUET_FILENAME)
        self.csv_path = os.path.join(self.uri, CSV_FILENAME)

Functions

extract_metadata(df: Union[pd.DataFrame, pd.Series]) -> Dict[str, MetadataType]

Extract metadata from the given pandas dataframe or series.

Parameters:

Name	Type	Description	Default
`df`	`Union[DataFrame, Series]`	The pandas dataframe or series to extract metadata from.	required

Returns:

Type	Description
`Dict[str, MetadataType]`	The extracted metadata as a dictionary.

Source code in src/zenml/integrations/pandas/materializers/pandas_materializer.py

def extract_metadata(
    self, df: Union[pd.DataFrame, pd.Series]
) -> Dict[str, "MetadataType"]:
    """Extract metadata from the given pandas dataframe or series.

    Args:
        df: The pandas dataframe or series to extract metadata from.

    Returns:
        The extracted metadata as a dictionary.
    """
    pandas_metadata: Dict[str, "MetadataType"] = {"shape": df.shape}

    if isinstance(df, pd.Series):
        pandas_metadata["dtype"] = DType(df.dtype.type)
        pandas_metadata["mean"] = float(df.mean().item())
        pandas_metadata["std"] = float(df.std().item())
        pandas_metadata["min"] = float(df.min().item())
        pandas_metadata["max"] = float(df.max().item())

    else:
        pandas_metadata["dtype"] = {
            str(key): DType(value.type) for key, value in df.dtypes.items()
        }
        for stat_name, stat in {
            "mean": df.mean,
            "std": df.std,
            "min": df.min,
            "max": df.max,
        }.items():
            pandas_metadata[stat_name] = {
                str(key): float(value)
                for key, value in stat(numeric_only=True).to_dict().items()
            }

    return pandas_metadata

load(data_type: Type[Any]) -> Union[pd.DataFrame, pd.Series]

Reads pd.DataFrame or pd.Series from a .parquet or .csv file.

Parameters:

Name	Type	Description	Default
`data_type`	`Type[Any]`	The type of the data to read.	required

Raises:

Type	Description
`ImportError`	If pyarrow or fastparquet is not installed.

Returns:

Type	Description
`Union[DataFrame, Series]`	The pandas dataframe or series.

Source code in src/zenml/integrations/pandas/materializers/pandas_materializer.py

def load(self, data_type: Type[Any]) -> Union[pd.DataFrame, pd.Series]:
    """Reads `pd.DataFrame` or `pd.Series` from a `.parquet` or `.csv` file.

    Args:
        data_type: The type of the data to read.

    Raises:
        ImportError: If pyarrow or fastparquet is not installed.

    Returns:
        The pandas dataframe or series.
    """
    if self.artifact_store.exists(self.parquet_path):
        if self.pyarrow_exists:
            with self.artifact_store.open(
                self.parquet_path, mode="rb"
            ) as f:
                df = pd.read_parquet(f)
        else:
            raise ImportError(
                "You have an old version of a `PandasMaterializer` "
                "data artifact stored in the artifact store "
                "as a `.parquet` file, which requires `pyarrow` "
                "for reading, You can install `pyarrow` by running "
                "'`pip install pyarrow fastparquet`'."
            )
    else:
        with self.artifact_store.open(self.csv_path, mode="rb") as f:
            df = pd.read_csv(f, index_col=0, parse_dates=True)

    # validate the type of the data.
    def is_dataframe_or_series(
        df: Union[pd.DataFrame, pd.Series],
    ) -> Union[pd.DataFrame, pd.Series]:
        """Checks if the data is a `pd.DataFrame` or `pd.Series`.

        Args:
            df: The data to check.

        Returns:
            The data if it is a `pd.DataFrame` or `pd.Series`.
        """
        if issubclass(data_type, pd.Series):
            # Taking the first column if its a series as the assumption
            # is that there will only be one
            assert len(df.columns) == 1
            df = df[df.columns[0]]
            return df
        else:
            return df

    return is_dataframe_or_series(df)

save(df: Union[pd.DataFrame, pd.Series]) -> None

Writes a pandas dataframe or series to the specified filename.

Parameters:

Name	Type	Description	Default
`df`	`Union[DataFrame, Series]`	The pandas dataframe or series to write.	required

Source code in src/zenml/integrations/pandas/materializers/pandas_materializer.py

def save(self, df: Union[pd.DataFrame, pd.Series]) -> None:
    """Writes a pandas dataframe or series to the specified filename.

    Args:
        df: The pandas dataframe or series to write.
    """
    if isinstance(df, pd.Series):
        df = df.to_frame(name="series")

    if self.pyarrow_exists:
        with self.artifact_store.open(self.parquet_path, mode="wb") as f:
            df.to_parquet(f, compression=COMPRESSION_TYPE)
    else:
        with self.artifact_store.open(self.csv_path, mode="wb") as f:
            df.to_csv(f, index=True)

save_visualizations(df: Union[pd.DataFrame, pd.Series]) -> Dict[str, VisualizationType]

Save visualizations of the given pandas dataframe or series.

Creates two visualizations: 1. A statistical description of the data (using df.describe()) 2. A sample of the data (first N rows controlled by ZENML_PANDAS_SAMPLE_ROWS)

Note

The number of sample rows shown can be controlled with the ZENML_PANDAS_SAMPLE_ROWS environment variable.

Parameters:

Name	Type	Description	Default
`df`	`Union[DataFrame, Series]`	The pandas dataframe or series to visualize.	required

Returns:

Type	Description
`Dict[str, VisualizationType]`	A dictionary of visualization URIs and their types.

Source code in src/zenml/integrations/pandas/materializers/pandas_materializer.py

def save_visualizations(
    self, df: Union[pd.DataFrame, pd.Series]
) -> Dict[str, VisualizationType]:
    """Save visualizations of the given pandas dataframe or series.

    Creates two visualizations:
    1. A statistical description of the data (using df.describe())
    2. A sample of the data (first N rows controlled by ZENML_PANDAS_SAMPLE_ROWS)

    Note:
        The number of sample rows shown can be controlled with the
        ZENML_PANDAS_SAMPLE_ROWS environment variable.

    Args:
        df: The pandas dataframe or series to visualize.

    Returns:
        A dictionary of visualization URIs and their types.
    """
    visualizations = {}
    describe_uri = os.path.join(self.uri, "describe.csv")
    describe_uri = describe_uri.replace("\\", "/")
    with self.artifact_store.open(describe_uri, mode="wb") as f:
        df.describe().to_csv(f)
    visualizations[describe_uri] = VisualizationType.CSV

    # Get the number of sample rows from environment variable or use default
    sample_rows = int(
        os.environ.get("ZENML_PANDAS_SAMPLE_ROWS", DEFAULT_SAMPLE_ROWS)
    )

    # Add our sample visualization (with configurable number of rows)
    if isinstance(df, pd.Series):
        sample_df = df.head(sample_rows).to_frame()
    else:
        sample_df = df.head(sample_rows)

    sample_uri = os.path.join(self.uri, "sample.csv")
    sample_uri = sample_uri.replace("\\", "/")
    with self.artifact_store.open(sample_uri, mode="wb") as f:
        sample_df.to_csv(f)

    visualizations[sample_uri] = VisualizationType.CSV

    return visualizations

Functions

Pandas

zenml.integrations.pandas

Attributes

PANDAS = 'pandas' module-attribute

Classes

Integration

Functions

activate() -> None classmethod

check_installation() -> bool classmethod

flavors() -> List[Type[Flavor]] classmethod

get_requirements(target_os: Optional[str] = None, python_version: Optional[str] = None) -> List[str] classmethod

get_uninstall_requirements(target_os: Optional[str] = None) -> List[str] classmethod

plugin_flavors() -> List[Type[BasePluginFlavor]] classmethod

PandasIntegration

Functions

activate() -> None classmethod

Modules

materializers

Classes

Modules

pandas_materializer

`zenml.integrations.pandas`

`PANDAS = 'pandas'` `module-attribute`

`Integration`

`activate() -> None` `classmethod`

`check_installation() -> bool` `classmethod`

`flavors() -> List[Type[Flavor]]` `classmethod`

`get_requirements(target_os: Optional[str] = None, python_version: Optional[str] = None) -> List[str]` `classmethod`

`get_uninstall_requirements(target_os: Optional[str] = None) -> List[str]` `classmethod`

`plugin_flavors() -> List[Type[BasePluginFlavor]]` `classmethod`

`PandasIntegration`

`activate() -> None` `classmethod`

`materializers`

`pandas_materializer`