Skip to content

Huggingface

zenml.integrations.huggingface special

Initialization of the Huggingface integration.

HuggingfaceIntegration (Integration)

Definition of Huggingface integration for ZenML.

Source code in zenml/integrations/huggingface/__init__.py
class HuggingfaceIntegration(Integration):
    """Definition of Huggingface integration for ZenML."""

    NAME = HUGGINGFACE
    REQUIREMENTS = ["transformers<=4.31", "datasets"]

    @classmethod
    def activate(cls) -> None:
        """Activates the integration."""
        from zenml.integrations.huggingface import materializers  # noqa

activate() classmethod

Activates the integration.

Source code in zenml/integrations/huggingface/__init__.py
@classmethod
def activate(cls) -> None:
    """Activates the integration."""
    from zenml.integrations.huggingface import materializers  # noqa

materializers special

Initialization of Huggingface materializers.

huggingface_datasets_materializer

Implementation of the Huggingface datasets materializer.

HFDatasetMaterializer (BaseMaterializer)

Materializer to read data to and from huggingface datasets.

Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
class HFDatasetMaterializer(BaseMaterializer):
    """Materializer to read data to and from huggingface datasets."""

    ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Dataset, DatasetDict)
    ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = (
        ArtifactType.DATA_ANALYSIS
    )

    def load(
        self, data_type: Union[Type[Dataset], Type[DatasetDict]]
    ) -> Union[Dataset, DatasetDict]:
        """Reads Dataset.

        Args:
            data_type: The type of the dataset to read.

        Returns:
            The dataset read from the specified dir.
        """
        temp_dir = mkdtemp()
        io_utils.copy_dir(
            os.path.join(self.uri, DEFAULT_DATASET_DIR),
            temp_dir,
        )
        return load_from_disk(temp_dir)

    def save(self, ds: Union[Dataset, DatasetDict]) -> None:
        """Writes a Dataset to the specified dir.

        Args:
            ds: The Dataset to write.
        """
        temp_dir = TemporaryDirectory()
        path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
        try:
            ds.save_to_disk(path)
            io_utils.copy_dir(
                path,
                os.path.join(self.uri, DEFAULT_DATASET_DIR),
            )
        finally:
            fileio.rmtree(temp_dir.name)

    def extract_metadata(
        self, ds: Union[Dataset, DatasetDict]
    ) -> Dict[str, "MetadataType"]:
        """Extract metadata from the given `Dataset` object.

        Args:
            ds: The `Dataset` object to extract metadata from.

        Returns:
            The extracted metadata as a dictionary.

        Raises:
            ValueError: If the given object is not a `Dataset` or `DatasetDict`.
        """
        pandas_materializer = PandasMaterializer(self.uri)
        if isinstance(ds, Dataset):
            return pandas_materializer.extract_metadata(ds.to_pandas())
        elif isinstance(ds, DatasetDict):
            metadata: Dict[str, Dict[str, "MetadataType"]] = defaultdict(dict)
            for dataset_name, dataset in ds.items():
                dataset_metadata = pandas_materializer.extract_metadata(
                    dataset.to_pandas()
                )
                for key, value in dataset_metadata.items():
                    metadata[key][dataset_name] = value
            return dict(metadata)
        raise ValueError(f"Unsupported type {type(ds)}")
extract_metadata(self, ds)

Extract metadata from the given Dataset object.

Parameters:

Name Type Description Default
ds Union[datasets.Dataset, datasets.dataset_dict.DatasetDict]

The Dataset object to extract metadata from.

required

Returns:

Type Description
Dict[str, MetadataType]

The extracted metadata as a dictionary.

Exceptions:

Type Description
ValueError

If the given object is not a Dataset or DatasetDict.

Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def extract_metadata(
    self, ds: Union[Dataset, DatasetDict]
) -> Dict[str, "MetadataType"]:
    """Extract metadata from the given `Dataset` object.

    Args:
        ds: The `Dataset` object to extract metadata from.

    Returns:
        The extracted metadata as a dictionary.

    Raises:
        ValueError: If the given object is not a `Dataset` or `DatasetDict`.
    """
    pandas_materializer = PandasMaterializer(self.uri)
    if isinstance(ds, Dataset):
        return pandas_materializer.extract_metadata(ds.to_pandas())
    elif isinstance(ds, DatasetDict):
        metadata: Dict[str, Dict[str, "MetadataType"]] = defaultdict(dict)
        for dataset_name, dataset in ds.items():
            dataset_metadata = pandas_materializer.extract_metadata(
                dataset.to_pandas()
            )
            for key, value in dataset_metadata.items():
                metadata[key][dataset_name] = value
        return dict(metadata)
    raise ValueError(f"Unsupported type {type(ds)}")
load(self, data_type)

Reads Dataset.

Parameters:

Name Type Description Default
data_type Union[Type[datasets.Dataset], Type[datasets.dataset_dict.DatasetDict]]

The type of the dataset to read.

required

Returns:

Type Description
Union[datasets.Dataset, datasets.dataset_dict.DatasetDict]

The dataset read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def load(
    self, data_type: Union[Type[Dataset], Type[DatasetDict]]
) -> Union[Dataset, DatasetDict]:
    """Reads Dataset.

    Args:
        data_type: The type of the dataset to read.

    Returns:
        The dataset read from the specified dir.
    """
    temp_dir = mkdtemp()
    io_utils.copy_dir(
        os.path.join(self.uri, DEFAULT_DATASET_DIR),
        temp_dir,
    )
    return load_from_disk(temp_dir)
save(self, ds)

Writes a Dataset to the specified dir.

Parameters:

Name Type Description Default
ds Union[datasets.Dataset, datasets.dataset_dict.DatasetDict]

The Dataset to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def save(self, ds: Union[Dataset, DatasetDict]) -> None:
    """Writes a Dataset to the specified dir.

    Args:
        ds: The Dataset to write.
    """
    temp_dir = TemporaryDirectory()
    path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
    try:
        ds.save_to_disk(path)
        io_utils.copy_dir(
            path,
            os.path.join(self.uri, DEFAULT_DATASET_DIR),
        )
    finally:
        fileio.rmtree(temp_dir.name)

huggingface_pt_model_materializer

Implementation of the Huggingface PyTorch model materializer.

HFPTModelMaterializer (BaseMaterializer)

Materializer to read torch model to and from huggingface pretrained model.

Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
class HFPTModelMaterializer(BaseMaterializer):
    """Materializer to read torch model to and from huggingface pretrained model."""

    ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (PreTrainedModel,)
    ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL

    def load(self, data_type: Type[PreTrainedModel]) -> PreTrainedModel:
        """Reads HFModel.

        Args:
            data_type: The type of the model to read.

        Returns:
            The model read from the specified dir.
        """
        temp_dir = TemporaryDirectory()
        io_utils.copy_dir(
            os.path.join(self.uri, DEFAULT_PT_MODEL_DIR), temp_dir.name
        )

        config = AutoConfig.from_pretrained(temp_dir.name)
        architecture = config.architectures[0]
        model_cls = getattr(
            importlib.import_module("transformers"), architecture
        )
        return model_cls.from_pretrained(temp_dir.name)

    def save(self, model: PreTrainedModel) -> None:
        """Writes a Model to the specified dir.

        Args:
            model: The Torch Model to write.
        """
        temp_dir = TemporaryDirectory()
        model.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.uri, DEFAULT_PT_MODEL_DIR),
        )

    def extract_metadata(
        self, model: PreTrainedModel
    ) -> Dict[str, "MetadataType"]:
        """Extract metadata from the given `PreTrainedModel` object.

        Args:
            model: The `PreTrainedModel` object to extract metadata from.

        Returns:
            The extracted metadata as a dictionary.
        """
        from zenml.integrations.pytorch.utils import count_module_params

        module_param_metadata = count_module_params(model)
        return {
            **module_param_metadata,
            "dtype": DType(str(model.dtype)),
            "device": str(model.device),
        }
extract_metadata(self, model)

Extract metadata from the given PreTrainedModel object.

Parameters:

Name Type Description Default
model transformers.PreTrainedModel

The PreTrainedModel object to extract metadata from.

required

Returns:

Type Description
Dict[str, MetadataType]

The extracted metadata as a dictionary.

Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def extract_metadata(
    self, model: PreTrainedModel
) -> Dict[str, "MetadataType"]:
    """Extract metadata from the given `PreTrainedModel` object.

    Args:
        model: The `PreTrainedModel` object to extract metadata from.

    Returns:
        The extracted metadata as a dictionary.
    """
    from zenml.integrations.pytorch.utils import count_module_params

    module_param_metadata = count_module_params(model)
    return {
        **module_param_metadata,
        "dtype": DType(str(model.dtype)),
        "device": str(model.device),
    }
load(self, data_type)

Reads HFModel.

Parameters:

Name Type Description Default
data_type Type[transformers.PreTrainedModel]

The type of the model to read.

required

Returns:

Type Description
transformers.PreTrainedModel

The model read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def load(self, data_type: Type[PreTrainedModel]) -> PreTrainedModel:
    """Reads HFModel.

    Args:
        data_type: The type of the model to read.

    Returns:
        The model read from the specified dir.
    """
    temp_dir = TemporaryDirectory()
    io_utils.copy_dir(
        os.path.join(self.uri, DEFAULT_PT_MODEL_DIR), temp_dir.name
    )

    config = AutoConfig.from_pretrained(temp_dir.name)
    architecture = config.architectures[0]
    model_cls = getattr(
        importlib.import_module("transformers"), architecture
    )
    return model_cls.from_pretrained(temp_dir.name)
save(self, model)

Writes a Model to the specified dir.

Parameters:

Name Type Description Default
model transformers.PreTrainedModel

The Torch Model to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def save(self, model: PreTrainedModel) -> None:
    """Writes a Model to the specified dir.

    Args:
        model: The Torch Model to write.
    """
    temp_dir = TemporaryDirectory()
    model.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.uri, DEFAULT_PT_MODEL_DIR),
    )

huggingface_tf_model_materializer

Implementation of the Huggingface TF model materializer.

HFTFModelMaterializer (BaseMaterializer)

Materializer to read Tensorflow model to and from huggingface pretrained model.

Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
class HFTFModelMaterializer(BaseMaterializer):
    """Materializer to read Tensorflow model to and from huggingface pretrained model."""

    ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (TFPreTrainedModel,)
    ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL

    def load(self, data_type: Type[TFPreTrainedModel]) -> TFPreTrainedModel:
        """Reads HFModel.

        Args:
            data_type: The type of the model to read.

        Returns:
            The model read from the specified dir.
        """
        temp_dir = TemporaryDirectory()
        io_utils.copy_dir(
            os.path.join(self.uri, DEFAULT_TF_MODEL_DIR), temp_dir.name
        )

        config = AutoConfig.from_pretrained(temp_dir.name)
        architecture = "TF" + config.architectures[0]
        model_cls = getattr(
            importlib.import_module("transformers"), architecture
        )
        return model_cls.from_pretrained(temp_dir.name)

    def save(self, model: TFPreTrainedModel) -> None:
        """Writes a Model to the specified dir.

        Args:
            model: The TF Model to write.
        """
        temp_dir = TemporaryDirectory()
        model.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.uri, DEFAULT_TF_MODEL_DIR),
        )

    def extract_metadata(
        self, model: TFPreTrainedModel
    ) -> Dict[str, "MetadataType"]:
        """Extract metadata from the given `PreTrainedModel` object.

        Args:
            model: The `PreTrainedModel` object to extract metadata from.

        Returns:
            The extracted metadata as a dictionary.
        """
        return {
            "num_layers": len(model.layers),
            "num_params": model.num_parameters(only_trainable=False),
            "num_trainable_params": model.num_parameters(only_trainable=True),
        }
extract_metadata(self, model)

Extract metadata from the given PreTrainedModel object.

Parameters:

Name Type Description Default
model transformers.TFPreTrainedModel

The PreTrainedModel object to extract metadata from.

required

Returns:

Type Description
Dict[str, MetadataType]

The extracted metadata as a dictionary.

Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def extract_metadata(
    self, model: TFPreTrainedModel
) -> Dict[str, "MetadataType"]:
    """Extract metadata from the given `PreTrainedModel` object.

    Args:
        model: The `PreTrainedModel` object to extract metadata from.

    Returns:
        The extracted metadata as a dictionary.
    """
    return {
        "num_layers": len(model.layers),
        "num_params": model.num_parameters(only_trainable=False),
        "num_trainable_params": model.num_parameters(only_trainable=True),
    }
load(self, data_type)

Reads HFModel.

Parameters:

Name Type Description Default
data_type Type[transformers.TFPreTrainedModel]

The type of the model to read.

required

Returns:

Type Description
transformers.TFPreTrainedModel

The model read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def load(self, data_type: Type[TFPreTrainedModel]) -> TFPreTrainedModel:
    """Reads HFModel.

    Args:
        data_type: The type of the model to read.

    Returns:
        The model read from the specified dir.
    """
    temp_dir = TemporaryDirectory()
    io_utils.copy_dir(
        os.path.join(self.uri, DEFAULT_TF_MODEL_DIR), temp_dir.name
    )

    config = AutoConfig.from_pretrained(temp_dir.name)
    architecture = "TF" + config.architectures[0]
    model_cls = getattr(
        importlib.import_module("transformers"), architecture
    )
    return model_cls.from_pretrained(temp_dir.name)
save(self, model)

Writes a Model to the specified dir.

Parameters:

Name Type Description Default
model transformers.TFPreTrainedModel

The TF Model to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def save(self, model: TFPreTrainedModel) -> None:
    """Writes a Model to the specified dir.

    Args:
        model: The TF Model to write.
    """
    temp_dir = TemporaryDirectory()
    model.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.uri, DEFAULT_TF_MODEL_DIR),
    )

huggingface_tokenizer_materializer

Implementation of the Huggingface tokenizer materializer.

HFTokenizerMaterializer (BaseMaterializer)

Materializer to read tokenizer to and from huggingface tokenizer.

Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
class HFTokenizerMaterializer(BaseMaterializer):
    """Materializer to read tokenizer to and from huggingface tokenizer."""

    ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (
        PreTrainedTokenizerBase,
    )
    ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL

    def load(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
        """Reads Tokenizer.

        Args:
            data_type: The type of the tokenizer to read.

        Returns:
            The tokenizer read from the specified dir.
        """
        temp_dir = TemporaryDirectory()
        io_utils.copy_dir(
            os.path.join(self.uri, DEFAULT_TOKENIZER_DIR), temp_dir.name
        )

        return AutoTokenizer.from_pretrained(temp_dir.name)

    def save(self, tokenizer: Type[Any]) -> None:
        """Writes a Tokenizer to the specified dir.

        Args:
            tokenizer: The HFTokenizer to write.
        """
        temp_dir = TemporaryDirectory()
        tokenizer.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.uri, DEFAULT_TOKENIZER_DIR),
        )
load(self, data_type)

Reads Tokenizer.

Parameters:

Name Type Description Default
data_type Type[Any]

The type of the tokenizer to read.

required

Returns:

Type Description
transformers.tokenization_utils_base.PreTrainedTokenizerBase

The tokenizer read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def load(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
    """Reads Tokenizer.

    Args:
        data_type: The type of the tokenizer to read.

    Returns:
        The tokenizer read from the specified dir.
    """
    temp_dir = TemporaryDirectory()
    io_utils.copy_dir(
        os.path.join(self.uri, DEFAULT_TOKENIZER_DIR), temp_dir.name
    )

    return AutoTokenizer.from_pretrained(temp_dir.name)
save(self, tokenizer)

Writes a Tokenizer to the specified dir.

Parameters:

Name Type Description Default
tokenizer Type[Any]

The HFTokenizer to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def save(self, tokenizer: Type[Any]) -> None:
    """Writes a Tokenizer to the specified dir.

    Args:
        tokenizer: The HFTokenizer to write.
    """
    temp_dir = TemporaryDirectory()
    tokenizer.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.uri, DEFAULT_TOKENIZER_DIR),
    )