Skip to content

Huggingface

zenml.integrations.huggingface special

Initialization of the Huggingface integration.

HuggingfaceIntegration (Integration)

Definition of Huggingface integration for ZenML.

Source code in zenml/integrations/huggingface/__init__.py
class HuggingfaceIntegration(Integration):
    """Definition of Huggingface integration for ZenML."""

    NAME = HUGGINGFACE
    REQUIREMENTS = ["transformers", "datasets"]

    @classmethod
    def activate(cls) -> None:
        """Activates the integration."""
        from zenml.integrations.huggingface import materializers  # noqa

activate() classmethod

Activates the integration.

Source code in zenml/integrations/huggingface/__init__.py
@classmethod
def activate(cls) -> None:
    """Activates the integration."""
    from zenml.integrations.huggingface import materializers  # noqa

materializers special

Initialization of Huggingface materializers.

huggingface_datasets_materializer

Implementation of the Huggingface datasets materializer.

HFDatasetMaterializer (BaseMaterializer)

Materializer to read data to and from huggingface datasets.

Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
class HFDatasetMaterializer(BaseMaterializer):
    """Materializer to read data to and from huggingface datasets."""

    ASSOCIATED_TYPES = (Dataset, DatasetDict)
    ASSOCIATED_ARTIFACT_TYPES = (DataArtifact,)

    def handle_input(self, data_type: Type[Any]) -> Dataset:
        """Reads Dataset.

        Args:
            data_type: The type of the dataset to read.

        Returns:
            The dataset read from the specified dir.
        """
        super().handle_input(data_type)
        temp_dir = mkdtemp()
        io_utils.copy_dir(
            os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
            temp_dir,
        )
        return load_from_disk(temp_dir)

    def handle_return(self, ds: Type[Any]) -> None:
        """Writes a Dataset to the specified dir.

        Args:
            ds: The Dataset to write.
        """
        super().handle_return(ds)
        temp_dir = TemporaryDirectory()
        path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
        try:
            ds.save_to_disk(path)
            io_utils.copy_dir(
                path,
                os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
            )
        finally:
            fileio.rmtree(temp_dir.name)
handle_input(self, data_type)

Reads Dataset.

Parameters:

Name Type Description Default
data_type Type[Any]

The type of the dataset to read.

required

Returns:

Type Description
Dataset

The dataset read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def handle_input(self, data_type: Type[Any]) -> Dataset:
    """Reads Dataset.

    Args:
        data_type: The type of the dataset to read.

    Returns:
        The dataset read from the specified dir.
    """
    super().handle_input(data_type)
    temp_dir = mkdtemp()
    io_utils.copy_dir(
        os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
        temp_dir,
    )
    return load_from_disk(temp_dir)
handle_return(self, ds)

Writes a Dataset to the specified dir.

Parameters:

Name Type Description Default
ds Type[Any]

The Dataset to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def handle_return(self, ds: Type[Any]) -> None:
    """Writes a Dataset to the specified dir.

    Args:
        ds: The Dataset to write.
    """
    super().handle_return(ds)
    temp_dir = TemporaryDirectory()
    path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
    try:
        ds.save_to_disk(path)
        io_utils.copy_dir(
            path,
            os.path.join(self.artifact.uri, DEFAULT_DATASET_DIR),
        )
    finally:
        fileio.rmtree(temp_dir.name)

huggingface_pt_model_materializer

Implementation of the Huggingface PyTorch model materializer.

HFPTModelMaterializer (BaseMaterializer)

Materializer to read torch model to and from huggingface pretrained model.

Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
class HFPTModelMaterializer(BaseMaterializer):
    """Materializer to read torch model to and from huggingface pretrained model."""

    ASSOCIATED_TYPES = (PreTrainedModel,)
    ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)

    def handle_input(self, data_type: Type[Any]) -> PreTrainedModel:
        """Reads HFModel.

        Args:
            data_type: The type of the model to read.

        Returns:
            The model read from the specified dir.
        """
        super().handle_input(data_type)

        config = AutoConfig.from_pretrained(
            os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
        )
        architecture = config.architectures[0]
        model_cls = getattr(
            importlib.import_module("transformers"), architecture
        )
        return model_cls.from_pretrained(
            os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
        )

    def handle_return(self, model: Type[Any]) -> None:
        """Writes a Model to the specified dir.

        Args:
            model: The Torch Model to write.
        """
        super().handle_return(model)
        temp_dir = TemporaryDirectory()
        model.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR),
        )
handle_input(self, data_type)

Reads HFModel.

Parameters:

Name Type Description Default
data_type Type[Any]

The type of the model to read.

required

Returns:

Type Description
PreTrainedModel

The model read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def handle_input(self, data_type: Type[Any]) -> PreTrainedModel:
    """Reads HFModel.

    Args:
        data_type: The type of the model to read.

    Returns:
        The model read from the specified dir.
    """
    super().handle_input(data_type)

    config = AutoConfig.from_pretrained(
        os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
    )
    architecture = config.architectures[0]
    model_cls = getattr(
        importlib.import_module("transformers"), architecture
    )
    return model_cls.from_pretrained(
        os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR)
    )
handle_return(self, model)

Writes a Model to the specified dir.

Parameters:

Name Type Description Default
model Type[Any]

The Torch Model to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def handle_return(self, model: Type[Any]) -> None:
    """Writes a Model to the specified dir.

    Args:
        model: The Torch Model to write.
    """
    super().handle_return(model)
    temp_dir = TemporaryDirectory()
    model.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.artifact.uri, DEFAULT_PT_MODEL_DIR),
    )

huggingface_tf_model_materializer

Implementation of the Huggingface TF model materializer.

HFTFModelMaterializer (BaseMaterializer)

Materializer to read Tensorflow model to and from huggingface pretrained model.

Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
class HFTFModelMaterializer(BaseMaterializer):
    """Materializer to read Tensorflow model to and from huggingface pretrained model."""

    ASSOCIATED_TYPES = (TFPreTrainedModel,)
    ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)

    def handle_input(self, data_type: Type[Any]) -> TFPreTrainedModel:
        """Reads HFModel.

        Args:
            data_type: The type of the model to read.

        Returns:
            The model read from the specified dir.
        """
        super().handle_input(data_type)

        config = AutoConfig.from_pretrained(
            os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
        )
        architecture = "TF" + config.architectures[0]
        model_cls = getattr(
            importlib.import_module("transformers"), architecture
        )
        return model_cls.from_pretrained(
            os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
        )

    def handle_return(self, model: Type[Any]) -> None:
        """Writes a Model to the specified dir.

        Args:
            model: The TF Model to write.
        """
        super().handle_return(model)
        temp_dir = TemporaryDirectory()
        model.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR),
        )
handle_input(self, data_type)

Reads HFModel.

Parameters:

Name Type Description Default
data_type Type[Any]

The type of the model to read.

required

Returns:

Type Description
TFPreTrainedModel

The model read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def handle_input(self, data_type: Type[Any]) -> TFPreTrainedModel:
    """Reads HFModel.

    Args:
        data_type: The type of the model to read.

    Returns:
        The model read from the specified dir.
    """
    super().handle_input(data_type)

    config = AutoConfig.from_pretrained(
        os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
    )
    architecture = "TF" + config.architectures[0]
    model_cls = getattr(
        importlib.import_module("transformers"), architecture
    )
    return model_cls.from_pretrained(
        os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR)
    )
handle_return(self, model)

Writes a Model to the specified dir.

Parameters:

Name Type Description Default
model Type[Any]

The TF Model to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def handle_return(self, model: Type[Any]) -> None:
    """Writes a Model to the specified dir.

    Args:
        model: The TF Model to write.
    """
    super().handle_return(model)
    temp_dir = TemporaryDirectory()
    model.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.artifact.uri, DEFAULT_TF_MODEL_DIR),
    )

huggingface_tokenizer_materializer

Implementation of the Huggingface tokenizer materializer.

HFTokenizerMaterializer (BaseMaterializer)

Materializer to read tokenizer to and from huggingface tokenizer.

Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
class HFTokenizerMaterializer(BaseMaterializer):
    """Materializer to read tokenizer to and from huggingface tokenizer."""

    ASSOCIATED_TYPES = (PreTrainedTokenizerBase,)
    ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)

    def handle_input(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
        """Reads Tokenizer.

        Args:
            data_type: The type of the tokenizer to read.

        Returns:
            The tokenizer read from the specified dir.
        """
        super().handle_input(data_type)

        return AutoTokenizer.from_pretrained(
            os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR)
        )

    def handle_return(self, tokenizer: Type[Any]) -> None:
        """Writes a Tokenizer to the specified dir.

        Args:
            tokenizer: The HFTokenizer to write.
        """
        super().handle_return(tokenizer)
        temp_dir = TemporaryDirectory()
        tokenizer.save_pretrained(temp_dir.name)
        io_utils.copy_dir(
            temp_dir.name,
            os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR),
        )
handle_input(self, data_type)

Reads Tokenizer.

Parameters:

Name Type Description Default
data_type Type[Any]

The type of the tokenizer to read.

required

Returns:

Type Description
PreTrainedTokenizerBase

The tokenizer read from the specified dir.

Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def handle_input(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
    """Reads Tokenizer.

    Args:
        data_type: The type of the tokenizer to read.

    Returns:
        The tokenizer read from the specified dir.
    """
    super().handle_input(data_type)

    return AutoTokenizer.from_pretrained(
        os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR)
    )
handle_return(self, tokenizer)

Writes a Tokenizer to the specified dir.

Parameters:

Name Type Description Default
tokenizer Type[Any]

The HFTokenizer to write.

required
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def handle_return(self, tokenizer: Type[Any]) -> None:
    """Writes a Tokenizer to the specified dir.

    Args:
        tokenizer: The HFTokenizer to write.
    """
    super().handle_return(tokenizer)
    temp_dir = TemporaryDirectory()
    tokenizer.save_pretrained(temp_dir.name)
    io_utils.copy_dir(
        temp_dir.name,
        os.path.join(self.artifact.uri, DEFAULT_TOKENIZER_DIR),
    )