Huggingface
zenml.integrations.huggingface
special
Initialization of the Huggingface integration.
HuggingfaceIntegration (Integration)
Definition of Huggingface integration for ZenML.
Source code in zenml/integrations/huggingface/__init__.py
class HuggingfaceIntegration(Integration):
"""Definition of Huggingface integration for ZenML."""
NAME = HUGGINGFACE
REQUIREMENTS = ["transformers<4.30", "datasets"]
@classmethod
def activate(cls) -> None:
"""Activates the integration."""
from zenml.integrations.huggingface import materializers # noqa
activate()
classmethod
Activates the integration.
Source code in zenml/integrations/huggingface/__init__.py
@classmethod
def activate(cls) -> None:
"""Activates the integration."""
from zenml.integrations.huggingface import materializers # noqa
materializers
special
Initialization of Huggingface materializers.
huggingface_datasets_materializer
Implementation of the Huggingface datasets materializer.
HFDatasetMaterializer (BaseMaterializer)
Materializer to read data to and from huggingface datasets.
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
class HFDatasetMaterializer(BaseMaterializer):
"""Materializer to read data to and from huggingface datasets."""
ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (Dataset, DatasetDict)
ASSOCIATED_ARTIFACT_TYPE: ClassVar[
ArtifactType
] = ArtifactType.DATA_ANALYSIS
def load(
self, data_type: Union[Type[Dataset], Type[DatasetDict]]
) -> Union[Dataset, DatasetDict]:
"""Reads Dataset.
Args:
data_type: The type of the dataset to read.
Returns:
The dataset read from the specified dir.
"""
temp_dir = mkdtemp()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_DATASET_DIR),
temp_dir,
)
return load_from_disk(temp_dir)
def save(self, ds: Union[Dataset, DatasetDict]) -> None:
"""Writes a Dataset to the specified dir.
Args:
ds: The Dataset to write.
"""
temp_dir = TemporaryDirectory()
path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
try:
ds.save_to_disk(path)
io_utils.copy_dir(
path,
os.path.join(self.uri, DEFAULT_DATASET_DIR),
)
finally:
fileio.rmtree(temp_dir.name)
def extract_metadata(
self, ds: Union[Dataset, DatasetDict]
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `Dataset` object.
Args:
ds: The `Dataset` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
Raises:
ValueError: If the given object is not a `Dataset` or `DatasetDict`.
"""
pandas_materializer = PandasMaterializer(self.uri)
if isinstance(ds, Dataset):
return pandas_materializer.extract_metadata(ds.to_pandas())
elif isinstance(ds, DatasetDict):
metadata: Dict[str, Dict[str, "MetadataType"]] = defaultdict(dict)
for dataset_name, dataset in ds.items():
dataset_metadata = pandas_materializer.extract_metadata(
dataset.to_pandas()
)
for key, value in dataset_metadata.items():
metadata[key][dataset_name] = value
return dict(metadata)
raise ValueError(f"Unsupported type {type(ds)}")
extract_metadata(self, ds)
Extract metadata from the given Dataset
object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ds |
Union[datasets.arrow_dataset.Dataset, datasets.dataset_dict.DatasetDict] |
The |
required |
Returns:
Type | Description |
---|---|
Dict[str, MetadataType] |
The extracted metadata as a dictionary. |
Exceptions:
Type | Description |
---|---|
ValueError |
If the given object is not a |
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def extract_metadata(
self, ds: Union[Dataset, DatasetDict]
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `Dataset` object.
Args:
ds: The `Dataset` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
Raises:
ValueError: If the given object is not a `Dataset` or `DatasetDict`.
"""
pandas_materializer = PandasMaterializer(self.uri)
if isinstance(ds, Dataset):
return pandas_materializer.extract_metadata(ds.to_pandas())
elif isinstance(ds, DatasetDict):
metadata: Dict[str, Dict[str, "MetadataType"]] = defaultdict(dict)
for dataset_name, dataset in ds.items():
dataset_metadata = pandas_materializer.extract_metadata(
dataset.to_pandas()
)
for key, value in dataset_metadata.items():
metadata[key][dataset_name] = value
return dict(metadata)
raise ValueError(f"Unsupported type {type(ds)}")
load(self, data_type)
Reads Dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Union[Type[datasets.arrow_dataset.Dataset], Type[datasets.dataset_dict.DatasetDict]] |
The type of the dataset to read. |
required |
Returns:
Type | Description |
---|---|
Union[datasets.arrow_dataset.Dataset, datasets.dataset_dict.DatasetDict] |
The dataset read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def load(
self, data_type: Union[Type[Dataset], Type[DatasetDict]]
) -> Union[Dataset, DatasetDict]:
"""Reads Dataset.
Args:
data_type: The type of the dataset to read.
Returns:
The dataset read from the specified dir.
"""
temp_dir = mkdtemp()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_DATASET_DIR),
temp_dir,
)
return load_from_disk(temp_dir)
save(self, ds)
Writes a Dataset to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ds |
Union[datasets.arrow_dataset.Dataset, datasets.dataset_dict.DatasetDict] |
The Dataset to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py
def save(self, ds: Union[Dataset, DatasetDict]) -> None:
"""Writes a Dataset to the specified dir.
Args:
ds: The Dataset to write.
"""
temp_dir = TemporaryDirectory()
path = os.path.join(temp_dir.name, DEFAULT_DATASET_DIR)
try:
ds.save_to_disk(path)
io_utils.copy_dir(
path,
os.path.join(self.uri, DEFAULT_DATASET_DIR),
)
finally:
fileio.rmtree(temp_dir.name)
huggingface_pt_model_materializer
Implementation of the Huggingface PyTorch model materializer.
HFPTModelMaterializer (BaseMaterializer)
Materializer to read torch model to and from huggingface pretrained model.
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
class HFPTModelMaterializer(BaseMaterializer):
"""Materializer to read torch model to and from huggingface pretrained model."""
ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (PreTrainedModel,)
ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL
def load(self, data_type: Type[PreTrainedModel]) -> PreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_PT_MODEL_DIR), temp_dir.name
)
config = AutoConfig.from_pretrained(temp_dir.name)
architecture = config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(temp_dir.name)
def save(self, model: PreTrainedModel) -> None:
"""Writes a Model to the specified dir.
Args:
model: The Torch Model to write.
"""
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_PT_MODEL_DIR),
)
def extract_metadata(
self, model: PreTrainedModel
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `PreTrainedModel` object.
Args:
model: The `PreTrainedModel` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
from zenml.integrations.pytorch.utils import count_module_params
module_param_metadata = count_module_params(model)
return {
**module_param_metadata,
"dtype": DType(str(model.dtype)),
"device": str(model.device),
}
extract_metadata(self, model)
Extract metadata from the given PreTrainedModel
object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
PreTrainedModel |
The |
required |
Returns:
Type | Description |
---|---|
Dict[str, MetadataType] |
The extracted metadata as a dictionary. |
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def extract_metadata(
self, model: PreTrainedModel
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `PreTrainedModel` object.
Args:
model: The `PreTrainedModel` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
from zenml.integrations.pytorch.utils import count_module_params
module_param_metadata = count_module_params(model)
return {
**module_param_metadata,
"dtype": DType(str(model.dtype)),
"device": str(model.device),
}
load(self, data_type)
Reads HFModel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[transformers.modeling_utils.PreTrainedModel] |
The type of the model to read. |
required |
Returns:
Type | Description |
---|---|
PreTrainedModel |
The model read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def load(self, data_type: Type[PreTrainedModel]) -> PreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_PT_MODEL_DIR), temp_dir.name
)
config = AutoConfig.from_pretrained(temp_dir.name)
architecture = config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(temp_dir.name)
save(self, model)
Writes a Model to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
PreTrainedModel |
The Torch Model to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_pt_model_materializer.py
def save(self, model: PreTrainedModel) -> None:
"""Writes a Model to the specified dir.
Args:
model: The Torch Model to write.
"""
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_PT_MODEL_DIR),
)
huggingface_tf_model_materializer
Implementation of the Huggingface TF model materializer.
HFTFModelMaterializer (BaseMaterializer)
Materializer to read Tensorflow model to and from huggingface pretrained model.
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
class HFTFModelMaterializer(BaseMaterializer):
"""Materializer to read Tensorflow model to and from huggingface pretrained model."""
ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (TFPreTrainedModel,)
ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL
def load(self, data_type: Type[TFPreTrainedModel]) -> TFPreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_TF_MODEL_DIR), temp_dir.name
)
config = AutoConfig.from_pretrained(temp_dir.name)
architecture = "TF" + config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(temp_dir.name)
def save(self, model: TFPreTrainedModel) -> None:
"""Writes a Model to the specified dir.
Args:
model: The TF Model to write.
"""
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_TF_MODEL_DIR),
)
def extract_metadata(
self, model: TFPreTrainedModel
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `PreTrainedModel` object.
Args:
model: The `PreTrainedModel` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
return {
"num_layers": len(model.layers),
"num_params": model.num_parameters(only_trainable=False),
"num_trainable_params": model.num_parameters(only_trainable=True),
}
extract_metadata(self, model)
Extract metadata from the given PreTrainedModel
object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
TFPreTrainedModel |
The |
required |
Returns:
Type | Description |
---|---|
Dict[str, MetadataType] |
The extracted metadata as a dictionary. |
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def extract_metadata(
self, model: TFPreTrainedModel
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given `PreTrainedModel` object.
Args:
model: The `PreTrainedModel` object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
return {
"num_layers": len(model.layers),
"num_params": model.num_parameters(only_trainable=False),
"num_trainable_params": model.num_parameters(only_trainable=True),
}
load(self, data_type)
Reads HFModel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[transformers.modeling_tf_utils.TFPreTrainedModel] |
The type of the model to read. |
required |
Returns:
Type | Description |
---|---|
TFPreTrainedModel |
The model read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def load(self, data_type: Type[TFPreTrainedModel]) -> TFPreTrainedModel:
"""Reads HFModel.
Args:
data_type: The type of the model to read.
Returns:
The model read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_TF_MODEL_DIR), temp_dir.name
)
config = AutoConfig.from_pretrained(temp_dir.name)
architecture = "TF" + config.architectures[0]
model_cls = getattr(
importlib.import_module("transformers"), architecture
)
return model_cls.from_pretrained(temp_dir.name)
save(self, model)
Writes a Model to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
TFPreTrainedModel |
The TF Model to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_tf_model_materializer.py
def save(self, model: TFPreTrainedModel) -> None:
"""Writes a Model to the specified dir.
Args:
model: The TF Model to write.
"""
temp_dir = TemporaryDirectory()
model.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_TF_MODEL_DIR),
)
huggingface_tokenizer_materializer
Implementation of the Huggingface tokenizer materializer.
HFTokenizerMaterializer (BaseMaterializer)
Materializer to read tokenizer to and from huggingface tokenizer.
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
class HFTokenizerMaterializer(BaseMaterializer):
"""Materializer to read tokenizer to and from huggingface tokenizer."""
ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (
PreTrainedTokenizerBase,
)
ASSOCIATED_ARTIFACT_TYPE: ClassVar[ArtifactType] = ArtifactType.MODEL
def load(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
"""Reads Tokenizer.
Args:
data_type: The type of the tokenizer to read.
Returns:
The tokenizer read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_TOKENIZER_DIR), temp_dir.name
)
return AutoTokenizer.from_pretrained(temp_dir.name)
def save(self, tokenizer: Type[Any]) -> None:
"""Writes a Tokenizer to the specified dir.
Args:
tokenizer: The HFTokenizer to write.
"""
temp_dir = TemporaryDirectory()
tokenizer.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_TOKENIZER_DIR),
)
load(self, data_type)
Reads Tokenizer.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the tokenizer to read. |
required |
Returns:
Type | Description |
---|---|
PreTrainedTokenizerBase |
The tokenizer read from the specified dir. |
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def load(self, data_type: Type[Any]) -> PreTrainedTokenizerBase:
"""Reads Tokenizer.
Args:
data_type: The type of the tokenizer to read.
Returns:
The tokenizer read from the specified dir.
"""
temp_dir = TemporaryDirectory()
io_utils.copy_dir(
os.path.join(self.uri, DEFAULT_TOKENIZER_DIR), temp_dir.name
)
return AutoTokenizer.from_pretrained(temp_dir.name)
save(self, tokenizer)
Writes a Tokenizer to the specified dir.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tokenizer |
Type[Any] |
The HFTokenizer to write. |
required |
Source code in zenml/integrations/huggingface/materializers/huggingface_tokenizer_materializer.py
def save(self, tokenizer: Type[Any]) -> None:
"""Writes a Tokenizer to the specified dir.
Args:
tokenizer: The HFTokenizer to write.
"""
temp_dir = TemporaryDirectory()
tokenizer.save_pretrained(temp_dir.name)
io_utils.copy_dir(
temp_dir.name,
os.path.join(self.uri, DEFAULT_TOKENIZER_DIR),
)