Prodigy
zenml.integrations.prodigy
special
Initialization of the Prodigy integration.
ProdigyIntegration (Integration)
Definition of Prodigy integration for ZenML.
Source code in zenml/integrations/prodigy/__init__.py
class ProdigyIntegration(Integration):
"""Definition of Prodigy integration for ZenML."""
NAME = PRODIGY
REQUIREMENTS = [
"prodigy",
"urllib3<2",
]
REQUIREMENTS_IGNORED_ON_UNINSTALL = ["urllib3"]
@classmethod
def flavors(cls) -> List[Type[Flavor]]:
"""Declare the stack component flavors for the Prodigy integration.
Returns:
List of stack component flavors for this integration.
"""
from zenml.integrations.prodigy.flavors import (
ProdigyAnnotatorFlavor,
)
return [ProdigyAnnotatorFlavor]
flavors()
classmethod
Declare the stack component flavors for the Prodigy integration.
Returns:
Type | Description |
---|---|
List[Type[zenml.stack.flavor.Flavor]] |
List of stack component flavors for this integration. |
Source code in zenml/integrations/prodigy/__init__.py
@classmethod
def flavors(cls) -> List[Type[Flavor]]:
"""Declare the stack component flavors for the Prodigy integration.
Returns:
List of stack component flavors for this integration.
"""
from zenml.integrations.prodigy.flavors import (
ProdigyAnnotatorFlavor,
)
return [ProdigyAnnotatorFlavor]
annotators
special
Initialization of the Prodigy annotators submodule.
prodigy_annotator
Implementation of the Prodigy annotation integration.
ProdigyAnnotator (BaseAnnotator, AuthenticationMixin)
Class to interact with the Prodigy annotation interface.
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
class ProdigyAnnotator(BaseAnnotator, AuthenticationMixin):
"""Class to interact with the Prodigy annotation interface."""
@property
def config(self) -> ProdigyAnnotatorConfig:
"""Returns the `ProdigyAnnotatorConfig` config.
Returns:
The configuration.
"""
return cast(ProdigyAnnotatorConfig, self._config)
def get_url(self) -> str:
"""Gets the top-level URL of the annotation interface.
Returns:
The URL of the annotation interface.
"""
instance_url = DEFAULT_LOCAL_INSTANCE_HOST
port = DEFAULT_LOCAL_PRODIGY_PORT
if self.config.custom_config_path:
with open(self.config.custom_config_path, "r") as f:
config = json.load(f)
instance_url = config.get("instance_url", instance_url)
port = config.get("port", port)
return f"http://{instance_url}:{port}"
def get_url_for_dataset(self, dataset_name: str) -> str:
"""Gets the URL of the annotation interface for the given dataset.
Prodigy does not support dataset-specific URLs, so this method returns
the top-level URL since that's what will be served for the user.
Args:
dataset_name: The name of the dataset. (Unuse)
Returns:
The URL of the annotation interface.
"""
return self.get_url()
def get_datasets(self) -> List[Any]:
"""Gets the datasets currently available for annotation.
Returns:
A list of datasets (str).
"""
datasets = self._get_db().datasets
return cast(List[Any], datasets)
def get_dataset_names(self) -> List[str]:
"""Gets the names of the datasets.
Returns:
A list of dataset names.
"""
return self.get_datasets()
def get_dataset_stats(self, dataset_name: str) -> Tuple[int, int]:
"""Gets the statistics of the given dataset.
Args:
dataset_name: The name of the dataset.
Returns:
A tuple containing (labeled_task_count, unlabeled_task_count) for
the dataset.
Raises:
IndexError: If the dataset does not exist.
"""
db = self._get_db()
try:
labeled_data_count = db.count_dataset(name=dataset_name)
except ValueError as e:
raise IndexError(
f"Dataset {dataset_name} does not exist. Please use `zenml "
f"annotator dataset list` to list the available datasets."
) from e
return (labeled_data_count, 0)
def launch(self, **kwargs: Any) -> None:
"""Launches the annotation interface.
This method extracts the 'command' and additional config
parameters from kwargs.
Args:
**kwargs: Should include:
- command: The full recipe command without "prodigy".
- Any additional config parameters to overwrite the
project-specific, global, and recipe config.
Raises:
ValueError: If the 'command' keyword argument is not provided.
"""
command = kwargs.get("command")
if not command:
raise ValueError(
"The 'command' keyword argument is required for launching Prodigy."
)
# Remove 'command' from kwargs to pass the rest as config parameters
config = {
key: value for key, value in kwargs.items() if key != "command"
}
prodigy.serve(command=command, **config)
def _get_db(
self,
custom_database: PeeweeDatabase = None,
display_id: Optional[str] = None,
display_name: Optional[str] = None,
) -> ProdigyDatabase:
"""Gets Prodigy database / client.
Args:
custom_database: Custom database to use.
display_id: The display id of the database.
display_name: The display name of the database.
Returns:
Prodigy database client.
"""
db_kwargs = {}
if custom_database:
db_kwargs["db"] = custom_database
if display_id:
db_kwargs["display_id"] = display_id
if display_name:
db_kwargs["display_name"] = display_name
# database is passed in without the keyword argument
if custom_database:
return connect(custom_database, **db_kwargs)
return connect(**db_kwargs)
def add_dataset(self, **kwargs: Any) -> Any:
"""Registers a dataset for annotation.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
A Prodigy list representing the dataset.
Raises:
ValueError: if 'dataset_name' and 'label_config' aren't provided.
"""
db = self._get_db()
dataset_kwargs = {"dataset_name": kwargs.get("dataset_name")}
if not dataset_kwargs["dataset_name"]:
raise ValueError("`dataset_name` keyword argument is required.")
if kwargs.get("dataset_meta"):
dataset_kwargs["dataset_meta"] = kwargs.get("dataset_meta")
return db.add_dataset(**dataset_kwargs)
def delete_dataset(self, **kwargs: Any) -> None:
"""Deletes a dataset from the annotation interface.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy
client.
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
db = self._get_db()
if not (dataset_name := kwargs.get("dataset_name")):
raise ValueError("`dataset_name` keyword argument is required.")
try:
db.drop_dataset(name=dataset_name)
except ProdigyError as e:
# see https://support.prodi.gy/t/how-to-import-datasetdoesnotexist-error/7205
if type(e).__name__ == "DatasetNotFound":
raise ValueError(
f"Dataset name '{dataset_name}' does not exist."
) from e
def get_dataset(self, **kwargs: Any) -> Any:
"""Gets the dataset metadata for the given name.
If you would like the labelled data, use `get_labeled_data` instead.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
The metadata associated with a Prodigy dataset
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
db = self._get_db()
if dataset_name := kwargs.get("dataset_name"):
try:
return db.get_meta(name=dataset_name)
except Exception as e:
raise ValueError(
f"Dataset name '{dataset_name}' does not exist."
) from e
def get_labeled_data(self, **kwargs: Any) -> Any:
"""Gets the labeled data for the given dataset.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
A list of all examples in the dataset serialized to the
Prodigy Task format.
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
if dataset_name := kwargs.get("dataset_name"):
return self._get_db().get_dataset_examples(dataset_name)
else:
raise ValueError("`dataset_name` keyword argument is required.")
def get_unlabeled_data(self, **kwargs: str) -> Any:
"""Gets the unlabeled data for the given dataset.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Raises:
NotImplementedError: Prodigy doesn't allow fetching unlabeled data.
"""
raise NotImplementedError(
"Prodigy doesn't allow fetching unlabeled data."
)
config: ProdigyAnnotatorConfig
property
readonly
Returns the ProdigyAnnotatorConfig
config.
Returns:
Type | Description |
---|---|
ProdigyAnnotatorConfig |
The configuration. |
add_dataset(self, **kwargs)
Registers a dataset for annotation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
Any |
Additional keyword arguments to pass to the Prodigy client. |
{} |
Returns:
Type | Description |
---|---|
Any |
A Prodigy list representing the dataset. |
Exceptions:
Type | Description |
---|---|
ValueError |
if 'dataset_name' and 'label_config' aren't provided. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def add_dataset(self, **kwargs: Any) -> Any:
"""Registers a dataset for annotation.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
A Prodigy list representing the dataset.
Raises:
ValueError: if 'dataset_name' and 'label_config' aren't provided.
"""
db = self._get_db()
dataset_kwargs = {"dataset_name": kwargs.get("dataset_name")}
if not dataset_kwargs["dataset_name"]:
raise ValueError("`dataset_name` keyword argument is required.")
if kwargs.get("dataset_meta"):
dataset_kwargs["dataset_meta"] = kwargs.get("dataset_meta")
return db.add_dataset(**dataset_kwargs)
delete_dataset(self, **kwargs)
Deletes a dataset from the annotation interface.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
Any |
Additional keyword arguments to pass to the Prodigy client. |
{} |
Exceptions:
Type | Description |
---|---|
ValueError |
If the dataset name is not provided or if the dataset does not exist. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def delete_dataset(self, **kwargs: Any) -> None:
"""Deletes a dataset from the annotation interface.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy
client.
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
db = self._get_db()
if not (dataset_name := kwargs.get("dataset_name")):
raise ValueError("`dataset_name` keyword argument is required.")
try:
db.drop_dataset(name=dataset_name)
except ProdigyError as e:
# see https://support.prodi.gy/t/how-to-import-datasetdoesnotexist-error/7205
if type(e).__name__ == "DatasetNotFound":
raise ValueError(
f"Dataset name '{dataset_name}' does not exist."
) from e
get_dataset(self, **kwargs)
Gets the dataset metadata for the given name.
If you would like the labelled data, use get_labeled_data
instead.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
Any |
Additional keyword arguments to pass to the Prodigy client. |
{} |
Returns:
Type | Description |
---|---|
Any |
The metadata associated with a Prodigy dataset |
Exceptions:
Type | Description |
---|---|
ValueError |
If the dataset name is not provided or if the dataset does not exist. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset(self, **kwargs: Any) -> Any:
"""Gets the dataset metadata for the given name.
If you would like the labelled data, use `get_labeled_data` instead.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
The metadata associated with a Prodigy dataset
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
db = self._get_db()
if dataset_name := kwargs.get("dataset_name"):
try:
return db.get_meta(name=dataset_name)
except Exception as e:
raise ValueError(
f"Dataset name '{dataset_name}' does not exist."
) from e
get_dataset_names(self)
Gets the names of the datasets.
Returns:
Type | Description |
---|---|
List[str] |
A list of dataset names. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset_names(self) -> List[str]:
"""Gets the names of the datasets.
Returns:
A list of dataset names.
"""
return self.get_datasets()
get_dataset_stats(self, dataset_name)
Gets the statistics of the given dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset_name |
str |
The name of the dataset. |
required |
Returns:
Type | Description |
---|---|
Tuple[int, int] |
A tuple containing (labeled_task_count, unlabeled_task_count) for the dataset. |
Exceptions:
Type | Description |
---|---|
IndexError |
If the dataset does not exist. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset_stats(self, dataset_name: str) -> Tuple[int, int]:
"""Gets the statistics of the given dataset.
Args:
dataset_name: The name of the dataset.
Returns:
A tuple containing (labeled_task_count, unlabeled_task_count) for
the dataset.
Raises:
IndexError: If the dataset does not exist.
"""
db = self._get_db()
try:
labeled_data_count = db.count_dataset(name=dataset_name)
except ValueError as e:
raise IndexError(
f"Dataset {dataset_name} does not exist. Please use `zenml "
f"annotator dataset list` to list the available datasets."
) from e
return (labeled_data_count, 0)
get_datasets(self)
Gets the datasets currently available for annotation.
Returns:
Type | Description |
---|---|
List[Any] |
A list of datasets (str). |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_datasets(self) -> List[Any]:
"""Gets the datasets currently available for annotation.
Returns:
A list of datasets (str).
"""
datasets = self._get_db().datasets
return cast(List[Any], datasets)
get_labeled_data(self, **kwargs)
Gets the labeled data for the given dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
Any |
Additional keyword arguments to pass to the Prodigy client. |
{} |
Returns:
Type | Description |
---|---|
Any |
A list of all examples in the dataset serialized to the Prodigy Task format. |
Exceptions:
Type | Description |
---|---|
ValueError |
If the dataset name is not provided or if the dataset does not exist. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_labeled_data(self, **kwargs: Any) -> Any:
"""Gets the labeled data for the given dataset.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Returns:
A list of all examples in the dataset serialized to the
Prodigy Task format.
Raises:
ValueError: If the dataset name is not provided or if the dataset
does not exist.
"""
if dataset_name := kwargs.get("dataset_name"):
return self._get_db().get_dataset_examples(dataset_name)
else:
raise ValueError("`dataset_name` keyword argument is required.")
get_unlabeled_data(self, **kwargs)
Gets the unlabeled data for the given dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
str |
Additional keyword arguments to pass to the Prodigy client. |
{} |
Exceptions:
Type | Description |
---|---|
NotImplementedError |
Prodigy doesn't allow fetching unlabeled data. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_unlabeled_data(self, **kwargs: str) -> Any:
"""Gets the unlabeled data for the given dataset.
Args:
**kwargs: Additional keyword arguments to pass to the Prodigy client.
Raises:
NotImplementedError: Prodigy doesn't allow fetching unlabeled data.
"""
raise NotImplementedError(
"Prodigy doesn't allow fetching unlabeled data."
)
get_url(self)
Gets the top-level URL of the annotation interface.
Returns:
Type | Description |
---|---|
str |
The URL of the annotation interface. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_url(self) -> str:
"""Gets the top-level URL of the annotation interface.
Returns:
The URL of the annotation interface.
"""
instance_url = DEFAULT_LOCAL_INSTANCE_HOST
port = DEFAULT_LOCAL_PRODIGY_PORT
if self.config.custom_config_path:
with open(self.config.custom_config_path, "r") as f:
config = json.load(f)
instance_url = config.get("instance_url", instance_url)
port = config.get("port", port)
return f"http://{instance_url}:{port}"
get_url_for_dataset(self, dataset_name)
Gets the URL of the annotation interface for the given dataset.
Prodigy does not support dataset-specific URLs, so this method returns the top-level URL since that's what will be served for the user.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset_name |
str |
The name of the dataset. (Unuse) |
required |
Returns:
Type | Description |
---|---|
str |
The URL of the annotation interface. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_url_for_dataset(self, dataset_name: str) -> str:
"""Gets the URL of the annotation interface for the given dataset.
Prodigy does not support dataset-specific URLs, so this method returns
the top-level URL since that's what will be served for the user.
Args:
dataset_name: The name of the dataset. (Unuse)
Returns:
The URL of the annotation interface.
"""
return self.get_url()
launch(self, **kwargs)
Launches the annotation interface.
This method extracts the 'command' and additional config parameters from kwargs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
Any |
Should include: - command: The full recipe command without "prodigy". - Any additional config parameters to overwrite the project-specific, global, and recipe config. |
{} |
Exceptions:
Type | Description |
---|---|
ValueError |
If the 'command' keyword argument is not provided. |
Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def launch(self, **kwargs: Any) -> None:
"""Launches the annotation interface.
This method extracts the 'command' and additional config
parameters from kwargs.
Args:
**kwargs: Should include:
- command: The full recipe command without "prodigy".
- Any additional config parameters to overwrite the
project-specific, global, and recipe config.
Raises:
ValueError: If the 'command' keyword argument is not provided.
"""
command = kwargs.get("command")
if not command:
raise ValueError(
"The 'command' keyword argument is required for launching Prodigy."
)
# Remove 'command' from kwargs to pass the rest as config parameters
config = {
key: value for key, value in kwargs.items() if key != "command"
}
prodigy.serve(command=command, **config)
flavors
special
Prodigy integration flavors.
prodigy_annotator_flavor
Prodigy annotator flavor.
ProdigyAnnotatorConfig (BaseAnnotatorConfig, AuthenticationConfigMixin)
Config for the Prodigy annotator.
See https://prodi.gy/docs/install#config for more on custom config files, but this allows you to override the default Prodigy config.
Attributes:
Name | Type | Description |
---|---|---|
custom_config_path |
Optional[str] |
The path to a custom config file for Prodigy. |
Source code in zenml/integrations/prodigy/flavors/prodigy_annotator_flavor.py
class ProdigyAnnotatorConfig(BaseAnnotatorConfig, AuthenticationConfigMixin):
"""Config for the Prodigy annotator.
See https://prodi.gy/docs/install#config for more on custom config files,
but this allows you to override the default Prodigy config.
Attributes:
custom_config_path: The path to a custom config file for Prodigy.
"""
custom_config_path: Optional[str] = None
ProdigyAnnotatorFlavor (BaseAnnotatorFlavor)
Prodigy annotator flavor.
Source code in zenml/integrations/prodigy/flavors/prodigy_annotator_flavor.py
class ProdigyAnnotatorFlavor(BaseAnnotatorFlavor):
"""Prodigy annotator flavor."""
@property
def name(self) -> str:
"""Name of the flavor.
Returns:
The name of the flavor.
"""
return PRODIGY_ANNOTATOR_FLAVOR
@property
def docs_url(self) -> Optional[str]:
"""A url to point at docs explaining this flavor.
Returns:
A flavor docs url.
"""
return self.generate_default_docs_url()
@property
def sdk_docs_url(self) -> Optional[str]:
"""A url to point at SDK docs explaining this flavor.
Returns:
A flavor SDK docs url.
"""
return self.generate_default_sdk_docs_url()
@property
def logo_url(self) -> str:
"""A url to represent the flavor in the dashboard.
Returns:
The flavor logo.
"""
return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/annotator/prodigy.png"
@property
def config_class(self) -> Type[ProdigyAnnotatorConfig]:
"""Returns `ProdigyAnnotatorConfig` config class.
Returns:
The config class.
"""
return ProdigyAnnotatorConfig
@property
def implementation_class(self) -> Type["ProdigyAnnotator"]:
"""Implementation class for this flavor.
Returns:
The implementation class.
"""
from zenml.integrations.prodigy.annotators import (
ProdigyAnnotator,
)
return ProdigyAnnotator
config_class: Type[zenml.integrations.prodigy.flavors.prodigy_annotator_flavor.ProdigyAnnotatorConfig]
property
readonly
Returns ProdigyAnnotatorConfig
config class.
Returns:
Type | Description |
---|---|
Type[zenml.integrations.prodigy.flavors.prodigy_annotator_flavor.ProdigyAnnotatorConfig] |
The config class. |
docs_url: Optional[str]
property
readonly
A url to point at docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor docs url. |
implementation_class: Type[ProdigyAnnotator]
property
readonly
Implementation class for this flavor.
Returns:
Type | Description |
---|---|
Type[ProdigyAnnotator] |
The implementation class. |
logo_url: str
property
readonly
A url to represent the flavor in the dashboard.
Returns:
Type | Description |
---|---|
str |
The flavor logo. |
name: str
property
readonly
Name of the flavor.
Returns:
Type | Description |
---|---|
str |
The name of the flavor. |
sdk_docs_url: Optional[str]
property
readonly
A url to point at SDK docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor SDK docs url. |