Skip to content

Prodigy

zenml.integrations.prodigy special

Initialization of the Prodigy integration.

ProdigyIntegration (Integration)

Definition of Prodigy integration for ZenML.

Source code in zenml/integrations/prodigy/__init__.py
class ProdigyIntegration(Integration):
    """Definition of Prodigy integration for ZenML."""

    NAME = PRODIGY
    REQUIREMENTS = [
        "prodigy",
        "urllib3<2",
    ]

    @classmethod
    def flavors(cls) -> List[Type[Flavor]]:
        """Declare the stack component flavors for the Prodigy integration.

        Returns:
            List of stack component flavors for this integration.
        """
        from zenml.integrations.prodigy.flavors import (
            ProdigyAnnotatorFlavor,
        )

        return [ProdigyAnnotatorFlavor]

flavors() classmethod

Declare the stack component flavors for the Prodigy integration.

Returns:

Type Description
List[Type[zenml.stack.flavor.Flavor]]

List of stack component flavors for this integration.

Source code in zenml/integrations/prodigy/__init__.py
@classmethod
def flavors(cls) -> List[Type[Flavor]]:
    """Declare the stack component flavors for the Prodigy integration.

    Returns:
        List of stack component flavors for this integration.
    """
    from zenml.integrations.prodigy.flavors import (
        ProdigyAnnotatorFlavor,
    )

    return [ProdigyAnnotatorFlavor]

annotators special

Initialization of the Prodigy annotators submodule.

prodigy_annotator

Implementation of the Prodigy annotation integration.

ProdigyAnnotator (BaseAnnotator, AuthenticationMixin)

Class to interact with the Prodigy annotation interface.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
class ProdigyAnnotator(BaseAnnotator, AuthenticationMixin):
    """Class to interact with the Prodigy annotation interface."""

    @property
    def config(self) -> ProdigyAnnotatorConfig:
        """Returns the `ProdigyAnnotatorConfig` config.

        Returns:
            The configuration.
        """
        return cast(ProdigyAnnotatorConfig, self._config)

    def get_url(self) -> str:
        """Gets the top-level URL of the annotation interface.

        Returns:
            The URL of the annotation interface.
        """
        instance_url = DEFAULT_LOCAL_INSTANCE_HOST
        port = DEFAULT_LOCAL_PRODIGY_PORT
        if self.config.custom_config_path:
            with open(self.config.custom_config_path, "r") as f:
                config = json.load(f)
            instance_url = config.get("instance_url", instance_url)
            port = config.get("port", port)
        return f"http://{instance_url}:{port}"

    def get_url_for_dataset(self, dataset_name: str) -> str:
        """Gets the URL of the annotation interface for the given dataset.

        Prodigy does not support dataset-specific URLs, so this method returns
        the top-level URL since that's what will be served for the user.

        Args:
            dataset_name: The name of the dataset. (Unuse)

        Returns:
            The URL of the annotation interface.
        """
        return self.get_url()

    def get_datasets(self) -> List[Any]:
        """Gets the datasets currently available for annotation.

        Returns:
            A list of datasets (str).
        """
        datasets = self._get_db().datasets
        return cast(List[Any], datasets)

    def get_dataset_names(self) -> List[str]:
        """Gets the names of the datasets.

        Returns:
            A list of dataset names.
        """
        return self.get_datasets()

    def get_dataset_stats(self, dataset_name: str) -> Tuple[int, int]:
        """Gets the statistics of the given dataset.

        Args:
            dataset_name: The name of the dataset.

        Returns:
            A tuple containing (labeled_task_count, unlabeled_task_count) for
                the dataset.

        Raises:
            IndexError: If the dataset does not exist.
        """
        db = self._get_db()
        try:
            labeled_data_count = db.count_dataset(name=dataset_name)
        except ValueError as e:
            raise IndexError(
                f"Dataset {dataset_name} does not exist. Please use `zenml "
                f"annotator dataset list` to list the available datasets."
            ) from e
        return (labeled_data_count, 0)

    def launch(self, **kwargs: Any) -> None:
        """Launches the annotation interface.

        This method extracts the 'command' and additional config
            parameters from kwargs.

        Args:
            **kwargs: Should include:
                - command: The full recipe command without "prodigy".
                - Any additional config parameters to overwrite the
                    project-specific, global, and recipe config.

        Raises:
            ValueError: If the 'command' keyword argument is not provided.
        """
        command = kwargs.get("command")
        if not command:
            raise ValueError(
                "The 'command' keyword argument is required for launching Prodigy."
            )

        # Remove 'command' from kwargs to pass the rest as config parameters
        config = {
            key: value for key, value in kwargs.items() if key != "command"
        }
        prodigy.serve(command=command, **config)

    def _get_db(
        self,
        custom_database: PeeweeDatabase = None,
        display_id: Optional[str] = None,
        display_name: Optional[str] = None,
    ) -> ProdigyDatabase:
        """Gets Prodigy database / client.

        Args:
            custom_database: Custom database to use.
            display_id: The display id of the database.
            display_name: The display name of the database.

        Returns:
            Prodigy database client.
        """
        db_kwargs = {}
        if custom_database:
            db_kwargs["db"] = custom_database
        if display_id:
            db_kwargs["display_id"] = display_id
        if display_name:
            db_kwargs["display_name"] = display_name

        # database is passed in without the keyword argument
        if custom_database:
            return connect(custom_database, **db_kwargs)
        return connect(**db_kwargs)

    def add_dataset(self, **kwargs: Any) -> Any:
        """Registers a dataset for annotation.

        Args:
            **kwargs: Additional keyword arguments to pass to the Prodigy client.

        Returns:
            A Prodigy list representing the dataset.

        Raises:
            ValueError: if 'dataset_name' and 'label_config' aren't provided.
        """
        db = self._get_db()
        dataset_kwargs = {"dataset_name": kwargs.get("dataset_name")}
        if not dataset_kwargs["dataset_name"]:
            raise ValueError("`dataset_name` keyword argument is required.")

        if kwargs.get("dataset_meta"):
            dataset_kwargs["dataset_meta"] = kwargs.get("dataset_meta")
        return db.add_dataset(**dataset_kwargs)

    def delete_dataset(self, **kwargs: Any) -> None:
        """Deletes a dataset from the annotation interface.

        Args:
            **kwargs: Additional keyword arguments to pass to the Prodigy
                client.

        Raises:
            ValueError: If the dataset name is not provided or if the dataset
                does not exist.
        """
        db = self._get_db()
        if not (dataset_name := kwargs.get("dataset_name")):
            raise ValueError("`dataset_name` keyword argument is required.")
        try:
            db.drop_dataset(name=dataset_name)
        except ProdigyError as e:
            # see https://support.prodi.gy/t/how-to-import-datasetdoesnotexist-error/7205
            if type(e).__name__ == "DatasetNotFound":
                raise ValueError(
                    f"Dataset name '{dataset_name}' does not exist."
                ) from e

    def get_dataset(self, **kwargs: Any) -> Any:
        """Gets the dataset metadata for the given name.

        If you would like the labelled data, use `get_labeled_data` instead.

        Args:
            **kwargs: Additional keyword arguments to pass to the Prodigy client.

        Returns:
            The metadata associated with a Prodigy dataset

        Raises:
            ValueError: If the dataset name is not provided or if the dataset
                does not exist.
        """
        db = self._get_db()
        if dataset_name := kwargs.get("dataset_name"):
            try:
                return db.get_meta(name=dataset_name)
            except Exception as e:
                raise ValueError(
                    f"Dataset name '{dataset_name}' does not exist."
                ) from e

    def get_labeled_data(self, **kwargs: Any) -> Any:
        """Gets the labeled data for the given dataset.

        Args:
            **kwargs: Additional keyword arguments to pass to the Prodigy client.

        Returns:
            A list of all examples in the dataset serialized to the
                Prodigy Task format.

        Raises:
            ValueError: If the dataset name is not provided or if the dataset
                does not exist.
        """
        if dataset_name := kwargs.get("dataset_name"):
            return self._get_db().get_dataset_examples(dataset_name)
        else:
            raise ValueError("`dataset_name` keyword argument is required.")

    def get_unlabeled_data(self, **kwargs: str) -> Any:
        """Gets the unlabeled data for the given dataset.

        Args:
            **kwargs: Additional keyword arguments to pass to the Prodigy client.

        Raises:
            NotImplementedError: Prodigy doesn't allow fetching unlabeled data.
        """
        raise NotImplementedError(
            "Prodigy doesn't allow fetching unlabeled data."
        )
config: ProdigyAnnotatorConfig property readonly

Returns the ProdigyAnnotatorConfig config.

Returns:

Type Description
ProdigyAnnotatorConfig

The configuration.

add_dataset(self, **kwargs)

Registers a dataset for annotation.

Parameters:

Name Type Description Default
**kwargs Any

Additional keyword arguments to pass to the Prodigy client.

{}

Returns:

Type Description
Any

A Prodigy list representing the dataset.

Exceptions:

Type Description
ValueError

if 'dataset_name' and 'label_config' aren't provided.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def add_dataset(self, **kwargs: Any) -> Any:
    """Registers a dataset for annotation.

    Args:
        **kwargs: Additional keyword arguments to pass to the Prodigy client.

    Returns:
        A Prodigy list representing the dataset.

    Raises:
        ValueError: if 'dataset_name' and 'label_config' aren't provided.
    """
    db = self._get_db()
    dataset_kwargs = {"dataset_name": kwargs.get("dataset_name")}
    if not dataset_kwargs["dataset_name"]:
        raise ValueError("`dataset_name` keyword argument is required.")

    if kwargs.get("dataset_meta"):
        dataset_kwargs["dataset_meta"] = kwargs.get("dataset_meta")
    return db.add_dataset(**dataset_kwargs)
delete_dataset(self, **kwargs)

Deletes a dataset from the annotation interface.

Parameters:

Name Type Description Default
**kwargs Any

Additional keyword arguments to pass to the Prodigy client.

{}

Exceptions:

Type Description
ValueError

If the dataset name is not provided or if the dataset does not exist.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def delete_dataset(self, **kwargs: Any) -> None:
    """Deletes a dataset from the annotation interface.

    Args:
        **kwargs: Additional keyword arguments to pass to the Prodigy
            client.

    Raises:
        ValueError: If the dataset name is not provided or if the dataset
            does not exist.
    """
    db = self._get_db()
    if not (dataset_name := kwargs.get("dataset_name")):
        raise ValueError("`dataset_name` keyword argument is required.")
    try:
        db.drop_dataset(name=dataset_name)
    except ProdigyError as e:
        # see https://support.prodi.gy/t/how-to-import-datasetdoesnotexist-error/7205
        if type(e).__name__ == "DatasetNotFound":
            raise ValueError(
                f"Dataset name '{dataset_name}' does not exist."
            ) from e
get_dataset(self, **kwargs)

Gets the dataset metadata for the given name.

If you would like the labelled data, use get_labeled_data instead.

Parameters:

Name Type Description Default
**kwargs Any

Additional keyword arguments to pass to the Prodigy client.

{}

Returns:

Type Description
Any

The metadata associated with a Prodigy dataset

Exceptions:

Type Description
ValueError

If the dataset name is not provided or if the dataset does not exist.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset(self, **kwargs: Any) -> Any:
    """Gets the dataset metadata for the given name.

    If you would like the labelled data, use `get_labeled_data` instead.

    Args:
        **kwargs: Additional keyword arguments to pass to the Prodigy client.

    Returns:
        The metadata associated with a Prodigy dataset

    Raises:
        ValueError: If the dataset name is not provided or if the dataset
            does not exist.
    """
    db = self._get_db()
    if dataset_name := kwargs.get("dataset_name"):
        try:
            return db.get_meta(name=dataset_name)
        except Exception as e:
            raise ValueError(
                f"Dataset name '{dataset_name}' does not exist."
            ) from e
get_dataset_names(self)

Gets the names of the datasets.

Returns:

Type Description
List[str]

A list of dataset names.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset_names(self) -> List[str]:
    """Gets the names of the datasets.

    Returns:
        A list of dataset names.
    """
    return self.get_datasets()
get_dataset_stats(self, dataset_name)

Gets the statistics of the given dataset.

Parameters:

Name Type Description Default
dataset_name str

The name of the dataset.

required

Returns:

Type Description
Tuple[int, int]

A tuple containing (labeled_task_count, unlabeled_task_count) for the dataset.

Exceptions:

Type Description
IndexError

If the dataset does not exist.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_dataset_stats(self, dataset_name: str) -> Tuple[int, int]:
    """Gets the statistics of the given dataset.

    Args:
        dataset_name: The name of the dataset.

    Returns:
        A tuple containing (labeled_task_count, unlabeled_task_count) for
            the dataset.

    Raises:
        IndexError: If the dataset does not exist.
    """
    db = self._get_db()
    try:
        labeled_data_count = db.count_dataset(name=dataset_name)
    except ValueError as e:
        raise IndexError(
            f"Dataset {dataset_name} does not exist. Please use `zenml "
            f"annotator dataset list` to list the available datasets."
        ) from e
    return (labeled_data_count, 0)
get_datasets(self)

Gets the datasets currently available for annotation.

Returns:

Type Description
List[Any]

A list of datasets (str).

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_datasets(self) -> List[Any]:
    """Gets the datasets currently available for annotation.

    Returns:
        A list of datasets (str).
    """
    datasets = self._get_db().datasets
    return cast(List[Any], datasets)
get_labeled_data(self, **kwargs)

Gets the labeled data for the given dataset.

Parameters:

Name Type Description Default
**kwargs Any

Additional keyword arguments to pass to the Prodigy client.

{}

Returns:

Type Description
Any

A list of all examples in the dataset serialized to the Prodigy Task format.

Exceptions:

Type Description
ValueError

If the dataset name is not provided or if the dataset does not exist.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_labeled_data(self, **kwargs: Any) -> Any:
    """Gets the labeled data for the given dataset.

    Args:
        **kwargs: Additional keyword arguments to pass to the Prodigy client.

    Returns:
        A list of all examples in the dataset serialized to the
            Prodigy Task format.

    Raises:
        ValueError: If the dataset name is not provided or if the dataset
            does not exist.
    """
    if dataset_name := kwargs.get("dataset_name"):
        return self._get_db().get_dataset_examples(dataset_name)
    else:
        raise ValueError("`dataset_name` keyword argument is required.")
get_unlabeled_data(self, **kwargs)

Gets the unlabeled data for the given dataset.

Parameters:

Name Type Description Default
**kwargs str

Additional keyword arguments to pass to the Prodigy client.

{}

Exceptions:

Type Description
NotImplementedError

Prodigy doesn't allow fetching unlabeled data.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_unlabeled_data(self, **kwargs: str) -> Any:
    """Gets the unlabeled data for the given dataset.

    Args:
        **kwargs: Additional keyword arguments to pass to the Prodigy client.

    Raises:
        NotImplementedError: Prodigy doesn't allow fetching unlabeled data.
    """
    raise NotImplementedError(
        "Prodigy doesn't allow fetching unlabeled data."
    )
get_url(self)

Gets the top-level URL of the annotation interface.

Returns:

Type Description
str

The URL of the annotation interface.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_url(self) -> str:
    """Gets the top-level URL of the annotation interface.

    Returns:
        The URL of the annotation interface.
    """
    instance_url = DEFAULT_LOCAL_INSTANCE_HOST
    port = DEFAULT_LOCAL_PRODIGY_PORT
    if self.config.custom_config_path:
        with open(self.config.custom_config_path, "r") as f:
            config = json.load(f)
        instance_url = config.get("instance_url", instance_url)
        port = config.get("port", port)
    return f"http://{instance_url}:{port}"
get_url_for_dataset(self, dataset_name)

Gets the URL of the annotation interface for the given dataset.

Prodigy does not support dataset-specific URLs, so this method returns the top-level URL since that's what will be served for the user.

Parameters:

Name Type Description Default
dataset_name str

The name of the dataset. (Unuse)

required

Returns:

Type Description
str

The URL of the annotation interface.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def get_url_for_dataset(self, dataset_name: str) -> str:
    """Gets the URL of the annotation interface for the given dataset.

    Prodigy does not support dataset-specific URLs, so this method returns
    the top-level URL since that's what will be served for the user.

    Args:
        dataset_name: The name of the dataset. (Unuse)

    Returns:
        The URL of the annotation interface.
    """
    return self.get_url()
launch(self, **kwargs)

Launches the annotation interface.

This method extracts the 'command' and additional config parameters from kwargs.

Parameters:

Name Type Description Default
**kwargs Any

Should include: - command: The full recipe command without "prodigy". - Any additional config parameters to overwrite the project-specific, global, and recipe config.

{}

Exceptions:

Type Description
ValueError

If the 'command' keyword argument is not provided.

Source code in zenml/integrations/prodigy/annotators/prodigy_annotator.py
def launch(self, **kwargs: Any) -> None:
    """Launches the annotation interface.

    This method extracts the 'command' and additional config
        parameters from kwargs.

    Args:
        **kwargs: Should include:
            - command: The full recipe command without "prodigy".
            - Any additional config parameters to overwrite the
                project-specific, global, and recipe config.

    Raises:
        ValueError: If the 'command' keyword argument is not provided.
    """
    command = kwargs.get("command")
    if not command:
        raise ValueError(
            "The 'command' keyword argument is required for launching Prodigy."
        )

    # Remove 'command' from kwargs to pass the rest as config parameters
    config = {
        key: value for key, value in kwargs.items() if key != "command"
    }
    prodigy.serve(command=command, **config)

flavors special

Prodigy integration flavors.

prodigy_annotator_flavor

Prodigy annotator flavor.

ProdigyAnnotatorConfig (BaseAnnotatorConfig, AuthenticationConfigMixin)

Config for the Prodigy annotator.

See https://prodi.gy/docs/install#config for more on custom config files, but this allows you to override the default Prodigy config.

Attributes:

Name Type Description
custom_config_path Optional[str]

The path to a custom config file for Prodigy.

Source code in zenml/integrations/prodigy/flavors/prodigy_annotator_flavor.py
class ProdigyAnnotatorConfig(BaseAnnotatorConfig, AuthenticationConfigMixin):
    """Config for the Prodigy annotator.

    See https://prodi.gy/docs/install#config for more on custom config files,
    but this allows you to override the default Prodigy config.

    Attributes:
        custom_config_path: The path to a custom config file for Prodigy.
    """

    custom_config_path: Optional[str] = None
ProdigyAnnotatorFlavor (BaseAnnotatorFlavor)

Prodigy annotator flavor.

Source code in zenml/integrations/prodigy/flavors/prodigy_annotator_flavor.py
class ProdigyAnnotatorFlavor(BaseAnnotatorFlavor):
    """Prodigy annotator flavor."""

    @property
    def name(self) -> str:
        """Name of the flavor.

        Returns:
            The name of the flavor.
        """
        return PRODIGY_ANNOTATOR_FLAVOR

    @property
    def docs_url(self) -> Optional[str]:
        """A url to point at docs explaining this flavor.

        Returns:
            A flavor docs url.
        """
        return self.generate_default_docs_url()

    @property
    def sdk_docs_url(self) -> Optional[str]:
        """A url to point at SDK docs explaining this flavor.

        Returns:
            A flavor SDK docs url.
        """
        return self.generate_default_sdk_docs_url()

    @property
    def logo_url(self) -> str:
        """A url to represent the flavor in the dashboard.

        Returns:
            The flavor logo.
        """
        return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/annotator/prodigy.png"

    @property
    def config_class(self) -> Type[ProdigyAnnotatorConfig]:
        """Returns `ProdigyAnnotatorConfig` config class.

        Returns:
                The config class.
        """
        return ProdigyAnnotatorConfig

    @property
    def implementation_class(self) -> Type["ProdigyAnnotator"]:
        """Implementation class for this flavor.

        Returns:
            The implementation class.
        """
        from zenml.integrations.prodigy.annotators import (
            ProdigyAnnotator,
        )

        return ProdigyAnnotator
config_class: Type[zenml.integrations.prodigy.flavors.prodigy_annotator_flavor.ProdigyAnnotatorConfig] property readonly

Returns ProdigyAnnotatorConfig config class.

Returns:

Type Description
Type[zenml.integrations.prodigy.flavors.prodigy_annotator_flavor.ProdigyAnnotatorConfig]

The config class.

docs_url: Optional[str] property readonly

A url to point at docs explaining this flavor.

Returns:

Type Description
Optional[str]

A flavor docs url.

implementation_class: Type[ProdigyAnnotator] property readonly

Implementation class for this flavor.

Returns:

Type Description
Type[ProdigyAnnotator]

The implementation class.

logo_url: str property readonly

A url to represent the flavor in the dashboard.

Returns:

Type Description
str

The flavor logo.

name: str property readonly

Name of the flavor.

Returns:

Type Description
str

The name of the flavor.

sdk_docs_url: Optional[str] property readonly

A url to point at SDK docs explaining this flavor.

Returns:

Type Description
Optional[str]

A flavor SDK docs url.