Great Expectations
zenml.integrations.great_expectations
special
Great Expectation integration for ZenML.
The Great Expectations integration enables you to use Great Expectations as a way of profiling and validating your data.
GreatExpectationsIntegration (Integration)
Definition of Great Expectations integration for ZenML.
Source code in zenml/integrations/great_expectations/__init__.py
class GreatExpectationsIntegration(Integration):
"""Definition of Great Expectations integration for ZenML."""
NAME = GREAT_EXPECTATIONS
REQUIREMENTS = [
"great-expectations>=0.15.0,<=0.15.47",
# typing_extensions 4.6.0 and above doesn't work with GE
"typing_extensions<4.6.0",
]
@staticmethod
def activate() -> None:
"""Activate the Great Expectations integration."""
from zenml.integrations.great_expectations import materializers # noqa
@classmethod
def flavors(cls) -> List[Type[Flavor]]:
"""Declare the stack component flavors for the Great Expectations integration.
Returns:
List of stack component flavors for this integration.
"""
from zenml.integrations.great_expectations.flavors import (
GreatExpectationsDataValidatorFlavor,
)
return [GreatExpectationsDataValidatorFlavor]
activate()
staticmethod
Activate the Great Expectations integration.
Source code in zenml/integrations/great_expectations/__init__.py
@staticmethod
def activate() -> None:
"""Activate the Great Expectations integration."""
from zenml.integrations.great_expectations import materializers # noqa
flavors()
classmethod
Declare the stack component flavors for the Great Expectations integration.
Returns:
Type | Description |
---|---|
List[Type[zenml.stack.flavor.Flavor]] |
List of stack component flavors for this integration. |
Source code in zenml/integrations/great_expectations/__init__.py
@classmethod
def flavors(cls) -> List[Type[Flavor]]:
"""Declare the stack component flavors for the Great Expectations integration.
Returns:
List of stack component flavors for this integration.
"""
from zenml.integrations.great_expectations.flavors import (
GreatExpectationsDataValidatorFlavor,
)
return [GreatExpectationsDataValidatorFlavor]
data_validators
special
Initialization of the Great Expectations data validator for ZenML.
ge_data_validator
Implementation of the Great Expectations data validator.
GreatExpectationsDataValidator (BaseDataValidator)
Great Expectations data validator stack component.
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
class GreatExpectationsDataValidator(BaseDataValidator):
"""Great Expectations data validator stack component."""
NAME: ClassVar[str] = "Great Expectations"
FLAVOR: ClassVar[
Type[BaseDataValidatorFlavor]
] = GreatExpectationsDataValidatorFlavor
_context: BaseDataContext = None
_context_config: Optional[Dict[str, Any]] = None
@property
def config(self) -> GreatExpectationsDataValidatorConfig:
"""Returns the `GreatExpectationsDataValidatorConfig` config.
Returns:
The configuration.
"""
return cast(GreatExpectationsDataValidatorConfig, self._config)
@classmethod
def get_data_context(cls) -> BaseDataContext:
"""Get the Great Expectations data context managed by ZenML.
Call this method to retrieve the data context managed by ZenML
through the active Great Expectations data validator stack component.
Returns:
A Great Expectations data context managed by ZenML as configured
through the active data validator stack component.
"""
data_validator = cast(
"GreatExpectationsDataValidator", cls.get_active_data_validator()
)
return data_validator.data_context
@property
def context_config(self) -> Optional[Dict[str, Any]]:
"""Get the Great Expectations data context configuration.
The first time the context config is loaded from the stack component
config, it is converted from JSON/YAML string format to a dict.
Raises:
ValueError: If the context_config value is not a valid JSON/YAML or
if the GE configuration extracted from it fails GE validation.
Returns:
A dictionary with the GE data context configuration.
"""
# If the context config is already loaded, return it
if self._context_config is not None:
return self._context_config
# Otherwise, load it from the stack component config
context_config = self.config.context_config
if context_config is None:
return None
if isinstance(context_config, dict):
self._context_config = context_config
return self._context_config
# If the context config is a string, try to parse it as JSON/YAML
try:
context_config_dict = yaml.safe_load(context_config)
except yaml.parser.ParserError as e:
raise ValueError(
f"Malformed `context_config` value. Only JSON and YAML "
f"formats are supported: {str(e)}"
)
# Validate that the context config is a valid GE config
try:
context_config = DataContextConfig(**context_config_dict)
BaseDataContext(project_config=context_config)
except Exception as e:
raise ValueError(f"Invalid `context_config` value: {str(e)}")
self._context_config = cast(Dict[str, Any], context_config_dict)
return self._context_config
@property
def local_path(self) -> Optional[str]:
"""Return a local path where this component stores information.
If an existing local GE data context is used, it is
interpreted as a local path that needs to be accessible in
all runtime environments.
Returns:
The local path where this component stores information.
"""
return self.config.context_root_dir
def get_store_config(self, class_name: str, prefix: str) -> Dict[str, Any]:
"""Generate a Great Expectations store configuration.
Args:
class_name: The store class name
prefix: The path prefix for the ZenML store configuration
Returns:
A dictionary with the GE store configuration.
"""
return {
"class_name": class_name,
"store_backend": {
"module_name": ZenMLArtifactStoreBackend.__module__,
"class_name": ZenMLArtifactStoreBackend.__name__,
"prefix": f"{str(self.id)}/{prefix}",
},
}
def get_data_docs_config(
self, prefix: str, local: bool = False
) -> Dict[str, Any]:
"""Generate Great Expectations data docs configuration.
Args:
prefix: The path prefix for the ZenML data docs configuration
local: Whether the data docs site is local or remote.
Returns:
A dictionary with the GE data docs site configuration.
"""
if local:
store_backend = {
"class_name": "TupleFilesystemStoreBackend",
"base_directory": f"{self.root_directory}/{prefix}",
}
else:
store_backend = {
"module_name": ZenMLArtifactStoreBackend.__module__,
"class_name": ZenMLArtifactStoreBackend.__name__,
"prefix": f"{str(self.id)}/{prefix}",
}
return {
"class_name": "SiteBuilder",
"store_backend": store_backend,
"site_index_builder": {
"class_name": "DefaultSiteIndexBuilder",
},
}
@property
def data_context(self) -> BaseDataContext:
"""Returns the Great Expectations data context configured for this component.
Returns:
The Great Expectations data context configured for this component.
"""
if not self._context:
expectations_store_name = "zenml_expectations_store"
validations_store_name = "zenml_validations_store"
checkpoint_store_name = "zenml_checkpoint_store"
profiler_store_name = "zenml_profiler_store"
evaluation_parameter_store_name = "evaluation_parameter_store"
zenml_context_config = dict(
stores={
expectations_store_name: self.get_store_config(
"ExpectationsStore", "expectations"
),
validations_store_name: self.get_store_config(
"ValidationsStore", "validations"
),
checkpoint_store_name: self.get_store_config(
"CheckpointStore", "checkpoints"
),
profiler_store_name: self.get_store_config(
"ProfilerStore", "profilers"
),
evaluation_parameter_store_name: {
"class_name": "EvaluationParameterStore"
},
},
expectations_store_name=expectations_store_name,
validations_store_name=validations_store_name,
checkpoint_store_name=checkpoint_store_name,
profiler_store_name=profiler_store_name,
evaluation_parameter_store_name=evaluation_parameter_store_name,
data_docs_sites={
"zenml_artifact_store": self.get_data_docs_config(
"data_docs"
)
},
)
configure_zenml_stores = self.config.configure_zenml_stores
if self.config.context_root_dir:
# initialize the local data context, if a local path was
# configured
self._context = DataContext(self.config.context_root_dir)
else:
# create an in-memory data context configuration that is not
# backed by a local YAML file (see https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file/).
if self.context_config:
context_config = DataContextConfig(**self.context_config)
else:
context_config = DataContextConfig(**zenml_context_config)
# skip adding the stores after initialization, as they are
# already baked in the initial configuration
configure_zenml_stores = False
self._context = BaseDataContext(project_config=context_config)
if configure_zenml_stores:
self._context.config.expectations_store_name = (
expectations_store_name
)
self._context.config.validations_store_name = (
validations_store_name
)
self._context.config.checkpoint_store_name = (
checkpoint_store_name
)
self._context.config.profiler_store_name = profiler_store_name
self._context.config.evaluation_parameter_store_name = (
evaluation_parameter_store_name
)
for store_name, store_config in zenml_context_config[ # type: ignore[attr-defined]
"stores"
].items():
self._context.add_store(
store_name=store_name,
store_config=store_config,
)
for site_name, site_config in zenml_context_config[ # type: ignore[attr-defined]
"data_docs_sites"
].items():
self._context.config.data_docs_sites[
site_name
] = site_config
if self.config.configure_local_docs:
client = Client()
artifact_store = client.active_stack.artifact_store
if artifact_store.flavor != "local":
self._context.config.data_docs_sites[
"zenml_local"
] = self.get_data_docs_config("data_docs", local=True)
return self._context
@property
def root_directory(self) -> str:
"""Returns path to the root directory for all local files concerning this data validator.
Returns:
Path to the root directory.
"""
path = os.path.join(
io_utils.get_global_config_directory(),
self.flavor,
str(self.id),
)
if not os.path.exists(path):
fileio.makedirs(path)
return path
def data_profiling(
self,
dataset: pd.DataFrame,
comparison_dataset: Optional[Any] = None,
profile_list: Optional[Sequence[str]] = None,
expectation_suite_name: Optional[str] = None,
data_asset_name: Optional[str] = None,
profiler_kwargs: Optional[Dict[str, Any]] = None,
overwrite_existing_suite: bool = True,
**kwargs: Any,
) -> ExpectationSuite:
"""Infer a Great Expectation Expectation Suite from a given dataset.
This Great Expectations specific data profiling method implementation
builds an Expectation Suite automatically by running a
UserConfigurableProfiler on an input dataset [as covered in the official
GE documentation](https://docs.greatexpectations.io/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler).
Args:
dataset: The dataset from which the expectation suite will be
inferred.
comparison_dataset: Optional dataset used to generate data
comparison (i.e. data drift) profiles. Not supported by the
Great Expectation data validator.
profile_list: Optional list identifying the categories of data
profiles to be generated. Not supported by the Great Expectation
data validator.
expectation_suite_name: The name of the expectation suite to create
or update. If not supplied, a unique name will be generated from
the current pipeline and step name, if running in the context of
a pipeline step.
data_asset_name: The name of the data asset to use to identify the
dataset in the Great Expectations docs.
profiler_kwargs: A dictionary of custom keyword arguments to pass to
the profiler.
overwrite_existing_suite: Whether to overwrite an existing
expectation suite, if one exists with that name.
kwargs: Additional keyword arguments (unused).
Returns:
The inferred Expectation Suite.
Raises:
ValueError: if an `expectation_suite_name` value is not supplied and
a name for the expectation suite cannot be generated from the
current step name and pipeline name.
"""
context = self.data_context
if comparison_dataset is not None:
logger.warning(
"A comparison dataset is not required by Great Expectations "
"to do data profiling. Silently ignoring the supplied dataset "
)
if not expectation_suite_name:
try:
step_context = get_step_context()
pipeline_name = step_context.pipeline.name
step_name = step_context.step_run.name
expectation_suite_name = f"{pipeline_name}_{step_name}"
except RuntimeError:
raise ValueError(
"A expectation suite name is required when not running in "
"the context of a pipeline step."
)
suite_exists = False
if context.expectations_store.has_key( # noqa
ExpectationSuiteIdentifier(expectation_suite_name)
):
suite_exists = True
suite = context.get_expectation_suite(expectation_suite_name)
if not overwrite_existing_suite:
logger.info(
f"Expectation Suite `{expectation_suite_name}` "
f"already exists and `overwrite_existing_suite` is not set "
f"in the step configuration. Skipping re-running the "
f"profiler."
)
return suite
batch_request = create_batch_request(context, dataset, data_asset_name)
try:
if suite_exists:
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=expectation_suite_name,
)
else:
validator = context.get_validator(
batch_request=batch_request,
create_expectation_suite_with_name=expectation_suite_name,
)
profiler = UserConfigurableProfiler(
profile_dataset=validator, **profiler_kwargs
)
suite = profiler.build_suite()
context.save_expectation_suite(
expectation_suite=suite,
expectation_suite_name=expectation_suite_name,
)
context.build_data_docs()
finally:
context.delete_datasource(batch_request.datasource_name)
return suite
def data_validation(
self,
dataset: pd.DataFrame,
comparison_dataset: Optional[Any] = None,
check_list: Optional[Sequence[str]] = None,
expectation_suite_name: Optional[str] = None,
data_asset_name: Optional[str] = None,
action_list: Optional[List[Dict[str, Any]]] = None,
**kwargs: Any,
) -> CheckpointResult:
"""Great Expectations data validation.
This Great Expectations specific data validation method
implementation validates an input dataset against an Expectation Suite
(the GE definition of a profile) [as covered in the official GE
documentation](https://docs.greatexpectations.io/docs/guides/validation/how_to_validate_data_by_running_a_checkpoint).
Args:
dataset: The dataset to validate.
comparison_dataset: Optional dataset used to run data
comparison (i.e. data drift) checks. Not supported by the
Great Expectation data validator.
check_list: Optional list identifying the data validation checks to
be performed. Not supported by the Great Expectations data
validator.
expectation_suite_name: The name of the expectation suite to use to
validate the dataset. A value must be provided.
data_asset_name: The name of the data asset to use to identify the
dataset in the Great Expectations docs.
action_list: A list of additional Great Expectations actions to run after
the validation check.
kwargs: Additional keyword arguments (unused).
Returns:
The Great Expectations validation (checkpoint) result.
Raises:
ValueError: if the `expectation_suite_name` argument is omitted.
"""
if not expectation_suite_name:
raise ValueError("Missing expectation_suite_name argument value.")
if comparison_dataset is not None:
logger.warning(
"A comparison dataset is not required by Great Expectations "
"to do data validation. Silently ignoring the supplied dataset "
)
try:
step_context = get_step_context()
run_name = step_context.pipeline_run.name
step_name = step_context.step_run.name
except RuntimeError:
# if not running inside a pipeline step, use random values
run_name = f"pipeline_{random_str(5)}"
step_name = f"step_{random_str(5)}"
context = self.data_context
checkpoint_name = f"{run_name}_{step_name}"
batch_request = create_batch_request(context, dataset, data_asset_name)
action_list = action_list or [
{
"name": "store_validation_result",
"action": {"class_name": "StoreValidationResultAction"},
},
{
"name": "store_evaluation_params",
"action": {"class_name": "StoreEvaluationParametersAction"},
},
{
"name": "update_data_docs",
"action": {"class_name": "UpdateDataDocsAction"},
},
]
checkpoint_config = {
"name": checkpoint_name,
"run_name_template": run_name,
"config_version": 1,
"class_name": "Checkpoint",
"expectation_suite_name": expectation_suite_name,
"action_list": action_list,
}
context.add_checkpoint(**checkpoint_config)
try:
results = context.run_checkpoint(
checkpoint_name=checkpoint_name,
validations=[{"batch_request": batch_request}],
)
finally:
context.delete_datasource(batch_request.datasource_name)
context.delete_checkpoint(checkpoint_name)
return results
config: GreatExpectationsDataValidatorConfig
property
readonly
Returns the GreatExpectationsDataValidatorConfig
config.
Returns:
Type | Description |
---|---|
GreatExpectationsDataValidatorConfig |
The configuration. |
context_config: Optional[Dict[str, Any]]
property
readonly
Get the Great Expectations data context configuration.
The first time the context config is loaded from the stack component config, it is converted from JSON/YAML string format to a dict.
Exceptions:
Type | Description |
---|---|
ValueError |
If the context_config value is not a valid JSON/YAML or if the GE configuration extracted from it fails GE validation. |
Returns:
Type | Description |
---|---|
Optional[Dict[str, Any]] |
A dictionary with the GE data context configuration. |
data_context: <function BaseDataContext at 0x7f96485c7dc0>
property
readonly
Returns the Great Expectations data context configured for this component.
Returns:
Type | Description |
---|---|
<function BaseDataContext at 0x7f96485c7dc0> |
The Great Expectations data context configured for this component. |
local_path: Optional[str]
property
readonly
Return a local path where this component stores information.
If an existing local GE data context is used, it is interpreted as a local path that needs to be accessible in all runtime environments.
Returns:
Type | Description |
---|---|
Optional[str] |
The local path where this component stores information. |
root_directory: str
property
readonly
Returns path to the root directory for all local files concerning this data validator.
Returns:
Type | Description |
---|---|
str |
Path to the root directory. |
FLAVOR (BaseDataValidatorFlavor)
Great Expectations data validator flavor.
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
class GreatExpectationsDataValidatorFlavor(BaseDataValidatorFlavor):
"""Great Expectations data validator flavor."""
@property
def name(self) -> str:
"""Name of the flavor.
Returns:
The name of the flavor.
"""
return GREAT_EXPECTATIONS_DATA_VALIDATOR_FLAVOR
@property
def docs_url(self) -> Optional[str]:
"""A url to point at docs explaining this flavor.
Returns:
A flavor docs url.
"""
return self.generate_default_docs_url()
@property
def sdk_docs_url(self) -> Optional[str]:
"""A url to point at SDK docs explaining this flavor.
Returns:
A flavor SDK docs url.
"""
return self.generate_default_sdk_docs_url()
@property
def logo_url(self) -> str:
"""A url to represent the flavor in the dashboard.
Returns:
The flavor logo.
"""
return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/data_validator/greatexpectations.jpeg"
@property
def config_class(self) -> Type[GreatExpectationsDataValidatorConfig]:
"""Returns `GreatExpectationsDataValidatorConfig` config class.
Returns:
The config class.
"""
return GreatExpectationsDataValidatorConfig
@property
def implementation_class(self) -> Type["GreatExpectationsDataValidator"]:
"""Implementation class for this flavor.
Returns:
The implementation class.
"""
from zenml.integrations.great_expectations.data_validators import (
GreatExpectationsDataValidator,
)
return GreatExpectationsDataValidator
config_class: Type[zenml.integrations.great_expectations.flavors.great_expectations_data_validator_flavor.GreatExpectationsDataValidatorConfig]
property
readonly
Returns GreatExpectationsDataValidatorConfig
config class.
Returns:
Type | Description |
---|---|
Type[zenml.integrations.great_expectations.flavors.great_expectations_data_validator_flavor.GreatExpectationsDataValidatorConfig] |
The config class. |
docs_url: Optional[str]
property
readonly
A url to point at docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor docs url. |
implementation_class: Type[GreatExpectationsDataValidator]
property
readonly
Implementation class for this flavor.
Returns:
Type | Description |
---|---|
Type[GreatExpectationsDataValidator] |
The implementation class. |
logo_url: str
property
readonly
A url to represent the flavor in the dashboard.
Returns:
Type | Description |
---|---|
str |
The flavor logo. |
name: str
property
readonly
Name of the flavor.
Returns:
Type | Description |
---|---|
str |
The name of the flavor. |
sdk_docs_url: Optional[str]
property
readonly
A url to point at SDK docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor SDK docs url. |
data_profiling(self, dataset, comparison_dataset=None, profile_list=None, expectation_suite_name=None, data_asset_name=None, profiler_kwargs=None, overwrite_existing_suite=True, **kwargs)
Infer a Great Expectation Expectation Suite from a given dataset.
This Great Expectations specific data profiling method implementation builds an Expectation Suite automatically by running a UserConfigurableProfiler on an input dataset as covered in the official GE documentation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
DataFrame |
The dataset from which the expectation suite will be inferred. |
required |
comparison_dataset |
Optional[Any] |
Optional dataset used to generate data comparison (i.e. data drift) profiles. Not supported by the Great Expectation data validator. |
None |
profile_list |
Optional[Sequence[str]] |
Optional list identifying the categories of data profiles to be generated. Not supported by the Great Expectation data validator. |
None |
expectation_suite_name |
Optional[str] |
The name of the expectation suite to create or update. If not supplied, a unique name will be generated from the current pipeline and step name, if running in the context of a pipeline step. |
None |
data_asset_name |
Optional[str] |
The name of the data asset to use to identify the dataset in the Great Expectations docs. |
None |
profiler_kwargs |
Optional[Dict[str, Any]] |
A dictionary of custom keyword arguments to pass to the profiler. |
None |
overwrite_existing_suite |
bool |
Whether to overwrite an existing expectation suite, if one exists with that name. |
True |
kwargs |
Any |
Additional keyword arguments (unused). |
{} |
Returns:
Type | Description |
---|---|
ExpectationSuite |
The inferred Expectation Suite. |
Exceptions:
Type | Description |
---|---|
ValueError |
if an |
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
def data_profiling(
self,
dataset: pd.DataFrame,
comparison_dataset: Optional[Any] = None,
profile_list: Optional[Sequence[str]] = None,
expectation_suite_name: Optional[str] = None,
data_asset_name: Optional[str] = None,
profiler_kwargs: Optional[Dict[str, Any]] = None,
overwrite_existing_suite: bool = True,
**kwargs: Any,
) -> ExpectationSuite:
"""Infer a Great Expectation Expectation Suite from a given dataset.
This Great Expectations specific data profiling method implementation
builds an Expectation Suite automatically by running a
UserConfigurableProfiler on an input dataset [as covered in the official
GE documentation](https://docs.greatexpectations.io/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler).
Args:
dataset: The dataset from which the expectation suite will be
inferred.
comparison_dataset: Optional dataset used to generate data
comparison (i.e. data drift) profiles. Not supported by the
Great Expectation data validator.
profile_list: Optional list identifying the categories of data
profiles to be generated. Not supported by the Great Expectation
data validator.
expectation_suite_name: The name of the expectation suite to create
or update. If not supplied, a unique name will be generated from
the current pipeline and step name, if running in the context of
a pipeline step.
data_asset_name: The name of the data asset to use to identify the
dataset in the Great Expectations docs.
profiler_kwargs: A dictionary of custom keyword arguments to pass to
the profiler.
overwrite_existing_suite: Whether to overwrite an existing
expectation suite, if one exists with that name.
kwargs: Additional keyword arguments (unused).
Returns:
The inferred Expectation Suite.
Raises:
ValueError: if an `expectation_suite_name` value is not supplied and
a name for the expectation suite cannot be generated from the
current step name and pipeline name.
"""
context = self.data_context
if comparison_dataset is not None:
logger.warning(
"A comparison dataset is not required by Great Expectations "
"to do data profiling. Silently ignoring the supplied dataset "
)
if not expectation_suite_name:
try:
step_context = get_step_context()
pipeline_name = step_context.pipeline.name
step_name = step_context.step_run.name
expectation_suite_name = f"{pipeline_name}_{step_name}"
except RuntimeError:
raise ValueError(
"A expectation suite name is required when not running in "
"the context of a pipeline step."
)
suite_exists = False
if context.expectations_store.has_key( # noqa
ExpectationSuiteIdentifier(expectation_suite_name)
):
suite_exists = True
suite = context.get_expectation_suite(expectation_suite_name)
if not overwrite_existing_suite:
logger.info(
f"Expectation Suite `{expectation_suite_name}` "
f"already exists and `overwrite_existing_suite` is not set "
f"in the step configuration. Skipping re-running the "
f"profiler."
)
return suite
batch_request = create_batch_request(context, dataset, data_asset_name)
try:
if suite_exists:
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=expectation_suite_name,
)
else:
validator = context.get_validator(
batch_request=batch_request,
create_expectation_suite_with_name=expectation_suite_name,
)
profiler = UserConfigurableProfiler(
profile_dataset=validator, **profiler_kwargs
)
suite = profiler.build_suite()
context.save_expectation_suite(
expectation_suite=suite,
expectation_suite_name=expectation_suite_name,
)
context.build_data_docs()
finally:
context.delete_datasource(batch_request.datasource_name)
return suite
data_validation(self, dataset, comparison_dataset=None, check_list=None, expectation_suite_name=None, data_asset_name=None, action_list=None, **kwargs)
Great Expectations data validation.
This Great Expectations specific data validation method implementation validates an input dataset against an Expectation Suite (the GE definition of a profile) as covered in the official GE documentation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
DataFrame |
The dataset to validate. |
required |
comparison_dataset |
Optional[Any] |
Optional dataset used to run data comparison (i.e. data drift) checks. Not supported by the Great Expectation data validator. |
None |
check_list |
Optional[Sequence[str]] |
Optional list identifying the data validation checks to be performed. Not supported by the Great Expectations data validator. |
None |
expectation_suite_name |
Optional[str] |
The name of the expectation suite to use to validate the dataset. A value must be provided. |
None |
data_asset_name |
Optional[str] |
The name of the data asset to use to identify the dataset in the Great Expectations docs. |
None |
action_list |
Optional[List[Dict[str, Any]]] |
A list of additional Great Expectations actions to run after the validation check. |
None |
kwargs |
Any |
Additional keyword arguments (unused). |
{} |
Returns:
Type | Description |
---|---|
CheckpointResult |
The Great Expectations validation (checkpoint) result. |
Exceptions:
Type | Description |
---|---|
ValueError |
if the |
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
def data_validation(
self,
dataset: pd.DataFrame,
comparison_dataset: Optional[Any] = None,
check_list: Optional[Sequence[str]] = None,
expectation_suite_name: Optional[str] = None,
data_asset_name: Optional[str] = None,
action_list: Optional[List[Dict[str, Any]]] = None,
**kwargs: Any,
) -> CheckpointResult:
"""Great Expectations data validation.
This Great Expectations specific data validation method
implementation validates an input dataset against an Expectation Suite
(the GE definition of a profile) [as covered in the official GE
documentation](https://docs.greatexpectations.io/docs/guides/validation/how_to_validate_data_by_running_a_checkpoint).
Args:
dataset: The dataset to validate.
comparison_dataset: Optional dataset used to run data
comparison (i.e. data drift) checks. Not supported by the
Great Expectation data validator.
check_list: Optional list identifying the data validation checks to
be performed. Not supported by the Great Expectations data
validator.
expectation_suite_name: The name of the expectation suite to use to
validate the dataset. A value must be provided.
data_asset_name: The name of the data asset to use to identify the
dataset in the Great Expectations docs.
action_list: A list of additional Great Expectations actions to run after
the validation check.
kwargs: Additional keyword arguments (unused).
Returns:
The Great Expectations validation (checkpoint) result.
Raises:
ValueError: if the `expectation_suite_name` argument is omitted.
"""
if not expectation_suite_name:
raise ValueError("Missing expectation_suite_name argument value.")
if comparison_dataset is not None:
logger.warning(
"A comparison dataset is not required by Great Expectations "
"to do data validation. Silently ignoring the supplied dataset "
)
try:
step_context = get_step_context()
run_name = step_context.pipeline_run.name
step_name = step_context.step_run.name
except RuntimeError:
# if not running inside a pipeline step, use random values
run_name = f"pipeline_{random_str(5)}"
step_name = f"step_{random_str(5)}"
context = self.data_context
checkpoint_name = f"{run_name}_{step_name}"
batch_request = create_batch_request(context, dataset, data_asset_name)
action_list = action_list or [
{
"name": "store_validation_result",
"action": {"class_name": "StoreValidationResultAction"},
},
{
"name": "store_evaluation_params",
"action": {"class_name": "StoreEvaluationParametersAction"},
},
{
"name": "update_data_docs",
"action": {"class_name": "UpdateDataDocsAction"},
},
]
checkpoint_config = {
"name": checkpoint_name,
"run_name_template": run_name,
"config_version": 1,
"class_name": "Checkpoint",
"expectation_suite_name": expectation_suite_name,
"action_list": action_list,
}
context.add_checkpoint(**checkpoint_config)
try:
results = context.run_checkpoint(
checkpoint_name=checkpoint_name,
validations=[{"batch_request": batch_request}],
)
finally:
context.delete_datasource(batch_request.datasource_name)
context.delete_checkpoint(checkpoint_name)
return results
get_data_context()
classmethod
Get the Great Expectations data context managed by ZenML.
Call this method to retrieve the data context managed by ZenML through the active Great Expectations data validator stack component.
Returns:
Type | Description |
---|---|
<function BaseDataContext at 0x7f96485c7dc0> |
A Great Expectations data context managed by ZenML as configured through the active data validator stack component. |
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
@classmethod
def get_data_context(cls) -> BaseDataContext:
"""Get the Great Expectations data context managed by ZenML.
Call this method to retrieve the data context managed by ZenML
through the active Great Expectations data validator stack component.
Returns:
A Great Expectations data context managed by ZenML as configured
through the active data validator stack component.
"""
data_validator = cast(
"GreatExpectationsDataValidator", cls.get_active_data_validator()
)
return data_validator.data_context
get_data_docs_config(self, prefix, local=False)
Generate Great Expectations data docs configuration.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
prefix |
str |
The path prefix for the ZenML data docs configuration |
required |
local |
bool |
Whether the data docs site is local or remote. |
False |
Returns:
Type | Description |
---|---|
Dict[str, Any] |
A dictionary with the GE data docs site configuration. |
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
def get_data_docs_config(
self, prefix: str, local: bool = False
) -> Dict[str, Any]:
"""Generate Great Expectations data docs configuration.
Args:
prefix: The path prefix for the ZenML data docs configuration
local: Whether the data docs site is local or remote.
Returns:
A dictionary with the GE data docs site configuration.
"""
if local:
store_backend = {
"class_name": "TupleFilesystemStoreBackend",
"base_directory": f"{self.root_directory}/{prefix}",
}
else:
store_backend = {
"module_name": ZenMLArtifactStoreBackend.__module__,
"class_name": ZenMLArtifactStoreBackend.__name__,
"prefix": f"{str(self.id)}/{prefix}",
}
return {
"class_name": "SiteBuilder",
"store_backend": store_backend,
"site_index_builder": {
"class_name": "DefaultSiteIndexBuilder",
},
}
get_store_config(self, class_name, prefix)
Generate a Great Expectations store configuration.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
class_name |
str |
The store class name |
required |
prefix |
str |
The path prefix for the ZenML store configuration |
required |
Returns:
Type | Description |
---|---|
Dict[str, Any] |
A dictionary with the GE store configuration. |
Source code in zenml/integrations/great_expectations/data_validators/ge_data_validator.py
def get_store_config(self, class_name: str, prefix: str) -> Dict[str, Any]:
"""Generate a Great Expectations store configuration.
Args:
class_name: The store class name
prefix: The path prefix for the ZenML store configuration
Returns:
A dictionary with the GE store configuration.
"""
return {
"class_name": class_name,
"store_backend": {
"module_name": ZenMLArtifactStoreBackend.__module__,
"class_name": ZenMLArtifactStoreBackend.__name__,
"prefix": f"{str(self.id)}/{prefix}",
},
}
flavors
special
Great Expectations integration flavors.
great_expectations_data_validator_flavor
Great Expectations data validator flavor.
GreatExpectationsDataValidatorConfig (BaseDataValidatorConfig)
pydantic-model
Config for the Great Expectations data validator.
Attributes:
Name | Type | Description |
---|---|---|
context_root_dir |
Optional[str] |
location of an already initialized Great Expectations data context. If configured, the data validator will only be usable with local orchestrators. |
context_config |
Optional[Dict[str, Any]] |
in-line Great Expectations data context configuration. |
configure_zenml_stores |
bool |
if set, ZenML will automatically configure
stores that use the Artifact Store as a backend. If neither
|
configure_local_docs |
bool |
configure a local data docs site where Great Expectations docs are generated and can be visualized locally. |
Source code in zenml/integrations/great_expectations/flavors/great_expectations_data_validator_flavor.py
class GreatExpectationsDataValidatorConfig(BaseDataValidatorConfig):
"""Config for the Great Expectations data validator.
Attributes:
context_root_dir: location of an already initialized Great Expectations
data context. If configured, the data validator will only be usable
with local orchestrators.
context_config: in-line Great Expectations data context configuration.
configure_zenml_stores: if set, ZenML will automatically configure
stores that use the Artifact Store as a backend. If neither
`context_root_dir` nor `context_config` are set, this is the default
behavior.
configure_local_docs: configure a local data docs site where Great
Expectations docs are generated and can be visualized locally.
"""
context_root_dir: Optional[str] = None
context_config: Optional[Dict[str, Any]] = None
configure_zenml_stores: bool = False
configure_local_docs: bool = True
@validator("context_root_dir")
def _ensure_valid_context_root_dir(
cls, context_root_dir: Optional[str] = None
) -> Optional[str]:
"""Ensures that the root directory is an absolute path and points to an existing path.
Args:
context_root_dir: The context_root_dir value to validate.
Returns:
The context_root_dir if it is valid.
Raises:
ValueError: If the context_root_dir is not valid.
"""
if context_root_dir:
context_root_dir = os.path.abspath(context_root_dir)
if not fileio.exists(context_root_dir):
raise ValueError(
f"The Great Expectations context_root_dir value doesn't "
f"point to an existing data context path: {context_root_dir}"
)
return context_root_dir
@property
def is_local(self) -> bool:
"""Checks if this stack component is running locally.
Returns:
True if this config is for a local component, False otherwise.
"""
# If an existing local GE data context is used, it is
# interpreted as a local path that needs to be accessible in
# all runtime environments.
return self.context_root_dir is not None
is_local: bool
property
readonly
Checks if this stack component is running locally.
Returns:
Type | Description |
---|---|
bool |
True if this config is for a local component, False otherwise. |
GreatExpectationsDataValidatorFlavor (BaseDataValidatorFlavor)
Great Expectations data validator flavor.
Source code in zenml/integrations/great_expectations/flavors/great_expectations_data_validator_flavor.py
class GreatExpectationsDataValidatorFlavor(BaseDataValidatorFlavor):
"""Great Expectations data validator flavor."""
@property
def name(self) -> str:
"""Name of the flavor.
Returns:
The name of the flavor.
"""
return GREAT_EXPECTATIONS_DATA_VALIDATOR_FLAVOR
@property
def docs_url(self) -> Optional[str]:
"""A url to point at docs explaining this flavor.
Returns:
A flavor docs url.
"""
return self.generate_default_docs_url()
@property
def sdk_docs_url(self) -> Optional[str]:
"""A url to point at SDK docs explaining this flavor.
Returns:
A flavor SDK docs url.
"""
return self.generate_default_sdk_docs_url()
@property
def logo_url(self) -> str:
"""A url to represent the flavor in the dashboard.
Returns:
The flavor logo.
"""
return "https://public-flavor-logos.s3.eu-central-1.amazonaws.com/data_validator/greatexpectations.jpeg"
@property
def config_class(self) -> Type[GreatExpectationsDataValidatorConfig]:
"""Returns `GreatExpectationsDataValidatorConfig` config class.
Returns:
The config class.
"""
return GreatExpectationsDataValidatorConfig
@property
def implementation_class(self) -> Type["GreatExpectationsDataValidator"]:
"""Implementation class for this flavor.
Returns:
The implementation class.
"""
from zenml.integrations.great_expectations.data_validators import (
GreatExpectationsDataValidator,
)
return GreatExpectationsDataValidator
config_class: Type[zenml.integrations.great_expectations.flavors.great_expectations_data_validator_flavor.GreatExpectationsDataValidatorConfig]
property
readonly
Returns GreatExpectationsDataValidatorConfig
config class.
Returns:
Type | Description |
---|---|
Type[zenml.integrations.great_expectations.flavors.great_expectations_data_validator_flavor.GreatExpectationsDataValidatorConfig] |
The config class. |
docs_url: Optional[str]
property
readonly
A url to point at docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor docs url. |
implementation_class: Type[GreatExpectationsDataValidator]
property
readonly
Implementation class for this flavor.
Returns:
Type | Description |
---|---|
Type[GreatExpectationsDataValidator] |
The implementation class. |
logo_url: str
property
readonly
A url to represent the flavor in the dashboard.
Returns:
Type | Description |
---|---|
str |
The flavor logo. |
name: str
property
readonly
Name of the flavor.
Returns:
Type | Description |
---|---|
str |
The name of the flavor. |
sdk_docs_url: Optional[str]
property
readonly
A url to point at SDK docs explaining this flavor.
Returns:
Type | Description |
---|---|
Optional[str] |
A flavor SDK docs url. |
ge_store_backend
Great Expectations store plugin for ZenML.
ZenMLArtifactStoreBackend (TupleStoreBackend)
Great Expectations store backend that uses the active ZenML Artifact Store as a store.
Source code in zenml/integrations/great_expectations/ge_store_backend.py
class ZenMLArtifactStoreBackend(TupleStoreBackend): # type: ignore[misc]
"""Great Expectations store backend that uses the active ZenML Artifact Store as a store."""
def __init__(
self,
prefix: str = "",
**kwargs: Any,
) -> None:
"""Create a Great Expectations ZenML store backend instance.
Args:
prefix: Subpath prefix to use for this store backend.
kwargs: Additional keyword arguments passed by the Great Expectations
core. These are transparently passed to the `TupleStoreBackend`
constructor.
"""
super().__init__(**kwargs)
client = Client()
artifact_store = client.active_stack.artifact_store
self.root_path = os.path.join(
artifact_store.path, "great_expectations"
)
# extract the protocol used in the artifact store root path
protocols = [
scheme
for scheme in artifact_store.config.SUPPORTED_SCHEMES
if self.root_path.startswith(scheme)
]
if protocols:
self.proto = protocols[0]
else:
self.proto = ""
if prefix:
if self.platform_specific_separator:
prefix = prefix.strip(os.sep)
prefix = prefix.strip("/")
self.prefix = prefix
# Initialize with store_backend_id if not part of an HTMLSiteStore
if not self._suppress_store_backend_id:
_ = self.store_backend_id
self._config = {
"prefix": prefix,
"module_name": self.__class__.__module__,
"class_name": self.__class__.__name__,
}
self._config.update(kwargs)
filter_properties_dict(
properties=self._config, clean_falsy=True, inplace=True
)
def _build_object_path(
self, key: Tuple[str, ...], is_prefix: bool = False
) -> str:
"""Build a filepath corresponding to an object key.
Args:
key: Great Expectation object key.
is_prefix: If True, the key will be interpreted as a prefix instead
of a full key identifier.
Returns:
The file path pointing to where the object is stored.
"""
if not isinstance(key, tuple):
key = key.to_tuple()
if not is_prefix:
object_relative_path = self._convert_key_to_filepath(key)
elif key:
object_relative_path = os.path.join(*key)
else:
object_relative_path = ""
if self.prefix:
object_key = os.path.join(self.prefix, object_relative_path)
else:
object_key = object_relative_path
return os.path.join(self.root_path, object_key)
def _get(self, key: Tuple[str, ...]) -> str:
"""Get the value of an object from the store.
Args:
key: object key identifier.
Raises:
InvalidKeyError: if the key doesn't point to an existing object.
Returns:
str: the object's contents
"""
filepath: str = self._build_object_path(key)
if fileio.exists(filepath):
contents = io_utils.read_file_contents_as_string(filepath).rstrip(
"\n"
)
else:
raise InvalidKeyError(
f"Unable to retrieve object from {self.__class__.__name__} with "
f"the following Key: {str(filepath)}"
)
return contents
def _set(self, key: Tuple[str, ...], value: str, **kwargs: Any) -> str:
"""Set the value of an object in the store.
Args:
key: object key identifier.
value: object value to set.
kwargs: additional keyword arguments (ignored).
Returns:
The file path where the object was stored.
"""
filepath: str = self._build_object_path(key)
if not io_utils.is_remote(filepath):
parent_dir = str(Path(filepath).parent)
os.makedirs(parent_dir, exist_ok=True)
with fileio.open(filepath, "wb") as outfile:
if isinstance(value, str):
outfile.write(value.encode("utf-8"))
else:
outfile.write(value)
return filepath
def _move(
self,
source_key: Tuple[str, ...],
dest_key: Tuple[str, ...],
**kwargs: Any,
) -> None:
"""Associate an object with a different key in the store.
Args:
source_key: current object key identifier.
dest_key: new object key identifier.
kwargs: additional keyword arguments (ignored).
"""
source_path = self._build_object_path(source_key)
dest_path = self._build_object_path(dest_key)
if fileio.exists(source_path):
if not io_utils.is_remote(dest_path):
parent_dir = str(Path(dest_path).parent)
os.makedirs(parent_dir, exist_ok=True)
fileio.rename(source_path, dest_path, overwrite=True)
def list_keys(self, prefix: Tuple[str, ...] = ()) -> List[Tuple[str, ...]]:
"""List the keys of all objects identified by a partial key.
Args:
prefix: partial object key identifier.
Returns:
List of keys identifying all objects present in the store that
match the input partial key.
"""
key_list = []
list_path = self._build_object_path(prefix, is_prefix=True)
root_path = self._build_object_path(tuple(), is_prefix=True)
for root, dirs, files in fileio.walk(list_path):
for file_ in files:
filepath = os.path.relpath(
os.path.join(str(root), str(file_)), root_path
)
if self.filepath_prefix and not filepath.startswith(
self.filepath_prefix
):
continue
elif self.filepath_suffix and not filepath.endswith(
self.filepath_suffix
):
continue
key = self._convert_filepath_to_key(filepath)
if key and not self.is_ignored_key(key):
key_list.append(key)
return key_list
def remove_key(self, key: Tuple[str, ...]) -> bool:
"""Delete an object from the store.
Args:
key: object key identifier.
Returns:
True if the object existed in the store and was removed, otherwise
False.
"""
filepath: str = self._build_object_path(key)
if fileio.exists(filepath):
fileio.remove(filepath)
if not io_utils.is_remote(filepath):
parent_dir = str(Path(filepath).parent)
self.rrmdir(self.root_path, str(parent_dir))
return True
return False
def _has_key(self, key: Tuple[str, ...]) -> bool:
"""Check if an object is present in the store.
Args:
key: object key identifier.
Returns:
True if the object is present in the store, otherwise False.
"""
filepath: str = self._build_object_path(key)
result = fileio.exists(filepath)
return result
def get_url_for_key(
self, key: Tuple[str, ...], protocol: Optional[str] = None
) -> str:
"""Get the URL of an object in the store.
Args:
key: object key identifier.
protocol: optional protocol to use instead of the store protocol.
Returns:
The URL of the object in the store.
"""
filepath = self._build_object_path(key)
if not protocol and not io_utils.is_remote(filepath):
protocol = "file:"
if protocol:
filepath = filepath.replace(self.proto, f"{protocol}//", 1)
return filepath
def get_public_url_for_key(
self, key: str, protocol: Optional[str] = None
) -> str:
"""Get the public URL of an object in the store.
Args:
key: object key identifier.
protocol: optional protocol to use instead of the store protocol.
Returns:
The public URL where the object can be accessed.
Raises:
StoreBackendError: if a `base_public_path` attribute was not
configured for the store.
"""
if not self.base_public_path:
raise StoreBackendError(
f"Error: No base_public_path was configured! A public URL was "
f"requested but `base_public_path` was not configured for the "
f"{self.__class__.__name__}"
)
filepath = self._convert_key_to_filepath(key)
public_url = self.base_public_path + filepath.replace(self.proto, "")
return cast(str, public_url)
@staticmethod
def rrmdir(start_path: str, end_path: str) -> None:
"""Recursively removes empty dirs between start_path and end_path inclusive.
Args:
start_path: Directory to use as a starting point.
end_path: Directory to use as a destination point.
"""
while not os.listdir(end_path) and start_path != end_path:
os.rmdir(end_path)
end_path = os.path.dirname(end_path)
@property
def config(self) -> Dict[str, Any]:
"""Get the store configuration.
Returns:
The store configuration.
"""
return self._config
config: Dict[str, Any]
property
readonly
Get the store configuration.
Returns:
Type | Description |
---|---|
Dict[str, Any] |
The store configuration. |
__init__(self, prefix='', **kwargs)
special
Create a Great Expectations ZenML store backend instance.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
prefix |
str |
Subpath prefix to use for this store backend. |
'' |
kwargs |
Any |
Additional keyword arguments passed by the Great Expectations
core. These are transparently passed to the |
{} |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
def __init__(
self,
prefix: str = "",
**kwargs: Any,
) -> None:
"""Create a Great Expectations ZenML store backend instance.
Args:
prefix: Subpath prefix to use for this store backend.
kwargs: Additional keyword arguments passed by the Great Expectations
core. These are transparently passed to the `TupleStoreBackend`
constructor.
"""
super().__init__(**kwargs)
client = Client()
artifact_store = client.active_stack.artifact_store
self.root_path = os.path.join(
artifact_store.path, "great_expectations"
)
# extract the protocol used in the artifact store root path
protocols = [
scheme
for scheme in artifact_store.config.SUPPORTED_SCHEMES
if self.root_path.startswith(scheme)
]
if protocols:
self.proto = protocols[0]
else:
self.proto = ""
if prefix:
if self.platform_specific_separator:
prefix = prefix.strip(os.sep)
prefix = prefix.strip("/")
self.prefix = prefix
# Initialize with store_backend_id if not part of an HTMLSiteStore
if not self._suppress_store_backend_id:
_ = self.store_backend_id
self._config = {
"prefix": prefix,
"module_name": self.__class__.__module__,
"class_name": self.__class__.__name__,
}
self._config.update(kwargs)
filter_properties_dict(
properties=self._config, clean_falsy=True, inplace=True
)
get_public_url_for_key(self, key, protocol=None)
Get the public URL of an object in the store.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
key |
str |
object key identifier. |
required |
protocol |
Optional[str] |
optional protocol to use instead of the store protocol. |
None |
Returns:
Type | Description |
---|---|
str |
The public URL where the object can be accessed. |
Exceptions:
Type | Description |
---|---|
StoreBackendError |
if a |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
def get_public_url_for_key(
self, key: str, protocol: Optional[str] = None
) -> str:
"""Get the public URL of an object in the store.
Args:
key: object key identifier.
protocol: optional protocol to use instead of the store protocol.
Returns:
The public URL where the object can be accessed.
Raises:
StoreBackendError: if a `base_public_path` attribute was not
configured for the store.
"""
if not self.base_public_path:
raise StoreBackendError(
f"Error: No base_public_path was configured! A public URL was "
f"requested but `base_public_path` was not configured for the "
f"{self.__class__.__name__}"
)
filepath = self._convert_key_to_filepath(key)
public_url = self.base_public_path + filepath.replace(self.proto, "")
return cast(str, public_url)
get_url_for_key(self, key, protocol=None)
Get the URL of an object in the store.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
key |
Tuple[str, ...] |
object key identifier. |
required |
protocol |
Optional[str] |
optional protocol to use instead of the store protocol. |
None |
Returns:
Type | Description |
---|---|
str |
The URL of the object in the store. |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
def get_url_for_key(
self, key: Tuple[str, ...], protocol: Optional[str] = None
) -> str:
"""Get the URL of an object in the store.
Args:
key: object key identifier.
protocol: optional protocol to use instead of the store protocol.
Returns:
The URL of the object in the store.
"""
filepath = self._build_object_path(key)
if not protocol and not io_utils.is_remote(filepath):
protocol = "file:"
if protocol:
filepath = filepath.replace(self.proto, f"{protocol}//", 1)
return filepath
list_keys(self, prefix=())
List the keys of all objects identified by a partial key.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
prefix |
Tuple[str, ...] |
partial object key identifier. |
() |
Returns:
Type | Description |
---|---|
List[Tuple[str, ...]] |
List of keys identifying all objects present in the store that match the input partial key. |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
def list_keys(self, prefix: Tuple[str, ...] = ()) -> List[Tuple[str, ...]]:
"""List the keys of all objects identified by a partial key.
Args:
prefix: partial object key identifier.
Returns:
List of keys identifying all objects present in the store that
match the input partial key.
"""
key_list = []
list_path = self._build_object_path(prefix, is_prefix=True)
root_path = self._build_object_path(tuple(), is_prefix=True)
for root, dirs, files in fileio.walk(list_path):
for file_ in files:
filepath = os.path.relpath(
os.path.join(str(root), str(file_)), root_path
)
if self.filepath_prefix and not filepath.startswith(
self.filepath_prefix
):
continue
elif self.filepath_suffix and not filepath.endswith(
self.filepath_suffix
):
continue
key = self._convert_filepath_to_key(filepath)
if key and not self.is_ignored_key(key):
key_list.append(key)
return key_list
remove_key(self, key)
Delete an object from the store.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
key |
Tuple[str, ...] |
object key identifier. |
required |
Returns:
Type | Description |
---|---|
bool |
True if the object existed in the store and was removed, otherwise False. |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
def remove_key(self, key: Tuple[str, ...]) -> bool:
"""Delete an object from the store.
Args:
key: object key identifier.
Returns:
True if the object existed in the store and was removed, otherwise
False.
"""
filepath: str = self._build_object_path(key)
if fileio.exists(filepath):
fileio.remove(filepath)
if not io_utils.is_remote(filepath):
parent_dir = str(Path(filepath).parent)
self.rrmdir(self.root_path, str(parent_dir))
return True
return False
rrmdir(start_path, end_path)
staticmethod
Recursively removes empty dirs between start_path and end_path inclusive.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
start_path |
str |
Directory to use as a starting point. |
required |
end_path |
str |
Directory to use as a destination point. |
required |
Source code in zenml/integrations/great_expectations/ge_store_backend.py
@staticmethod
def rrmdir(start_path: str, end_path: str) -> None:
"""Recursively removes empty dirs between start_path and end_path inclusive.
Args:
start_path: Directory to use as a starting point.
end_path: Directory to use as a destination point.
"""
while not os.listdir(end_path) and start_path != end_path:
os.rmdir(end_path)
end_path = os.path.dirname(end_path)
materializers
special
Materializers for Great Expectation serializable objects.
ge_materializer
Implementation of the Great Expectations materializers.
GreatExpectationsMaterializer (BaseMaterializer)
Materializer to read/write Great Expectation objects.
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
class GreatExpectationsMaterializer(BaseMaterializer):
"""Materializer to read/write Great Expectation objects."""
ASSOCIATED_TYPES: ClassVar[Tuple[Type[Any], ...]] = (
ExpectationSuite,
CheckpointResult,
)
ASSOCIATED_ARTIFACT_TYPE: ClassVar[
ArtifactType
] = ArtifactType.DATA_ANALYSIS
@staticmethod
def preprocess_checkpoint_result_dict(
artifact_dict: Dict[str, Any],
) -> None:
"""Pre-processes a GE checkpoint dict before it is used to de-serialize a GE CheckpointResult object.
The GE CheckpointResult object is not fully de-serializable
due to some missing code in the GE codebase. We need to compensate
for this by manually converting some of the attributes to
their correct data types.
Args:
artifact_dict: A dict containing the GE checkpoint result.
"""
def preprocess_run_result(key: str, value: Any) -> Any:
if key == "validation_result":
return ExpectationSuiteValidationResult(**value)
return value
artifact_dict["checkpoint_config"] = CheckpointConfig(
**artifact_dict["checkpoint_config"]
)
validation_dict = {}
for result_ident, results in artifact_dict["run_results"].items():
validation_ident = (
ValidationResultIdentifier.from_fixed_length_tuple(
result_ident.split("::")[1].split("/")
)
)
validation_results = {
result_name: preprocess_run_result(result_name, result)
for result_name, result in results.items()
}
validation_dict[validation_ident] = validation_results
artifact_dict["run_results"] = validation_dict
def load(self, data_type: Type[Any]) -> SerializableDictDot:
"""Reads and returns a Great Expectations object.
Args:
data_type: The type of the data to read.
Returns:
A loaded Great Expectations object.
"""
filepath = os.path.join(self.uri, ARTIFACT_FILENAME)
artifact_dict = yaml_utils.read_json(filepath)
data_type = source_utils.load(artifact_dict.pop("data_type"))
if data_type is CheckpointResult:
self.preprocess_checkpoint_result_dict(artifact_dict)
return data_type(**artifact_dict)
def save(self, obj: SerializableDictDot) -> None:
"""Writes a Great Expectations object.
Args:
obj: A Great Expectations object.
"""
filepath = os.path.join(self.uri, ARTIFACT_FILENAME)
artifact_dict = obj.to_json_dict()
artifact_type = type(obj)
artifact_dict[
"data_type"
] = f"{artifact_type.__module__}.{artifact_type.__name__}"
yaml_utils.write_json(filepath, artifact_dict)
def save_visualizations(
self, data: Union[ExpectationSuite, CheckpointResult]
) -> Dict[str, VisualizationType]:
"""Saves visualizations for the given Great Expectations object.
Args:
data: The Great Expectations object to save visualizations for.
Returns:
A dictionary of visualization URIs and their types.
"""
visualizations = {}
if isinstance(data, CheckpointResult):
result = cast(CheckpointResult, data)
identifier = next(iter(result.run_results.keys()))
else:
suite = cast(ExpectationSuite, data)
identifier = ExpectationSuiteIdentifier(
suite.expectation_suite_name
)
context = GreatExpectationsDataValidator.get_data_context()
sites = context.get_docs_sites_urls(identifier)
for site in sites:
url = site["site_url"]
visualizations[url] = VisualizationType.HTML
return visualizations
def extract_metadata(
self, data: Union[ExpectationSuite, CheckpointResult]
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given Great Expectations object.
Args:
data: The Great Expectations object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
if isinstance(data, CheckpointResult):
return {
"checkpoint_result_name": data.name,
"checkpoint_result_passed": data.success,
}
elif isinstance(data, ExpectationSuite):
return {
"expectation_suite_name": data.name,
}
return {}
extract_metadata(self, data)
Extract metadata from the given Great Expectations object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Union[great_expectations.core.expectation_suite.ExpectationSuite, great_expectations.checkpoint.types.checkpoint_result.CheckpointResult] |
The Great Expectations object to extract metadata from. |
required |
Returns:
Type | Description |
---|---|
Dict[str, MetadataType] |
The extracted metadata as a dictionary. |
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
def extract_metadata(
self, data: Union[ExpectationSuite, CheckpointResult]
) -> Dict[str, "MetadataType"]:
"""Extract metadata from the given Great Expectations object.
Args:
data: The Great Expectations object to extract metadata from.
Returns:
The extracted metadata as a dictionary.
"""
if isinstance(data, CheckpointResult):
return {
"checkpoint_result_name": data.name,
"checkpoint_result_passed": data.success,
}
elif isinstance(data, ExpectationSuite):
return {
"expectation_suite_name": data.name,
}
return {}
load(self, data_type)
Reads and returns a Great Expectations object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_type |
Type[Any] |
The type of the data to read. |
required |
Returns:
Type | Description |
---|---|
SerializableDictDot |
A loaded Great Expectations object. |
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
def load(self, data_type: Type[Any]) -> SerializableDictDot:
"""Reads and returns a Great Expectations object.
Args:
data_type: The type of the data to read.
Returns:
A loaded Great Expectations object.
"""
filepath = os.path.join(self.uri, ARTIFACT_FILENAME)
artifact_dict = yaml_utils.read_json(filepath)
data_type = source_utils.load(artifact_dict.pop("data_type"))
if data_type is CheckpointResult:
self.preprocess_checkpoint_result_dict(artifact_dict)
return data_type(**artifact_dict)
preprocess_checkpoint_result_dict(artifact_dict)
staticmethod
Pre-processes a GE checkpoint dict before it is used to de-serialize a GE CheckpointResult object.
The GE CheckpointResult object is not fully de-serializable due to some missing code in the GE codebase. We need to compensate for this by manually converting some of the attributes to their correct data types.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
artifact_dict |
Dict[str, Any] |
A dict containing the GE checkpoint result. |
required |
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
@staticmethod
def preprocess_checkpoint_result_dict(
artifact_dict: Dict[str, Any],
) -> None:
"""Pre-processes a GE checkpoint dict before it is used to de-serialize a GE CheckpointResult object.
The GE CheckpointResult object is not fully de-serializable
due to some missing code in the GE codebase. We need to compensate
for this by manually converting some of the attributes to
their correct data types.
Args:
artifact_dict: A dict containing the GE checkpoint result.
"""
def preprocess_run_result(key: str, value: Any) -> Any:
if key == "validation_result":
return ExpectationSuiteValidationResult(**value)
return value
artifact_dict["checkpoint_config"] = CheckpointConfig(
**artifact_dict["checkpoint_config"]
)
validation_dict = {}
for result_ident, results in artifact_dict["run_results"].items():
validation_ident = (
ValidationResultIdentifier.from_fixed_length_tuple(
result_ident.split("::")[1].split("/")
)
)
validation_results = {
result_name: preprocess_run_result(result_name, result)
for result_name, result in results.items()
}
validation_dict[validation_ident] = validation_results
artifact_dict["run_results"] = validation_dict
save(self, obj)
Writes a Great Expectations object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
obj |
SerializableDictDot |
A Great Expectations object. |
required |
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
def save(self, obj: SerializableDictDot) -> None:
"""Writes a Great Expectations object.
Args:
obj: A Great Expectations object.
"""
filepath = os.path.join(self.uri, ARTIFACT_FILENAME)
artifact_dict = obj.to_json_dict()
artifact_type = type(obj)
artifact_dict[
"data_type"
] = f"{artifact_type.__module__}.{artifact_type.__name__}"
yaml_utils.write_json(filepath, artifact_dict)
save_visualizations(self, data)
Saves visualizations for the given Great Expectations object.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
Union[great_expectations.core.expectation_suite.ExpectationSuite, great_expectations.checkpoint.types.checkpoint_result.CheckpointResult] |
The Great Expectations object to save visualizations for. |
required |
Returns:
Type | Description |
---|---|
Dict[str, zenml.enums.VisualizationType] |
A dictionary of visualization URIs and their types. |
Source code in zenml/integrations/great_expectations/materializers/ge_materializer.py
def save_visualizations(
self, data: Union[ExpectationSuite, CheckpointResult]
) -> Dict[str, VisualizationType]:
"""Saves visualizations for the given Great Expectations object.
Args:
data: The Great Expectations object to save visualizations for.
Returns:
A dictionary of visualization URIs and their types.
"""
visualizations = {}
if isinstance(data, CheckpointResult):
result = cast(CheckpointResult, data)
identifier = next(iter(result.run_results.keys()))
else:
suite = cast(ExpectationSuite, data)
identifier = ExpectationSuiteIdentifier(
suite.expectation_suite_name
)
context = GreatExpectationsDataValidator.get_data_context()
sites = context.get_docs_sites_urls(identifier)
for site in sites:
url = site["site_url"]
visualizations[url] = VisualizationType.HTML
return visualizations
steps
special
Great Expectations data profiling and validation standard steps.
ge_profiler
Great Expectations data profiling standard step.
ge_validator
Great Expectations data validation standard step.
utils
Great Expectations data profiling standard step.
create_batch_request(context, dataset, data_asset_name)
Create a temporary runtime GE batch request from a dataset step artifact.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
context |
<function BaseDataContext at 0x7f96485c7dc0> |
Great Expectations data context. |
required |
dataset |
DataFrame |
Input dataset. |
required |
data_asset_name |
Optional[str] |
Optional custom name for the data asset. |
required |
Returns:
Type | Description |
---|---|
RuntimeBatchRequest |
A Great Expectations runtime batch request. |
Source code in zenml/integrations/great_expectations/utils.py
def create_batch_request(
context: BaseDataContext,
dataset: pd.DataFrame,
data_asset_name: Optional[str],
) -> RuntimeBatchRequest:
"""Create a temporary runtime GE batch request from a dataset step artifact.
Args:
context: Great Expectations data context.
dataset: Input dataset.
data_asset_name: Optional custom name for the data asset.
Returns:
A Great Expectations runtime batch request.
"""
try:
# get pipeline name, step name and run id
step_context = get_step_context()
pipeline_name = step_context.pipeline.name
run_name = step_context.pipeline_run.name
step_name = step_context.step_run.name
except RuntimeError:
# if not running inside a pipeline step, use random values
pipeline_name = f"pipeline_{random_str(5)}"
run_name = f"pipeline_{random_str(5)}"
step_name = f"step_{random_str(5)}"
datasource_name = f"{run_name}_{step_name}"
data_connector_name = datasource_name
data_asset_name = data_asset_name or f"{pipeline_name}_{step_name}"
batch_identifier = "default"
datasource_config = {
"name": datasource_name,
"class_name": "Datasource",
"module_name": "great_expectations.datasource",
"execution_engine": {
"module_name": "great_expectations.execution_engine",
"class_name": "PandasExecutionEngine",
},
"data_connectors": {
data_connector_name: {
"class_name": "RuntimeDataConnector",
"batch_identifiers": [batch_identifier],
},
},
}
context.add_datasource(**datasource_config)
batch_request = RuntimeBatchRequest(
datasource_name=datasource_name,
data_connector_name=data_connector_name,
data_asset_name=data_asset_name,
runtime_parameters={"batch_data": dataset},
batch_identifiers={batch_identifier: batch_identifier},
)
return batch_request