Lineage Graph
zenml.lineage_graph
special
Initialization of lineage generation module.
edge
Class for Edges in a lineage graph.
Edge (BaseModel)
pydantic-model
A class that represents an edge in a lineage graph.
Source code in zenml/lineage_graph/edge.py
class Edge(BaseModel):
"""A class that represents an edge in a lineage graph."""
id: str
source: str
target: str
lineage_graph
Class for lineage graph generation.
LineageGraph (BaseModel)
pydantic-model
A lineage graph representation of a PipelineRunResponseModel.
Source code in zenml/lineage_graph/lineage_graph.py
class LineageGraph(BaseModel):
"""A lineage graph representation of a PipelineRunResponseModel."""
nodes: List[Union[StepNode, ArtifactNode]] = []
edges: List[Edge] = []
root_step_id: Optional[str] = None
run_metadata: List[Tuple[str, str, str]] = []
def generate_run_nodes_and_edges(
self, run: "PipelineRunResponseModel"
) -> None:
"""Initializes a lineage graph from a pipeline run.
Args:
run: The PipelineRunResponseModel to generate the lineage graph for.
"""
self.run_metadata = [
(m.key, str(m.value), str(m.type)) for m in run.metadata.values()
]
for step in run.steps.values():
self.generate_step_nodes_and_edges(step)
self.add_external_artifacts(run)
self.add_direct_edges(run)
def generate_step_nodes_and_edges(
self, step: "StepRunResponseModel"
) -> None:
"""Generates the nodes and edges for a step and its artifacts.
Args:
step: The step to generate the nodes and edges for.
"""
step_id = STEP_PREFIX + str(step.id)
# Set a root step if it doesn't exist yet
if self.root_step_id is None:
self.root_step_id = step_id
# Add the step node
self.add_step_node(step, step_id)
# Add nodes and edges for all output artifacts
for artifact_name, artifact in step.outputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
if step.status == ExecutionStatus.CACHED:
artifact_status = ArtifactNodeStatus.CACHED
elif step.status == ExecutionStatus.COMPLETED:
artifact_status = ArtifactNodeStatus.CREATED
else:
artifact_status = ArtifactNodeStatus.UNKNOWN
self.add_artifact_node(
artifact=artifact,
id=artifact_id,
name=artifact_name,
step_id=str(step_id),
status=artifact_status,
)
self.add_edge(step_id, artifact_id)
# Add nodes and edges for all input artifacts
for artifact_name, artifact in step.inputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
self.add_edge(artifact_id, step_id)
def add_external_artifacts(self, run: "PipelineRunResponseModel") -> None:
"""Adds all external artifacts to the lineage graph.
Args:
run: The pipeline run to add external artifacts for.
"""
nodes_ids = {node.id for node in self.nodes}
for step in run.steps.values():
for artifact_name, artifact in step.inputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
if artifact_id not in nodes_ids:
self.add_artifact_node(
artifact=artifact,
id=artifact_id,
name=artifact_name,
step_id=str(artifact.producer_step_run_id),
status=ArtifactNodeStatus.EXTERNAL,
)
def add_direct_edges(self, run: "PipelineRunResponseModel") -> None:
"""Add all direct edges between nodes generated by `after=...`.
Args:
run: The pipeline run to add direct edges for.
"""
for step in run.steps.values():
step_id = STEP_PREFIX + str(step.id)
for parent_step_id_uuid in step.parent_step_ids:
parent_step_id = STEP_PREFIX + str(parent_step_id_uuid)
if not self.has_artifact_link(step_id, parent_step_id):
self.add_edge(parent_step_id, step_id)
def has_artifact_link(self, step_id: str, parent_step_id: str) -> bool:
"""Checks if a step has an artifact link to a parent step.
This is the case for all parent steps that were not specified via
`after=...`.
Args:
step_id: The node ID of the step to check.
parent_step_id: T node ID of the parent step to check.
Returns:
True if the steps are linked via an artifact, False otherwise.
"""
parent_outputs, child_inputs = set(), set()
for edge in self.edges:
if edge.source == parent_step_id:
parent_outputs.add(edge.target)
if edge.target == step_id:
child_inputs.add(edge.source)
return bool(parent_outputs.intersection(child_inputs))
def add_step_node(
self,
step: "StepRunResponseModel",
id: str,
) -> None:
"""Adds a step node to the lineage graph.
Args:
step: The step to add a node for.
id: The id of the step node.
"""
step_config = step.config.dict()
if step_config:
step_config = {
key: value
for key, value in step_config.items()
if key not in ["inputs", "outputs", "parameters"] and value
}
self.nodes.append(
StepNode(
id=id,
data=StepNodeDetails(
execution_id=str(step.id),
name=step.name, # redundant for consistency
status=step.status,
entrypoint_name=step.config.name, # redundant for consistency
parameters=step.config.parameters,
configuration=step_config,
inputs={k: v.uri for k, v in step.inputs.items()},
outputs={k: v.uri for k, v in step.outputs.items()},
metadata=[
(m.key, str(m.value), str(m.type))
for m in step.metadata.values()
],
),
)
)
def add_artifact_node(
self,
artifact: "ArtifactResponseModel",
id: str,
name: str,
step_id: str,
status: ArtifactNodeStatus,
) -> None:
"""Adds an artifact node to the lineage graph.
Args:
artifact: The artifact to add a node for.
id: The id of the artifact node.
name: The input or output name of the artifact.
step_id: The id of the step that produced the artifact.
status: The status of the step that produced the artifact.
"""
node = ArtifactNode(
id=id,
data=ArtifactNodeDetails(
execution_id=str(artifact.id),
name=name,
status=status,
is_cached=status == ArtifactNodeStatus.CACHED,
artifact_type=artifact.type,
artifact_data_type=artifact.data_type.import_path,
parent_step_id=step_id,
producer_step_id=str(artifact.producer_step_run_id),
uri=artifact.uri,
metadata=[
(m.key, str(m.value), str(m.type))
for m in artifact.metadata.values()
],
),
)
self.nodes.append(node)
def add_edge(self, source: str, target: str) -> None:
"""Adds an edge to the lineage graph.
Args:
source: The source node id.
target: The target node id.
"""
self.edges.append(
Edge(id=source + "_" + target, source=source, target=target)
)
add_artifact_node(self, artifact, id, name, step_id, status)
Adds an artifact node to the lineage graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
artifact |
ArtifactResponseModel |
The artifact to add a node for. |
required |
id |
str |
The id of the artifact node. |
required |
name |
str |
The input or output name of the artifact. |
required |
step_id |
str |
The id of the step that produced the artifact. |
required |
status |
ArtifactNodeStatus |
The status of the step that produced the artifact. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def add_artifact_node(
self,
artifact: "ArtifactResponseModel",
id: str,
name: str,
step_id: str,
status: ArtifactNodeStatus,
) -> None:
"""Adds an artifact node to the lineage graph.
Args:
artifact: The artifact to add a node for.
id: The id of the artifact node.
name: The input or output name of the artifact.
step_id: The id of the step that produced the artifact.
status: The status of the step that produced the artifact.
"""
node = ArtifactNode(
id=id,
data=ArtifactNodeDetails(
execution_id=str(artifact.id),
name=name,
status=status,
is_cached=status == ArtifactNodeStatus.CACHED,
artifact_type=artifact.type,
artifact_data_type=artifact.data_type.import_path,
parent_step_id=step_id,
producer_step_id=str(artifact.producer_step_run_id),
uri=artifact.uri,
metadata=[
(m.key, str(m.value), str(m.type))
for m in artifact.metadata.values()
],
),
)
self.nodes.append(node)
add_direct_edges(self, run)
Add all direct edges between nodes generated by after=...
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
run |
PipelineRunResponseModel |
The pipeline run to add direct edges for. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def add_direct_edges(self, run: "PipelineRunResponseModel") -> None:
"""Add all direct edges between nodes generated by `after=...`.
Args:
run: The pipeline run to add direct edges for.
"""
for step in run.steps.values():
step_id = STEP_PREFIX + str(step.id)
for parent_step_id_uuid in step.parent_step_ids:
parent_step_id = STEP_PREFIX + str(parent_step_id_uuid)
if not self.has_artifact_link(step_id, parent_step_id):
self.add_edge(parent_step_id, step_id)
add_edge(self, source, target)
Adds an edge to the lineage graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
source |
str |
The source node id. |
required |
target |
str |
The target node id. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def add_edge(self, source: str, target: str) -> None:
"""Adds an edge to the lineage graph.
Args:
source: The source node id.
target: The target node id.
"""
self.edges.append(
Edge(id=source + "_" + target, source=source, target=target)
)
add_external_artifacts(self, run)
Adds all external artifacts to the lineage graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
run |
PipelineRunResponseModel |
The pipeline run to add external artifacts for. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def add_external_artifacts(self, run: "PipelineRunResponseModel") -> None:
"""Adds all external artifacts to the lineage graph.
Args:
run: The pipeline run to add external artifacts for.
"""
nodes_ids = {node.id for node in self.nodes}
for step in run.steps.values():
for artifact_name, artifact in step.inputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
if artifact_id not in nodes_ids:
self.add_artifact_node(
artifact=artifact,
id=artifact_id,
name=artifact_name,
step_id=str(artifact.producer_step_run_id),
status=ArtifactNodeStatus.EXTERNAL,
)
add_step_node(self, step, id)
Adds a step node to the lineage graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
step |
StepRunResponseModel |
The step to add a node for. |
required |
id |
str |
The id of the step node. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def add_step_node(
self,
step: "StepRunResponseModel",
id: str,
) -> None:
"""Adds a step node to the lineage graph.
Args:
step: The step to add a node for.
id: The id of the step node.
"""
step_config = step.config.dict()
if step_config:
step_config = {
key: value
for key, value in step_config.items()
if key not in ["inputs", "outputs", "parameters"] and value
}
self.nodes.append(
StepNode(
id=id,
data=StepNodeDetails(
execution_id=str(step.id),
name=step.name, # redundant for consistency
status=step.status,
entrypoint_name=step.config.name, # redundant for consistency
parameters=step.config.parameters,
configuration=step_config,
inputs={k: v.uri for k, v in step.inputs.items()},
outputs={k: v.uri for k, v in step.outputs.items()},
metadata=[
(m.key, str(m.value), str(m.type))
for m in step.metadata.values()
],
),
)
)
generate_run_nodes_and_edges(self, run)
Initializes a lineage graph from a pipeline run.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
run |
PipelineRunResponseModel |
The PipelineRunResponseModel to generate the lineage graph for. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def generate_run_nodes_and_edges(
self, run: "PipelineRunResponseModel"
) -> None:
"""Initializes a lineage graph from a pipeline run.
Args:
run: The PipelineRunResponseModel to generate the lineage graph for.
"""
self.run_metadata = [
(m.key, str(m.value), str(m.type)) for m in run.metadata.values()
]
for step in run.steps.values():
self.generate_step_nodes_and_edges(step)
self.add_external_artifacts(run)
self.add_direct_edges(run)
generate_step_nodes_and_edges(self, step)
Generates the nodes and edges for a step and its artifacts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
step |
StepRunResponseModel |
The step to generate the nodes and edges for. |
required |
Source code in zenml/lineage_graph/lineage_graph.py
def generate_step_nodes_and_edges(
self, step: "StepRunResponseModel"
) -> None:
"""Generates the nodes and edges for a step and its artifacts.
Args:
step: The step to generate the nodes and edges for.
"""
step_id = STEP_PREFIX + str(step.id)
# Set a root step if it doesn't exist yet
if self.root_step_id is None:
self.root_step_id = step_id
# Add the step node
self.add_step_node(step, step_id)
# Add nodes and edges for all output artifacts
for artifact_name, artifact in step.outputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
if step.status == ExecutionStatus.CACHED:
artifact_status = ArtifactNodeStatus.CACHED
elif step.status == ExecutionStatus.COMPLETED:
artifact_status = ArtifactNodeStatus.CREATED
else:
artifact_status = ArtifactNodeStatus.UNKNOWN
self.add_artifact_node(
artifact=artifact,
id=artifact_id,
name=artifact_name,
step_id=str(step_id),
status=artifact_status,
)
self.add_edge(step_id, artifact_id)
# Add nodes and edges for all input artifacts
for artifact_name, artifact in step.inputs.items():
artifact_id = ARTIFACT_PREFIX + str(artifact.id)
self.add_edge(artifact_id, step_id)
has_artifact_link(self, step_id, parent_step_id)
Checks if a step has an artifact link to a parent step.
This is the case for all parent steps that were not specified via
after=...
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
step_id |
str |
The node ID of the step to check. |
required |
parent_step_id |
str |
T node ID of the parent step to check. |
required |
Returns:
Type | Description |
---|---|
bool |
True if the steps are linked via an artifact, False otherwise. |
Source code in zenml/lineage_graph/lineage_graph.py
def has_artifact_link(self, step_id: str, parent_step_id: str) -> bool:
"""Checks if a step has an artifact link to a parent step.
This is the case for all parent steps that were not specified via
`after=...`.
Args:
step_id: The node ID of the step to check.
parent_step_id: T node ID of the parent step to check.
Returns:
True if the steps are linked via an artifact, False otherwise.
"""
parent_outputs, child_inputs = set(), set()
for edge in self.edges:
if edge.source == parent_step_id:
parent_outputs.add(edge.target)
if edge.target == step_id:
child_inputs.add(edge.source)
return bool(parent_outputs.intersection(child_inputs))
node
special
Initialization of lineage nodes.
artifact_node
Class for all lineage artifact nodes.
ArtifactNode (BaseNode)
pydantic-model
A class that represents an artifact node in a lineage graph.
Source code in zenml/lineage_graph/node/artifact_node.py
class ArtifactNode(BaseNode):
"""A class that represents an artifact node in a lineage graph."""
type: str = "artifact"
data: ArtifactNodeDetails
ArtifactNodeDetails (BaseNodeDetails)
pydantic-model
Captures all artifact details for the node.
Source code in zenml/lineage_graph/node/artifact_node.py
class ArtifactNodeDetails(BaseNodeDetails):
"""Captures all artifact details for the node."""
status: ArtifactNodeStatus
is_cached: bool
artifact_type: str
artifact_data_type: str
parent_step_id: str
producer_step_id: Optional[str]
uri: str
metadata: List[Tuple[str, str, str]] # (key, value, type)
ArtifactNodeStatus (StrEnum)
Enum that represents the status of an artifact.
Source code in zenml/lineage_graph/node/artifact_node.py
class ArtifactNodeStatus(StrEnum):
"""Enum that represents the status of an artifact."""
CACHED = "cached"
CREATED = "created"
EXTERNAL = "external"
UNKNOWN = "unknown"
base_node
Base class for all lineage nodes.
BaseNode (BaseModel)
pydantic-model
A class that represents a node in a lineage graph.
Source code in zenml/lineage_graph/node/base_node.py
class BaseNode(BaseModel):
"""A class that represents a node in a lineage graph."""
id: str
type: str
data: BaseNodeDetails
BaseNodeDetails (BaseModel)
pydantic-model
Captures all details for the node.
Source code in zenml/lineage_graph/node/base_node.py
class BaseNodeDetails(BaseModel):
"""Captures all details for the node."""
execution_id: str
name: str
step_node
Class for all lineage step nodes.
StepNode (BaseNode)
pydantic-model
A class that represents a step node in a lineage graph.
Source code in zenml/lineage_graph/node/step_node.py
class StepNode(BaseNode):
"""A class that represents a step node in a lineage graph."""
type: str = "step"
data: StepNodeDetails
StepNodeDetails (BaseNodeDetails)
pydantic-model
Captures all artifact details for the node.
Source code in zenml/lineage_graph/node/step_node.py
class StepNodeDetails(BaseNodeDetails):
"""Captures all artifact details for the node."""
status: ExecutionStatus
entrypoint_name: str
parameters: Dict[str, Any]
configuration: Dict[str, Any]
inputs: Dict[str, Any]
outputs: Dict[str, Any]
metadata: List[Tuple[str, str, str]] # (key, value, type)