Files
dlt/tests/workspace/helpers/dashboard/test_utils.py
rudolfix 06bc05848b (chore) adds hub extra (#3428)
* adds hub extra

* makes hub module more user friendly when hub not installed

* test and lint fixes

* adds plugin version check util function

* adds dlt-runtime to hub extra, minimal import tests

* bumps to dlthub 0.20.0 alpha

* lists pipelines with cli using the same functions as dashboard, dlt pipeline will list pipelines by default

* adds configured propfiles method on context so only profiles with configs or pipelines are listed

* adds list of locations that contained actual configs to provider interface

* improves workspace and profile commands

* test fixes

* fixes tests
2025-12-05 16:15:19 +01:00

942 lines
32 KiB
Python

from typing import cast, Set, List, Dict, Any
import os
import tempfile
from datetime import datetime
from pathlib import Path
import re
import marimo as mo
import pyarrow
import pytest
import dlt
from dlt.common import pendulum
from dlt._workspace.cli import utils as cli_utils
from dlt._workspace.helpers.dashboard.config import DashboardConfiguration
from dlt._workspace.helpers.dashboard.utils import (
get_dashboard_config_sections,
get_query_result_cached,
resolve_dashboard_config,
get_pipeline,
pipeline_details,
create_table_list,
get_row_counts_list,
create_column_list,
get_query_result,
get_row_counts,
get_loads,
trace_overview,
trace_execution_context,
trace_steps_overview,
style_cell,
_without_none_or_empty_string,
_align_dict_keys,
_humanize_datetime_values,
_dict_to_table_items,
build_exception_section,
get_local_data_path,
remote_state_details,
sanitize_trace_for_display,
trace_resolved_config_values,
trace_step_details,
get_source_and_resource_state_for_table,
get_default_query_for_table,
get_example_query_for_dataset,
_get_steps_data_and_status,
_get_migrations_count,
build_pipeline_execution_visualization,
_collect_load_packages_from_trace,
load_package_status_labels,
TPipelineRunStatus,
TVisualPipelineStep,
)
from dlt.pipeline.trace import TRACE_FILE_NAME
from tests.workspace.helpers.dashboard.example_pipelines import (
SUCCESS_PIPELINE_DUCKDB,
SUCCESS_PIPELINE_FILESYSTEM,
EXTRACT_EXCEPTION_PIPELINE,
NORMALIZE_EXCEPTION_PIPELINE,
NEVER_RAN_PIPELINE,
LOAD_EXCEPTION_PIPELINE,
NO_DESTINATION_PIPELINE,
create_success_pipeline_duckdb,
)
from tests.workspace.helpers.dashboard.example_pipelines import (
ALL_PIPELINES,
PIPELINES_WITH_EXCEPTIONS,
PIPELINES_WITH_LOAD,
)
from tests.workspace.utils import isolated_workspace
@pytest.fixture
def temp_pipelines_dir():
"""Create a temporary directory structure for testing pipelines"""
with tempfile.TemporaryDirectory() as temp_dir:
pipelines_dir = Path(temp_dir) / "pipelines"
pipelines_dir.mkdir()
# Create some test pipeline directories
(pipelines_dir / "success_pipeline_1").mkdir()
(pipelines_dir / "success_pipeline_2").mkdir()
(pipelines_dir / "_dlt_internal").mkdir()
# Create trace files with different timestamps
trace_file_1 = pipelines_dir / "success_pipeline_1" / TRACE_FILE_NAME
trace_file_1.touch()
# Set modification time to 2 days ago
os.utime(trace_file_1, (1000000, 1000000))
trace_file_2 = pipelines_dir / "success_pipeline_2" / TRACE_FILE_NAME
trace_file_2.touch()
# Set modification time to 1 day ago (more recent)
os.utime(trace_file_2, (2000000, 2000000))
yield str(pipelines_dir)
#
# cli utils tests
# TODO: move to test_cli_utils.py - pipeline fixtures should be unified for cli, dashboard and mcp tests
#
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_pipelines(pipeline: dlt.Pipeline):
"""Test getting local pipelines"""
pipelines_dir, pipelines = cli_utils.list_local_pipelines(pipeline.pipelines_dir)
assert pipelines_dir == pipeline.pipelines_dir
assert len(pipelines) == 1
assert pipelines[0]["name"] == pipeline.pipeline_name
def test_get_local_pipelines_with_temp_dir(temp_pipelines_dir):
"""Test getting local pipelines with temporary directory"""
pipelines_dir, pipelines = cli_utils.list_local_pipelines(temp_pipelines_dir)
assert pipelines_dir == temp_pipelines_dir
assert len(pipelines) == 3 # success_pipeline_1, success_pipeline_2, _dlt_internal
# Should be sorted by timestamp (descending)
pipeline_names = [p["name"] for p in pipelines]
assert "success_pipeline_2" in pipeline_names
assert "success_pipeline_1" in pipeline_names
assert "_dlt_internal" in pipeline_names
# Check timestamps are present
for pipeline in pipelines:
assert "timestamp" in pipeline
assert isinstance(pipeline["timestamp"], (int, float))
def test_get_local_pipelines_empty_dir():
"""Test getting local pipelines from empty directory"""
with tempfile.TemporaryDirectory() as temp_dir:
pipelines_dir, pipelines = cli_utils.list_local_pipelines(temp_dir)
assert pipelines_dir == temp_dir
assert pipelines == []
def test_get_local_pipelines_nonexistent_dir():
"""Test getting local pipelines from nonexistent directory"""
nonexistent_dir = "/nonexistent/directory"
pipelines_dir, pipelines = cli_utils.list_local_pipelines(nonexistent_dir)
assert pipelines_dir == nonexistent_dir
assert pipelines == []
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_pipeline_last_run(pipeline: dlt.Pipeline):
"""Test getting the last run of a pipeline"""
if pipeline.pipeline_name in [NEVER_RAN_PIPELINE, NO_DESTINATION_PIPELINE]:
assert (
cli_utils.get_pipeline_trace_mtime(pipeline.pipelines_dir, pipeline.pipeline_name) == 0
)
else:
assert (
cli_utils.get_pipeline_trace_mtime(pipeline.pipelines_dir, pipeline.pipeline_name)
> 1000000
)
def test_integration_get_local_pipelines_with_sorting(temp_pipelines_dir):
"""Test integration scenario with multiple pipelines sorted by timestamp"""
pipelines_dir, pipelines = cli_utils.list_local_pipelines(
temp_pipelines_dir, sort_by_trace=True
)
assert pipelines_dir == temp_pipelines_dir
assert len(pipelines) == 3
# Should be sorted by timestamp (descending - most recent first)
timestamps = [p["timestamp"] for p in pipelines]
assert timestamps == sorted(timestamps, reverse=True)
# Verify the most recent pipeline is first
most_recent = pipelines[0]
assert most_recent["name"] == "success_pipeline_2"
assert most_recent["timestamp"] == 2000000
#
# dashboard utils tests
#
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_build_exception_section(pipeline: dlt.Pipeline):
if pipeline.pipeline_name in PIPELINES_WITH_EXCEPTIONS:
assert "Show full stacktrace" in build_exception_section(pipeline)[0].text
else:
assert not build_exception_section(pipeline)
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_local_data_path(pipeline: dlt.Pipeline):
if pipeline.pipeline_name in [LOAD_EXCEPTION_PIPELINE, NO_DESTINATION_PIPELINE]:
# custom destination does not support local data path
assert get_local_data_path(pipeline) is None
else:
assert get_local_data_path(pipeline)
def test_get_dashboard_config_sections(success_pipeline_duckdb) -> None:
# NOTE: "dashboard" obligatory section comes from configuration __section__
assert get_dashboard_config_sections(success_pipeline_duckdb) == (
"pipelines",
"success_pipeline_duckdb",
)
assert get_dashboard_config_sections(None) == ()
# create workspace context
with isolated_workspace("configured_workspace"):
assert get_dashboard_config_sections(None) == ("workspace",)
def test_resolve_dashboard_config(success_pipeline_duckdb) -> None:
"""Test resolving dashboard config with a real pipeline"""
os.environ["PIPELINES__SUCCESS_PIPELINE_DUCKDB__DASHBOARD__DATETIME_FORMAT"] = "some format"
os.environ["DASHBOARD__DATETIME_FORMAT"] = "other format"
config = resolve_dashboard_config(success_pipeline_duckdb)
assert isinstance(config, DashboardConfiguration)
assert isinstance(config.datetime_format, str)
assert config.datetime_format == "some format"
other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckdb")
config = resolve_dashboard_config(other_pipeline)
assert config.datetime_format == "other format"
# create workspace context
with isolated_workspace("configured_workspace"):
os.environ["WORKSPACE__DASHBOARD__DATETIME_FORMAT"] = "workspace format"
config = resolve_dashboard_config(None)
assert config.datetime_format == "workspace format"
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
def test_get_source_and_resource_state_for_table(pipeline: dlt.Pipeline):
"""Test getting source and resource state for a table"""
table = pipeline.default_schema.tables["purchases"]
resource_name, source_state, resource_state = get_source_and_resource_state_for_table(
table, pipeline, pipeline.default_schema_name
)
assert resource_name
assert source_state == {}
assert resource_state.get("incremental").get("id") is not None
# check it can be rendered with marimo
assert mo.json(resource_state).text
assert mo.json(source_state).text
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_pipeline(pipeline: dlt.Pipeline):
"""Test getting a real pipeline by name"""
pipeline = get_pipeline(pipeline.pipeline_name, pipeline.pipelines_dir)
assert pipeline.pipeline_name == pipeline.pipeline_name
assert pipeline.dataset_name == pipeline.dataset_name
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_pipeline_details(pipeline, temp_pipelines_dir):
"""Test getting pipeline details from a real pipeline"""
config = DashboardConfiguration()
result = pipeline_details(config, pipeline, temp_pipelines_dir)
assert isinstance(result, list)
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert len(result) == 9
elif pipeline.pipeline_name in [LOAD_EXCEPTION_PIPELINE, NORMALIZE_EXCEPTION_PIPELINE]:
# custom destination does not support remote data info
assert len(result) == 8
else:
# no remote data info
assert len(result) == 7
# Convert to dict for easier testing
details_dict = {item["name"]: item["value"] for item in result}
assert details_dict["pipeline_name"] == pipeline.pipeline_name
if pipeline.pipeline_name == NO_DESTINATION_PIPELINE:
assert details_dict["destination"] == "No destination set"
elif pipeline.pipeline_name == SUCCESS_PIPELINE_FILESYSTEM:
assert details_dict["destination"] == "filesystem (dlt.destinations.filesystem)"
elif pipeline.pipeline_name == LOAD_EXCEPTION_PIPELINE:
assert details_dict["destination"] == "dummy (dlt.destinations.dummy)"
else:
assert details_dict["destination"] == "duckdb (dlt.destinations.duckdb)"
assert details_dict["dataset_name"] == pipeline.dataset_name
if pipeline.pipeline_name in PIPELINES_WITH_LOAD or pipeline.pipeline_name in [
LOAD_EXCEPTION_PIPELINE,
NORMALIZE_EXCEPTION_PIPELINE,
]:
assert details_dict["schemas"].startswith("fruitshop")
else:
assert "schemas" not in details_dict
assert details_dict["working_dir"].endswith(pipeline.pipeline_name)
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
@pytest.mark.parametrize("show_internals", [True, False])
@pytest.mark.parametrize("show_child_tables", [True, False])
def test_create_table_list(pipeline, show_internals, show_child_tables):
"""Test creating a basic table list with real schema"""
config = DashboardConfiguration()
result = create_table_list(
config,
pipeline,
selected_schema_name=pipeline.default_schema_name,
show_internals=show_internals,
show_child_tables=show_child_tables,
)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
base_table_names = {"inventory", "purchases", "customers", "inventory_categories"}
dlt_table_names = {"_dlt_loads", "_dlt_version", "_dlt_pipeline_state"}
child_table_names = {"purchases__child"}
expected_table_names = {*base_table_names}
if show_internals:
expected_table_names.update(dlt_table_names)
if show_child_tables:
expected_table_names.update(child_table_names)
table_names = {table["name"] for table in result}
assert set(table_names) == expected_table_names
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
@pytest.mark.parametrize("show_internals", [True, False])
@pytest.mark.parametrize("show_type_hints", [True, False])
@pytest.mark.parametrize("show_other_hints", [True, False])
@pytest.mark.parametrize("show_custom_hints", [True, False])
def test_create_column_list_basic(
pipeline, show_internals, show_type_hints, show_other_hints, show_custom_hints
):
"""Test creating a basic column list with real schema"""
config = DashboardConfiguration()
# Should exclude _dlt columns by default, will also not show incomplete columns
result = create_column_list(
config,
pipeline,
selected_schema_name=pipeline.default_schema_name,
table_name="purchases",
show_internals=show_internals,
show_type_hints=show_type_hints,
show_other_hints=show_other_hints,
show_custom_hints=show_custom_hints,
)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
# check visible columns
base_column_names = {"customer_id", "quantity", "id", "inventory_id", "date"}
dlt_column_names = {"_dlt_load_id", "_dlt_id"}
expected_column_names = {*base_column_names}
if show_internals:
expected_column_names.update(dlt_column_names)
column_names = {col["name"] for col in result}
assert column_names == expected_column_names
# Find the id column
id_column = next(col for col in result if col["name"] == "id")
# check type hints
if show_type_hints:
assert id_column["data_type"] == "bigint"
assert id_column["nullable"] is False
else:
assert "data_type" not in id_column
assert "nullable" not in id_column
if show_other_hints:
assert id_column["primary_key"] is True
else:
assert "primary_key" not in id_column
if show_custom_hints:
assert id_column["x-custom"] == "foo"
else:
assert "x-custom" not in id_column
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
def test_get_query_result(pipeline: dlt.Pipeline):
"""Test getting query result from real pipeline"""
# Clear cache first
get_query_result_cached.cache_clear()
result, error_message, traceback_string = get_query_result(
pipeline, "SELECT COUNT(*) as count FROM purchases"
)
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert isinstance(result, pyarrow.Table)
assert len(result) == 1
assert (
result[0][0].as_py() == 100
if pipeline.pipeline_name == SUCCESS_PIPELINE_DUCKDB
else 103
) # merge does not work on filesystem
else:
assert len(result) == 0
assert error_message
assert traceback_string
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
def test_get_default_query_for_table(pipeline: dlt.Pipeline):
query, error_message, traceback_string = get_default_query_for_table(
pipeline, pipeline.default_schema_name, "purchases", True
)
assert query == 'SELECT\n *\nFROM "purchases"\nLIMIT 1000'
assert not error_message
assert not traceback_string
assert query
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
def test_get_example_query_for_dataset(pipeline: dlt.Pipeline):
query, error_message, traceback_string = get_example_query_for_dataset(
pipeline, pipeline.default_schema_name
)
assert query == 'SELECT\n *\nFROM "customers"\nLIMIT 1000'
assert not error_message
assert not traceback_string
assert query
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_row_counts_list(pipeline: dlt.Pipeline):
"""Test getting row counts from real pipeline"""
result = get_row_counts_list(pipeline)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
reverted_result = {i["name"]: i["row_count"] for i in result}
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert reverted_result == {
"customers": 13,
"inventory": 6,
"purchases": (
100 if pipeline.pipeline_name == SUCCESS_PIPELINE_DUCKDB else 103
), # merge does not work on filesystem
"purchases__child": 3,
"inventory_categories": 3,
"_dlt_version": 3,
"_dlt_loads": 4,
"_dlt_pipeline_state": 3,
}
else:
reverted_result = {}
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_loads(pipeline: dlt.Pipeline):
"""Test getting loads from real pipeline"""
config = DashboardConfiguration()
# Clear cache first
result, error_message, traceback_string = get_loads(config, pipeline, limit=100)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert isinstance(result, list)
assert len(result) >= 1 # Should have at least one load
assert not error_message
assert not traceback_string
if result:
load = result[0]
assert "load_id" in load
else:
assert result == []
assert error_message
assert traceback_string
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_trace(pipeline: dlt.Pipeline):
"""Test trace overview with real trace data"""
config = DashboardConfiguration()
trace = pipeline.last_trace
if pipeline.pipeline_name in [NEVER_RAN_PIPELINE, NO_DESTINATION_PIPELINE]:
assert trace is None
return
# overview
result = trace_overview(config, trace)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
assert {item["name"] for item in result} == {
"pipeline_name",
"started_at",
"finished_at",
"transaction_id",
"duration",
}
values_dict = {item["name"]: item["value"] for item in result}
assert values_dict["pipeline_name"] == pipeline.pipeline_name
# execution context
result = trace_execution_context(config, trace)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
assert len(result) == 7
assert {item["name"] for item in result} == {
"cpu",
"os",
"library",
"run_context",
"python",
"ci_run",
"exec_info",
}
# TODO: inspect values
# steps overview
result = trace_steps_overview(config, trace)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
if pipeline.pipeline_name == EXTRACT_EXCEPTION_PIPELINE:
assert len(result) == 1
assert result[0]["step"] == "extract"
elif pipeline.pipeline_name == NORMALIZE_EXCEPTION_PIPELINE:
assert len(result) == 2
assert result[0]["step"] == "extract"
assert result[1]["step"] == "normalize"
else:
assert len(result) == 3
assert result[0]["step"] == "extract"
assert result[1]["step"] == "normalize"
assert result[2]["step"] == "load"
# TODO: inspect values of trace steps overview
for item in result:
result = trace_step_details(config, trace, item["step"])
# TODO: inspect trace step details
# resolved config values (TODO: add at least one config value)
result = trace_resolved_config_values(config, trace)
# check it can be rendered as table with marimo
assert mo.ui.table(result).text is not None
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_get_remote_state_details(pipeline: dlt.Pipeline):
remote_state = remote_state_details(pipeline)
# check it can be rendered as table with marimo
assert mo.ui.table(remote_state).text is not None
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert remote_state[0] == {"name": "state_version", "value": 3}
assert remote_state[1]["name"] == "schemas"
assert remote_state[1]["value"].startswith("fruitshop")
assert remote_state[2]["name"] == ""
assert remote_state[2]["value"].startswith("fruitshop_customers")
else:
assert remote_state[0] == {
"name": "Info",
"value": "Could not restore state from destination",
}
def test_style_cell():
"""Test style cell function"""
# Even row
result = style_cell("0", "test_col", "test_value")
assert result["background-color"] == "white"
# Odd row
result = style_cell("1", "test_col", "test_value")
assert result["background-color"] == "#f4f4f9"
# Name column (case insensitive)
result = style_cell("0", "name", "test_value")
assert result["font-weight"] == "bold"
result = style_cell("0", "NAME", "test_value")
assert result["font-weight"] == "bold"
def test_without_none_or_empty_string():
"""Test removing None and empty string values"""
input_dict = {
"key1": "value1",
"key2": None,
"key3": "",
"key4": "value4",
"key5": 0, # Should be kept
"key6": False, # Should be kept
"key7": [], # Should be kept
}
result = _without_none_or_empty_string(input_dict)
expected = {"key1": "value1", "key4": "value4", "key5": 0, "key6": False, "key7": []}
assert result == expected
def test_align_dict_keys():
"""Test aligning dictionary keys"""
items = [
{"key1": "value1", "key2": "value2"},
{"key1": "value3", "key3": "value4"},
{"key2": "value5", "key4": "value6"},
]
result = _align_dict_keys(items)
# All items should have all keys
expected_keys = {"key1", "key2", "key3", "key4"}
for item in result:
assert set(item.keys()) == expected_keys
# Missing keys should be filled with "-"
assert result[0]["key3"] == "-"
assert result[0]["key4"] == "-"
assert result[1]["key2"] == "-"
assert result[1]["key4"] == "-"
assert result[2]["key1"] == "-"
assert result[2]["key3"] == "-"
def test_align_dict_keys_with_none_values():
"""Test aligning dictionary keys with None values filtered out"""
items = [
{"key1": "value1", "key2": None, "key3": ""},
{"key1": None, "key2": "value2", "key4": "value4"},
]
result = _align_dict_keys(items)
# None and empty string values should be removed before alignment
assert "key3" not in result[0] # Was empty string
# Missing keys should be filled with "-"
assert result[0]["key4"] == "-"
assert result[0]["key2"] == "-"
assert result[1]["key1"] == "-"
def test_humanize_datetime_values():
"""Test humanizing datetime values"""
config = DashboardConfiguration()
config.datetime_format = "YYYY-MM-DD HH:mm:ss Z"
input_dict = {
"started_at": pendulum.parse("2023-01-01T12:00:00"),
"finished_at": pendulum.parse("2023-01-01T12:30:00"),
"created": 1672574400, # Unix timestamp
"last_modified": "1672574400.123", # String timestamp
"inserted_at": datetime(2023, 1, 1, 12, 0, 0),
"load_id": 1672574400, # Unix timestamp for 2023-01-01T12:00:00
"other_field": "unchanged",
"numeric_field": 42,
}
result = _humanize_datetime_values(config, input_dict)
# Should have duration calculated
assert "duration" in result
# Original non-datetime fields should be preserved
assert result["other_field"] == "unchanged"
assert result["numeric_field"] == 42
assert result["created"] == "2023-01-01 12:00:00 +00:00"
assert result["last_modified"] == "2023-01-01 12:00:00 +00:00"
assert result["inserted_at"] == "2023-01-01 12:00:00 +00:00"
assert result["load_package_created_at"] == "2023-01-01 12:00:00 +00:00"
def test_dict_to_table_items():
"""Test converting dict to table items"""
input_dict = {
"pipeline_name": "success_pipeline",
"destination": "duckdb",
"status": "completed",
}
result = _dict_to_table_items(input_dict)
expected = [
{"name": "pipeline_name", "value": "success_pipeline"},
{"name": "destination", "value": "duckdb"},
{"name": "status", "value": "completed"},
]
# Sort both by name for comparison since dict order may vary
result_sorted = sorted(result, key=lambda x: x["name"])
expected_sorted = sorted(expected, key=lambda x: x["name"])
assert result_sorted == expected_sorted
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_integration_pipeline_workflow(pipeline, temp_pipelines_dir):
"""Test integration scenario with complete pipeline workflow"""
# Test pipeline details
config = DashboardConfiguration()
details = pipeline_details(config, pipeline, temp_pipelines_dir)
# check it can be rendered as table with marimo
assert mo.ui.table(details).text is not None
details_dict = {item["name"]: item["value"] for item in details}
assert details_dict["pipeline_name"] == pipeline.pipeline_name
# Test row counts
row_counts = get_row_counts(pipeline)
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert row_counts["customers"] == 13
else:
assert row_counts == {}
# Test query execution
query_result, error_message, traceback_string = get_query_result(
pipeline, "SELECT name FROM customers ORDER BY id"
)
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert len(query_result) == 13
assert query_result[0][0].as_py() == "simon"
assert not error_message
assert not traceback_string
else:
assert len(query_result) == 0
assert error_message
assert traceback_string
# Test loads
config = DashboardConfiguration()
loads, error_message, traceback_string = get_loads(config, pipeline)
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
assert len(loads) >= 1
assert not error_message
assert not traceback_string
else:
assert error_message
assert traceback_string
assert loads == []
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
def test_sanitize_trace_for_display(pipeline: dlt.Pipeline):
"""Test sanitizing trace for display"""
trace = pipeline.last_trace
sanitized = sanitize_trace_for_display(trace)
assert sanitized is not None
assert isinstance(sanitized, dict)
# check it can be rendered with marimo
assert mo.json(sanitized).text is not None
@pytest.mark.parametrize(
"pipeline, expected_steps, expected_status",
[
(SUCCESS_PIPELINE_DUCKDB, {"extract", "normalize", "load"}, "succeeded"),
(SUCCESS_PIPELINE_FILESYSTEM, {"extract", "normalize", "load"}, "succeeded"),
(EXTRACT_EXCEPTION_PIPELINE, {"extract"}, "failed"),
(LOAD_EXCEPTION_PIPELINE, {"extract", "normalize", "load"}, "failed"),
],
indirect=["pipeline"],
)
def test_get_steps_data_and_status(
pipeline: dlt.Pipeline,
expected_steps: Set[TVisualPipelineStep],
expected_status: TPipelineRunStatus,
) -> None:
"""Test getting steps data and the pipeline execution status from trace"""
trace = pipeline.last_trace
steps_data, status = _get_steps_data_and_status(trace.steps)
assert len(steps_data) == len(expected_steps)
assert status == expected_status
assert all(step.duration_ms > 0 for step in steps_data)
if expected_status == "succeeded":
assert all(step.failed is False for step in steps_data)
else:
assert any(step.failed is True for step in steps_data)
assert set([step.step for step in steps_data]) == expected_steps
def test_get_migrations_count(temp_pipelines_dir) -> None:
"""Test getting migrations count from the pipeline's last load info"""
import duckdb
db_conn = duckdb.connect()
try:
pipeline = create_success_pipeline_duckdb(temp_pipelines_dir, db_conn=db_conn)
migrations_count = _get_migrations_count(pipeline.last_trace.last_load_info)
assert migrations_count == 1
# Trigger multiple migrations
pipeline.extract([{"id": 1, "name": "test"}], table_name="my_table")
pipeline.extract([{"id": 2, "name": "test2", "new_column": "value"}], table_name="my_table")
pipeline.extract(
[{"id": 3, "name": "test3", "new_column": "value", "another_column": 100}],
table_name="my_table",
)
pipeline.normalize()
pipeline.load()
migrations_count = _get_migrations_count(pipeline.last_trace.last_load_info)
assert migrations_count == 3
finally:
db_conn.close()
@pytest.mark.parametrize(
"pipeline, expected_steps, expected_status",
[
(SUCCESS_PIPELINE_DUCKDB, {"extract", "normalize", "load"}, "succeeded"),
(SUCCESS_PIPELINE_FILESYSTEM, {"extract", "normalize", "load"}, "succeeded"),
(EXTRACT_EXCEPTION_PIPELINE, {"extract"}, "failed"),
(LOAD_EXCEPTION_PIPELINE, {"extract", "normalize", "load"}, "failed"),
],
indirect=["pipeline"],
)
def test_build_pipeline_execution_visualization(
pipeline: dlt.Pipeline,
expected_steps: Set[TVisualPipelineStep],
expected_status: TPipelineRunStatus,
) -> None:
"""Test overall pipeline execution visualization logic"""
trace = pipeline.last_trace
html = build_pipeline_execution_visualization(trace)
html_str = str(html.text)
# Check for CSS class structure
assert 'class="pipeline-execution-container"' in html_str
assert 'class="pipeline-execution-layout"' in html_str
assert 'class="pipeline-execution-timeline"' in html_str
assert 'class="pipeline-execution-badges"' in html_str
assert f"Last execution ID: <strong>{trace.transaction_id[:8]}</strong>" in html_str
total_time_match = re.search(
r"<div>Total time: <strong>([\d.]+)(ms|s)?</strong></div>", html_str
)
assert total_time_match is not None
# Check for status badge using CSS classes (not inline styles)
status_badge_class = (
"status-badge-green" if expected_status == "succeeded" else "status-badge-red"
)
assert (
f'<div class="status-badge {status_badge_class}"><strong>{expected_status}</strong></div>'
in html_str
)
# Check for migration badge using CSS classes (not inline styles)
migrations_count = _get_migrations_count(trace.last_load_info) if trace.last_load_info else 0
migration_badge = (
f'<div class="status-badge status-badge-yellow"><strong>{migrations_count} dataset'
" migration(s)</strong></div>"
)
if migrations_count != 0:
assert migration_badge in html_str
else:
assert migration_badge not in html_str
steps_data, _ = _get_steps_data_and_status(trace.steps)
for step_data in steps_data:
duration_pattern = re.search(rf"{step_data.step.capitalize()}\s+([\d.]+)(ms|s)?", html_str)
assert duration_pattern is not None
if "extract" in expected_steps:
assert "var(--dlt-color-lime)" in html_str
if "normalize" in expected_steps:
assert "var(--dlt-color-aqua)" in html_str
if "load" in expected_steps:
assert "var(--dlt-color-pink)" in html_str
@pytest.mark.parametrize(
"pipeline",
[
SUCCESS_PIPELINE_DUCKDB,
SUCCESS_PIPELINE_FILESYSTEM,
EXTRACT_EXCEPTION_PIPELINE,
NORMALIZE_EXCEPTION_PIPELINE,
LOAD_EXCEPTION_PIPELINE,
],
indirect=["pipeline"],
)
def test_collect_load_packages_from_trace(
pipeline: dlt.Pipeline,
) -> None:
"""Test getting load package status labels from trace"""
trace = pipeline.last_trace
table = load_package_status_labels(trace)
list_of_load_package_info = cast(List[Dict[str, Any]], table.data)
if pipeline.pipeline_name in ["success_pipeline_duckdb", "success_pipeline_filesystem"]:
assert len(list_of_load_package_info) == 2
assert all(
"loaded" in str(load_package_info["status"].text)
for load_package_info in list_of_load_package_info
)
elif pipeline.pipeline_name == "extract_exception_pipeline":
assert len(list_of_load_package_info) == 1
assert "discarded" in str(list_of_load_package_info[0]["status"].text)
elif pipeline.pipeline_name == "load_exception_pipeline":
assert len(list_of_load_package_info) == 1
assert "aborted" in str(list_of_load_package_info[0]["status"].text)
elif pipeline.pipeline_name == "normalize_exception_pipeline":
assert len(list_of_load_package_info) == 1
assert "pending" in str(list_of_load_package_info[0]["status"].text)