mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
* adds hub extra * makes hub module more user friendly when hub not installed * test and lint fixes * adds plugin version check util function * adds dlt-runtime to hub extra, minimal import tests * bumps to dlthub 0.20.0 alpha * lists pipelines with cli using the same functions as dashboard, dlt pipeline will list pipelines by default * adds configured propfiles method on context so only profiles with configs or pipelines are listed * adds list of locations that contained actual configs to provider interface * improves workspace and profile commands * test fixes * fixes tests
942 lines
32 KiB
Python
942 lines
32 KiB
Python
from typing import cast, Set, List, Dict, Any
|
|
import os
|
|
import tempfile
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import re
|
|
|
|
import marimo as mo
|
|
import pyarrow
|
|
import pytest
|
|
|
|
import dlt
|
|
from dlt.common import pendulum
|
|
|
|
from dlt._workspace.cli import utils as cli_utils
|
|
from dlt._workspace.helpers.dashboard.config import DashboardConfiguration
|
|
from dlt._workspace.helpers.dashboard.utils import (
|
|
get_dashboard_config_sections,
|
|
get_query_result_cached,
|
|
resolve_dashboard_config,
|
|
get_pipeline,
|
|
pipeline_details,
|
|
create_table_list,
|
|
get_row_counts_list,
|
|
create_column_list,
|
|
get_query_result,
|
|
get_row_counts,
|
|
get_loads,
|
|
trace_overview,
|
|
trace_execution_context,
|
|
trace_steps_overview,
|
|
style_cell,
|
|
_without_none_or_empty_string,
|
|
_align_dict_keys,
|
|
_humanize_datetime_values,
|
|
_dict_to_table_items,
|
|
build_exception_section,
|
|
get_local_data_path,
|
|
remote_state_details,
|
|
sanitize_trace_for_display,
|
|
trace_resolved_config_values,
|
|
trace_step_details,
|
|
get_source_and_resource_state_for_table,
|
|
get_default_query_for_table,
|
|
get_example_query_for_dataset,
|
|
_get_steps_data_and_status,
|
|
_get_migrations_count,
|
|
build_pipeline_execution_visualization,
|
|
_collect_load_packages_from_trace,
|
|
load_package_status_labels,
|
|
TPipelineRunStatus,
|
|
TVisualPipelineStep,
|
|
)
|
|
|
|
from dlt.pipeline.trace import TRACE_FILE_NAME
|
|
from tests.workspace.helpers.dashboard.example_pipelines import (
|
|
SUCCESS_PIPELINE_DUCKDB,
|
|
SUCCESS_PIPELINE_FILESYSTEM,
|
|
EXTRACT_EXCEPTION_PIPELINE,
|
|
NORMALIZE_EXCEPTION_PIPELINE,
|
|
NEVER_RAN_PIPELINE,
|
|
LOAD_EXCEPTION_PIPELINE,
|
|
NO_DESTINATION_PIPELINE,
|
|
create_success_pipeline_duckdb,
|
|
)
|
|
from tests.workspace.helpers.dashboard.example_pipelines import (
|
|
ALL_PIPELINES,
|
|
PIPELINES_WITH_EXCEPTIONS,
|
|
PIPELINES_WITH_LOAD,
|
|
)
|
|
from tests.workspace.utils import isolated_workspace
|
|
|
|
|
|
@pytest.fixture
|
|
def temp_pipelines_dir():
|
|
"""Create a temporary directory structure for testing pipelines"""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
pipelines_dir = Path(temp_dir) / "pipelines"
|
|
pipelines_dir.mkdir()
|
|
|
|
# Create some test pipeline directories
|
|
(pipelines_dir / "success_pipeline_1").mkdir()
|
|
(pipelines_dir / "success_pipeline_2").mkdir()
|
|
(pipelines_dir / "_dlt_internal").mkdir()
|
|
|
|
# Create trace files with different timestamps
|
|
trace_file_1 = pipelines_dir / "success_pipeline_1" / TRACE_FILE_NAME
|
|
trace_file_1.touch()
|
|
# Set modification time to 2 days ago
|
|
os.utime(trace_file_1, (1000000, 1000000))
|
|
|
|
trace_file_2 = pipelines_dir / "success_pipeline_2" / TRACE_FILE_NAME
|
|
trace_file_2.touch()
|
|
# Set modification time to 1 day ago (more recent)
|
|
os.utime(trace_file_2, (2000000, 2000000))
|
|
|
|
yield str(pipelines_dir)
|
|
|
|
|
|
#
|
|
# cli utils tests
|
|
# TODO: move to test_cli_utils.py - pipeline fixtures should be unified for cli, dashboard and mcp tests
|
|
#
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_pipelines(pipeline: dlt.Pipeline):
|
|
"""Test getting local pipelines"""
|
|
pipelines_dir, pipelines = cli_utils.list_local_pipelines(pipeline.pipelines_dir)
|
|
assert pipelines_dir == pipeline.pipelines_dir
|
|
assert len(pipelines) == 1
|
|
assert pipelines[0]["name"] == pipeline.pipeline_name
|
|
|
|
|
|
def test_get_local_pipelines_with_temp_dir(temp_pipelines_dir):
|
|
"""Test getting local pipelines with temporary directory"""
|
|
pipelines_dir, pipelines = cli_utils.list_local_pipelines(temp_pipelines_dir)
|
|
|
|
assert pipelines_dir == temp_pipelines_dir
|
|
assert len(pipelines) == 3 # success_pipeline_1, success_pipeline_2, _dlt_internal
|
|
|
|
# Should be sorted by timestamp (descending)
|
|
pipeline_names = [p["name"] for p in pipelines]
|
|
assert "success_pipeline_2" in pipeline_names
|
|
assert "success_pipeline_1" in pipeline_names
|
|
assert "_dlt_internal" in pipeline_names
|
|
|
|
# Check timestamps are present
|
|
for pipeline in pipelines:
|
|
assert "timestamp" in pipeline
|
|
assert isinstance(pipeline["timestamp"], (int, float))
|
|
|
|
|
|
def test_get_local_pipelines_empty_dir():
|
|
"""Test getting local pipelines from empty directory"""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
pipelines_dir, pipelines = cli_utils.list_local_pipelines(temp_dir)
|
|
|
|
assert pipelines_dir == temp_dir
|
|
assert pipelines == []
|
|
|
|
|
|
def test_get_local_pipelines_nonexistent_dir():
|
|
"""Test getting local pipelines from nonexistent directory"""
|
|
nonexistent_dir = "/nonexistent/directory"
|
|
pipelines_dir, pipelines = cli_utils.list_local_pipelines(nonexistent_dir)
|
|
|
|
assert pipelines_dir == nonexistent_dir
|
|
assert pipelines == []
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_pipeline_last_run(pipeline: dlt.Pipeline):
|
|
"""Test getting the last run of a pipeline"""
|
|
if pipeline.pipeline_name in [NEVER_RAN_PIPELINE, NO_DESTINATION_PIPELINE]:
|
|
assert (
|
|
cli_utils.get_pipeline_trace_mtime(pipeline.pipelines_dir, pipeline.pipeline_name) == 0
|
|
)
|
|
else:
|
|
assert (
|
|
cli_utils.get_pipeline_trace_mtime(pipeline.pipelines_dir, pipeline.pipeline_name)
|
|
> 1000000
|
|
)
|
|
|
|
|
|
def test_integration_get_local_pipelines_with_sorting(temp_pipelines_dir):
|
|
"""Test integration scenario with multiple pipelines sorted by timestamp"""
|
|
pipelines_dir, pipelines = cli_utils.list_local_pipelines(
|
|
temp_pipelines_dir, sort_by_trace=True
|
|
)
|
|
|
|
assert pipelines_dir == temp_pipelines_dir
|
|
assert len(pipelines) == 3
|
|
|
|
# Should be sorted by timestamp (descending - most recent first)
|
|
timestamps = [p["timestamp"] for p in pipelines]
|
|
assert timestamps == sorted(timestamps, reverse=True)
|
|
|
|
# Verify the most recent pipeline is first
|
|
most_recent = pipelines[0]
|
|
assert most_recent["name"] == "success_pipeline_2"
|
|
assert most_recent["timestamp"] == 2000000
|
|
|
|
|
|
#
|
|
# dashboard utils tests
|
|
#
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_build_exception_section(pipeline: dlt.Pipeline):
|
|
if pipeline.pipeline_name in PIPELINES_WITH_EXCEPTIONS:
|
|
assert "Show full stacktrace" in build_exception_section(pipeline)[0].text
|
|
else:
|
|
assert not build_exception_section(pipeline)
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_local_data_path(pipeline: dlt.Pipeline):
|
|
if pipeline.pipeline_name in [LOAD_EXCEPTION_PIPELINE, NO_DESTINATION_PIPELINE]:
|
|
# custom destination does not support local data path
|
|
assert get_local_data_path(pipeline) is None
|
|
else:
|
|
assert get_local_data_path(pipeline)
|
|
|
|
|
|
def test_get_dashboard_config_sections(success_pipeline_duckdb) -> None:
|
|
# NOTE: "dashboard" obligatory section comes from configuration __section__
|
|
assert get_dashboard_config_sections(success_pipeline_duckdb) == (
|
|
"pipelines",
|
|
"success_pipeline_duckdb",
|
|
)
|
|
assert get_dashboard_config_sections(None) == ()
|
|
|
|
# create workspace context
|
|
with isolated_workspace("configured_workspace"):
|
|
assert get_dashboard_config_sections(None) == ("workspace",)
|
|
|
|
|
|
def test_resolve_dashboard_config(success_pipeline_duckdb) -> None:
|
|
"""Test resolving dashboard config with a real pipeline"""
|
|
|
|
os.environ["PIPELINES__SUCCESS_PIPELINE_DUCKDB__DASHBOARD__DATETIME_FORMAT"] = "some format"
|
|
os.environ["DASHBOARD__DATETIME_FORMAT"] = "other format"
|
|
|
|
config = resolve_dashboard_config(success_pipeline_duckdb)
|
|
|
|
assert isinstance(config, DashboardConfiguration)
|
|
assert isinstance(config.datetime_format, str)
|
|
assert config.datetime_format == "some format"
|
|
|
|
other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckdb")
|
|
config = resolve_dashboard_config(other_pipeline)
|
|
assert config.datetime_format == "other format"
|
|
|
|
# create workspace context
|
|
with isolated_workspace("configured_workspace"):
|
|
os.environ["WORKSPACE__DASHBOARD__DATETIME_FORMAT"] = "workspace format"
|
|
config = resolve_dashboard_config(None)
|
|
assert config.datetime_format == "workspace format"
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
def test_get_source_and_resource_state_for_table(pipeline: dlt.Pipeline):
|
|
"""Test getting source and resource state for a table"""
|
|
table = pipeline.default_schema.tables["purchases"]
|
|
resource_name, source_state, resource_state = get_source_and_resource_state_for_table(
|
|
table, pipeline, pipeline.default_schema_name
|
|
)
|
|
assert resource_name
|
|
assert source_state == {}
|
|
assert resource_state.get("incremental").get("id") is not None
|
|
|
|
# check it can be rendered with marimo
|
|
assert mo.json(resource_state).text
|
|
assert mo.json(source_state).text
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_pipeline(pipeline: dlt.Pipeline):
|
|
"""Test getting a real pipeline by name"""
|
|
pipeline = get_pipeline(pipeline.pipeline_name, pipeline.pipelines_dir)
|
|
|
|
assert pipeline.pipeline_name == pipeline.pipeline_name
|
|
assert pipeline.dataset_name == pipeline.dataset_name
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_pipeline_details(pipeline, temp_pipelines_dir):
|
|
"""Test getting pipeline details from a real pipeline"""
|
|
config = DashboardConfiguration()
|
|
result = pipeline_details(config, pipeline, temp_pipelines_dir)
|
|
|
|
assert isinstance(result, list)
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert len(result) == 9
|
|
elif pipeline.pipeline_name in [LOAD_EXCEPTION_PIPELINE, NORMALIZE_EXCEPTION_PIPELINE]:
|
|
# custom destination does not support remote data info
|
|
assert len(result) == 8
|
|
else:
|
|
# no remote data info
|
|
assert len(result) == 7
|
|
|
|
# Convert to dict for easier testing
|
|
details_dict = {item["name"]: item["value"] for item in result}
|
|
|
|
assert details_dict["pipeline_name"] == pipeline.pipeline_name
|
|
if pipeline.pipeline_name == NO_DESTINATION_PIPELINE:
|
|
assert details_dict["destination"] == "No destination set"
|
|
elif pipeline.pipeline_name == SUCCESS_PIPELINE_FILESYSTEM:
|
|
assert details_dict["destination"] == "filesystem (dlt.destinations.filesystem)"
|
|
elif pipeline.pipeline_name == LOAD_EXCEPTION_PIPELINE:
|
|
assert details_dict["destination"] == "dummy (dlt.destinations.dummy)"
|
|
else:
|
|
assert details_dict["destination"] == "duckdb (dlt.destinations.duckdb)"
|
|
assert details_dict["dataset_name"] == pipeline.dataset_name
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD or pipeline.pipeline_name in [
|
|
LOAD_EXCEPTION_PIPELINE,
|
|
NORMALIZE_EXCEPTION_PIPELINE,
|
|
]:
|
|
assert details_dict["schemas"].startswith("fruitshop")
|
|
else:
|
|
assert "schemas" not in details_dict
|
|
|
|
assert details_dict["working_dir"].endswith(pipeline.pipeline_name)
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
@pytest.mark.parametrize("show_internals", [True, False])
|
|
@pytest.mark.parametrize("show_child_tables", [True, False])
|
|
def test_create_table_list(pipeline, show_internals, show_child_tables):
|
|
"""Test creating a basic table list with real schema"""
|
|
config = DashboardConfiguration()
|
|
|
|
result = create_table_list(
|
|
config,
|
|
pipeline,
|
|
selected_schema_name=pipeline.default_schema_name,
|
|
show_internals=show_internals,
|
|
show_child_tables=show_child_tables,
|
|
)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
base_table_names = {"inventory", "purchases", "customers", "inventory_categories"}
|
|
dlt_table_names = {"_dlt_loads", "_dlt_version", "_dlt_pipeline_state"}
|
|
child_table_names = {"purchases__child"}
|
|
|
|
expected_table_names = {*base_table_names}
|
|
if show_internals:
|
|
expected_table_names.update(dlt_table_names)
|
|
if show_child_tables:
|
|
expected_table_names.update(child_table_names)
|
|
|
|
table_names = {table["name"] for table in result}
|
|
assert set(table_names) == expected_table_names
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
@pytest.mark.parametrize("show_internals", [True, False])
|
|
@pytest.mark.parametrize("show_type_hints", [True, False])
|
|
@pytest.mark.parametrize("show_other_hints", [True, False])
|
|
@pytest.mark.parametrize("show_custom_hints", [True, False])
|
|
def test_create_column_list_basic(
|
|
pipeline, show_internals, show_type_hints, show_other_hints, show_custom_hints
|
|
):
|
|
"""Test creating a basic column list with real schema"""
|
|
config = DashboardConfiguration()
|
|
|
|
# Should exclude _dlt columns by default, will also not show incomplete columns
|
|
result = create_column_list(
|
|
config,
|
|
pipeline,
|
|
selected_schema_name=pipeline.default_schema_name,
|
|
table_name="purchases",
|
|
show_internals=show_internals,
|
|
show_type_hints=show_type_hints,
|
|
show_other_hints=show_other_hints,
|
|
show_custom_hints=show_custom_hints,
|
|
)
|
|
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
# check visible columns
|
|
base_column_names = {"customer_id", "quantity", "id", "inventory_id", "date"}
|
|
dlt_column_names = {"_dlt_load_id", "_dlt_id"}
|
|
|
|
expected_column_names = {*base_column_names}
|
|
if show_internals:
|
|
expected_column_names.update(dlt_column_names)
|
|
|
|
column_names = {col["name"] for col in result}
|
|
assert column_names == expected_column_names
|
|
|
|
# Find the id column
|
|
id_column = next(col for col in result if col["name"] == "id")
|
|
|
|
# check type hints
|
|
if show_type_hints:
|
|
assert id_column["data_type"] == "bigint"
|
|
assert id_column["nullable"] is False
|
|
else:
|
|
assert "data_type" not in id_column
|
|
assert "nullable" not in id_column
|
|
|
|
if show_other_hints:
|
|
assert id_column["primary_key"] is True
|
|
else:
|
|
assert "primary_key" not in id_column
|
|
|
|
if show_custom_hints:
|
|
assert id_column["x-custom"] == "foo"
|
|
else:
|
|
assert "x-custom" not in id_column
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
def test_get_query_result(pipeline: dlt.Pipeline):
|
|
"""Test getting query result from real pipeline"""
|
|
# Clear cache first
|
|
get_query_result_cached.cache_clear()
|
|
|
|
result, error_message, traceback_string = get_query_result(
|
|
pipeline, "SELECT COUNT(*) as count FROM purchases"
|
|
)
|
|
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert isinstance(result, pyarrow.Table)
|
|
assert len(result) == 1
|
|
assert (
|
|
result[0][0].as_py() == 100
|
|
if pipeline.pipeline_name == SUCCESS_PIPELINE_DUCKDB
|
|
else 103
|
|
) # merge does not work on filesystem
|
|
else:
|
|
assert len(result) == 0
|
|
assert error_message
|
|
assert traceback_string
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
def test_get_default_query_for_table(pipeline: dlt.Pipeline):
|
|
query, error_message, traceback_string = get_default_query_for_table(
|
|
pipeline, pipeline.default_schema_name, "purchases", True
|
|
)
|
|
assert query == 'SELECT\n *\nFROM "purchases"\nLIMIT 1000'
|
|
assert not error_message
|
|
assert not traceback_string
|
|
assert query
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", PIPELINES_WITH_LOAD, indirect=True)
|
|
def test_get_example_query_for_dataset(pipeline: dlt.Pipeline):
|
|
query, error_message, traceback_string = get_example_query_for_dataset(
|
|
pipeline, pipeline.default_schema_name
|
|
)
|
|
assert query == 'SELECT\n *\nFROM "customers"\nLIMIT 1000'
|
|
assert not error_message
|
|
assert not traceback_string
|
|
assert query
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_row_counts_list(pipeline: dlt.Pipeline):
|
|
"""Test getting row counts from real pipeline"""
|
|
result = get_row_counts_list(pipeline)
|
|
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
reverted_result = {i["name"]: i["row_count"] for i in result}
|
|
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert reverted_result == {
|
|
"customers": 13,
|
|
"inventory": 6,
|
|
"purchases": (
|
|
100 if pipeline.pipeline_name == SUCCESS_PIPELINE_DUCKDB else 103
|
|
), # merge does not work on filesystem
|
|
"purchases__child": 3,
|
|
"inventory_categories": 3,
|
|
"_dlt_version": 3,
|
|
"_dlt_loads": 4,
|
|
"_dlt_pipeline_state": 3,
|
|
}
|
|
else:
|
|
reverted_result = {}
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_loads(pipeline: dlt.Pipeline):
|
|
"""Test getting loads from real pipeline"""
|
|
config = DashboardConfiguration()
|
|
|
|
# Clear cache first
|
|
result, error_message, traceback_string = get_loads(config, pipeline, limit=100)
|
|
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert isinstance(result, list)
|
|
assert len(result) >= 1 # Should have at least one load
|
|
assert not error_message
|
|
assert not traceback_string
|
|
if result:
|
|
load = result[0]
|
|
assert "load_id" in load
|
|
else:
|
|
assert result == []
|
|
assert error_message
|
|
assert traceback_string
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_trace(pipeline: dlt.Pipeline):
|
|
"""Test trace overview with real trace data"""
|
|
config = DashboardConfiguration()
|
|
trace = pipeline.last_trace
|
|
|
|
if pipeline.pipeline_name in [NEVER_RAN_PIPELINE, NO_DESTINATION_PIPELINE]:
|
|
assert trace is None
|
|
return
|
|
|
|
# overview
|
|
result = trace_overview(config, trace)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
assert {item["name"] for item in result} == {
|
|
"pipeline_name",
|
|
"started_at",
|
|
"finished_at",
|
|
"transaction_id",
|
|
"duration",
|
|
}
|
|
values_dict = {item["name"]: item["value"] for item in result}
|
|
assert values_dict["pipeline_name"] == pipeline.pipeline_name
|
|
|
|
# execution context
|
|
result = trace_execution_context(config, trace)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
assert len(result) == 7
|
|
assert {item["name"] for item in result} == {
|
|
"cpu",
|
|
"os",
|
|
"library",
|
|
"run_context",
|
|
"python",
|
|
"ci_run",
|
|
"exec_info",
|
|
}
|
|
# TODO: inspect values
|
|
|
|
# steps overview
|
|
result = trace_steps_overview(config, trace)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
if pipeline.pipeline_name == EXTRACT_EXCEPTION_PIPELINE:
|
|
assert len(result) == 1
|
|
assert result[0]["step"] == "extract"
|
|
elif pipeline.pipeline_name == NORMALIZE_EXCEPTION_PIPELINE:
|
|
assert len(result) == 2
|
|
assert result[0]["step"] == "extract"
|
|
assert result[1]["step"] == "normalize"
|
|
else:
|
|
assert len(result) == 3
|
|
assert result[0]["step"] == "extract"
|
|
assert result[1]["step"] == "normalize"
|
|
assert result[2]["step"] == "load"
|
|
|
|
# TODO: inspect values of trace steps overview
|
|
|
|
for item in result:
|
|
result = trace_step_details(config, trace, item["step"])
|
|
# TODO: inspect trace step details
|
|
|
|
# resolved config values (TODO: add at least one config value)
|
|
result = trace_resolved_config_values(config, trace)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(result).text is not None
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_get_remote_state_details(pipeline: dlt.Pipeline):
|
|
remote_state = remote_state_details(pipeline)
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(remote_state).text is not None
|
|
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert remote_state[0] == {"name": "state_version", "value": 3}
|
|
assert remote_state[1]["name"] == "schemas"
|
|
assert remote_state[1]["value"].startswith("fruitshop")
|
|
assert remote_state[2]["name"] == ""
|
|
assert remote_state[2]["value"].startswith("fruitshop_customers")
|
|
else:
|
|
assert remote_state[0] == {
|
|
"name": "Info",
|
|
"value": "Could not restore state from destination",
|
|
}
|
|
|
|
|
|
def test_style_cell():
|
|
"""Test style cell function"""
|
|
# Even row
|
|
result = style_cell("0", "test_col", "test_value")
|
|
assert result["background-color"] == "white"
|
|
|
|
# Odd row
|
|
result = style_cell("1", "test_col", "test_value")
|
|
assert result["background-color"] == "#f4f4f9"
|
|
|
|
# Name column (case insensitive)
|
|
result = style_cell("0", "name", "test_value")
|
|
assert result["font-weight"] == "bold"
|
|
|
|
result = style_cell("0", "NAME", "test_value")
|
|
assert result["font-weight"] == "bold"
|
|
|
|
|
|
def test_without_none_or_empty_string():
|
|
"""Test removing None and empty string values"""
|
|
input_dict = {
|
|
"key1": "value1",
|
|
"key2": None,
|
|
"key3": "",
|
|
"key4": "value4",
|
|
"key5": 0, # Should be kept
|
|
"key6": False, # Should be kept
|
|
"key7": [], # Should be kept
|
|
}
|
|
|
|
result = _without_none_or_empty_string(input_dict)
|
|
|
|
expected = {"key1": "value1", "key4": "value4", "key5": 0, "key6": False, "key7": []}
|
|
assert result == expected
|
|
|
|
|
|
def test_align_dict_keys():
|
|
"""Test aligning dictionary keys"""
|
|
items = [
|
|
{"key1": "value1", "key2": "value2"},
|
|
{"key1": "value3", "key3": "value4"},
|
|
{"key2": "value5", "key4": "value6"},
|
|
]
|
|
|
|
result = _align_dict_keys(items)
|
|
|
|
# All items should have all keys
|
|
expected_keys = {"key1", "key2", "key3", "key4"}
|
|
for item in result:
|
|
assert set(item.keys()) == expected_keys
|
|
|
|
# Missing keys should be filled with "-"
|
|
assert result[0]["key3"] == "-"
|
|
assert result[0]["key4"] == "-"
|
|
assert result[1]["key2"] == "-"
|
|
assert result[1]["key4"] == "-"
|
|
assert result[2]["key1"] == "-"
|
|
assert result[2]["key3"] == "-"
|
|
|
|
|
|
def test_align_dict_keys_with_none_values():
|
|
"""Test aligning dictionary keys with None values filtered out"""
|
|
items = [
|
|
{"key1": "value1", "key2": None, "key3": ""},
|
|
{"key1": None, "key2": "value2", "key4": "value4"},
|
|
]
|
|
|
|
result = _align_dict_keys(items)
|
|
|
|
# None and empty string values should be removed before alignment
|
|
assert "key3" not in result[0] # Was empty string
|
|
|
|
# Missing keys should be filled with "-"
|
|
assert result[0]["key4"] == "-"
|
|
assert result[0]["key2"] == "-"
|
|
assert result[1]["key1"] == "-"
|
|
|
|
|
|
def test_humanize_datetime_values():
|
|
"""Test humanizing datetime values"""
|
|
config = DashboardConfiguration()
|
|
config.datetime_format = "YYYY-MM-DD HH:mm:ss Z"
|
|
|
|
input_dict = {
|
|
"started_at": pendulum.parse("2023-01-01T12:00:00"),
|
|
"finished_at": pendulum.parse("2023-01-01T12:30:00"),
|
|
"created": 1672574400, # Unix timestamp
|
|
"last_modified": "1672574400.123", # String timestamp
|
|
"inserted_at": datetime(2023, 1, 1, 12, 0, 0),
|
|
"load_id": 1672574400, # Unix timestamp for 2023-01-01T12:00:00
|
|
"other_field": "unchanged",
|
|
"numeric_field": 42,
|
|
}
|
|
|
|
result = _humanize_datetime_values(config, input_dict)
|
|
|
|
# Should have duration calculated
|
|
assert "duration" in result
|
|
|
|
# Original non-datetime fields should be preserved
|
|
assert result["other_field"] == "unchanged"
|
|
assert result["numeric_field"] == 42
|
|
|
|
assert result["created"] == "2023-01-01 12:00:00 +00:00"
|
|
assert result["last_modified"] == "2023-01-01 12:00:00 +00:00"
|
|
assert result["inserted_at"] == "2023-01-01 12:00:00 +00:00"
|
|
assert result["load_package_created_at"] == "2023-01-01 12:00:00 +00:00"
|
|
|
|
|
|
def test_dict_to_table_items():
|
|
"""Test converting dict to table items"""
|
|
input_dict = {
|
|
"pipeline_name": "success_pipeline",
|
|
"destination": "duckdb",
|
|
"status": "completed",
|
|
}
|
|
|
|
result = _dict_to_table_items(input_dict)
|
|
|
|
expected = [
|
|
{"name": "pipeline_name", "value": "success_pipeline"},
|
|
{"name": "destination", "value": "duckdb"},
|
|
{"name": "status", "value": "completed"},
|
|
]
|
|
|
|
# Sort both by name for comparison since dict order may vary
|
|
result_sorted = sorted(result, key=lambda x: x["name"])
|
|
expected_sorted = sorted(expected, key=lambda x: x["name"])
|
|
|
|
assert result_sorted == expected_sorted
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_integration_pipeline_workflow(pipeline, temp_pipelines_dir):
|
|
"""Test integration scenario with complete pipeline workflow"""
|
|
# Test pipeline details
|
|
config = DashboardConfiguration()
|
|
|
|
details = pipeline_details(config, pipeline, temp_pipelines_dir)
|
|
|
|
# check it can be rendered as table with marimo
|
|
assert mo.ui.table(details).text is not None
|
|
|
|
details_dict = {item["name"]: item["value"] for item in details}
|
|
assert details_dict["pipeline_name"] == pipeline.pipeline_name
|
|
|
|
# Test row counts
|
|
row_counts = get_row_counts(pipeline)
|
|
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert row_counts["customers"] == 13
|
|
else:
|
|
assert row_counts == {}
|
|
|
|
# Test query execution
|
|
query_result, error_message, traceback_string = get_query_result(
|
|
pipeline, "SELECT name FROM customers ORDER BY id"
|
|
)
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert len(query_result) == 13
|
|
assert query_result[0][0].as_py() == "simon"
|
|
assert not error_message
|
|
assert not traceback_string
|
|
else:
|
|
assert len(query_result) == 0
|
|
assert error_message
|
|
assert traceback_string
|
|
|
|
# Test loads
|
|
config = DashboardConfiguration()
|
|
loads, error_message, traceback_string = get_loads(config, pipeline)
|
|
if pipeline.pipeline_name in PIPELINES_WITH_LOAD:
|
|
assert len(loads) >= 1
|
|
assert not error_message
|
|
assert not traceback_string
|
|
else:
|
|
assert error_message
|
|
assert traceback_string
|
|
assert loads == []
|
|
|
|
|
|
@pytest.mark.parametrize("pipeline", ALL_PIPELINES, indirect=True)
|
|
def test_sanitize_trace_for_display(pipeline: dlt.Pipeline):
|
|
"""Test sanitizing trace for display"""
|
|
trace = pipeline.last_trace
|
|
sanitized = sanitize_trace_for_display(trace)
|
|
assert sanitized is not None
|
|
assert isinstance(sanitized, dict)
|
|
# check it can be rendered with marimo
|
|
assert mo.json(sanitized).text is not None
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"pipeline, expected_steps, expected_status",
|
|
[
|
|
(SUCCESS_PIPELINE_DUCKDB, {"extract", "normalize", "load"}, "succeeded"),
|
|
(SUCCESS_PIPELINE_FILESYSTEM, {"extract", "normalize", "load"}, "succeeded"),
|
|
(EXTRACT_EXCEPTION_PIPELINE, {"extract"}, "failed"),
|
|
(LOAD_EXCEPTION_PIPELINE, {"extract", "normalize", "load"}, "failed"),
|
|
],
|
|
indirect=["pipeline"],
|
|
)
|
|
def test_get_steps_data_and_status(
|
|
pipeline: dlt.Pipeline,
|
|
expected_steps: Set[TVisualPipelineStep],
|
|
expected_status: TPipelineRunStatus,
|
|
) -> None:
|
|
"""Test getting steps data and the pipeline execution status from trace"""
|
|
trace = pipeline.last_trace
|
|
|
|
steps_data, status = _get_steps_data_and_status(trace.steps)
|
|
assert len(steps_data) == len(expected_steps)
|
|
assert status == expected_status
|
|
|
|
assert all(step.duration_ms > 0 for step in steps_data)
|
|
if expected_status == "succeeded":
|
|
assert all(step.failed is False for step in steps_data)
|
|
else:
|
|
assert any(step.failed is True for step in steps_data)
|
|
|
|
assert set([step.step for step in steps_data]) == expected_steps
|
|
|
|
|
|
def test_get_migrations_count(temp_pipelines_dir) -> None:
|
|
"""Test getting migrations count from the pipeline's last load info"""
|
|
import duckdb
|
|
|
|
db_conn = duckdb.connect()
|
|
try:
|
|
pipeline = create_success_pipeline_duckdb(temp_pipelines_dir, db_conn=db_conn)
|
|
|
|
migrations_count = _get_migrations_count(pipeline.last_trace.last_load_info)
|
|
assert migrations_count == 1
|
|
|
|
# Trigger multiple migrations
|
|
pipeline.extract([{"id": 1, "name": "test"}], table_name="my_table")
|
|
pipeline.extract([{"id": 2, "name": "test2", "new_column": "value"}], table_name="my_table")
|
|
pipeline.extract(
|
|
[{"id": 3, "name": "test3", "new_column": "value", "another_column": 100}],
|
|
table_name="my_table",
|
|
)
|
|
pipeline.normalize()
|
|
pipeline.load()
|
|
migrations_count = _get_migrations_count(pipeline.last_trace.last_load_info)
|
|
assert migrations_count == 3
|
|
finally:
|
|
db_conn.close()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"pipeline, expected_steps, expected_status",
|
|
[
|
|
(SUCCESS_PIPELINE_DUCKDB, {"extract", "normalize", "load"}, "succeeded"),
|
|
(SUCCESS_PIPELINE_FILESYSTEM, {"extract", "normalize", "load"}, "succeeded"),
|
|
(EXTRACT_EXCEPTION_PIPELINE, {"extract"}, "failed"),
|
|
(LOAD_EXCEPTION_PIPELINE, {"extract", "normalize", "load"}, "failed"),
|
|
],
|
|
indirect=["pipeline"],
|
|
)
|
|
def test_build_pipeline_execution_visualization(
|
|
pipeline: dlt.Pipeline,
|
|
expected_steps: Set[TVisualPipelineStep],
|
|
expected_status: TPipelineRunStatus,
|
|
) -> None:
|
|
"""Test overall pipeline execution visualization logic"""
|
|
|
|
trace = pipeline.last_trace
|
|
|
|
html = build_pipeline_execution_visualization(trace)
|
|
html_str = str(html.text)
|
|
|
|
# Check for CSS class structure
|
|
assert 'class="pipeline-execution-container"' in html_str
|
|
assert 'class="pipeline-execution-layout"' in html_str
|
|
assert 'class="pipeline-execution-timeline"' in html_str
|
|
assert 'class="pipeline-execution-badges"' in html_str
|
|
|
|
assert f"Last execution ID: <strong>{trace.transaction_id[:8]}</strong>" in html_str
|
|
total_time_match = re.search(
|
|
r"<div>Total time: <strong>([\d.]+)(ms|s)?</strong></div>", html_str
|
|
)
|
|
assert total_time_match is not None
|
|
|
|
# Check for status badge using CSS classes (not inline styles)
|
|
status_badge_class = (
|
|
"status-badge-green" if expected_status == "succeeded" else "status-badge-red"
|
|
)
|
|
assert (
|
|
f'<div class="status-badge {status_badge_class}"><strong>{expected_status}</strong></div>'
|
|
in html_str
|
|
)
|
|
|
|
# Check for migration badge using CSS classes (not inline styles)
|
|
migrations_count = _get_migrations_count(trace.last_load_info) if trace.last_load_info else 0
|
|
migration_badge = (
|
|
f'<div class="status-badge status-badge-yellow"><strong>{migrations_count} dataset'
|
|
" migration(s)</strong></div>"
|
|
)
|
|
if migrations_count != 0:
|
|
assert migration_badge in html_str
|
|
else:
|
|
assert migration_badge not in html_str
|
|
|
|
steps_data, _ = _get_steps_data_and_status(trace.steps)
|
|
for step_data in steps_data:
|
|
duration_pattern = re.search(rf"{step_data.step.capitalize()}\s+([\d.]+)(ms|s)?", html_str)
|
|
assert duration_pattern is not None
|
|
|
|
if "extract" in expected_steps:
|
|
assert "var(--dlt-color-lime)" in html_str
|
|
if "normalize" in expected_steps:
|
|
assert "var(--dlt-color-aqua)" in html_str
|
|
if "load" in expected_steps:
|
|
assert "var(--dlt-color-pink)" in html_str
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"pipeline",
|
|
[
|
|
SUCCESS_PIPELINE_DUCKDB,
|
|
SUCCESS_PIPELINE_FILESYSTEM,
|
|
EXTRACT_EXCEPTION_PIPELINE,
|
|
NORMALIZE_EXCEPTION_PIPELINE,
|
|
LOAD_EXCEPTION_PIPELINE,
|
|
],
|
|
indirect=["pipeline"],
|
|
)
|
|
def test_collect_load_packages_from_trace(
|
|
pipeline: dlt.Pipeline,
|
|
) -> None:
|
|
"""Test getting load package status labels from trace"""
|
|
|
|
trace = pipeline.last_trace
|
|
table = load_package_status_labels(trace)
|
|
|
|
list_of_load_package_info = cast(List[Dict[str, Any]], table.data)
|
|
|
|
if pipeline.pipeline_name in ["success_pipeline_duckdb", "success_pipeline_filesystem"]:
|
|
assert len(list_of_load_package_info) == 2
|
|
assert all(
|
|
"loaded" in str(load_package_info["status"].text)
|
|
for load_package_info in list_of_load_package_info
|
|
)
|
|
|
|
elif pipeline.pipeline_name == "extract_exception_pipeline":
|
|
assert len(list_of_load_package_info) == 1
|
|
assert "discarded" in str(list_of_load_package_info[0]["status"].text)
|
|
|
|
elif pipeline.pipeline_name == "load_exception_pipeline":
|
|
assert len(list_of_load_package_info) == 1
|
|
assert "aborted" in str(list_of_load_package_info[0]["status"].text)
|
|
|
|
elif pipeline.pipeline_name == "normalize_exception_pipeline":
|
|
assert len(list_of_load_package_info) == 1
|
|
assert "pending" in str(list_of_load_package_info[0]["status"].text)
|