mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
291 lines
9.1 KiB
Python
291 lines
9.1 KiB
Python
#
|
|
# A list of pipelines that are execute and are in various states for testing the dashboard
|
|
# TODO: generalize these and make them available for other tests
|
|
# TODO: consolidate these test pipelines with the ones in tests/e2e/helpers/dashboard
|
|
#
|
|
|
|
from typing import Any
|
|
from unittest.mock import patch
|
|
|
|
import duckdb
|
|
import dlt
|
|
import pytest
|
|
from dlt._workspace._templates._single_file_templates.fruitshop_pipeline import (
|
|
fruitshop as fruitshop_source,
|
|
)
|
|
from dlt._workspace._templates._single_file_templates.arrow_pipeline import (
|
|
resource as humans,
|
|
)
|
|
from dlt.common.destination.exceptions import (
|
|
DestinationTerminalException,
|
|
)
|
|
|
|
SUCCESS_PIPELINE_DUCKDB = "success_pipeline_duckdb"
|
|
SUCCESS_PIPELINE_FILESYSTEM = "success_pipeline_filesystem"
|
|
EXTRACT_EXCEPTION_PIPELINE = "extract_exception_pipeline"
|
|
NORMALIZE_EXCEPTION_PIPELINE = "normalize_exception_pipeline"
|
|
NEVER_RAN_PIPELINE = "never_ran_pipline"
|
|
LOAD_EXCEPTION_PIPELINE = "load_exception_pipeline"
|
|
NO_DESTINATION_PIPELINE = "no_destination_pipeline"
|
|
SYNC_EXCEPTION_PIPELINE = "sync_exception_pipeline"
|
|
|
|
ALL_PIPELINES = [
|
|
SUCCESS_PIPELINE_DUCKDB,
|
|
EXTRACT_EXCEPTION_PIPELINE,
|
|
NORMALIZE_EXCEPTION_PIPELINE,
|
|
NEVER_RAN_PIPELINE,
|
|
LOAD_EXCEPTION_PIPELINE,
|
|
NO_DESTINATION_PIPELINE,
|
|
SUCCESS_PIPELINE_FILESYSTEM,
|
|
SYNC_EXCEPTION_PIPELINE,
|
|
]
|
|
|
|
PIPELINES_WITH_EXCEPTIONS = [
|
|
EXTRACT_EXCEPTION_PIPELINE,
|
|
NORMALIZE_EXCEPTION_PIPELINE,
|
|
LOAD_EXCEPTION_PIPELINE,
|
|
SYNC_EXCEPTION_PIPELINE,
|
|
]
|
|
PIPELINES_WITH_LOAD = [SUCCESS_PIPELINE_DUCKDB, SUCCESS_PIPELINE_FILESYSTEM]
|
|
|
|
|
|
def run_success_pipeline(pipeline: dlt.Pipeline):
|
|
"""
|
|
Create a success pipeline with
|
|
* fruitshop dataset
|
|
* child table
|
|
* second schema
|
|
* incremental column
|
|
"""
|
|
pipeline.run(fruitshop_source(), schema=dlt.Schema("fruitshop"))
|
|
|
|
# we overwrite the purchases table to have a child table and an incomplete column
|
|
@dlt.resource( # type: ignore
|
|
primary_key="id",
|
|
write_disposition="merge",
|
|
columns={"incomplete": {}, "id": {"x-custom": "foo"}},
|
|
)
|
|
def purchases(inc_id=dlt.sources.incremental("id")):
|
|
"""Load purchases data from a simple python list."""
|
|
yield [
|
|
{
|
|
"id": 1,
|
|
"customer_id": 1,
|
|
"inventory_id": 1,
|
|
"quantity": 1,
|
|
"child": [
|
|
{"id": 1, "name": "child 1"},
|
|
{"id": 2, "name": "child 2"},
|
|
{"id": 3, "name": "child 3"},
|
|
],
|
|
},
|
|
{"id": 2, "customer_id": 1, "inventory_id": 2, "quantity": 2},
|
|
{"id": 3, "customer_id": 2, "inventory_id": 3, "quantity": 3},
|
|
]
|
|
|
|
pipeline.run(purchases(), schema=dlt.Schema("fruitshop"))
|
|
|
|
# write something to another schema so we have multiple schemas
|
|
customers = fruitshop_source().customers
|
|
|
|
customers.apply_hints(
|
|
nested_hints={
|
|
("child",): dlt.mark.make_nested_hints(columns=[{"name": "name", "data_type": "text"}]),
|
|
("child", "child_inner"): dlt.mark.make_nested_hints(
|
|
columns=[{"name": "name", "data_type": "text"}]
|
|
),
|
|
},
|
|
)
|
|
pipeline.run(
|
|
customers,
|
|
schema=dlt.Schema("fruitshop_customers"),
|
|
table_name="other_customers",
|
|
)
|
|
|
|
|
|
def create_success_pipeline_duckdb(pipelines_dir: str = None, db_conn: Any = None):
|
|
"""Create a test pipeline with in memory duckdb destination, properties see `run_success_pipeline`"""
|
|
import duckdb
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=SUCCESS_PIPELINE_DUCKDB,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=db_conn if db_conn else None),
|
|
)
|
|
|
|
run_success_pipeline(pipeline)
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_success_pipeline_filesystem(
|
|
pipelines_dir: str = None, bucket_url: str = "_storage/data"
|
|
):
|
|
"""Create a test pipeline with filesystem destination, properties see `run_success_pipeline`"""
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=SUCCESS_PIPELINE_FILESYSTEM,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.filesystem(bucket_url=bucket_url),
|
|
)
|
|
|
|
run_success_pipeline(pipeline)
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_fruitshop_duckdb_with_shared_dataset(pipelines_dir: str = None):
|
|
"""Create a test pipeline with in memory duckdb destination, properties see `run_success_pipeline`"""
|
|
import duckdb
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="fruits_test_pipeline",
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:/data.db")),
|
|
dataset_name="test_shared_dataset",
|
|
)
|
|
|
|
run_success_pipeline(pipeline)
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_humans_arrow_duckdb_with_shared_dataset(
|
|
pipelines_dir: str = None,
|
|
):
|
|
"""Create a test pipeline with filesystem destination, properties see `run_success_pipeline`"""
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="humans_test_pipeline",
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:/data.db")),
|
|
dataset_name="test_shared_dataset",
|
|
)
|
|
|
|
pipeline.run(humans())
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_extract_exception_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline with duckdb destination, raises an exception in the extract step"""
|
|
import duckdb
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=EXTRACT_EXCEPTION_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
|
|
)
|
|
|
|
@dlt.resource
|
|
def broken_resource():
|
|
raise AssertionError("I am broken")
|
|
|
|
with pytest.raises(Exception):
|
|
pipeline.run(broken_resource())
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_normalize_exception_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline with duckdb destination, raises an exception in the normalize step"""
|
|
import duckdb
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=NORMALIZE_EXCEPTION_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
|
|
)
|
|
|
|
@dlt.resource
|
|
def data_with_type_conflict():
|
|
# First yield double, then string for same column - causes normalize failure with strict schema contract
|
|
yield [{"id": 1, "value": 123.4}]
|
|
yield [{"id": 2, "value": "string"}]
|
|
|
|
with pytest.raises(Exception):
|
|
pipeline.run(
|
|
data_with_type_conflict(),
|
|
schema=dlt.Schema("fruitshop"),
|
|
table_name="items",
|
|
schema_contract={"data_type": "freeze"}, # Strict mode - fail on type conflicts
|
|
)
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_never_ran_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline with duckdb destination which never was run"""
|
|
import duckdb
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=NEVER_RAN_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
|
|
)
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_load_exception_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline with duckdb destination, raises an exception in the load step"""
|
|
|
|
@dlt.destination
|
|
def failing_destination(one, two):
|
|
raise DestinationTerminalException()
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=LOAD_EXCEPTION_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.dummy(timeout=0.1),
|
|
)
|
|
|
|
with pytest.raises(Exception):
|
|
pipeline.run([1, 2, 3], table_name="items", schema=dlt.Schema("fruitshop"))
|
|
|
|
return pipeline
|
|
|
|
|
|
def create_no_destination_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline with no destination"""
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=NO_DESTINATION_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
)
|
|
return pipeline
|
|
|
|
|
|
def create_sync_exception_pipeline(pipelines_dir: str = None):
|
|
"""Create a test pipeline that raises an exception in the sync step"""
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=SYNC_EXCEPTION_PIPELINE,
|
|
pipelines_dir=pipelines_dir,
|
|
destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")),
|
|
)
|
|
|
|
@dlt.resource
|
|
def dummy_data():
|
|
yield [{"id": 1, "value": "test"}]
|
|
|
|
with patch.object(pipeline, "_restore_state_from_destination") as mock_restore:
|
|
mock_restore.side_effect = ConnectionError("Cannot connect to destination for sync")
|
|
|
|
with pytest.raises(Exception) as excinfo:
|
|
pipeline.run(dummy_data())
|
|
|
|
assert "failed at `step=sync`" in str(excinfo)
|
|
|
|
return pipeline
|
|
|
|
|
|
# NOTE: this script can be run to create the test pipelines globally for manual testing of the dashboard app and cli
|
|
if __name__ == "__main__":
|
|
create_success_pipeline_duckdb()
|
|
create_success_pipeline_filesystem()
|
|
create_extract_exception_pipeline()
|
|
create_normalize_exception_pipeline()
|
|
create_never_ran_pipeline()
|
|
create_load_exception_pipeline()
|
|
create_no_destination_pipeline()
|
|
create_sync_exception_pipeline()
|