mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
* bumps to version 1.20.0 * update the hub reference docs, add CI check * use dependency specifier in hub for plugin version check * minimum dlt runtime cli check * rollaback to old fsspec min version * fixes test_hub ci workflow * fixes flaky test * bumps hub extra * updates cli docs linting * fixes docs lock --------- Co-authored-by: Marcin Rudolf <rudolfix@rudolfix.org> Co-authored-by: ivasio <ivan@dlthub.com>
4479 lines
160 KiB
Python
4479 lines
160 KiB
Python
import asyncio
|
|
from multiprocessing.dummy import DummyProcess
|
|
import pathlib
|
|
import pickle
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import itertools
|
|
import logging
|
|
import os, sys
|
|
import random
|
|
import shutil
|
|
import threading
|
|
from time import sleep
|
|
from typing import Any, List, Tuple, cast
|
|
from tenacity import retry_if_exception, Retrying, stop_after_attempt
|
|
from unittest.mock import patch
|
|
import pytest
|
|
|
|
import dlt
|
|
from dlt.common import json, pendulum, Decimal
|
|
from dlt.common.configuration import resolve
|
|
from dlt.common.configuration.specs.pluggable_run_context import PluggableRunContext
|
|
from dlt.common.known_env import DLT_LOCAL_DIR
|
|
from dlt.common.storages import FileStorage
|
|
from dlt.common.storages.load_storage import ParsedLoadJobFileName
|
|
from dlt.common.configuration.container import Container
|
|
from dlt.common.configuration.exceptions import ConfigFieldMissingException, InvalidNativeValue
|
|
from dlt.common.data_writers.exceptions import FileImportNotFound, SpecLookupFailed
|
|
from dlt.common.destination import DestinationCapabilitiesContext
|
|
from dlt.common.destination.client import WithStateSync
|
|
from dlt.common.destination.exceptions import (
|
|
DestinationHasFailedJobs,
|
|
DestinationIncompatibleLoaderFileFormatException,
|
|
DestinationLoadingViaStagingNotSupported,
|
|
DestinationNoStagingMode,
|
|
DestinationTerminalException,
|
|
UnknownDestinationModule,
|
|
)
|
|
from dlt.common.exceptions import PipelineStateNotAvailable, SignalReceivedException
|
|
from dlt.common.pipeline import ExtractInfo, LoadInfo, PipelineContext, SupportsPipeline
|
|
from dlt.common.runtime import signals
|
|
from dlt.common.runtime.collector import DictCollector, LogCollector
|
|
from dlt.common.schema.exceptions import TableIdentifiersFrozen
|
|
from dlt.common.schema.typing import TColumnSchema
|
|
from dlt.common.schema.utils import get_first_column_name_with_prop, new_column, new_table
|
|
from dlt.common.typing import DictStrAny, TDataItems
|
|
from dlt.common.utils import uniq_id
|
|
from dlt.common.schema import Schema
|
|
|
|
from dlt.destinations import filesystem, redshift, dummy, duckdb
|
|
from dlt.destinations.impl.filesystem.filesystem import INIT_FILE_NAME
|
|
from dlt.extract.exceptions import (
|
|
InvalidResourceDataTypeBasic,
|
|
ResourceExtractionError,
|
|
SourceExhausted,
|
|
)
|
|
from dlt.extract.extract import ExtractStorage
|
|
from dlt.extract import DltResource, DltSource
|
|
from dlt.extract.extractors import MaterializedEmptyList
|
|
from dlt.load.exceptions import LoadClientJobFailed
|
|
from dlt.normalize.exceptions import NormalizeJobFailed
|
|
from dlt.pipeline.configuration import PipelineConfiguration
|
|
from dlt.pipeline.exceptions import (
|
|
InvalidPipelineName,
|
|
PipelineNeverRan,
|
|
PipelineNotActive,
|
|
PipelineStepFailed,
|
|
)
|
|
from dlt.pipeline.helpers import retry_load
|
|
|
|
from dlt.pipeline.pipeline import Pipeline
|
|
from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace
|
|
from dlt.pipeline.typing import TPipelineStep
|
|
|
|
from tests.common.utils import TEST_SENTRY_DSN
|
|
from tests.utils import TEST_STORAGE_ROOT, skipifwindows
|
|
from tests.extract.utils import expect_extracted_file
|
|
from tests.pipeline.utils import (
|
|
assert_table_counts,
|
|
assert_load_info,
|
|
airtable_emojis,
|
|
assert_only_table_columns,
|
|
load_table_counts,
|
|
load_tables_to_dicts,
|
|
many_delayed,
|
|
)
|
|
|
|
from dlt.destinations.dataset import get_destination_client_initial_config
|
|
|
|
DUMMY_COMPLETE = dummy(completed_prob=1) # factory set up to complete jobs
|
|
|
|
|
|
def test_default_pipeline() -> None:
|
|
p = dlt.pipeline()
|
|
# this is a name of executing test harness or blank pipeline on windows
|
|
possible_names = ["dlt_pytest", "dlt_pipeline"]
|
|
assert p.pipeline_name in possible_names
|
|
assert p.pipelines_dir == os.path.abspath(os.path.join(TEST_STORAGE_ROOT, ".dlt", "pipelines"))
|
|
# default dataset name is not created until a destination that requires it is set
|
|
assert p.dataset_name is None
|
|
assert p.destination is None
|
|
assert p.default_schema_name is None
|
|
# init_cwd is local_dir
|
|
assert p.get_local_state_val("initial_cwd") == os.path.abspath(
|
|
dlt.current.run_context().local_dir
|
|
)
|
|
|
|
# this is the same pipeline
|
|
p2 = dlt.pipeline()
|
|
assert p is p2
|
|
|
|
# this will create default schema
|
|
p.extract(["a", "b", "c"], table_name="data")
|
|
# `_pipeline` is removed from default schema name
|
|
assert p.default_schema_name in ["dlt_pytest", "dlt"]
|
|
|
|
|
|
def test_pipeline_runtime_configuration() -> None:
|
|
c = resolve.resolve_configuration(PipelineConfiguration())
|
|
assert c.pipeline_name.startswith("dlt_")
|
|
assert c.runtime.slack_incoming_hook is None
|
|
|
|
# check pipeline runtime config
|
|
os.environ["RUNTIME__SLACK_INCOMING_HOOK"] = "https://hooks.slack.com/services/..."
|
|
c = resolve.resolve_configuration(PipelineConfiguration())
|
|
assert c.runtime.slack_incoming_hook == os.environ["RUNTIME__SLACK_INCOMING_HOOK"]
|
|
|
|
os.environ["TEST_P__RUNTIME__SLACK_INCOMING_HOOK"] = "#test-p-slack"
|
|
p = dlt.pipeline("test_p")
|
|
assert p.config.runtime.slack_incoming_hook == "#test-p-slack"
|
|
assert p.config.runtime is p.runtime_config
|
|
|
|
|
|
def test_default_pipeline_dataset_layout(environment) -> None:
|
|
# Set dataset_name_layout to "bobby_%s"
|
|
dataset_name_layout = "bobby_%s"
|
|
environment["DATASET_NAME_LAYOUT"] = dataset_name_layout
|
|
|
|
# use destination that needs a dataset
|
|
p = dlt.pipeline(destination="filesystem")
|
|
# this is a name of executing test harness or blank pipeline on windows
|
|
possible_names = ["dlt_pytest", "dlt_pipeline"]
|
|
possible_dataset_names = [
|
|
dataset_name_layout % "dlt_pytest_dataset",
|
|
dataset_name_layout % "dlt_pipeline_dataset",
|
|
]
|
|
assert p.pipeline_name in possible_names
|
|
assert p.pipelines_dir == os.path.abspath(os.path.join(TEST_STORAGE_ROOT, ".dlt", "pipelines"))
|
|
# dataset that will be used to load data is the pipeline name
|
|
assert p.dataset_name in possible_dataset_names
|
|
assert p.default_schema_name is None
|
|
|
|
# this is the same pipeline
|
|
p2 = dlt.pipeline()
|
|
assert p is p2
|
|
|
|
# this will create default schema
|
|
p.extract(["a", "b", "c"], table_name="data")
|
|
# `_pipeline` is removed from default schema name
|
|
assert p.default_schema_name in ["dlt_pytest", "dlt"]
|
|
|
|
|
|
def test_default_pipeline_dataset() -> None:
|
|
# no dataset and no destination
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline")
|
|
assert p.dataset_name is None
|
|
p._wipe_working_folder()
|
|
|
|
# dummy does not need a dataset (is schemaless)
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="dummy")
|
|
assert p.dataset_name is None # so it is none
|
|
p._wipe_working_folder()
|
|
|
|
# clickhouse has optional dataset
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="clickhouse")
|
|
assert p.dataset_name is None
|
|
p._wipe_working_folder()
|
|
|
|
# filesystem needs one
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline", destination="filesystem")
|
|
assert p.dataset_name == "test_default_pipeline_dataset"
|
|
p._wipe_working_folder()
|
|
|
|
|
|
def test_default_pipeline_dataset_late_destination() -> None:
|
|
# no dataset and no destination
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline")
|
|
assert p.dataset_name is None
|
|
|
|
# default dataset name will be created
|
|
p.sync_destination(destination=dlt.destinations.filesystem(TEST_STORAGE_ROOT))
|
|
assert p.dataset_name == "test_default_pipeline_dataset"
|
|
p._wipe_working_folder()
|
|
|
|
p = dlt.pipeline(pipeline_name="test_default_pipeline")
|
|
# dummy won't set dataset
|
|
p.sync_destination(destination="dummy")
|
|
print(p.dataset_name)
|
|
assert p.dataset_name is None
|
|
|
|
|
|
def test_default_pipeline_dataset_name(environment) -> None:
|
|
environment["DATASET_NAME"] = "dataset"
|
|
environment["DATASET_NAME_LAYOUT"] = "prefix_%s"
|
|
|
|
p = dlt.pipeline(destination="filesystem")
|
|
assert p.dataset_name == "prefix_dataset"
|
|
|
|
|
|
def test_default_pipeline_dataset_layout_exception(environment) -> None:
|
|
# Set dataset_name_layout without placeholder %s
|
|
environment["DATASET_NAME_LAYOUT"] = "bobby_"
|
|
|
|
with pytest.raises(ValueError):
|
|
dlt.pipeline(destination="filesystem")
|
|
|
|
|
|
def test_default_pipeline_dataset_layout_placeholder(environment) -> None:
|
|
# Set dataset_name_layout only with placeholder
|
|
environment["DATASET_NAME_LAYOUT"] = "%s"
|
|
|
|
possible_dataset_names = ["dlt_pytest_dataset", "dlt_pipeline_dataset"]
|
|
p = dlt.pipeline(destination="filesystem")
|
|
assert p.dataset_name in possible_dataset_names
|
|
|
|
|
|
def test_default_pipeline_dataset_layout_empty(environment) -> None:
|
|
# Set dataset_name_layout empty
|
|
environment["DATASET_NAME_LAYOUT"] = ""
|
|
|
|
possible_dataset_names = ["dlt_pytest_dataset", "dlt_pipeline_dataset"]
|
|
p = dlt.pipeline(destination="filesystem")
|
|
assert p.dataset_name in possible_dataset_names
|
|
|
|
|
|
def test_pipeline_initial_cwd_follows_local_dir(environment) -> None:
|
|
local_dir = os.path.join(TEST_STORAGE_ROOT, uniq_id())
|
|
os.makedirs(local_dir)
|
|
# mock tmp dir
|
|
os.environ[DLT_LOCAL_DIR] = local_dir
|
|
p = dlt.pipeline(destination="filesystem")
|
|
assert p.get_local_state_val("initial_cwd") == os.path.abspath(local_dir)
|
|
|
|
|
|
def test_pipeline_configuration_top_level_section(environment) -> None:
|
|
environment["PIPELINES__DATASET_NAME"] = "pipeline_dataset"
|
|
environment["PIPELINES__DESTINATION_TYPE"] = "dummy"
|
|
environment["PIPELINES__IMPORT_SCHEMA_PATH"] = os.path.join(TEST_STORAGE_ROOT, "import")
|
|
environment["PIPELINES__EXPORT_SCHEMA_PATH"] = os.path.join(TEST_STORAGE_ROOT, "import")
|
|
|
|
pipeline = dlt.pipeline()
|
|
assert pipeline.dataset_name == "pipeline_dataset"
|
|
assert pipeline.destination.destination_type == "dlt.destinations.dummy"
|
|
assert (
|
|
pipeline._schema_storage_config.export_schema_path
|
|
== environment["PIPELINES__EXPORT_SCHEMA_PATH"]
|
|
)
|
|
assert (
|
|
pipeline._schema_storage_config.import_schema_path
|
|
== environment["PIPELINES__IMPORT_SCHEMA_PATH"]
|
|
)
|
|
|
|
|
|
def test_pipeline_configuration_named_section(environment) -> None:
|
|
environment["PIPELINES__NAMED__DATASET_NAME"] = "pipeline_dataset"
|
|
environment["PIPELINES__NAMED__DESTINATION_TYPE"] = "dummy"
|
|
environment["PIPELINES__NAMED__IMPORT_SCHEMA_PATH"] = os.path.join(TEST_STORAGE_ROOT, "import")
|
|
environment["PIPELINES__NAMED__EXPORT_SCHEMA_PATH"] = os.path.join(TEST_STORAGE_ROOT, "import")
|
|
|
|
pipeline = dlt.pipeline(pipeline_name="named")
|
|
assert pipeline.dataset_name == "pipeline_dataset"
|
|
assert pipeline.destination.destination_type == "dlt.destinations.dummy"
|
|
assert (
|
|
pipeline._schema_storage_config.export_schema_path
|
|
== environment["PIPELINES__NAMED__EXPORT_SCHEMA_PATH"]
|
|
)
|
|
assert (
|
|
pipeline._schema_storage_config.import_schema_path
|
|
== environment["PIPELINES__NAMED__IMPORT_SCHEMA_PATH"]
|
|
)
|
|
|
|
|
|
def test_run_dev_mode_default_dataset() -> None:
|
|
p = dlt.pipeline(dev_mode=True, destination="filesystem")
|
|
assert p.dataset_name.endswith(p._pipeline_instance_id)
|
|
# restore this pipeline
|
|
r_p = dlt.attach(dev_mode=False)
|
|
assert r_p.dataset_name.endswith(p._pipeline_instance_id)
|
|
|
|
# dummy does not need dataset
|
|
p = dlt.pipeline(dev_mode=True, destination="dummy")
|
|
assert p.dataset_name is None
|
|
# simulate set new dataset
|
|
p._set_destinations("filesystem")
|
|
assert p.dataset_name is None
|
|
p._set_dataset_name(None)
|
|
# full refresh is still observed
|
|
assert p.dataset_name and p.dataset_name.endswith(p._pipeline_instance_id)
|
|
|
|
|
|
def test_run_dev_mode_default_dataset_layout(environment) -> None:
|
|
# Set dataset_name_layout to "bobby_%s"
|
|
dataset_name_layout = "bobby_%s"
|
|
environment["DATASET_NAME_LAYOUT"] = dataset_name_layout
|
|
|
|
p = dlt.pipeline(dev_mode=True, destination="filesystem")
|
|
assert p.dataset_name in [
|
|
dataset_name_layout % f"dlt_pytest_dataset{p._pipeline_instance_id}",
|
|
dataset_name_layout % f"dlt_pipeline_dataset{p._pipeline_instance_id}",
|
|
]
|
|
# restore this pipeline
|
|
r_p = dlt.attach(dev_mode=False)
|
|
assert r_p.dataset_name in [
|
|
dataset_name_layout % f"dlt_pytest_dataset{p._pipeline_instance_id}",
|
|
dataset_name_layout % f"dlt_pipeline_dataset{p._pipeline_instance_id}",
|
|
]
|
|
|
|
# dummy does not need dataset
|
|
p = dlt.pipeline(dev_mode=True, destination="dummy")
|
|
assert p.dataset_name is None
|
|
|
|
# simulate set new dataset
|
|
p._set_destinations("filesystem")
|
|
assert p.dataset_name is None
|
|
p._set_dataset_name(None)
|
|
|
|
# full refresh is still observed
|
|
assert p.dataset_name in [
|
|
dataset_name_layout % f"dlt_pytest_dataset{p._pipeline_instance_id}",
|
|
dataset_name_layout % f"dlt_pipeline_dataset{p._pipeline_instance_id}",
|
|
]
|
|
|
|
|
|
def test_run_dev_mode_underscored_dataset() -> None:
|
|
p = dlt.pipeline(dev_mode=True, dataset_name="_main_")
|
|
assert p.dataset_name.endswith(p._pipeline_instance_id)
|
|
# restore this pipeline
|
|
r_p = dlt.attach(dev_mode=False)
|
|
assert r_p.dataset_name.endswith(p._pipeline_instance_id)
|
|
|
|
|
|
def test_dataset_pipeline_never_ran() -> None:
|
|
p = dlt.pipeline(destination="duckdb", dev_mode=True, dataset_name="_main_")
|
|
# we get a dataset with an empty schema with the name of the dataset
|
|
dataset = p.dataset()
|
|
assert dataset.schema.name == p.dataset_name
|
|
assert set(dataset.schema.tables.keys()) == {"_dlt_version", "_dlt_loads"}
|
|
|
|
|
|
def test_dataset_unknown_schema() -> None:
|
|
p = dlt.pipeline(destination="duckdb", dev_mode=True, dataset_name="mmmmm")
|
|
p.run([1, 2, 3], table_name="digits")
|
|
|
|
dataset = p.dataset(schema="unknown")
|
|
assert dataset.schema.name == "unknown"
|
|
assert set(dataset.schema.tables.keys()) == {"_dlt_version", "_dlt_loads"}
|
|
|
|
|
|
def test_pipeline_with_non_alpha_name() -> None:
|
|
name = "another pipeline %__8329イロハニホヘト"
|
|
# contains %
|
|
with pytest.raises(InvalidPipelineName):
|
|
p = dlt.pipeline(pipeline_name=name)
|
|
|
|
name = "another pipeline __8329イロハニホヘト"
|
|
p = dlt.pipeline(pipeline_name=name, destination="filesystem")
|
|
assert p.pipeline_name == name
|
|
# default dataset is set (we used filesystem destination that requires dataset)
|
|
assert p.dataset_name == f"{name}_dataset"
|
|
|
|
# this will create default schema
|
|
p.extract(["a", "b", "c"], table_name="data")
|
|
assert p.default_schema_name == "another_pipeline_8329x"
|
|
|
|
|
|
def test_file_format_resolution() -> None:
|
|
# raise on destinations that does not support staging
|
|
with pytest.raises(DestinationLoadingViaStagingNotSupported):
|
|
dlt.pipeline(
|
|
pipeline_name="managed_state_pipeline", destination="postgres", staging="filesystem"
|
|
)
|
|
|
|
# raise on staging that does not support staging interface
|
|
with pytest.raises(DestinationNoStagingMode):
|
|
dlt.pipeline(pipeline_name="managed_state_pipeline", staging="postgres")
|
|
|
|
# check invalid input
|
|
with pytest.raises(DestinationIncompatibleLoaderFileFormatException):
|
|
pipeline = dlt.pipeline(pipeline_name="managed_state_pipeline", destination="postgres")
|
|
pipeline.config.restore_from_destination = False
|
|
pipeline.run([1, 2, 3], table_name="numbers", loader_file_format="jsonl")
|
|
|
|
# check invalid input
|
|
with pytest.raises(DestinationIncompatibleLoaderFileFormatException):
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="managed_state_pipeline", destination="athena", staging="filesystem"
|
|
)
|
|
pipeline.config.restore_from_destination = False
|
|
pipeline.run([1, 2, 3], table_name="numbers", loader_file_format="insert_values")
|
|
|
|
|
|
def test_invalid_dataset_name() -> None:
|
|
# this is invalid dataset name but it will be normalized within a destination
|
|
p = dlt.pipeline(dataset_name="!")
|
|
assert p.dataset_name == "!"
|
|
|
|
|
|
def test_invalid_dataset_layout(environment) -> None:
|
|
# Set dataset_name_prefix to "bobby"
|
|
dataset_name_layout = "bobby_%s"
|
|
environment["DATASET_NAME_LAYOUT"] = dataset_name_layout
|
|
|
|
# this is invalid dataset name but it will be normalized within a destination
|
|
p = dlt.pipeline(dataset_name="!")
|
|
assert p.dataset_name == dataset_name_layout % "!"
|
|
|
|
|
|
def test_pipeline_context_deferred_activation() -> None:
|
|
ctx = Container()[PipelineContext]
|
|
assert ctx.is_active() is False
|
|
# this creates default pipeline
|
|
p = ctx.pipeline()
|
|
# and we can get it here
|
|
assert p is dlt.pipeline()
|
|
|
|
|
|
def test_pipeline_context() -> None:
|
|
ctx = Container()[PipelineContext]
|
|
ctx.enable_activation_history = True
|
|
try:
|
|
assert ctx.is_active() is False
|
|
assert len(ctx.activation_history()) == 0
|
|
# create pipeline
|
|
p = dlt.pipeline()
|
|
assert ctx.is_active() is True
|
|
assert ctx.pipeline() is p
|
|
assert p.is_active is True
|
|
# has no destination context
|
|
assert DestinationCapabilitiesContext not in Container()
|
|
assert ctx.activation_history()[0] is p
|
|
ctx.clear_activation_history()
|
|
|
|
# create another pipeline
|
|
p2 = dlt.pipeline(pipeline_name="another pipeline", destination="duckdb")
|
|
assert ctx.pipeline() is p2
|
|
assert p.is_active is False
|
|
assert p2.is_active is True
|
|
assert ctx.activation_history() == [p2]
|
|
|
|
p3 = dlt.pipeline(pipeline_name="more pipelines", destination="dummy")
|
|
assert ctx.pipeline() is p3
|
|
assert p3.is_active is True
|
|
assert p2.is_active is False
|
|
# no default naming convention
|
|
assert Container()[DestinationCapabilitiesContext].naming_convention is None
|
|
assert ctx.activation_history() == [p2, p3]
|
|
|
|
# restore previous
|
|
p2_a = dlt.attach("another pipeline")
|
|
assert ctx.pipeline() is p2_a
|
|
assert p3.is_active is False
|
|
assert p2.is_active is False
|
|
assert p2_a.is_active is True
|
|
assert ctx.activation_history() == [p2, p3, p2_a]
|
|
|
|
# activate p2
|
|
p2.activate()
|
|
# already was on the list
|
|
assert ctx.activation_history() == [p2, p3, p2_a]
|
|
finally:
|
|
ctx.clear_activation_history()
|
|
ctx.enable_activation_history = False
|
|
|
|
|
|
def test_import_unknown_destination() -> None:
|
|
with pytest.raises(UnknownDestinationModule):
|
|
dlt.pipeline(destination="dlt.destinations.unknown")
|
|
|
|
|
|
def test_configured_destination_type(environment) -> None:
|
|
environment["DESTINATION_TYPE"] = "dlt.destinations.postgres"
|
|
environment["PIPELINE_NAME"] = "postgres_pipe"
|
|
|
|
p = dlt.pipeline()
|
|
assert p.destination is not None
|
|
assert p.destination.destination_type == "dlt.destinations.postgres"
|
|
assert p.destination.destination_name == "postgres"
|
|
assert p.pipeline_name == "postgres_pipe"
|
|
|
|
|
|
def test_configured_destination_unknown_type(environment) -> None:
|
|
environment["DESTINATION_TYPE"] = "dlt.destinations.unknown"
|
|
|
|
with pytest.raises(UnknownDestinationModule):
|
|
dlt.pipeline()
|
|
|
|
|
|
def test_configured_destination_unknown_name(environment) -> None:
|
|
environment["DESTINATION_NAME"] = "filesystem-prod"
|
|
environment["DESTINATION_TYPE"] = "filesystem"
|
|
|
|
p = dlt.pipeline()
|
|
assert p.destination is not None
|
|
assert p.destination.destination_type == "dlt.destinations.filesystem"
|
|
assert p.destination.destination_name == "filesystem-prod"
|
|
|
|
# we do not have config for postgres-prod so getting destination client must fail
|
|
with pytest.raises(ConfigFieldMissingException):
|
|
p.destination_client()
|
|
|
|
|
|
def test_deterministic_salt(environment) -> None:
|
|
environment["PIPELINE_NAME"] = "postgres_pipe"
|
|
p = dlt.pipeline()
|
|
p2 = dlt.attach()
|
|
assert p.pipeline_name == p2.pipeline_name == "postgres_pipe"
|
|
assert p.pipeline_salt == p2.pipeline_salt
|
|
|
|
p3 = dlt.pipeline(pipeline_name="postgres_redshift")
|
|
assert p.pipeline_salt != p3.pipeline_salt
|
|
|
|
|
|
def test_destination_explicit_credentials(environment: Any) -> None:
|
|
from dlt.destinations import motherduck
|
|
|
|
# test redshift
|
|
p = dlt.pipeline(
|
|
pipeline_name="postgres_pipeline",
|
|
destination=motherduck(credentials="md://user:password@/dlt_data"),
|
|
)
|
|
config = p.destination_client().config
|
|
assert config.credentials.is_resolved()
|
|
assert config.credentials.to_native_representation() == "md://user:password@/dlt_data"
|
|
|
|
|
|
def test_destination_staging_config(environment: Any) -> None:
|
|
fs_dest = filesystem("file:///testing-bucket")
|
|
p = dlt.pipeline(
|
|
pipeline_name="staging_pipeline",
|
|
destination=dummy(),
|
|
staging=fs_dest,
|
|
)
|
|
schema = Schema("foo")
|
|
p._inject_schema(schema)
|
|
|
|
_, staging_client = p._get_destination_clients()
|
|
|
|
assert staging_client.config.as_staging_destination is True # type: ignore
|
|
|
|
|
|
def test_destination_factory_defaults_resolve_from_config(environment: Any) -> None:
|
|
"""Params passed explicitly to destination supersede config values.
|
|
Env config values supersede default values.
|
|
"""
|
|
environment["FAIL_PROB"] = "0.3"
|
|
environment["RETRY_PROB"] = "0.8"
|
|
p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=dummy(retry_prob=0.5))
|
|
|
|
client = p.destination_client()
|
|
|
|
assert client.config.fail_prob == 0.3 # type: ignore[attr-defined]
|
|
assert client.config.retry_prob == 0.5 # type: ignore[attr-defined]
|
|
|
|
|
|
def test_destination_credentials_in_factory(environment: Any) -> None:
|
|
os.environ["DESTINATION__REDSHIFT__CREDENTIALS"] = "redshift://abc:123@localhost:5432/some_db"
|
|
|
|
redshift_dest = redshift("redshift://abc:123@localhost:5432/other_db")
|
|
|
|
p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest)
|
|
|
|
initial_config = get_destination_client_initial_config(
|
|
p.destination, "some_schema_name", p.dataset_name
|
|
)
|
|
dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type]
|
|
# Explicit factory arg supersedes config
|
|
assert dest_config.credentials.database == "other_db"
|
|
|
|
redshift_dest = redshift()
|
|
p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest)
|
|
|
|
initial_config = get_destination_client_initial_config(
|
|
p.destination, "some_schema_name", p.dataset_name
|
|
)
|
|
dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type]
|
|
assert dest_config.credentials.database == "some_db"
|
|
|
|
|
|
def test_destination_explicit_invalid_credentials_filesystem(environment: Any) -> None:
|
|
# if string cannot be parsed
|
|
p = dlt.pipeline(
|
|
pipeline_name="postgres_pipeline",
|
|
destination=filesystem(bucket_url="s3://test", destination_name="uniq_s3_bucket"),
|
|
)
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
p.run([1, 2, 3], table_name="data", credentials="PR8BLEM")
|
|
assert pip_ex.value.step == "sync"
|
|
assert pip_ex.value.load_id is None
|
|
assert pip_ex.value.is_package_partially_loaded is False
|
|
assert pip_ex.value.has_pending_data is False
|
|
assert isinstance(pip_ex.value.__cause__, InvalidNativeValue)
|
|
|
|
|
|
def test_extract_source_twice() -> None:
|
|
def some_data():
|
|
yield [1, 2, 3]
|
|
yield [1, 2, 3]
|
|
|
|
s = DltSource(dlt.Schema("source"), "module", [dlt.resource(some_data())])
|
|
dlt.pipeline().extract(s)
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
dlt.pipeline().extract(s)
|
|
assert type(py_ex.value.exception) is SourceExhausted
|
|
assert py_ex.value.exception.source_name == "source"
|
|
|
|
|
|
def test_disable_enable_state_sync(environment: Any) -> None:
|
|
environment["RESTORE_FROM_DESTINATION"] = "False"
|
|
p = dlt.pipeline(destination="redshift")
|
|
|
|
def some_data():
|
|
yield [1, 2, 3]
|
|
|
|
s = DltSource(dlt.Schema("default"), "module", [dlt.resource(some_data())])
|
|
dlt.pipeline().extract(s)
|
|
storage = ExtractStorage(p._normalize_storage_config())
|
|
assert len(storage.list_files_to_normalize_sorted()) == 1
|
|
expect_extracted_file(storage, "default", "some_data", json.dumps([1, 2, 3]))
|
|
with pytest.raises(FileNotFoundError):
|
|
expect_extracted_file(storage, "default", s.schema.state_table_name, "")
|
|
|
|
p.config.restore_from_destination = True
|
|
# extract to different schema, state must go to default schema
|
|
s = DltSource(dlt.Schema("default_2"), "module", [dlt.resource(some_data())])
|
|
dlt.pipeline().extract(s)
|
|
expect_extracted_file(storage, "default", s.schema.state_table_name, "***")
|
|
|
|
|
|
def test_extract_multiple_sources() -> None:
|
|
s1 = DltSource(
|
|
dlt.Schema("default"),
|
|
"module",
|
|
[dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")],
|
|
)
|
|
s2 = DltSource(
|
|
dlt.Schema("default_2"),
|
|
"module",
|
|
[dlt.resource([6, 7, 8], name="resource_3"), dlt.resource([9, 10, 0], name="resource_4")],
|
|
)
|
|
|
|
p = dlt.pipeline(destination="dummy")
|
|
p.config.restore_from_destination = False
|
|
p.extract([s1, s2])
|
|
storage = ExtractStorage(p._normalize_storage_config())
|
|
expect_extracted_file(storage, "default", "resource_1", json.dumps([1, 2, 3]))
|
|
expect_extracted_file(storage, "default", "resource_2", json.dumps([3, 4, 5]))
|
|
expect_extracted_file(storage, "default_2", "resource_3", json.dumps([6, 7, 8]))
|
|
expect_extracted_file(storage, "default_2", "resource_4", json.dumps([9, 10, 0]))
|
|
assert len(storage.list_files_to_normalize_sorted()) == 4
|
|
p.normalize()
|
|
|
|
# make the last resource fail
|
|
|
|
@dlt.resource
|
|
def i_fail():
|
|
raise NotImplementedError()
|
|
|
|
s3 = DltSource(
|
|
dlt.Schema("default_3"),
|
|
"module",
|
|
[dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")],
|
|
)
|
|
s4 = DltSource(
|
|
dlt.Schema("default_4"), "module", [dlt.resource([6, 7, 8], name="resource_3"), i_fail]
|
|
)
|
|
|
|
with pytest.raises(PipelineStepFailed):
|
|
# NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later
|
|
p.extract([s4, s3])
|
|
|
|
# nothing to normalize
|
|
assert len(storage.list_files_to_normalize_sorted()) == 0
|
|
# pipeline state is successfully rollbacked after the last extract and default_3 and 4 schemas are not present
|
|
assert set(p.schema_names) == {"default", "default_2"}
|
|
assert set(p._schema_storage.list_schemas()) == {"default", "default_2"}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"resource_defs",
|
|
(
|
|
# single resource will set the section
|
|
[("", "fr_n", "TEST_PIPELINE__CONFIG_SECTIONS", "TEST_PIPELINE__FR_N", "single_res")],
|
|
[("fr_s", "fr_n", "FR_S__CONFIG_SECTIONS", "FR_S__FR_N", "single_res_section")],
|
|
# multiple resources with the same section will set the section name
|
|
# writer settings use source/schema name which is pipeline name
|
|
[
|
|
("fr_s", "fr_n", "FR_S__CONFIG_SECTIONS", "FR_S__FR_N", "mul_res_same_section_N"),
|
|
("fr_s", "fr_x", "FR_S__CONFIG_SECTIONS", "FR_S__FR_X", "mul_res_same_section_X"),
|
|
],
|
|
[
|
|
(
|
|
None,
|
|
"fr_n",
|
|
"TEST_PIPELINE__CONFIG_SECTIONS",
|
|
"TEST_PIPELINE__FR_N",
|
|
"mul_res_same_auto_N",
|
|
),
|
|
(
|
|
None,
|
|
"fr_x",
|
|
"TEST_PIPELINE__CONFIG_SECTIONS",
|
|
"TEST_PIPELINE__FR_X",
|
|
"mul_res_same_auto_X",
|
|
),
|
|
],
|
|
# multiple resources from different sections, section and name come from pipeline
|
|
[
|
|
(
|
|
"fr_s_1",
|
|
"fr_n",
|
|
"CONFIG_SECTIONS__CONFIG_SECTIONS",
|
|
"CONFIG_SECTIONS__FR_N",
|
|
"mul_res_diff_section_N",
|
|
),
|
|
(
|
|
"fr_s_2",
|
|
"fr_x",
|
|
"CONFIG_SECTIONS__CONFIG_SECTIONS",
|
|
"CONFIG_SECTIONS__FR_X",
|
|
"mul_res_diff_section_X",
|
|
),
|
|
],
|
|
),
|
|
ids=(
|
|
"single_res",
|
|
"single_res_section",
|
|
"mul_res_same_section",
|
|
"mul_res_same_auto",
|
|
"mul_res_diff_section",
|
|
),
|
|
)
|
|
def test_extract_config_sections(resource_defs: List[Tuple[str, str, str, str, str]]) -> None:
|
|
"""Tests if we can configure resources and extract pipe via fine grained SOURCES__... layout"""
|
|
# do not extract state
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
pipeline_name = "config_sections"
|
|
pipeline = dlt.pipeline(pipeline_name=pipeline_name)
|
|
|
|
rows = 10
|
|
batches = 12
|
|
|
|
resources = []
|
|
|
|
for section, name, writer_section, config_section, expected_check in resource_defs:
|
|
|
|
@dlt.resource(name=name, section=section, standalone=True)
|
|
def data_frames(expected_check_: str, check: str = dlt.config.value):
|
|
# make sure we can pass configuration
|
|
assert check == expected_check_
|
|
for i in range(batches):
|
|
yield [{"i": i, "val": "A" * i}] * rows
|
|
|
|
resources.append(data_frames(expected_check))
|
|
|
|
# get buffer written and file rotated with each yielded frame
|
|
os.environ[f"SOURCES__{writer_section}__DATA_WRITER__BUFFER_MAX_ITEMS"] = str(rows)
|
|
os.environ[f"SOURCES__{writer_section}__DATA_WRITER__FILE_MAX_ITEMS"] = str(rows)
|
|
|
|
# pass check value
|
|
os.environ[f"SOURCES__{config_section}__CHECK"] = expected_check
|
|
|
|
pipeline.extract(resources)
|
|
# default schema name follows single resource name or pipeline name if many
|
|
# expected_schema_name = name if len(resources) == 1 else pipeline.pipeline_name
|
|
assert pipeline.default_schema_name == pipeline.pipeline_name
|
|
|
|
assert len(pipeline.list_extracted_resources()) == batches * len(resources)
|
|
|
|
|
|
def test_pipeline_resources_injected_sections() -> None:
|
|
from tests.extract.cases.section_source.external_resources import (
|
|
with_external,
|
|
with_bound_external,
|
|
init_resource_f_2,
|
|
resource_f_2,
|
|
)
|
|
|
|
# standalone resources must accept the injected sections for lookups
|
|
os.environ["SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL"] = (
|
|
"SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL"
|
|
)
|
|
os.environ["SOURCES__EXTERNAL_RESOURCES__VAL"] = "SOURCES__EXTERNAL_RESOURCES__VAL"
|
|
os.environ["SOURCES__SECTION_SOURCE__VAL"] = "SOURCES__SECTION_SOURCE__VAL"
|
|
os.environ["SOURCES__NAME_OVERRIDDEN__VAL"] = "SOURCES__NAME_OVERRIDDEN__VAL"
|
|
|
|
pipeline_name = "config_sections"
|
|
pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
|
|
|
|
# the external resources use their standalone sections
|
|
pipeline.run(init_resource_f_2())
|
|
assert pipeline.dataset().init_resource_f_2.fetchall()[0][0] == "SOURCES__SECTION_SOURCE__VAL"
|
|
# schema is pipeline name
|
|
assert "config_sections" in pipeline.schemas
|
|
pipeline.run(resource_f_2())
|
|
# will join already existing schema in the pipeline
|
|
assert "resource_f_2" not in pipeline.schemas
|
|
assert "resource_f_2" in pipeline.default_schema.tables
|
|
assert pipeline.dataset().resource_f_2.fetchall()[0][0] == "SOURCES__NAME_OVERRIDDEN__VAL"
|
|
|
|
# the source returns: it's own argument, same via inner resource, and two external resources that are not bound
|
|
# the iterator in the source will force its sections so external resource sections are not used
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=pipeline_name, destination="duckdb", dataset_name="with_external"
|
|
)
|
|
s_ = with_external()
|
|
pipeline.run(s_)
|
|
ds_ = pipeline.dataset(schema="with_external")
|
|
assert ds_.source_val[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL",)
|
|
assert ds_.inner_resource[["value"]].fetchall()[0] == (
|
|
"SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL",
|
|
)
|
|
assert ds_.init_resource_f_2[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__VAL",)
|
|
assert ds_.resource_f_2[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__VAL",)
|
|
# assert pipeline.dataset(schema="with_external").init_resource_f_2.fetchall()[0][0] == "SOURCES__SECTION_SOURCE__VAL"
|
|
assert "with_external" in pipeline.schemas
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=pipeline_name, destination="duckdb", dataset_name="with_bound_external"
|
|
)
|
|
s_ = with_bound_external()
|
|
pipeline.run(s_)
|
|
ds_ = pipeline.dataset(schema="with_bound_external")
|
|
assert ds_.source_val[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL",)
|
|
assert ds_.inner_resource[["value"]].fetchall()[0] == (
|
|
"SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL",
|
|
)
|
|
assert ds_.init_resource_f_2[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__VAL",)
|
|
assert ds_.resource_f_2[["value"]].fetchall()[0] == ("SOURCES__EXTERNAL_RESOURCES__VAL",)
|
|
# assert pipeline.dataset(schema="with_external").init_resource_f_2.fetchall()[0][0] == "SOURCES__SECTION_SOURCE__VAL"
|
|
assert "with_bound_external" in pipeline.schemas
|
|
|
|
|
|
def test_mark_hints() -> None:
|
|
# this resource emits table schema with first item
|
|
@dlt.resource
|
|
def with_mark():
|
|
yield dlt.mark.with_hints(
|
|
{"id": 1},
|
|
dlt.mark.make_hints(
|
|
table_name="spec_table", write_disposition="merge", primary_key="id"
|
|
),
|
|
)
|
|
yield {"id": 2}
|
|
|
|
p = dlt.pipeline(destination="dummy", pipeline_name="mark_pipeline")
|
|
p.extract(with_mark())
|
|
storage = ExtractStorage(p._normalize_storage_config())
|
|
expect_extracted_file(storage, "mark", "spec_table", json.dumps([{"id": 1}, {"id": 2}]))
|
|
p.normalize()
|
|
# no "with_mark" table in the schema: we update resource hints before any table schema is computed
|
|
assert "with_mark" not in p.default_schema.tables
|
|
assert "spec_table" in p.default_schema.tables
|
|
# resource name is kept
|
|
assert p.default_schema.tables["spec_table"]["resource"] == "with_mark"
|
|
|
|
|
|
def test_mark_hints_with_variant() -> None:
|
|
@dlt.resource(primary_key="pk")
|
|
def with_table_hints():
|
|
# dispatch to table a
|
|
yield dlt.mark.with_hints(
|
|
{"id": 1, "pk": "A"},
|
|
dlt.mark.make_hints(
|
|
table_name="table_a", columns=[{"name": "id", "data_type": "bigint"}]
|
|
),
|
|
create_table_variant=True,
|
|
)
|
|
|
|
# dispatch to table b
|
|
yield dlt.mark.with_hints(
|
|
{"id": 2, "pk": "B"},
|
|
dlt.mark.make_hints(table_name="table_b", write_disposition="replace"),
|
|
create_table_variant=True,
|
|
)
|
|
|
|
# item to resource
|
|
yield {"id": 3, "pk": "C"}
|
|
# table a with table_hints
|
|
yield dlt.mark.with_table_name({"id": 4, "pk": "D"}, "table_a")
|
|
# table b with table_hints
|
|
yield dlt.mark.with_table_name({"id": 5, "pk": "E"}, "table_b")
|
|
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
|
|
info = pipeline.run(with_table_hints)
|
|
assert_load_info(info)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts == {
|
|
"_dlt_pipeline_state": 1,
|
|
"table_a": 2,
|
|
"table_b": 2,
|
|
"with_table_hints": 1,
|
|
}
|
|
# check table counts
|
|
assert_table_counts(pipeline, {"table_a": 2, "table_b": 2, "with_table_hints": 1})
|
|
|
|
|
|
def test_mark_hints_variant_dynamic_name() -> None:
|
|
@dlt.resource(table_name=lambda item: "table_" + item["tag"])
|
|
def with_table_hints():
|
|
# dispatch to table a
|
|
yield dlt.mark.with_hints(
|
|
{"id": 1, "pk": "A", "tag": "a"},
|
|
dlt.mark.make_hints(
|
|
table_name="table_a",
|
|
primary_key="pk",
|
|
columns=[{"name": "id", "data_type": "bigint"}],
|
|
),
|
|
create_table_variant=True,
|
|
)
|
|
|
|
# dispatch to table b
|
|
yield dlt.mark.with_hints(
|
|
{"id": 2, "pk": "B", "tag": "b"},
|
|
dlt.mark.make_hints(table_name="table_b", write_disposition="replace"),
|
|
create_table_variant=True,
|
|
)
|
|
|
|
# dispatch by tag
|
|
yield {"id": 3, "pk": "C", "tag": "c"}
|
|
yield {"id": 4, "pk": "D", "tag": "a"}
|
|
yield {"id": 5, "pk": "E", "tag": "b"}
|
|
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
|
|
info = pipeline.run(with_table_hints)
|
|
assert_load_info(info)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts == {
|
|
"_dlt_pipeline_state": 1,
|
|
"table_a": 2,
|
|
"table_b": 2,
|
|
"table_c": 1,
|
|
}
|
|
# check table counts
|
|
assert_table_counts(pipeline, {"table_a": 2, "table_b": 2, "table_c": 1})
|
|
|
|
|
|
def test_restore_state_on_dummy() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
p.config.restore_from_destination = True
|
|
info = p.run([1, 2, 3], table_name="dummy_table")
|
|
print(info)
|
|
assert p.first_run is False
|
|
# no effect
|
|
p.sync_destination()
|
|
assert p.state["_state_version"] == 1
|
|
|
|
# wipe out storage
|
|
p._wipe_working_folder()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
assert p.first_run is True
|
|
p.sync_destination()
|
|
assert p.first_run is True
|
|
assert p.state["_state_version"] == 0
|
|
|
|
|
|
def test_first_run_flag() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
assert p.first_run is True
|
|
# attach
|
|
p = dlt.attach(pipeline_name=pipeline_name)
|
|
assert p.first_run is True
|
|
p.extract([1, 2, 3], table_name="dummy_table")
|
|
assert p.first_run is True
|
|
# attach again
|
|
p = dlt.attach(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
assert p.first_run is True
|
|
assert len(p.list_extracted_load_packages()) > 0
|
|
p.normalize()
|
|
assert len(p.list_normalized_load_packages()) > 0
|
|
assert p.first_run is True
|
|
# load will change the flag
|
|
p.load()
|
|
assert p.first_run is False
|
|
# attach again
|
|
p = dlt.attach(pipeline_name=pipeline_name)
|
|
assert p.first_run is False
|
|
# wipe the pipeline
|
|
p._create_pipeline()
|
|
assert p.first_run is True
|
|
p._save_state(p._get_state())
|
|
p = dlt.attach(pipeline_name=pipeline_name)
|
|
assert p.first_run is True
|
|
|
|
|
|
def test_has_pending_data_flag() -> None:
|
|
p = dlt.pipeline(pipeline_name="pipe_" + uniq_id(), destination=DUMMY_COMPLETE)
|
|
assert p.has_pending_data is False
|
|
p.extract([1, 2, 3], table_name="dummy_table")
|
|
assert p.has_pending_data is True
|
|
p.normalize()
|
|
assert p.has_pending_data is True
|
|
p.load()
|
|
assert p.has_pending_data is False
|
|
|
|
|
|
def test_sentry_tracing() -> None:
|
|
import sentry_sdk
|
|
|
|
os.environ["RUNTIME__SENTRY_DSN"] = TEST_SENTRY_DSN
|
|
Container()[PluggableRunContext].reload()
|
|
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
|
|
# def inspect_transaction(ctx):
|
|
# print(ctx)
|
|
# return 1.0
|
|
|
|
# sentry_sdk.Hub.current.client.options["traces_sampler"] = inspect_transaction
|
|
|
|
# def inspect_events(event, hint):
|
|
# print(event)
|
|
# print(hint)
|
|
# return event
|
|
|
|
# sentry_sdk.Hub.current.client.options["before_send"] = inspect_events
|
|
|
|
@dlt.resource
|
|
def r_check_sentry():
|
|
assert sentry_sdk.Scope.get_current_scope().span.op == "extract"
|
|
assert sentry_sdk.Scope.get_current_scope().transaction.name == "run"
|
|
yield [1, 2, 3]
|
|
|
|
p.run(r_check_sentry)
|
|
assert sentry_sdk.Hub.current.scope.span is None
|
|
sentry_sdk.flush()
|
|
|
|
@dlt.resource
|
|
def r_fail():
|
|
raise NotImplementedError()
|
|
|
|
# run pipeline with error in extract
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
p.run(r_fail)
|
|
assert py_ex.value.step == "extract"
|
|
# sentry cleaned up
|
|
assert sentry_sdk.Hub.current.scope.span is None
|
|
|
|
# run pipeline with error in load
|
|
os.environ["FAIL_SCHEMA_UPDATE"] = "true"
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
p.run(r_check_sentry)
|
|
assert py_ex.value.step == "load"
|
|
assert sentry_sdk.Hub.current.scope.span is None
|
|
|
|
|
|
def test_pipeline_state_on_extract_exception() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
|
|
@dlt.resource
|
|
def data_piece_1():
|
|
yield [1, 2, 3]
|
|
yield [3, 4, 5]
|
|
|
|
@dlt.resource
|
|
def data_piece_2():
|
|
yield [6, 7, 8]
|
|
raise NotImplementedError()
|
|
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
p.run([data_piece_1, data_piece_2], write_disposition="replace")
|
|
# make sure that exception has right step info
|
|
assert pip_ex.value.load_id in pip_ex.value.step_info.loads_ids
|
|
assert pip_ex.value.is_package_partially_loaded is False
|
|
assert pip_ex.value.has_pending_data is False
|
|
# print(pip_ex.value.load_id)
|
|
# print(pip_ex.value.step_info.asdict())
|
|
# print(p._last_trace.last_pipeline_step_trace("extract").exception_traces)
|
|
|
|
# first run didn't really happen
|
|
assert p.first_run is True
|
|
assert p.has_data is False
|
|
assert p.default_schema_name is None
|
|
# live schemas created during extract are popped from mem
|
|
assert len(p._schema_storage.list_schemas()) == 0
|
|
|
|
# restore the pipeline
|
|
p = dlt.attach(pipeline_name)
|
|
assert p.first_run is True
|
|
assert p.has_data is False
|
|
# no schema was saved to storage, the one above was only in memory
|
|
assert p._schema_storage.list_schemas() == []
|
|
assert p.default_schema_name is None
|
|
|
|
# same but with multiple sources generating many schemas
|
|
|
|
@dlt.source
|
|
def data_schema_1():
|
|
return data_piece_1
|
|
|
|
@dlt.source
|
|
def data_schema_2():
|
|
return data_piece_1
|
|
|
|
@dlt.source
|
|
def data_schema_3():
|
|
return data_piece_2
|
|
|
|
# new pipeline
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
|
|
with pytest.raises(PipelineStepFailed):
|
|
p.run([data_schema_1(), data_schema_2(), data_schema_3()], write_disposition="replace")
|
|
|
|
# first run didn't really happen
|
|
assert p.first_run is True
|
|
assert p.has_data is False
|
|
# live schemas created during extract are popped from mem
|
|
assert len(p._schema_storage.list_schemas()) == 0
|
|
assert p.default_schema_name is None
|
|
|
|
p.run([data_schema_1(), data_schema_2()], write_disposition="replace")
|
|
assert set(p.schema_names) == set(p._schema_storage.list_schemas())
|
|
|
|
|
|
def test_run_with_table_name_exceeding_path_length() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name)
|
|
|
|
# we must fix that
|
|
with pytest.raises(PipelineStepFailed) as sf_ex:
|
|
p.extract([1, 2, 3], table_name="TABLE_" + "a" * 267)
|
|
assert isinstance(sf_ex.value.__context__, OSError)
|
|
|
|
|
|
def test_raise_on_failed_job() -> None:
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
p.run([1, 2, 3], table_name="numbers")
|
|
assert py_ex.value.step == "load"
|
|
assert py_ex.value.load_id is not None
|
|
assert py_ex.value.load_id in py_ex.value.step_info.loads_ids
|
|
# loaded
|
|
assert py_ex.value.has_pending_data is False
|
|
# all packages failed
|
|
assert py_ex.value.is_package_partially_loaded is False
|
|
# get package info
|
|
package_info = p.get_load_package_info(py_ex.value.step_info.loads_ids[0])
|
|
assert package_info.state == "aborted"
|
|
assert isinstance(py_ex.value.__context__, LoadClientJobFailed)
|
|
assert isinstance(py_ex.value.__context__, DestinationTerminalException)
|
|
# next call to run does nothing
|
|
load_info = p.run()
|
|
assert load_info is None
|
|
|
|
|
|
def test_load_info_raise_on_failed_jobs() -> None:
|
|
# By default, raises terminal error on a failed job and aborts load. This pipeline does not fail
|
|
os.environ["COMPLETED_PROB"] = "1.0"
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
load_info = p.run([1, 2, 3], table_name="numbers")
|
|
assert load_info.has_failed_jobs is False
|
|
|
|
# Test explicit raising on a failed job after the load is completed. Let pipeline fail
|
|
os.environ["COMPLETED_PROB"] = "0.0"
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false"
|
|
load_info = p.run([1, 2, 3], table_name="numbers")
|
|
assert load_info.has_failed_jobs is True
|
|
with pytest.raises(DestinationHasFailedJobs) as py_ex:
|
|
load_info.raise_on_failed_jobs()
|
|
assert py_ex.value.destination_name == "dummy"
|
|
assert py_ex.value.load_id == load_info.loads_ids[0]
|
|
|
|
# Test automatic raising on a failed job which aborts the load. Let pipeline fail
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "true"
|
|
with pytest.raises(PipelineStepFailed) as py_ex_2:
|
|
p.run([1, 2, 3], table_name="numbers")
|
|
load_info = py_ex_2.value.step_info # type: ignore[assignment]
|
|
assert load_info.has_failed_jobs is True
|
|
with pytest.raises(DestinationHasFailedJobs) as py_ex:
|
|
load_info.raise_on_failed_jobs()
|
|
assert py_ex.value.destination_name == "dummy"
|
|
assert py_ex.value.load_id == load_info.loads_ids[0]
|
|
|
|
|
|
def test_run_load_pending() -> None:
|
|
# prepare some data and complete load with run
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
|
|
def some_data():
|
|
yield from [1, 2, 3]
|
|
|
|
@dlt.source
|
|
def source():
|
|
return dlt.resource(some_data(), name="numbers")
|
|
|
|
s = source()
|
|
p.extract(s)
|
|
assert s.exhausted
|
|
# will normalize and load, the data, the source will not be evaluated so there's no exception
|
|
load_info = p.run(s)
|
|
assert len(load_info.loads_ids) == 1
|
|
# now it is
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
p.run(s)
|
|
assert isinstance(py_ex.value.__context__, SourceExhausted)
|
|
|
|
# now only load
|
|
s = source()
|
|
p.extract(s)
|
|
p.normalize()
|
|
load_info = p.run(s)
|
|
assert len(load_info.loads_ids) == 1
|
|
|
|
|
|
def test_retry_load() -> None:
|
|
os.environ["COMPLETED_PROB"] = "1.0"
|
|
retry_count = 2
|
|
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
|
|
@dlt.resource
|
|
def fail_extract():
|
|
nonlocal retry_count
|
|
retry_count -= 1
|
|
if retry_count == 0:
|
|
yield [1, 2, 3]
|
|
else:
|
|
raise Exception("Transient")
|
|
|
|
attempt = None
|
|
|
|
for attempt in Retrying(
|
|
stop=stop_after_attempt(3),
|
|
retry=retry_if_exception(retry_load(("load", "extract"))),
|
|
reraise=True,
|
|
):
|
|
with attempt:
|
|
p.run(fail_extract())
|
|
# it retried
|
|
assert retry_count == 0
|
|
|
|
# now it fails (extract is terminal exception)
|
|
retry_count = 2
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
for attempt in Retrying(
|
|
stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(())), reraise=True
|
|
):
|
|
with attempt:
|
|
p.run(fail_extract())
|
|
assert isinstance(py_ex.value, PipelineStepFailed)
|
|
assert py_ex.value.step == "extract"
|
|
|
|
os.environ["COMPLETED_PROB"] = "0.0"
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
for attempt in Retrying(
|
|
stop=stop_after_attempt(3),
|
|
retry=retry_if_exception(retry_load(("load", "extract"))),
|
|
reraise=True,
|
|
):
|
|
with attempt:
|
|
p.run(fail_extract())
|
|
assert isinstance(py_ex.value, PipelineStepFailed)
|
|
assert py_ex.value.step == "load"
|
|
|
|
|
|
@pytest.mark.skip("Not implemented")
|
|
def test_extract_exception() -> None:
|
|
# make sure that PipelineStepFailed contains right step information
|
|
# TODO: same tests for normalize and load
|
|
pass
|
|
|
|
|
|
@pytest.mark.skip("Not implemented")
|
|
def test_extract_all_data_types() -> None:
|
|
# list, iterators, generators, resource, source, list of resources, list of sources
|
|
pass
|
|
|
|
|
|
def test_set_get_local_value() -> None:
|
|
p = dlt.pipeline(destination="dummy", dev_mode=True)
|
|
value = uniq_id()
|
|
# value is set
|
|
p.set_local_state_val(value, value)
|
|
assert p.get_local_state_val(value) == value
|
|
# check if this is actual local state
|
|
assert p.state["_local"][value] == value # type: ignore[literal-required]
|
|
|
|
new_val = uniq_id()
|
|
|
|
# check in context manager
|
|
@dlt.resource
|
|
def _w_local_state():
|
|
# join existing managed state
|
|
p.set_local_state_val(new_val, new_val)
|
|
yield 1
|
|
|
|
p.extract(_w_local_state)
|
|
assert p.state["_local"][new_val] == new_val # type: ignore[literal-required]
|
|
|
|
|
|
def test_update_last_run_context() -> None:
|
|
p = dlt.pipeline(destination="dummy", dev_mode=True)
|
|
p._update_last_run_context()
|
|
assert p.last_run_context["local_dir"] == os.path.join(os.getcwd(), "_storage")
|
|
assert p.last_run_context["settings_dir"] == os.path.join(os.getcwd(), ".dlt")
|
|
|
|
|
|
def test_changed_write_disposition() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
|
|
@dlt.resource
|
|
def resource_1():
|
|
yield [1, 2, 3]
|
|
|
|
p.run(resource_1, write_disposition="append")
|
|
assert p.default_schema.get_table("resource_1")["write_disposition"] == "append"
|
|
|
|
p.run(resource_1, write_disposition="append")
|
|
assert p.default_schema.get_table("resource_1")["write_disposition"] == "append"
|
|
|
|
p.run(resource_1, write_disposition="replace")
|
|
print(list(p._schema_storage.live_schemas.values())[0].to_pretty_yaml())
|
|
assert (
|
|
p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace"
|
|
)
|
|
assert p.default_schema.get_table("resource_1")["write_disposition"] == "replace"
|
|
|
|
|
|
@dlt.transformer(
|
|
name="github_repo_events",
|
|
primary_key="id",
|
|
write_disposition="merge",
|
|
table_name=lambda i: i["type"],
|
|
)
|
|
def github_repo_events(page):
|
|
yield page
|
|
|
|
|
|
@dlt.transformer(name="github_repo_events", primary_key="id", write_disposition="merge")
|
|
def github_repo_events_table_meta(page):
|
|
yield from [dlt.mark.with_table_name(p, p["type"]) for p in page]
|
|
|
|
|
|
@dlt.resource
|
|
def _get_shuffled_events(repeat: int = 1):
|
|
for _ in range(repeat):
|
|
with open(
|
|
"tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8"
|
|
) as f:
|
|
issues = json.load(f)
|
|
yield issues
|
|
|
|
|
|
@pytest.mark.parametrize("github_resource", (github_repo_events_table_meta, github_repo_events))
|
|
def test_dispatch_rows_to_tables(github_resource: DltResource):
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination=DUMMY_COMPLETE)
|
|
|
|
info = p.run(_get_shuffled_events | github_resource)
|
|
assert_load_info(info)
|
|
|
|
# get all expected tables
|
|
events = list(_get_shuffled_events)
|
|
expected_tables = set(
|
|
map(lambda e: p.default_schema.naming.normalize_identifier(e["type"]), events)
|
|
)
|
|
|
|
# all the tables present
|
|
assert (
|
|
expected_tables.intersection([t["name"] for t in p.default_schema.data_tables()])
|
|
== expected_tables
|
|
)
|
|
|
|
# all the columns have primary keys and merge disposition derived from resource
|
|
for table in p.default_schema.data_tables():
|
|
if table.get("parent") is None:
|
|
assert table["write_disposition"] == "merge"
|
|
assert table["columns"]["id"]["primary_key"] is True
|
|
|
|
|
|
def test_resource_name_in_schema() -> None:
|
|
@dlt.resource(table_name="some_table")
|
|
def static_data():
|
|
yield {"a": 1, "b": 2}
|
|
|
|
@dlt.resource(table_name=lambda x: "dynamic_func_table")
|
|
def dynamic_func_data():
|
|
yield {"a": 1, "b": 2}
|
|
|
|
@dlt.resource
|
|
def dynamic_mark_data():
|
|
yield dlt.mark.with_table_name({"a": 1, "b": 2}, "dynamic_mark_table")
|
|
|
|
@dlt.resource(table_name="parent_table")
|
|
def nested_data():
|
|
yield {"a": 1, "items": [{"c": 2}, {"c": 3}, {"c": 4}]}
|
|
|
|
@dlt.source
|
|
def some_source():
|
|
return [static_data(), dynamic_func_data(), dynamic_mark_data(), nested_data()]
|
|
|
|
source = some_source()
|
|
p = dlt.pipeline(pipeline_name="p" + uniq_id(), destination=DUMMY_COMPLETE)
|
|
p.run(source)
|
|
|
|
schema = p.default_schema
|
|
|
|
assert schema.tables["some_table"]["resource"] == "static_data"
|
|
assert schema.tables["dynamic_func_table"]["resource"] == "dynamic_func_data"
|
|
assert schema.tables["dynamic_mark_table"]["resource"] == "dynamic_mark_data"
|
|
assert schema.tables["parent_table"]["resource"] == "nested_data"
|
|
assert "resource" not in schema.tables["parent_table__items"]
|
|
|
|
|
|
def test_preserve_fields_order() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
|
|
item = {"col_1": 1, "col_2": 2, "col_3": "list"}
|
|
p.extract([item], table_name="order_1")
|
|
p.normalize()
|
|
|
|
@dlt.resource(name="order_2")
|
|
def ordered_dict():
|
|
yield {"col_1": 1, "col_2": 2, "col_3": "list"}
|
|
|
|
def reverse_order(item):
|
|
rev_dict = {}
|
|
for k in reversed(item.keys()):
|
|
rev_dict[k] = item[k]
|
|
return rev_dict
|
|
|
|
p.extract(ordered_dict().add_map(reverse_order))
|
|
p.normalize()
|
|
|
|
assert list(p.default_schema.tables["order_1"]["columns"].keys()) == [
|
|
"col_1",
|
|
"col_2",
|
|
"col_3",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
]
|
|
assert list(p.default_schema.tables["order_2"]["columns"].keys()) == [
|
|
"col_3",
|
|
"col_2",
|
|
"col_1",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
]
|
|
|
|
|
|
def test_preserve_new_fields_order_on_append() -> None:
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy")
|
|
|
|
item = {"c1": 1, "c2": 2, "c3": "list"}
|
|
p.extract([item], table_name="order_1")
|
|
p.normalize()
|
|
assert list(p.default_schema.get_table_columns("order_1").keys()) == [
|
|
"c1",
|
|
"c2",
|
|
"c3",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
]
|
|
|
|
# add columns
|
|
item = {"c1": 1, "c4": 2.0, "c3": "list", "c5": {"x": 1}}
|
|
p.extract([item], table_name="order_1")
|
|
p.normalize()
|
|
assert list(p.default_schema.get_table_columns("order_1").keys()) == [
|
|
"c1",
|
|
"c2",
|
|
"c3",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
"c4",
|
|
"c5__x",
|
|
]
|
|
|
|
|
|
def test_preserve_fields_order_incomplete_columns() -> None:
|
|
p = dlt.pipeline(pipeline_name="column_order", destination="dummy")
|
|
# incomplete columns (without data type) will be added in order of fields in data
|
|
|
|
@dlt.resource(columns={"c3": {"precision": 32}}, primary_key="c2")
|
|
def items():
|
|
yield {"c1": 1, "c2": 1, "c3": 1}
|
|
|
|
p.extract(items)
|
|
p.normalize()
|
|
assert list(p.default_schema.get_table_columns("items").keys()) == [
|
|
"c1",
|
|
"c2",
|
|
"c3",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
]
|
|
|
|
# complete columns preserve order in "columns"
|
|
p = p.drop()
|
|
|
|
@dlt.resource(columns={"c3": {"precision": 32, "data_type": "decimal"}}, primary_key="c1")
|
|
def items2():
|
|
yield {"c1": 1, "c2": 1, "c3": 1}
|
|
|
|
p.extract(items2)
|
|
p.normalize()
|
|
# c3 was first so goes first
|
|
assert list(p.default_schema.get_table_columns("items2").keys()) == [
|
|
"c3",
|
|
"c1",
|
|
"c2",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
]
|
|
|
|
|
|
def test_pipeline_log_progress() -> None:
|
|
os.environ["TIMEOUT"] = "3.0"
|
|
|
|
# will attach dlt logger
|
|
p = dlt.pipeline(
|
|
destination="dummy", progress=dlt.progress.log(0.5, logger=None, log_level=logging.WARNING)
|
|
)
|
|
# collector was created before pipeline so logger is not attached
|
|
assert cast(LogCollector, p.collector).logger is None
|
|
p.extract(many_delayed(2, 10))
|
|
# dlt logger attached
|
|
assert cast(LogCollector, p.collector).logger is not None
|
|
|
|
# pass explicit root logger
|
|
p = dlt.attach(progress=dlt.progress.log(0.5, logger=logging.getLogger()))
|
|
assert cast(LogCollector, p.collector).logger is not None
|
|
p.extract(many_delayed(2, 10))
|
|
|
|
|
|
def test_progress_collector_callbacks() -> None:
|
|
collector = dlt.progress.log()
|
|
|
|
with (
|
|
patch.object(collector, "on_start_trace") as mock_on_start_trace,
|
|
patch.object(collector, "on_start_trace_step") as mock_on_start_trace_step,
|
|
patch.object(collector, "on_end_trace_step") as mock_on_end_trace_step,
|
|
patch.object(collector, "on_end_trace") as mock_on_end_trace,
|
|
):
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_pipeline", destination="dummy", progress=collector
|
|
)
|
|
pipeline.extract(many_delayed(2, 5))
|
|
|
|
mock_on_start_trace.assert_called()
|
|
mock_on_start_trace_step.assert_called()
|
|
mock_on_end_trace_step.assert_called()
|
|
mock_on_end_trace.assert_called()
|
|
|
|
# Verify on_start_trace args
|
|
call_args = mock_on_start_trace.call_args[0]
|
|
assert isinstance(call_args[0], PipelineTrace)
|
|
assert isinstance(call_args[1], str)
|
|
assert call_args[2].pipeline_name == "test_pipeline"
|
|
|
|
# Verify on_start_trace_step args
|
|
call_args = mock_on_start_trace_step.call_args[0]
|
|
assert isinstance(call_args[0], PipelineTrace)
|
|
assert isinstance(call_args[1], str)
|
|
assert call_args[2].pipeline_name == "test_pipeline"
|
|
|
|
# Verify on_end_trace_step args
|
|
call_args = mock_on_end_trace_step.call_args[0]
|
|
assert isinstance(call_args[0], PipelineTrace)
|
|
assert isinstance(call_args[1], PipelineStepTrace)
|
|
assert call_args[2].pipeline_name == "test_pipeline"
|
|
assert isinstance(call_args[4], bool) # send_state
|
|
|
|
# Verify on_end_trace args
|
|
call_args = mock_on_end_trace.call_args[0]
|
|
assert isinstance(call_args[0], PipelineTrace)
|
|
assert call_args[1].pipeline_name == "test_pipeline"
|
|
assert isinstance(call_args[2], bool) # send_state
|
|
|
|
|
|
def test_progress_subclass_receives_callbacks() -> None:
|
|
counters_accesible = False
|
|
callbacks_received = {
|
|
"on_start_trace": False,
|
|
"on_start_trace_step": False,
|
|
"on_end_trace_step": False,
|
|
"on_end_trace": False,
|
|
}
|
|
|
|
class MyCollector(LogCollector):
|
|
def on_start_trace(
|
|
self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline
|
|
) -> None:
|
|
nonlocal callbacks_received
|
|
callbacks_received["on_start_trace"] = True
|
|
|
|
def on_start_trace_step(
|
|
self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline
|
|
) -> None:
|
|
nonlocal callbacks_received
|
|
callbacks_received["on_start_trace_step"] = True
|
|
|
|
def on_end_trace_step(
|
|
self,
|
|
trace: PipelineTrace,
|
|
step: PipelineStepTrace,
|
|
pipeline: SupportsPipeline,
|
|
step_info: Any,
|
|
send_state: bool,
|
|
) -> None:
|
|
nonlocal callbacks_received
|
|
callbacks_received["on_end_trace_step"] = True
|
|
|
|
def on_end_trace(
|
|
self, trace: PipelineTrace, pipeline: SupportsPipeline, send_state: bool
|
|
) -> None:
|
|
nonlocal callbacks_received
|
|
callbacks_received["on_end_trace"] = True
|
|
|
|
def on_log(self) -> None:
|
|
nonlocal counters_accesible
|
|
if self.counters.keys():
|
|
counters_accesible = True
|
|
|
|
collector = MyCollector()
|
|
pipeline = dlt.pipeline(pipeline_name="test_pipeline", destination="dummy", progress=collector)
|
|
pipeline.extract([1, 2, 3], table_name="test_table")
|
|
|
|
assert counters_accesible
|
|
assert callbacks_received["on_start_trace"]
|
|
assert callbacks_received["on_start_trace_step"]
|
|
assert callbacks_received["on_end_trace_step"]
|
|
assert callbacks_received["on_end_trace"]
|
|
|
|
|
|
def test_pipeline_source_state_activation() -> None:
|
|
appendix_yielded = None
|
|
|
|
@dlt.source
|
|
def reads_state(source_st, resource_st):
|
|
if dlt.current.source_state().get("appendix"):
|
|
|
|
@dlt.resource
|
|
def appendix():
|
|
nonlocal appendix_yielded
|
|
appendix_yielded = dlt.current.source_state().get("appendix") or ["NO"]
|
|
yield appendix_yielded
|
|
|
|
yield appendix
|
|
|
|
@dlt.resource
|
|
def writes_state():
|
|
dlt.current.source_state()["appendix"] = source_st
|
|
dlt.current.resource_state()["RX"] = resource_st
|
|
yield from [1, 2, 3]
|
|
|
|
yield writes_state
|
|
|
|
# activate first pipeline
|
|
p_appendix = dlt.pipeline(pipeline_name="appendix_p")
|
|
assert p_appendix.is_active
|
|
s_appendix = reads_state("appendix", "r_appendix")
|
|
assert s_appendix.state == {}
|
|
# create state by running extract
|
|
p_appendix.extract(s_appendix)
|
|
assert s_appendix.state == {
|
|
"appendix": "appendix",
|
|
"resources": {"writes_state": {"RX": "r_appendix"}},
|
|
}
|
|
assert s_appendix.writes_state.state == {"RX": "r_appendix"}
|
|
|
|
# change the active pipeline
|
|
p_postfix = dlt.pipeline(pipeline_name="postfix_p")
|
|
# now state comes from active pipeline which has an empty state
|
|
assert s_appendix.state == {}
|
|
# and back
|
|
p_appendix.activate()
|
|
assert s_appendix.writes_state.state == {"RX": "r_appendix"}
|
|
|
|
# create another source
|
|
s_w_appendix = reads_state("appendix", "r_appendix")
|
|
# has appendix because state is present
|
|
assert hasattr(s_w_appendix, "appendix")
|
|
|
|
# now extract the source in p_postfix that does not have a state
|
|
p_postfix.activate()
|
|
p_postfix.extract(s_w_appendix)
|
|
# so appendix yielded NO
|
|
# NOTE: this behavior is not intuitive, s_w_appendix was created with p_appendix active so appendix resource was created because state was set
|
|
# but we evaluated it in p_postfix pipeline without the state
|
|
assert appendix_yielded == ["NO"]
|
|
|
|
with pytest.raises(PipelineNotActive):
|
|
p_appendix.deactivate()
|
|
|
|
p_postfix.deactivate()
|
|
with pytest.raises(PipelineStateNotAvailable):
|
|
assert s_appendix.state == {}
|
|
|
|
|
|
def test_extract_add_tables() -> None:
|
|
# we extract and make sure that tables are added to schema
|
|
s = airtable_emojis()
|
|
assert list(s.resources.keys()) == ["💰Budget", "📆 Schedule", "🦚Peacock", "🦚WidePeacock"]
|
|
assert s.resources["🦚Peacock"].compute_table_schema()["resource"] == "🦚Peacock"
|
|
# only name will be normalized
|
|
assert s.resources["🦚Peacock"].compute_table_schema()["name"] == "🦚Peacock"
|
|
assert (
|
|
s.resources["💰Budget"].compute_table_schema()["columns"]["🔑book_id"]["name"]
|
|
== "🔑book_id"
|
|
)
|
|
pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy")
|
|
info = pipeline.extract(s)
|
|
assert info.extract_data_info[0]["name"] == "airtable_emojis"
|
|
schema = pipeline.default_schema
|
|
assert schema.tables["_schedule"]["resource"] == "📆 Schedule"
|
|
assert len(schema.tables["_schedule"]["columns"]) == 0
|
|
assert "_budget" not in schema.tables
|
|
|
|
# extract peacock
|
|
s = airtable_emojis()
|
|
s.resources["🦚Peacock"].selected = True
|
|
pipeline.extract(s)
|
|
# live schema
|
|
assert schema.tables["_peacock"]["resource"] == "🦚Peacock"
|
|
assert len(schema.tables["_peacock"]["columns"]) == 1
|
|
assert "_wide_peacock" not in schema.tables
|
|
|
|
# extract wide peacock
|
|
s = airtable_emojis()
|
|
s.resources["🦚WidePeacock"].selected = True
|
|
pipeline.extract(s)
|
|
assert len(schema.tables["_wide_peacock"]["columns"]) == 0
|
|
assert schema.tables["_wide_peacock"]["resource"] == "🦚WidePeacock"
|
|
|
|
# now normalize, we should have columns
|
|
pipeline.normalize()
|
|
# dlt tables added
|
|
assert len(schema.tables["_wide_peacock"]["columns"]) == 2
|
|
# resource still not normalized
|
|
assert schema.tables["_wide_peacock"]["resource"] == "🦚WidePeacock"
|
|
|
|
# reload schema
|
|
schema = pipeline._schema_storage.load_schema("airtable_emojis")
|
|
assert len(schema.tables["_wide_peacock"]["columns"]) == 2
|
|
# resource still not normalized
|
|
assert schema.tables["_wide_peacock"]["resource"] == "🦚WidePeacock"
|
|
|
|
|
|
def test_emojis_resource_names() -> None:
|
|
pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb")
|
|
info = pipeline.run(airtable_emojis())
|
|
assert_load_info(info)
|
|
# make sure that resource in schema update has a right name
|
|
table = info.load_packages[0].schema_update["_schedule"]
|
|
assert table["resource"] == "📆 Schedule"
|
|
# only schedule is added
|
|
assert set(info.load_packages[0].schema_update.keys()) == {
|
|
"_dlt_version",
|
|
"_dlt_loads",
|
|
"_schedule",
|
|
"_dlt_pipeline_state",
|
|
}
|
|
info = pipeline.run(airtable_emojis())
|
|
assert_load_info(info)
|
|
# here we add _peacock with has primary_key (so at least single column)
|
|
s = airtable_emojis()
|
|
s.resources["🦚Peacock"].selected = True
|
|
info = pipeline.run(s)
|
|
assert_load_info(info)
|
|
table = info.load_packages[0].schema_update["_peacock"]
|
|
assert table["resource"] == "🦚Peacock"
|
|
# here we add _wide_peacock which has no columns
|
|
s = airtable_emojis()
|
|
s.resources["🦚WidePeacock"].selected = True
|
|
info = pipeline.run(s)
|
|
assert_load_info(info)
|
|
table = info.load_packages[0].schema_update["_wide_peacock"]
|
|
assert table["resource"] == "🦚WidePeacock"
|
|
|
|
|
|
def test_apply_hints_infer_hints() -> None:
|
|
@dlt.source
|
|
def infer():
|
|
yield dlt.resource(
|
|
[{"id": 1, "timestamp": "NOW"}],
|
|
name="table1",
|
|
columns=[new_column("timestamp", nullable=True)],
|
|
)
|
|
|
|
new_new_hints = {"not_null": ["timestamp"], "primary_key": ["id"]}
|
|
s = infer()
|
|
s.schema.merge_hints(new_new_hints) # type: ignore[arg-type]
|
|
pipeline = dlt.pipeline(pipeline_name="inf", destination=DUMMY_COMPLETE)
|
|
pipeline.run(s)
|
|
# check schema
|
|
table = pipeline.default_schema.get_table("table1")
|
|
# nullable True coming from hint overrides inferred hint
|
|
assert table["columns"]["timestamp"] == {
|
|
"name": "timestamp",
|
|
"data_type": "text",
|
|
"nullable": True,
|
|
}
|
|
# fully from data
|
|
assert table["columns"]["id"] == {
|
|
"name": "id",
|
|
"data_type": "bigint",
|
|
"nullable": True,
|
|
"primary_key": True,
|
|
}
|
|
|
|
# remove primary key and change nullable
|
|
s = infer()
|
|
s.table1.apply_hints(
|
|
columns=[
|
|
{"name": "timestamp", "nullable": False},
|
|
{"name": "id", "nullable": False, "primary_key": False},
|
|
]
|
|
)
|
|
pipeline.run(s)
|
|
table = pipeline.default_schema.get_table("table1")
|
|
# hints overwrite pipeline schema
|
|
assert table["columns"]["timestamp"] == {
|
|
"name": "timestamp",
|
|
"data_type": "text",
|
|
"nullable": False,
|
|
}
|
|
assert table["columns"]["id"] == {
|
|
"name": "id",
|
|
"data_type": "bigint",
|
|
"nullable": False,
|
|
"primary_key": False,
|
|
}
|
|
# print(pipeline.default_schema.to_pretty_yaml())
|
|
|
|
|
|
def test_invalid_data_edge_cases() -> None:
|
|
# pass lambda directly to run, allowed now because functions can be extracted too
|
|
pipeline = dlt.pipeline(pipeline_name="invalid", destination=DUMMY_COMPLETE)
|
|
pipeline.run(lambda: [1, 2, 3], table_name="late_digits")
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["late_digits"] == 3
|
|
|
|
# pass not called source function
|
|
@dlt.source
|
|
def my_source():
|
|
return dlt.resource(itertools.count(start=1), name="infinity").add_limit(5)
|
|
|
|
pipeline.run(my_source)
|
|
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["infinity"] == 5
|
|
|
|
def res_return():
|
|
return dlt.resource(itertools.count(start=1), name="infinity").add_limit(5)
|
|
|
|
pipeline.run(res_return)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["res_return"] == 5
|
|
|
|
# with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
# pipeline.run(res_return)
|
|
# assert isinstance(pip_ex.value.__context__, PipeGenInvalid)
|
|
# assert "dlt.resource" in str(pip_ex.value)
|
|
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run({"a": "b"}, table_name="data")
|
|
assert isinstance(pip_ex.value.__context__, InvalidResourceDataTypeBasic)
|
|
|
|
# check same cases but that yield
|
|
@dlt.source
|
|
def my_source_yield():
|
|
yield dlt.resource(itertools.count(start=1), name="infinity").add_limit(5)
|
|
|
|
pipeline.run(my_source_yield)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["infinity"] == 5
|
|
|
|
# pipeline = dlt.pipeline(pipeline_name="invalid", destination=DUMMY_COMPLETE)
|
|
# with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
# pipeline.run(my_source_yield)
|
|
# assert isinstance(pip_ex.value.__context__, PipeGenInvalid)
|
|
# assert "dlt.source" in str(pip_ex.value)
|
|
|
|
def res_return_yield():
|
|
yield dlt.resource(itertools.count(start=1), name="infinity").add_limit(5)
|
|
|
|
# here extract pipe tries to call yielded resources which is not callable
|
|
# TODO: better exception, but this is a total messup
|
|
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(res_return_yield)
|
|
assert isinstance(pip_ex.value.__context__, ResourceExtractionError)
|
|
|
|
|
|
def test_resource_rename_same_table():
|
|
@dlt.resource(write_disposition="replace")
|
|
def generic(start):
|
|
dlt.current.resource_state()["start"] = start
|
|
yield [{"id": idx, "text": "A" * idx} for idx in range(start, start + 10)]
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
load_info = pipeline.run(
|
|
[generic(10).with_name("state1"), generic(20).with_name("state2")],
|
|
table_name="single_table",
|
|
)
|
|
assert_load_info(load_info)
|
|
# both resources loaded
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["single_table"] == 20
|
|
# only this table and state
|
|
assert len(pipeline.last_trace.last_normalize_info.row_counts) == 2
|
|
|
|
# check state
|
|
# state1 should have 10
|
|
assert generic(0).with_name("state1").state["start"] == 10
|
|
# state2 is 10
|
|
assert generic(0).with_name("state2").state["start"] == 20
|
|
|
|
# NOTE: only one resource will be set in table
|
|
assert pipeline.default_schema.get_table("single_table")["resource"] == "state1"
|
|
|
|
# now load only state1
|
|
load_info = pipeline.run(
|
|
[
|
|
generic(5).with_name("state1"),
|
|
],
|
|
table_name="single_table",
|
|
)
|
|
assert_load_info(load_info)
|
|
# both resources loaded
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["single_table"] == 10
|
|
assert generic(0).with_name("state1").state["start"] == 5
|
|
# resource got swapped to the most recent one
|
|
assert pipeline.default_schema.get_table("single_table")["resource"] == "state1"
|
|
|
|
|
|
def test_drop_with_new_name() -> None:
|
|
old_test_name = "old_pipeline_name"
|
|
new_test_name = "new_pipeline_name"
|
|
|
|
pipeline = dlt.pipeline(pipeline_name=old_test_name, destination="duckdb")
|
|
new_pipeline = pipeline.drop(pipeline_name=new_test_name)
|
|
|
|
assert new_pipeline.pipeline_name == new_test_name
|
|
|
|
# load to old pipeline
|
|
pipeline.run([1, 2, 3], table_name="p1")
|
|
new_pipeline.run([1, 2, 3], table_name="p2")
|
|
|
|
assert_table_counts(pipeline, {"p1": 3})
|
|
assert_table_counts(new_pipeline, {"p2": 3})
|
|
|
|
|
|
def test_drop() -> None:
|
|
pipeline = dlt.pipeline(pipeline_name="test_drop", destination="duckdb")
|
|
clean_pipeline = pipeline.drop()
|
|
assert clean_pipeline is pipeline
|
|
assert clean_pipeline.pipeline_name == "test_drop"
|
|
pipeline.run([1, 2, 3], table_name="numbers")
|
|
|
|
|
|
def test_source_schema_in_resource() -> None:
|
|
run_count = 0
|
|
|
|
@dlt.resource
|
|
def schema_inspector():
|
|
schema = dlt.current.source_schema()
|
|
if run_count == 0:
|
|
assert "schema_inspector" not in schema.tables
|
|
if run_count == 1:
|
|
assert "schema_inspector" in schema.tables
|
|
assert schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] == "X" # type: ignore[typeddict-item]
|
|
|
|
yield [1, 2, 3]
|
|
|
|
pipeline = dlt.pipeline(pipeline_name="test_inspector", destination="duckdb")
|
|
pipeline.run(schema_inspector())
|
|
|
|
# add custom annotation
|
|
pipeline.default_schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] = "X" # type: ignore[typeddict-unknown-key]
|
|
|
|
run_count += 1
|
|
pipeline.run(schema_inspector())
|
|
|
|
|
|
def test_schema_version_increase_and_source_update() -> None:
|
|
now = pendulum.now()
|
|
|
|
@dlt.source
|
|
def autodetect():
|
|
# add unix ts autodetection to current source schema
|
|
dlt.current.source_schema().add_type_detection("timestamp")
|
|
return dlt.resource(
|
|
[int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)],
|
|
name="numbers",
|
|
)
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
# control version of the schema
|
|
auto_source = autodetect()
|
|
assert auto_source.schema.stored_version is None
|
|
pipeline.extract(auto_source)
|
|
# extract did a first save
|
|
assert pipeline.default_schema.stored_version == 1
|
|
# only one prev hash
|
|
assert len(pipeline.default_schema.previous_hashes) == 1
|
|
# source schema was updated in the pipeline
|
|
assert auto_source.schema.stored_version == 1
|
|
# source has pipeline schema
|
|
assert pipeline.default_schema is auto_source.schema
|
|
|
|
pipeline.normalize()
|
|
# columns added and schema was saved in between
|
|
assert pipeline.default_schema.stored_version == 2
|
|
assert len(pipeline.default_schema.previous_hashes) == 2
|
|
# source schema still updated
|
|
assert auto_source.schema.stored_version == 2
|
|
assert pipeline.default_schema is auto_source.schema
|
|
pipeline.load()
|
|
# nothing changed in load
|
|
assert pipeline.default_schema.stored_version == 2
|
|
assert pipeline.default_schema is auto_source.schema
|
|
|
|
# run same source again
|
|
pipeline.extract(auto_source)
|
|
assert pipeline.default_schema.stored_version == 2
|
|
assert pipeline.default_schema is auto_source.schema
|
|
pipeline.normalize()
|
|
assert pipeline.default_schema.stored_version == 2
|
|
pipeline.load()
|
|
assert pipeline.default_schema.stored_version == 2
|
|
|
|
# run another instance of the same source
|
|
pipeline.run(autodetect())
|
|
assert pipeline.default_schema.stored_version == 2
|
|
assert pipeline.default_schema is auto_source.schema
|
|
assert "timestamp" in pipeline.default_schema.settings["detections"]
|
|
|
|
# data has compatible schema with "numbers" but schema is taken from pipeline
|
|
pipeline.run([1, 2, 3], table_name="numbers")
|
|
assert "timestamp" in pipeline.default_schema.settings["detections"]
|
|
assert pipeline.default_schema.stored_version == 2
|
|
assert pipeline.default_schema is auto_source.schema
|
|
|
|
# new table will evolve schema
|
|
pipeline.run([1, 2, 3], table_name="seq")
|
|
assert "timestamp" in pipeline.default_schema.settings["detections"]
|
|
assert pipeline.default_schema.stored_version == 4
|
|
assert pipeline.default_schema is auto_source.schema
|
|
|
|
|
|
def test_remove_autodetect() -> None:
|
|
now = pendulum.now()
|
|
|
|
@dlt.source
|
|
def autodetect():
|
|
# add unix ts autodetection to current source schema
|
|
dlt.current.source_schema().add_type_detection("timestamp")
|
|
return dlt.resource(
|
|
[int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)],
|
|
name="numbers",
|
|
)
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
auto_source = autodetect()
|
|
pipeline.extract(auto_source)
|
|
pipeline.normalize()
|
|
|
|
# unix ts recognized
|
|
assert (
|
|
pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "timestamp"
|
|
)
|
|
pipeline.load()
|
|
|
|
source = autodetect()
|
|
assert "timestamp" in source.schema.settings["detections"]
|
|
source.schema.remove_type_detection("timestamp")
|
|
assert "timestamp" not in source.schema.settings["detections"]
|
|
|
|
pipeline.run(source, refresh="drop_sources")
|
|
assert "timestamp" not in pipeline.default_schema.settings["detections"]
|
|
|
|
assert pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "bigint"
|
|
|
|
|
|
def test_flattened_column_hint() -> None:
|
|
now = pendulum.now()
|
|
|
|
# @dlt.resource(columns=[{"name": "value__timestamp", "data_type": "timestamp"}])
|
|
@dlt.resource()
|
|
def flattened_dict():
|
|
# dlt.current.source_schema().add_type_detection("timestamp")
|
|
|
|
for delta in range(4):
|
|
yield {
|
|
"delta": delta,
|
|
"values": [{"Value": {"timestampValue": now.timestamp() + delta}}],
|
|
}
|
|
|
|
@dlt.source
|
|
def nested_resource():
|
|
# we need to create a whole structure
|
|
dict_resource = flattened_dict()
|
|
# add table from resource
|
|
dlt.current.source_schema().update_table(dict_resource.compute_table_schema())
|
|
values_table = new_table(
|
|
dict_resource.name + "__values",
|
|
parent_table_name=dict_resource.name,
|
|
columns=[{"name": "value__timestamp_value", "data_type": "timestamp"}],
|
|
)
|
|
# and child table
|
|
dlt.current.source_schema().update_table(values_table)
|
|
return dict_resource
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
pipeline.run(nested_resource())
|
|
# print(pipeline.default_schema.to_pretty_yaml())
|
|
assert (
|
|
pipeline.default_schema.get_table("flattened_dict__values")["columns"][
|
|
"value__timestamp_value"
|
|
]["data_type"]
|
|
== "timestamp"
|
|
)
|
|
# make sure data is there
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["flattened_dict__values"] == 4
|
|
|
|
|
|
def test_column_name_with_break_path() -> None:
|
|
"""Tests how normalization behaves for names with break path ie __
|
|
all the names must be idempotent
|
|
"""
|
|
pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking")
|
|
info = pipeline.run(
|
|
[{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path"
|
|
)
|
|
assert_load_info(info)
|
|
# table name was preserved
|
|
table = pipeline.default_schema.get_table("custom__path")
|
|
assert pipeline.default_schema.data_table_names() == ["custom__path"]
|
|
# column name was preserved
|
|
assert table["columns"]["example_custom_field__c"]["data_type"] == "text"
|
|
assert set(table["columns"]) == {"example_custom_field__c", "reg_c", "_dlt_id", "_dlt_load_id"}
|
|
|
|
# get data
|
|
assert_table_counts(pipeline, {"custom__path": 1})
|
|
# get data via dataset with dbapi
|
|
data_ = pipeline.dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall()
|
|
assert data_ == [("custom", "c")]
|
|
|
|
|
|
def test_column_name_with_break_path_legacy() -> None:
|
|
"""Tests how normalization behaves for names with break path ie __
|
|
in legacy mode table and column names were normalized as single identifier
|
|
"""
|
|
os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False"
|
|
pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking")
|
|
info = pipeline.run(
|
|
[{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path"
|
|
)
|
|
assert_load_info(info)
|
|
# table name was contracted
|
|
table = pipeline.default_schema.get_table("custom_path")
|
|
assert pipeline.default_schema.data_table_names() == ["custom_path"]
|
|
# column name was contracted
|
|
assert table["columns"]["example_custom_field_c"]["data_type"] == "text"
|
|
assert set(table["columns"]) == {"example_custom_field_c", "reg_c", "_dlt_id", "_dlt_load_id"}
|
|
|
|
# get data
|
|
assert_table_counts(pipeline, {"custom_path": 1})
|
|
# get data via dataset with dbapi
|
|
data_ = pipeline.dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall()
|
|
assert data_ == [("custom", "c")]
|
|
|
|
|
|
def test_column_hint_with_break_path() -> None:
|
|
"""Up form the v 1.4.1 name normalizer is idempotent on break path"""
|
|
now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10"))
|
|
|
|
@dlt.resource(
|
|
name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}]
|
|
)
|
|
def flattened_dict():
|
|
for delta in range(4):
|
|
yield {
|
|
"delta": delta,
|
|
"value": {"timestamp": now.timestamp() + delta},
|
|
}
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
info = pipeline.run(flattened_dict())
|
|
assert_load_info(info)
|
|
|
|
assert pipeline.default_schema.data_table_names() == ["flattened__dict"]
|
|
table = pipeline.default_schema.get_table("flattened__dict")
|
|
assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"}
|
|
assert table["columns"]["value__timestamp"]["data_type"] == "timestamp"
|
|
|
|
# make sure data is there
|
|
data_ = pipeline.dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall()
|
|
assert data_ == [(0, now)]
|
|
|
|
|
|
def test_column_hint_with_break_path_legacy() -> None:
|
|
"""Up form the v 1.4.1 name normalizer is idempotent on break path"""
|
|
|
|
os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False"
|
|
now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10"))
|
|
|
|
@dlt.resource(
|
|
name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}]
|
|
)
|
|
def flattened_dict():
|
|
for delta in range(4):
|
|
yield {
|
|
"delta": delta,
|
|
"value": {"timestamp": now.timestamp() + delta},
|
|
}
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
info = pipeline.run(flattened_dict())
|
|
assert_load_info(info)
|
|
# table name contracted
|
|
assert pipeline.default_schema.data_table_names() == ["flattened_dict"]
|
|
table = pipeline.default_schema.get_table("flattened_dict")
|
|
# hint applied
|
|
assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"}
|
|
assert table["columns"]["value__timestamp"]["data_type"] == "timestamp"
|
|
# make sure data is there
|
|
data_ = pipeline.dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall()
|
|
assert data_ == [(0, now)]
|
|
|
|
|
|
def test_empty_rows_are_included() -> None:
|
|
"""Empty rows where all values are `None` or empty dicts
|
|
create rows in the dataset with `NULL` in all columns
|
|
"""
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
|
|
pipeline.run(iter([{}, {}, {}]), table_name="empty_rows")
|
|
pipeline.run(iter([{"a": 1}, {}, {}]), table_name="empty_rows")
|
|
pipeline.run(iter([{"a": None}, {}]), table_name="empty_rows")
|
|
|
|
with pipeline.sql_client() as client:
|
|
rows = client.execute_sql("SELECT a FROM empty_rows ORDER BY a")
|
|
|
|
values = [r[0] for r in rows]
|
|
assert values == [1, None, None, None, None, None, None, None]
|
|
|
|
|
|
def test_resource_state_name_not_normalized() -> None:
|
|
pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb")
|
|
peacock_s = airtable_emojis().with_resources("🦚Peacock")
|
|
pipeline.extract(peacock_s)
|
|
assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"}
|
|
pipeline.normalize()
|
|
pipeline.load()
|
|
|
|
# get state from destination
|
|
from dlt.pipeline.state_sync import load_pipeline_state_from_destination
|
|
|
|
client: WithStateSync
|
|
with pipeline.destination_client() as client: # type: ignore[assignment]
|
|
state = load_pipeline_state_from_destination(pipeline.pipeline_name, client)
|
|
assert "airtable_emojis" in state["sources"]
|
|
assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}}
|
|
|
|
|
|
def test_pipeline_list_packages() -> None:
|
|
pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy")
|
|
pipeline.extract(airtable_emojis())
|
|
load_ids = pipeline.list_extracted_load_packages()
|
|
assert len(load_ids) == 1
|
|
# two new packages: for emojis schema and emojis_2
|
|
pipeline.extract(
|
|
[airtable_emojis(), airtable_emojis(), airtable_emojis().clone(with_name="emojis_2")]
|
|
)
|
|
load_ids = pipeline.list_extracted_load_packages()
|
|
assert len(load_ids) == 3
|
|
extracted_package = pipeline.get_load_package_info(load_ids[1])
|
|
assert extracted_package.schema_name == "airtable_emojis"
|
|
extracted_package = pipeline.get_load_package_info(load_ids[2])
|
|
assert extracted_package.schema_name == "emojis_2"
|
|
extracted_package = pipeline.get_load_package_info(load_ids[0])
|
|
assert extracted_package.state == "extracted"
|
|
assert extracted_package.schema_name == "airtable_emojis"
|
|
# same load id continues till the end
|
|
pipeline.normalize()
|
|
load_ids_n = pipeline.list_normalized_load_packages()
|
|
assert load_ids == load_ids_n
|
|
normalized_package = pipeline.get_load_package_info(load_ids[0])
|
|
# same number of new jobs
|
|
assert normalized_package.state == "normalized"
|
|
assert len(normalized_package.jobs["new_jobs"]) == len(extracted_package.jobs["new_jobs"])
|
|
# load all 3 packages and fail all jobs in them
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false" # do not raise, complete package till the end
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
pipeline.load()
|
|
load_ids_l = pipeline.list_completed_load_packages()
|
|
assert load_ids == load_ids_l
|
|
loaded_package = pipeline.get_load_package_info(load_ids[0])
|
|
assert len(loaded_package.jobs["failed_jobs"]) == len(extracted_package.jobs["new_jobs"])
|
|
assert loaded_package.state == "loaded"
|
|
failed_jobs = pipeline.list_failed_jobs_in_package(load_ids[0])
|
|
assert len(loaded_package.jobs["failed_jobs"]) == len(failed_jobs)
|
|
|
|
|
|
def test_remove_pending_packages() -> None:
|
|
pipeline = dlt.pipeline(pipeline_name="emojis", destination=DUMMY_COMPLETE)
|
|
pipeline.extract(airtable_emojis())
|
|
assert pipeline.has_pending_data
|
|
pipeline.drop_pending_packages()
|
|
assert pipeline.has_pending_data is False
|
|
pipeline.extract(airtable_emojis())
|
|
pipeline.normalize()
|
|
pipeline.extract(airtable_emojis())
|
|
assert pipeline.has_pending_data
|
|
pipeline.drop_pending_packages()
|
|
assert pipeline.has_pending_data is False
|
|
# partial load
|
|
os.environ["EXCEPTION_PROB"] = "1.0"
|
|
os.environ["TIMEOUT"] = "1.0"
|
|
# will make job go into retry state
|
|
with pytest.raises(PipelineStepFailed):
|
|
pipeline.run(airtable_emojis())
|
|
# move job into completed folder manually to simulate partial package
|
|
load_storage = pipeline._get_load_storage()
|
|
load_id = load_storage.normalized_packages.list_packages()[0]
|
|
job = load_storage.normalized_packages.list_new_jobs(load_id)[0]
|
|
started_path = load_storage.normalized_packages.start_job(
|
|
load_id, FileStorage.get_file_name_from_file_path(job)
|
|
)
|
|
completed_path = load_storage.normalized_packages.complete_job(
|
|
load_id, FileStorage.get_file_name_from_file_path(job)
|
|
)
|
|
# to test partial loads we need two jobs one completed an one in another state
|
|
# to simulate this, we just duplicate the completed job into the started path
|
|
shutil.copyfile(completed_path, started_path)
|
|
# now "with partial loads" can be tested
|
|
assert pipeline.has_pending_data
|
|
pipeline.drop_pending_packages(with_partial_loads=False)
|
|
assert pipeline.has_pending_data
|
|
pipeline.drop_pending_packages()
|
|
assert pipeline.has_pending_data is False
|
|
|
|
|
|
@pytest.mark.parametrize("workers", (1, 4), ids=("1 norm worker", "4 norm workers"))
|
|
def test_parallel_pipelines_threads(workers: int) -> None:
|
|
# critical section to control pipeline steps
|
|
init_lock = threading.Lock()
|
|
extract_ev = threading.Event()
|
|
normalize_ev = threading.Event()
|
|
load_ev = threading.Event()
|
|
# control main thread
|
|
sem = threading.Semaphore(0)
|
|
|
|
# rotate the files frequently so we have parallel normalize and load
|
|
os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "10"
|
|
os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "10"
|
|
|
|
# force spawn process pool
|
|
os.environ["NORMALIZE__START_METHOD"] = "spawn"
|
|
|
|
page_repeats = 1
|
|
|
|
# set the extra per pipeline
|
|
os.environ["PIPELINE_1__EXTRA"] = "CFG_P_1"
|
|
os.environ["PIPELINE_2__EXTRA"] = "CFG_P_2"
|
|
|
|
def _run_pipeline(pipeline_name: str) -> Tuple[LoadInfo, PipelineContext, DictStrAny]:
|
|
try:
|
|
|
|
@dlt.transformer(
|
|
name="github_repo_events",
|
|
write_disposition="append",
|
|
table_name=lambda i: i["type"],
|
|
)
|
|
def github_repo_events(page, extra):
|
|
# test setting the resource state
|
|
dlt.current.resource_state()["extra"] = extra
|
|
yield page
|
|
|
|
@dlt.transformer
|
|
async def slow(items):
|
|
await asyncio.sleep(0.1)
|
|
return items
|
|
|
|
@dlt.transformer
|
|
@dlt.defer
|
|
def slow_func(items, extra):
|
|
# sdd configurable extra to each element
|
|
sleep(0.1)
|
|
return map(lambda item: {**item, **{"extra": extra}}, items)
|
|
|
|
@dlt.source
|
|
def github(extra: str = dlt.config.value):
|
|
# generate github events, push them through futures and thread pools and then dispatch to separate tables
|
|
return (
|
|
_get_shuffled_events(repeat=page_repeats)
|
|
| slow
|
|
| slow_func(extra)
|
|
| github_repo_events(extra)
|
|
)
|
|
|
|
# make sure that only one pipeline is created
|
|
with init_lock:
|
|
pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
|
|
context = Container()[PipelineContext]
|
|
finally:
|
|
sem.release()
|
|
# start every step at the same moment to increase chances of any race conditions to happen
|
|
extract_ev.wait()
|
|
context_2 = Container()[PipelineContext]
|
|
try:
|
|
pipeline.extract(github())
|
|
finally:
|
|
sem.release()
|
|
normalize_ev.wait()
|
|
try:
|
|
pipeline.normalize(workers=workers)
|
|
finally:
|
|
sem.release()
|
|
load_ev.wait()
|
|
info = pipeline.load()
|
|
|
|
# get counts in the thread
|
|
counts = load_table_counts(pipeline)
|
|
|
|
assert context is context_2
|
|
return info, context, counts
|
|
|
|
with ThreadPoolExecutor(max_workers=4) as pool:
|
|
f_1 = pool.submit(_run_pipeline, "pipeline_1")
|
|
f_2 = pool.submit(_run_pipeline, "pipeline_2")
|
|
|
|
sem.acquire()
|
|
sem.acquire()
|
|
if f_1.done():
|
|
raise f_1.exception()
|
|
if f_2.done():
|
|
raise f_2.exception()
|
|
extract_ev.set()
|
|
sem.acquire()
|
|
sem.acquire()
|
|
if f_1.done():
|
|
raise f_1.exception()
|
|
if f_2.done():
|
|
raise f_2.exception()
|
|
normalize_ev.set()
|
|
sem.acquire()
|
|
sem.acquire()
|
|
if f_1.done():
|
|
raise f_1.exception()
|
|
if f_2.done():
|
|
raise f_2.exception()
|
|
load_ev.set()
|
|
|
|
info_1, context_1, counts_1 = f_1.result()
|
|
info_2, context_2, counts_2 = f_2.result()
|
|
|
|
assert_load_info(info_1)
|
|
assert_load_info(info_2)
|
|
|
|
pipeline_1: dlt.Pipeline = context_1.pipeline() # type: ignore
|
|
pipeline_2: dlt.Pipeline = context_2.pipeline() # type: ignore
|
|
|
|
n_counts_1 = pipeline_1.last_trace.last_normalize_info
|
|
assert n_counts_1.row_counts["push_event"] == 8 * page_repeats == counts_1["push_event"]
|
|
n_counts_2 = pipeline_2.last_trace.last_normalize_info
|
|
assert n_counts_2.row_counts["push_event"] == 8 * page_repeats == counts_2["push_event"]
|
|
|
|
assert pipeline_1.pipeline_name == "pipeline_1"
|
|
assert pipeline_2.pipeline_name == "pipeline_2"
|
|
|
|
# check if resource state has extra
|
|
assert pipeline_1.state["sources"]["github"]["resources"]["github_repo_events"] == {
|
|
"extra": "CFG_P_1"
|
|
}
|
|
assert pipeline_2.state["sources"]["github"]["resources"]["github_repo_events"] == {
|
|
"extra": "CFG_P_2"
|
|
}
|
|
|
|
# make sure we can still access data
|
|
pipeline_1.activate() # activate pipeline to access inner duckdb
|
|
assert load_table_counts(pipeline_1) == counts_1
|
|
pipeline_2.activate()
|
|
assert load_table_counts(pipeline_2) == counts_2
|
|
|
|
|
|
@pytest.mark.parametrize("workers", (1, 4), ids=("1 norm worker", "4 norm workers"))
|
|
def test_parallel_pipelines_async(workers: int) -> None:
|
|
os.environ["NORMALIZE__WORKERS"] = str(workers)
|
|
|
|
# create both futures and thread parallel resources
|
|
|
|
def async_table():
|
|
async def _gen(idx):
|
|
await asyncio.sleep(0.1)
|
|
return {"async_gen": idx}
|
|
|
|
# just yield futures in a loop
|
|
for idx_ in range(10):
|
|
yield _gen(idx_)
|
|
|
|
def defer_table():
|
|
@dlt.defer
|
|
def _gen(idx):
|
|
sleep(0.1)
|
|
return {"thread_gen": idx}
|
|
|
|
# just yield futures in a loop
|
|
for idx_ in range(5):
|
|
yield _gen(idx_)
|
|
|
|
def _run_pipeline(pipeline, gen_) -> LoadInfo:
|
|
# run the pipeline in a thread, also instantiate generators here!
|
|
# Python does not let you use generators across instances
|
|
return pipeline.run(gen_())
|
|
|
|
# declare pipelines in main thread then run them "async"
|
|
pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True)
|
|
pipeline_2 = dlt.pipeline("pipeline_2", destination="duckdb", dev_mode=True)
|
|
|
|
async def _run_async():
|
|
loop = asyncio.get_running_loop()
|
|
with ThreadPoolExecutor() as executor:
|
|
results = await asyncio.gather(
|
|
loop.run_in_executor(executor, _run_pipeline, pipeline_1, async_table),
|
|
loop.run_in_executor(executor, _run_pipeline, pipeline_2, defer_table),
|
|
)
|
|
assert_load_info(results[0])
|
|
assert_load_info(results[1])
|
|
|
|
asyncio.run(_run_async())
|
|
pipeline_1.activate() # activate pipeline 1 to access inner duckdb
|
|
assert load_table_counts(pipeline_1) == {"async_table": 10}
|
|
pipeline_2.activate() # activate pipeline 2 to access inner duckdb
|
|
assert load_table_counts(pipeline_2) == {"defer_table": 5}
|
|
|
|
|
|
def test_resource_while_stop() -> None:
|
|
def product():
|
|
stop = False
|
|
|
|
@dlt.defer
|
|
def get_page(page_num):
|
|
nonlocal stop
|
|
|
|
# print(f"Fetching page {page_num}")
|
|
items = api_fetch(page_num)
|
|
# print(f"Retrieved {len(items)} from page {page_num}")
|
|
|
|
if len(items) == 0:
|
|
stop = True
|
|
return items
|
|
|
|
idx = 0
|
|
while not stop:
|
|
yield get_page(idx)
|
|
idx += 1
|
|
|
|
def api_fetch(page_num):
|
|
import time
|
|
|
|
time.sleep(1)
|
|
if page_num < 12:
|
|
return [{"id": page_num}]
|
|
else:
|
|
return []
|
|
|
|
pipeline = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True)
|
|
load_info = pipeline.run(product())
|
|
assert_load_info(load_info)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["product"] == 12
|
|
|
|
|
|
def test_run_with_pua_payload() -> None:
|
|
# prepare some data and complete load with run
|
|
os.environ["COMPLETED_PROB"] = "1.0"
|
|
pipeline_name = "pipe_" + uniq_id()
|
|
p = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb")
|
|
print(pipeline_name)
|
|
from dlt.common.json import PUA_START, PUA_CHARACTER_MAX
|
|
|
|
def some_data():
|
|
yield from [
|
|
# text is only PUA
|
|
{"id": 1, "text": chr(PUA_START)},
|
|
{"id": 2, "text": chr(PUA_START - 1)},
|
|
{"id": 3, "text": chr(PUA_START + 1)},
|
|
{"id": 4, "text": chr(PUA_START + PUA_CHARACTER_MAX + 1)},
|
|
# PUA inside text
|
|
{"id": 5, "text": f"a{chr(PUA_START)}b"},
|
|
{"id": 6, "text": f"a{chr(PUA_START - 1)}b"},
|
|
{"id": 7, "text": f"a{chr(PUA_START + 1)}b"},
|
|
# text starts with PUA
|
|
{"id": 8, "text": f"{chr(PUA_START)}a"},
|
|
{"id": 9, "text": f"{chr(PUA_START - 1)}a"},
|
|
{"id": 10, "text": f"{chr(PUA_START + 1)}a"},
|
|
]
|
|
|
|
@dlt.source
|
|
def source():
|
|
return dlt.resource(some_data(), name="pua_data")
|
|
|
|
load_info = p.run(source())
|
|
assert p.last_trace.last_normalize_info.row_counts["pua_data"] == 10
|
|
|
|
with p.sql_client() as client:
|
|
rows = client.execute_sql("SELECT text FROM pua_data ORDER BY id")
|
|
|
|
values = [r[0] for r in rows]
|
|
assert values == [
|
|
"\uf026",
|
|
"\uf025",
|
|
"\uf027",
|
|
"\uf02f",
|
|
"a\uf026b",
|
|
"a\uf025b",
|
|
"a\uf027b",
|
|
"\uf026a",
|
|
"\uf025a",
|
|
"\uf027a",
|
|
]
|
|
assert len(load_info.loads_ids) == 1
|
|
|
|
|
|
def test_pipeline_load_info_metrics_schema_is_not_changing() -> None:
|
|
"""Test if load info schema is idempotent throughout multiple load cycles
|
|
|
|
## Setup
|
|
|
|
We will run the same pipeline with
|
|
|
|
1. A single source returning one resource and collect `schema.version_hash`,
|
|
2. Another source returning 2 resources with more complex data and collect `schema.version_hash`,
|
|
3. At last we run both sources,
|
|
4. For each 1. 2. 3. we load `last_extract_info`, `last_normalize_info` and `last_load_info` and collect `schema.version_hash`
|
|
|
|
## Expected
|
|
|
|
`version_hash` collected in each stage should remain the same at all times.
|
|
"""
|
|
data = [
|
|
{"id": 1, "name": "Alice"},
|
|
{"id": 2, "name": "Bob"},
|
|
]
|
|
|
|
# this source must have all the hints so other sources do not change trace schema (extract/hints)
|
|
|
|
@dlt.source
|
|
def users_source():
|
|
return dlt.resource([data], name="users_resource")
|
|
|
|
@dlt.source
|
|
def taxi_demand_source():
|
|
@dlt.resource(
|
|
primary_key="city", columns=[{"name": "id", "data_type": "bigint", "precision": 4}]
|
|
)
|
|
def locations(idx=dlt.sources.incremental("id")):
|
|
for idx in range(10):
|
|
yield {
|
|
"id": idx,
|
|
"address": f"address-{idx}",
|
|
"city": f"city-{idx}",
|
|
}
|
|
|
|
@dlt.resource(primary_key="id")
|
|
def demand_map():
|
|
for idx in range(10):
|
|
yield {
|
|
"id": idx,
|
|
"city": f"city-{idx}",
|
|
"demand": random.randint(0, 10000),
|
|
}
|
|
|
|
return [locations, demand_map]
|
|
|
|
schema = dlt.Schema(name="nice_load_info_schema")
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="quick_start",
|
|
destination="duckdb",
|
|
dataset_name="mydata",
|
|
)
|
|
|
|
taxi_load_info = pipeline.run(
|
|
taxi_demand_source(),
|
|
)
|
|
|
|
schema_hashset = set()
|
|
pipeline.run(
|
|
[taxi_load_info],
|
|
table_name="_load_info",
|
|
schema=schema,
|
|
)
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_normalize_info],
|
|
table_name="_normalize_info",
|
|
schema=schema,
|
|
)
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_extract_info],
|
|
table_name="_extract_info",
|
|
schema=schema,
|
|
)
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
trace_schema = pipeline.schemas["nice_load_info_schema"].to_pretty_yaml()
|
|
|
|
users_load_info = pipeline.run(
|
|
users_source(),
|
|
)
|
|
|
|
pipeline.run(
|
|
[users_load_info],
|
|
table_name="_load_info",
|
|
schema=schema,
|
|
)
|
|
assert trace_schema == pipeline.schemas["nice_load_info_schema"].to_pretty_yaml()
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
assert len(schema_hashset) == 1
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_normalize_info],
|
|
table_name="_normalize_info",
|
|
schema=schema,
|
|
)
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
assert len(schema_hashset) == 1
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_extract_info],
|
|
table_name="_extract_info",
|
|
schema=schema,
|
|
)
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
assert len(schema_hashset) == 1
|
|
|
|
load_info = pipeline.run(
|
|
[users_source(), taxi_demand_source()],
|
|
)
|
|
|
|
pipeline.run(
|
|
[load_info],
|
|
table_name="_load_info",
|
|
schema=schema,
|
|
)
|
|
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_normalize_info],
|
|
table_name="_normalize_info",
|
|
schema=schema,
|
|
)
|
|
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
|
|
pipeline.run(
|
|
[pipeline.last_trace.last_extract_info],
|
|
table_name="_extract_info",
|
|
schema=schema,
|
|
)
|
|
|
|
schema_hashset.add(pipeline.schemas["nice_load_info_schema"].version_hash)
|
|
|
|
assert len(schema_hashset) == 1
|
|
|
|
|
|
def test_yielding_empty_list_creates_table() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="empty_start",
|
|
destination="duckdb",
|
|
dataset_name="mydata",
|
|
)
|
|
|
|
# empty list should create empty table in the destination but with the required schema
|
|
extract_info = pipeline.extract(
|
|
[MaterializedEmptyList()],
|
|
table_name="empty",
|
|
columns=[{"name": "id", "data_type": "bigint", "nullable": True}],
|
|
)
|
|
print(extract_info)
|
|
normalize_info = pipeline.normalize()
|
|
print(normalize_info)
|
|
assert normalize_info.row_counts["empty"] == 0
|
|
load_info = pipeline.load()
|
|
# print(load_info.asstr(verbosity=3))
|
|
assert_load_info(load_info)
|
|
assert_table_counts(pipeline, {"empty": 0})
|
|
# make sure we have expected columns
|
|
assert set(pipeline.default_schema.tables["empty"]["columns"].keys()) == {
|
|
"id",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
}
|
|
|
|
# load some data
|
|
pipeline.run([{"id": 1}], table_name="empty")
|
|
assert_table_counts(pipeline, {"empty": 1})
|
|
|
|
# update schema on existing table
|
|
pipeline.run(
|
|
[MaterializedEmptyList()],
|
|
table_name="empty",
|
|
columns=[{"name": "user_name", "data_type": "text", "nullable": True}],
|
|
)
|
|
assert_table_counts(pipeline, {"empty": 1})
|
|
assert set(pipeline.default_schema.tables["empty"]["columns"].keys()) == {
|
|
"id",
|
|
"_dlt_load_id",
|
|
"_dlt_id",
|
|
"user_name",
|
|
}
|
|
with pipeline.sql_client() as client:
|
|
with client.execute_query("SELECT id, user_name FROM empty") as cur:
|
|
rows = list(cur.fetchall())
|
|
assert len(rows) == 1
|
|
assert rows[0] == (1, None)
|
|
|
|
|
|
local_paths = [os.path.abspath(TEST_STORAGE_ROOT), "."]
|
|
if os.name == "nt":
|
|
local_paths += [
|
|
# UNC extended path
|
|
"\\\\?\\UNC\\localhost\\" + os.path.abspath(TEST_STORAGE_ROOT).replace(":", "$"),
|
|
# UNC path
|
|
"\\\\localhost\\" + os.path.abspath(TEST_STORAGE_ROOT).replace(":", "$"),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("local_path", local_paths)
|
|
def test_local_filesystem_destination(local_path: str) -> None:
|
|
dataset_name = "mydata_" + uniq_id()
|
|
|
|
@dlt.resource
|
|
def stateful_resource():
|
|
dlt.current.source_state()["mark"] = 1
|
|
yield [1, 2, 3]
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="local_files",
|
|
destination=dlt.destinations.filesystem(local_path),
|
|
dataset_name=dataset_name,
|
|
)
|
|
info = pipeline.run(stateful_resource(), table_name="numbers", write_disposition="replace")
|
|
assert_load_info(info)
|
|
|
|
info = pipeline.run(stateful_resource(), table_name="numbers", write_disposition="replace")
|
|
assert_load_info(info)
|
|
|
|
pipeline = pipeline.drop()
|
|
|
|
# must be able to restore the schema and state
|
|
pipeline.sync_destination()
|
|
assert pipeline.state["sources"]["local_files"]["mark"] == 1
|
|
assert pipeline.default_schema.get_table("numbers") is not None
|
|
|
|
# check all the files, paths may get messed up in many different ways
|
|
# and data may land anywhere especially on Windows
|
|
expected_dataset = pathlib.Path("_storage").joinpath(dataset_name).resolve()
|
|
assert expected_dataset.exists()
|
|
assert expected_dataset.is_dir()
|
|
|
|
# count files in tables
|
|
assert expected_dataset.joinpath(INIT_FILE_NAME).is_file()
|
|
# one numbers table (replaced)
|
|
assert len(list(expected_dataset.joinpath("numbers").glob("*"))) == 1
|
|
# two loads
|
|
assert len(list(expected_dataset.joinpath("_dlt_loads").glob("*"))) == 2
|
|
# one schema (dedup on hash)
|
|
assert len(list(expected_dataset.joinpath("_dlt_version").glob("*"))) == 1
|
|
# one state (not sent twice)
|
|
assert len(list(expected_dataset.joinpath("_dlt_pipeline_state").glob("*"))) == 1
|
|
|
|
fs_client = pipeline._fs_client()
|
|
# all path formats we use must lead to "_storage" relative to tests
|
|
expect_path_fragment = str(pathlib.Path(TEST_STORAGE_ROOT).joinpath(dataset_name).resolve())
|
|
expect_path_fragment = expect_path_fragment[expect_path_fragment.index(TEST_STORAGE_ROOT) :]
|
|
# TODO: restore on windows
|
|
assert str(pathlib.Path(TEST_STORAGE_ROOT).joinpath(dataset_name).resolve()).endswith(
|
|
expect_path_fragment
|
|
)
|
|
# same for client
|
|
assert len(fs_client.list_table_files("numbers")) == 1
|
|
assert len(fs_client.list_table_files("_dlt_loads")) == 2
|
|
assert len(fs_client.list_table_files("_dlt_version")) == 1
|
|
assert len(fs_client.list_table_files("_dlt_pipeline_state")) == 1
|
|
|
|
|
|
def test_filesystem_in_pipeline_dir() -> None:
|
|
pipeline = dlt.pipeline("test_filesystem_in_pipeline_dir", destination=filesystem(":pipeline:"))
|
|
pipeline.run([1, 2, 3], table_name="digits")
|
|
assert os.path.isdir(os.path.join(pipeline.working_dir, "test_filesystem_in_pipeline_dir"))
|
|
|
|
|
|
@pytest.mark.parametrize("truncate", (True, False))
|
|
def test_staging_dataset_truncate(truncate) -> None:
|
|
dlt.config["truncate_staging_dataset"] = truncate
|
|
|
|
@dlt.resource(write_disposition="merge", merge_key="id")
|
|
def test_data():
|
|
yield [{"field": 1, "id": 1}, {"field": 2, "id": 2}, {"field": 3, "id": 3}]
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_staging_cleared",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
|
|
info = pipeline.run(test_data, table_name="staging_cleared")
|
|
assert_load_info(info)
|
|
|
|
with pipeline.sql_client() as client:
|
|
with client.execute_query(
|
|
f"SELECT * FROM {pipeline.dataset_name}_staging.staging_cleared"
|
|
) as cur:
|
|
if truncate:
|
|
assert len(cur.fetchall()) == 0
|
|
else:
|
|
assert len(cur.fetchall()) == 3
|
|
|
|
with client.execute_query(f"SELECT * FROM {pipeline.dataset_name}.staging_cleared") as cur:
|
|
assert len(cur.fetchall()) == 3
|
|
|
|
|
|
def test_change_naming_convention_name_collision() -> None:
|
|
duck_ = dlt.destinations.duckdb(naming_convention="duck_case", recommended_file_size=120000)
|
|
caps = duck_.capabilities()
|
|
assert caps.naming_convention == "duck_case"
|
|
assert caps.recommended_file_size == 120000
|
|
|
|
# use duck case to load data into duckdb so casing and emoji are preserved
|
|
pipeline = dlt.pipeline("test_change_naming_convention_name_collision", destination=duck_)
|
|
info = pipeline.run(
|
|
airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock")
|
|
)
|
|
assert_load_info(info)
|
|
# make sure that emojis got in
|
|
assert "🦚Peacock" in pipeline.default_schema.tables
|
|
assert "🔑id" in pipeline.default_schema.tables["🦚Peacock"]["columns"]
|
|
assert load_table_counts(pipeline) == {
|
|
"📆 Schedule": 3,
|
|
"🦚Peacock": 1,
|
|
"🦚WidePeacock": 1,
|
|
"🦚Peacock__peacock": 3,
|
|
"🦚WidePeacock__Peacock": 3,
|
|
}
|
|
with pipeline.sql_client() as client:
|
|
rows = client.execute_sql("SELECT 🔑id FROM 🦚Peacock")
|
|
# 🔑id value is 1
|
|
assert rows[0][0] == 1
|
|
|
|
# change naming convention and run pipeline again so we generate name clashes
|
|
os.environ["SOURCES__AIRTABLE_EMOJIS__SCHEMA__NAMING"] = "sql_ci_v1"
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"))
|
|
# see conflicts early
|
|
assert pip_ex.value.step == "extract"
|
|
assert isinstance(pip_ex.value.__cause__, TableIdentifiersFrozen)
|
|
|
|
# all good if we drop tables
|
|
info = pipeline.run(
|
|
airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"),
|
|
refresh="drop_resources",
|
|
)
|
|
assert_load_info(info)
|
|
# case insensitive normalization
|
|
assert load_table_counts(pipeline) == {
|
|
"_schedule": 3,
|
|
"_peacock": 1,
|
|
"_widepeacock": 1,
|
|
"_peacock__peacock": 3,
|
|
"_widepeacock__peacock": 3,
|
|
}
|
|
|
|
|
|
def test_change_to_more_lax_naming_convention_name_collision() -> None:
|
|
# use snake_case which is strict and then change to duck_case which accepts snake_case names without any changes
|
|
# still we want to detect collisions
|
|
pipeline = dlt.pipeline(
|
|
"test_change_to_more_lax_naming_convention_name_collision", destination="duckdb"
|
|
)
|
|
info = pipeline.run(
|
|
airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock")
|
|
)
|
|
assert_load_info(info)
|
|
assert "_peacock" in pipeline.default_schema.tables
|
|
|
|
# use duck case to load data into duckdb so casing and emoji are preserved
|
|
duck_ = dlt.destinations.duckdb(naming_convention="duck_case")
|
|
|
|
# changing destination to one with a separate naming convention raises immediately
|
|
with pytest.raises(TableIdentifiersFrozen):
|
|
pipeline.run(
|
|
airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"),
|
|
destination=duck_,
|
|
)
|
|
|
|
# refresh on the source level will work
|
|
info = pipeline.run(
|
|
airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"),
|
|
destination=duck_,
|
|
refresh="drop_sources",
|
|
)
|
|
assert_load_info(info)
|
|
# make sure that emojis got in
|
|
assert "🦚Peacock" in pipeline.default_schema.tables
|
|
|
|
|
|
def test_change_naming_convention_column_collision() -> None:
|
|
duck_ = dlt.destinations.duckdb(naming_convention="duck_case")
|
|
|
|
data = {"Col": "A"}
|
|
pipeline = dlt.pipeline("test_change_naming_convention_column_collision", destination=duck_)
|
|
info = pipeline.run([data], table_name="data")
|
|
assert_load_info(info)
|
|
|
|
os.environ["SCHEMA__NAMING"] = "sql_ci_v1"
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run([data], table_name="data")
|
|
assert isinstance(pip_ex.value.__cause__, TableIdentifiersFrozen)
|
|
|
|
|
|
def test_import_jsonl_file() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_jsonl_import",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
columns: List[TColumnSchema] = [
|
|
{"name": "id", "data_type": "bigint", "nullable": False},
|
|
{"name": "name", "data_type": "text"},
|
|
{"name": "description", "data_type": "text"},
|
|
{"name": "ordered_at", "data_type": "date"},
|
|
{"name": "price", "data_type": "decimal"},
|
|
]
|
|
import_file = "tests/load/cases/loading/header.jsonl"
|
|
info = pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "jsonl", 2)],
|
|
table_name="no_header",
|
|
loader_file_format="jsonl",
|
|
columns=columns,
|
|
)
|
|
print(info)
|
|
assert_imported_file(pipeline, "no_header", columns, 2)
|
|
|
|
# use hints to infer
|
|
hints = dlt.mark.make_hints(columns=columns)
|
|
pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "jsonl", 2, hints=hints)],
|
|
table_name="no_header_2",
|
|
)
|
|
assert_imported_file(pipeline, "no_header_2", columns, 2, expects_state=False)
|
|
|
|
|
|
def test_import_file_without_sniff_schema() -> None:
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false"
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_jsonl_import",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
|
|
# table will not be found which is terminal exception
|
|
import_file = "tests/load/cases/loading/header.jsonl"
|
|
info = pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "jsonl", 2)],
|
|
table_name="no_header",
|
|
)
|
|
assert info.has_failed_jobs
|
|
print(info)
|
|
|
|
|
|
def test_import_non_existing_file() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_jsonl_import",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
# this file does not exist
|
|
import_file = "tests/load/cases/loading/X_header.jsonl"
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "jsonl", 2)],
|
|
table_name="no_header",
|
|
)
|
|
inner_ex = pip_ex.value.__cause__
|
|
assert isinstance(inner_ex, FileImportNotFound)
|
|
assert inner_ex.import_file_path == import_file
|
|
|
|
|
|
def test_import_unsupported_file_format() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_jsonl_import",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
# this file does not exist
|
|
import_file = "tests/load/cases/loading/csv_no_header.csv"
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "csv", 2)],
|
|
table_name="no_header",
|
|
)
|
|
inner_ex = pip_ex.value.__cause__
|
|
assert isinstance(inner_ex, NormalizeJobFailed)
|
|
assert isinstance(inner_ex.__cause__, SpecLookupFailed)
|
|
|
|
|
|
def test_import_unknown_file_format() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_jsonl_import",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
# this file does not exist
|
|
import_file = "tests/load/cases/loading/csv_no_header.csv"
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(
|
|
[dlt.mark.with_file_import(import_file, "unknown", 2)], # type: ignore[arg-type]
|
|
table_name="no_header",
|
|
)
|
|
inner_ex = pip_ex.value.__cause__
|
|
assert isinstance(inner_ex, NormalizeJobFailed)
|
|
# can't figure format from extension
|
|
assert isinstance(inner_ex.__cause__, ValueError)
|
|
|
|
|
|
def test_run_file_format_sets_table_schema() -> None:
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_run_file_format_sets_table_schema",
|
|
destination="duckdb",
|
|
dev_mode=True,
|
|
)
|
|
|
|
@dlt.resource(file_format="jsonl")
|
|
def _data():
|
|
yield [1, 2, 3]
|
|
|
|
pipeline.run(_data(), loader_file_format="insert_values")
|
|
assert pipeline.default_schema.get_table("_data")["file_format"] == "insert_values"
|
|
|
|
pipeline.extract(_data().with_name("data_"), loader_file_format="insert_values")
|
|
assert pipeline.default_schema.get_table("datax")["file_format"] == "insert_values"
|
|
|
|
pipeline.extract(_data().with_name("_data_"))
|
|
assert pipeline.default_schema.get_table("_datax")["file_format"] == "jsonl"
|
|
|
|
|
|
def test_resource_transformer_standalone() -> None:
|
|
# requires that standalone resources are executes in a single source
|
|
page = 1
|
|
|
|
@dlt.resource(name="pages")
|
|
def gen_pages():
|
|
nonlocal page
|
|
while True:
|
|
yield {"page": page}
|
|
if page == 10:
|
|
return
|
|
page += 1
|
|
|
|
@dlt.transformer(name="subpages")
|
|
def get_subpages(page_item):
|
|
yield from [
|
|
{
|
|
"page": page_item["page"],
|
|
"subpage": subpage,
|
|
}
|
|
for subpage in range(1, 11)
|
|
]
|
|
|
|
pipeline = dlt.pipeline("test_resource_transformer_standalone", destination="duckdb")
|
|
# here we must combine resources and transformers using the same instance
|
|
info = pipeline.run([gen_pages, gen_pages | get_subpages])
|
|
assert_load_info(info)
|
|
# this works because we extract transformer and resource above in a single source so dlt optimizes
|
|
# dag and extracts gen_pages only once.
|
|
assert load_table_counts(pipeline) == {"subpages": 100, "pages": 10}
|
|
|
|
# for two separate sources we have the following
|
|
page = 1
|
|
schema = Schema("test")
|
|
info = pipeline.run(
|
|
[DltSource(schema, "", [gen_pages]), DltSource(schema, "", [gen_pages | get_subpages])],
|
|
dataset_name="new_dataset",
|
|
)
|
|
assert_load_info(info, 2)
|
|
# ten subpages because only 1 page is extracted in the second source (see gen_pages exit condition)
|
|
assert load_table_counts(pipeline) == {"subpages": 10, "pages": 10}
|
|
|
|
|
|
def test_resources_same_name_in_single_source() -> None:
|
|
source_ids: List[int] = []
|
|
|
|
@dlt.resource(name="pages")
|
|
def gen_pages():
|
|
page = 0
|
|
# also store id of current source instance
|
|
source_ids.append(id(dlt.current.source()))
|
|
while True:
|
|
yield {"page": page}
|
|
if page == 10:
|
|
return
|
|
page += 1
|
|
|
|
pipeline = dlt.pipeline("test_resources_same_name_in_single_source", destination="duckdb")
|
|
info = pipeline.run([gen_pages(), gen_pages()])
|
|
assert_load_info(info)
|
|
# two separate sources
|
|
assert len(set(source_ids)) == 2
|
|
|
|
# check against different names
|
|
source_ids.clear()
|
|
info = pipeline.run([gen_pages().with_name("page_1"), gen_pages().with_name("page_2")])
|
|
assert_load_info(info)
|
|
# one source
|
|
assert len(set(source_ids)) == 1
|
|
|
|
|
|
def test_static_staging_dataset() -> None:
|
|
# share database and staging dataset
|
|
duckdb_ = dlt.destinations.duckdb(
|
|
"test_static_staging_dataset.db", staging_dataset_name_layout="_dlt_staging"
|
|
)
|
|
|
|
pipeline_1 = dlt.pipeline("test_static_staging_dataset_1", destination=duckdb_, dev_mode=True)
|
|
pipeline_2 = dlt.pipeline("test_static_staging_dataset_2", destination=duckdb_, dev_mode=True)
|
|
# staging append (without primary key)
|
|
info = pipeline_1.run([1, 2, 3], table_name="digits", write_disposition="merge")
|
|
assert_load_info(info)
|
|
info = pipeline_2.run(["a", "b", "c", "d"], table_name="letters", write_disposition="merge")
|
|
assert_load_info(info)
|
|
with pipeline_1.sql_client() as client:
|
|
with client.with_alternative_dataset_name("_dlt_staging"):
|
|
assert client.has_dataset()
|
|
schemas = client.execute_sql("SELECT schema_name FROM _dlt_staging._dlt_version")
|
|
assert {s[0] for s in schemas} == {
|
|
"test_static_staging_dataset_1",
|
|
"test_static_staging_dataset_2",
|
|
}
|
|
|
|
assert_table_counts(pipeline_1, {"digits": 3})
|
|
assert_table_counts(pipeline_2, {"letters": 4})
|
|
|
|
|
|
def test_underscore_tables_and_columns() -> None:
|
|
pipeline = dlt.pipeline("test_underscore_tables_and_columns", destination="duckdb")
|
|
|
|
@dlt.resource
|
|
def ids(_id=dlt.sources.incremental("_id", initial_value=2)):
|
|
yield from [{"_id": i, "value": l} for i, l in zip([1, 2, 3], ["A", "B", "C"])]
|
|
|
|
info = pipeline.run(ids, table_name="_ids")
|
|
assert_load_info(info)
|
|
print(pipeline.default_schema.to_pretty_yaml())
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["_ids"] == 2
|
|
|
|
|
|
def test_dlt_columns_nested_table_collisions() -> None:
|
|
# we generate all identifiers in upper case to test for a bug where dlt columns for nested tables were hardcoded to
|
|
# small caps. they got normalized to upper case after the first run and then added again as small caps
|
|
# generating duplicate columns and raising collision exception as duckdb is ci destination
|
|
duck = duckdb(naming_convention="tests.common.cases.normalizers.sql_upper")
|
|
pipeline = dlt.pipeline("test_dlt_columns_child_table_collisions", destination=duck)
|
|
customers = [
|
|
{"id": 1, "name": "dave", "orders": [1, 2, 3]},
|
|
]
|
|
assert_load_info(pipeline.run(customers, table_name="CUSTOMERS"))
|
|
# this one would fail without bugfix
|
|
assert_load_info(pipeline.run(customers, table_name="CUSTOMERS"))
|
|
|
|
|
|
def test_access_pipeline_in_resource() -> None:
|
|
pipeline = dlt.pipeline("test_access_pipeline_in_resource", destination="duckdb")
|
|
|
|
@dlt.resource(name="user_comments")
|
|
def comments(user_id: str):
|
|
current_pipeline = dlt.current.pipeline()
|
|
# find last comment id for given user_id by looking in destination
|
|
max_id: int = 0
|
|
# on first pipeline run, user_comments table does not yet exist so do not check at all
|
|
# alternatively catch DatabaseUndefinedRelation which is raised when unknown table is selected
|
|
if not current_pipeline.first_run:
|
|
with current_pipeline.sql_client() as client:
|
|
# we may get last user comment or None which we replace with 0
|
|
max_id = (
|
|
client.execute_sql(
|
|
"SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id
|
|
)[0][0]
|
|
or 0
|
|
)
|
|
# use max_id to filter our results
|
|
yield from [
|
|
{"_id": i, "value": l, "user_id": user_id}
|
|
for i, l in zip([1, 2, 3], ["A", "B", "C"])
|
|
if i > max_id
|
|
]
|
|
|
|
info = pipeline.run(comments("USER_A"))
|
|
assert_load_info(info)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["user_comments"] == 3
|
|
info = pipeline.run(comments("USER_A"))
|
|
# no more data for USER_A
|
|
assert_load_info(info, 0)
|
|
info = pipeline.run(comments("USER_B"))
|
|
assert_load_info(info)
|
|
assert pipeline.last_trace.last_normalize_info.row_counts["user_comments"] == 3
|
|
|
|
|
|
def test_exceed_job_file_name_length() -> None:
|
|
# use very long table name both for parent and for a child
|
|
data = {
|
|
"id": 1,
|
|
"child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": [
|
|
1,
|
|
2,
|
|
3,
|
|
],
|
|
"col use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child use very long table name both for parent and for a child": (
|
|
"data"
|
|
),
|
|
}
|
|
|
|
table_name = (
|
|
"parent use very long table name both for parent and for a child use very long table name"
|
|
" both for parent and for a child use very long table name both for parent and for a child"
|
|
" use very long table name both for parent and for a child use very long table name both"
|
|
" for parent and for a child use very long table name both for parent and for a child "
|
|
)
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_exceed_job_file_name_length",
|
|
destination="duckdb",
|
|
)
|
|
# path too long
|
|
with pytest.raises(PipelineStepFailed) as os_err:
|
|
pipeline.run([data], table_name=table_name)
|
|
assert isinstance(os_err.value.__cause__, OSError)
|
|
|
|
# fit into 255 + 1
|
|
suffix_len = len(".b61d3af76c.0.insert-values.gz")
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_exceed_job_file_name_length",
|
|
destination=duckdb(
|
|
max_identifier_length=255 - suffix_len + 1,
|
|
),
|
|
)
|
|
# path too long
|
|
with pytest.raises(PipelineStepFailed):
|
|
pipeline.run([data], table_name=table_name)
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_exceed_job_file_name_length",
|
|
destination=duckdb(
|
|
max_identifier_length=255 - suffix_len,
|
|
),
|
|
)
|
|
pipeline.run([data], table_name=table_name)
|
|
|
|
|
|
def assert_imported_file(
|
|
pipeline: Pipeline,
|
|
table_name: str,
|
|
columns: List[TColumnSchema],
|
|
expected_rows: int,
|
|
expects_state: bool = True,
|
|
) -> None:
|
|
assert_only_table_columns(pipeline, table_name, [col["name"] for col in columns])
|
|
rows = load_tables_to_dicts(pipeline, table_name)
|
|
assert len(rows[table_name]) == expected_rows
|
|
# we should have twp files loaded
|
|
jobs = pipeline.last_trace.last_load_info.load_packages[0].jobs["completed_jobs"]
|
|
job_extensions = []
|
|
for job in jobs:
|
|
parsed_file = ParsedLoadJobFileName.parse(job.job_file_info.file_name())
|
|
ext = (
|
|
f".{parsed_file.file_format}.gz"
|
|
if parsed_file.is_compressed
|
|
else f".{parsed_file.file_format}"
|
|
)
|
|
job_extensions.append(ext)
|
|
assert ".jsonl" in job_extensions
|
|
if expects_state:
|
|
assert ".insert_values.gz" in job_extensions
|
|
# check extract trace if jsonl is really there
|
|
extract_info = pipeline.last_trace.last_extract_info
|
|
jobs = extract_info.load_packages[0].jobs["new_jobs"]
|
|
# find jsonl job
|
|
jsonl_job = next(job for job in jobs if job.job_file_info.table_name == table_name)
|
|
assert jsonl_job.job_file_info.file_format == "jsonl"
|
|
# find metrics for table
|
|
assert (
|
|
extract_info.metrics[extract_info.loads_ids[0]][0]["table_metrics"][table_name].items_count
|
|
== expected_rows
|
|
)
|
|
|
|
|
|
def test_duckdb_column_invalid_timestamp() -> None:
|
|
# DuckDB does not have timestamps with timezone and precision, will default to timezone
|
|
@dlt.resource(
|
|
columns={"event_tstamp": {"data_type": "timestamp", "timezone": True, "precision": 3}},
|
|
primary_key="event_id",
|
|
)
|
|
def events():
|
|
yield [{"event_id": 1, "event_tstamp": "2024-07-30T10:00:00.123+00:00"}]
|
|
|
|
pipeline = dlt.pipeline(destination="duckdb")
|
|
pipeline.run(events())
|
|
|
|
|
|
def test_push_table_with_upfront_schema() -> None:
|
|
# infer schema
|
|
pipeline = dlt.pipeline(pipeline_name="push_table_infer_pipeline", destination="duckdb")
|
|
info = pipeline.run(_get_shuffled_events())
|
|
assert_load_info(info)
|
|
|
|
# get resource as table
|
|
data = list(_get_shuffled_events())
|
|
|
|
# save into other pipeline
|
|
infer_hash = pipeline.default_schema.version_hash
|
|
copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
|
|
info = copy_pipeline.run(
|
|
data, table_name="_get_shuffled_events", schema=pipeline.default_schema
|
|
)
|
|
copy_schema = copy_pipeline.default_schema
|
|
# make sure that schema hash didn't change - we didn't change anything in the data
|
|
assert copy_pipeline.default_schema.version_hash == infer_hash
|
|
copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
|
|
info = copy_pipeline.run(data, table_name="_get_shuffled_events", schema=copy_schema)
|
|
assert copy_pipeline.default_schema.version_hash == infer_hash
|
|
copy_schema = copy_pipeline.default_schema
|
|
|
|
# another table
|
|
copy_pipeline = dlt.pipeline(pipeline_name="push_table_copy_pipeline", destination="duckdb")
|
|
info = copy_pipeline.run(data, table_name="events", schema=copy_schema)
|
|
assert copy_pipeline.default_schema.version_hash != infer_hash
|
|
|
|
|
|
def test_pipeline_with_sources_sharing_schema() -> None:
|
|
schema = Schema("shared")
|
|
|
|
@dlt.source(schema=schema, max_table_nesting=1)
|
|
def source_1():
|
|
@dlt.resource(primary_key="user_id")
|
|
def gen1():
|
|
dlt.current.source_state()["source_1"] = True
|
|
dlt.current.resource_state()["source_1"] = True
|
|
yield {"id": "Y", "user_id": "user_y"}
|
|
|
|
@dlt.resource(columns={"value": {"data_type": "bool"}})
|
|
def conflict():
|
|
yield True
|
|
|
|
return gen1, conflict
|
|
|
|
@dlt.source(schema=schema, max_table_nesting=2)
|
|
def source_2():
|
|
@dlt.resource(primary_key="id")
|
|
def gen1():
|
|
dlt.current.source_state()["source_2"] = True
|
|
dlt.current.resource_state()["source_2"] = True
|
|
yield {"id": "X", "user_id": "user_X"}
|
|
|
|
def gen2():
|
|
yield from "CDE"
|
|
|
|
@dlt.resource(columns={"value": {"data_type": "text"}}, selected=False)
|
|
def conflict():
|
|
yield "indeed"
|
|
|
|
return gen2, gen1, conflict
|
|
|
|
# all selected tables with hints should be there
|
|
discover_1 = source_1().discover_schema()
|
|
assert "gen1" in discover_1.tables
|
|
assert discover_1.tables["gen1"]["columns"]["user_id"]["primary_key"] is True
|
|
assert "data_type" not in discover_1.tables["gen1"]["columns"]["user_id"]
|
|
assert "conflict" in discover_1.tables
|
|
assert discover_1.tables["conflict"]["columns"]["value"]["data_type"] == "bool"
|
|
|
|
discover_2 = source_2().discover_schema()
|
|
assert "gen1" in discover_2.tables
|
|
assert "gen2" in discover_2.tables
|
|
# conflict deselected
|
|
assert "conflict" not in discover_2.tables
|
|
|
|
p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True)
|
|
p.extract([source_1(), source_2()])
|
|
default_schema = p.default_schema
|
|
gen1_table = default_schema.tables["gen1"]
|
|
assert "user_id" in gen1_table["columns"]
|
|
assert "id" in gen1_table["columns"]
|
|
assert "conflict" in default_schema.tables
|
|
assert "gen2" in default_schema.tables
|
|
p.normalize()
|
|
assert "gen2" in default_schema.tables
|
|
assert default_schema.tables["conflict"]["columns"]["value"]["data_type"] == "bool"
|
|
p.load()
|
|
table_names = [t["name"] for t in default_schema.data_tables()]
|
|
counts = load_table_counts(p, *table_names)
|
|
assert counts == {"gen1": 2, "gen2": 3, "conflict": 1}
|
|
# both sources share the same state
|
|
assert p.state["sources"] == {
|
|
"shared": {
|
|
"source_1": True,
|
|
"resources": {"gen1": {"source_1": True, "source_2": True}},
|
|
"source_2": True,
|
|
}
|
|
}
|
|
|
|
# same pipeline but enable conflict
|
|
p.extract([source_2().with_resources("conflict")])
|
|
p.normalize()
|
|
assert default_schema.tables["conflict"]["columns"]["value"]["data_type"] == "text"
|
|
with pytest.raises(PipelineStepFailed):
|
|
# will generate failed job on type that does not match
|
|
p.load()
|
|
counts = load_table_counts(p, "conflict")
|
|
assert counts == {"conflict": 1}
|
|
|
|
# alter table in duckdb
|
|
with p.sql_client() as client:
|
|
client.execute_sql("ALTER TABLE conflict ALTER value TYPE VARCHAR;")
|
|
p.run([source_2().with_resources("conflict")])
|
|
counts = load_table_counts(p, "conflict")
|
|
assert counts == {"conflict": 2}
|
|
|
|
|
|
def test_many_pipelines_single_dataset() -> None:
|
|
schema = Schema("shared")
|
|
|
|
@dlt.source(schema=schema, max_table_nesting=1)
|
|
def source_1():
|
|
@dlt.resource(primary_key="user_id")
|
|
def gen1():
|
|
dlt.current.source_state()["source_1"] = True
|
|
dlt.current.resource_state()["source_1"] = True
|
|
yield {"id": "Y", "user_id": "user_y"}
|
|
|
|
return gen1
|
|
|
|
@dlt.source(schema=schema, max_table_nesting=2)
|
|
def source_2():
|
|
@dlt.resource(primary_key="id")
|
|
def gen1():
|
|
dlt.current.source_state()["source_2"] = True
|
|
dlt.current.resource_state()["source_2"] = True
|
|
yield {"id": "X", "user_id": "user_X"}
|
|
|
|
def gen2():
|
|
yield from "CDE"
|
|
|
|
return gen2, gen1
|
|
|
|
# load source_1 to common dataset
|
|
p = dlt.pipeline(
|
|
pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset"
|
|
)
|
|
p.run(source_1(), credentials="duckdb:///test_quack.duckdb")
|
|
counts = load_table_counts(p, *p.default_schema.tables.keys())
|
|
assert counts.items() >= {"gen1": 1, "_dlt_pipeline_state": 1, "_dlt_loads": 1}.items()
|
|
p._wipe_working_folder()
|
|
p.deactivate()
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset"
|
|
)
|
|
p.run(source_2(), credentials="duckdb:///test_quack.duckdb")
|
|
# table_names = [t["name"] for t in p.default_schema.data_tables()]
|
|
counts = load_table_counts(p, *p.default_schema.tables.keys())
|
|
# gen1: one record comes from source_1, 1 record from source_2
|
|
assert counts.items() >= {"gen1": 2, "_dlt_pipeline_state": 2, "_dlt_loads": 2}.items()
|
|
# assert counts == {'gen1': 2, 'gen2': 3}
|
|
p._wipe_working_folder()
|
|
p.deactivate()
|
|
|
|
# restore from destination, check state
|
|
p = dlt.pipeline(
|
|
pipeline_name="source_1_pipeline",
|
|
destination=dlt.destinations.duckdb(credentials="duckdb:///test_quack.duckdb"),
|
|
dataset_name="shared_dataset",
|
|
)
|
|
p.sync_destination()
|
|
# we have our separate state
|
|
assert p.state["sources"]["shared"] == {
|
|
"source_1": True,
|
|
"resources": {"gen1": {"source_1": True}},
|
|
}
|
|
# but the schema was common so we have the earliest one
|
|
assert "gen2" in p.default_schema.tables
|
|
p._wipe_working_folder()
|
|
p.deactivate()
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="source_2_pipeline",
|
|
destination=dlt.destinations.duckdb(credentials="duckdb:///test_quack.duckdb"),
|
|
dataset_name="shared_dataset",
|
|
)
|
|
p.sync_destination()
|
|
# we have our separate state
|
|
assert p.state["sources"]["shared"] == {
|
|
"source_2": True,
|
|
"resources": {"gen1": {"source_2": True}},
|
|
}
|
|
|
|
|
|
def test_nested_hints_file_format() -> None:
|
|
@dlt.resource(nested_hints={"list": {"file_format": "csv"}})
|
|
def nested_data():
|
|
yield [{"id": 1, "list": [1, 2, 3]}]
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_file_format",
|
|
destination=dlt.destinations.filesystem(TEST_STORAGE_ROOT),
|
|
dataset_name="local",
|
|
)
|
|
p.extract(nested_data())
|
|
# file_format is set
|
|
assert p.default_schema.tables["nested_data__list"]["file_format"] == "csv"
|
|
# TODO: has no effect (we have single item storage per root table)
|
|
normalize_info = p.normalize()
|
|
load_id = normalize_info.loads_ids[0]
|
|
norm_metrics = normalize_info.metrics[load_id][0]
|
|
for file_name, _ in norm_metrics["job_metrics"].items():
|
|
# always jsonl
|
|
assert file_name.endswith("jsonl.gz")
|
|
|
|
|
|
def test_nested_hints_write_disposition_append_replace() -> None:
|
|
# we can mix replace and append write disposition in nested tables
|
|
# that does not make much sense
|
|
@dlt.resource(
|
|
nested_hints={"list": dlt.mark.make_nested_hints(write_disposition="replace")},
|
|
write_disposition="append",
|
|
)
|
|
def nested_data():
|
|
yield [{"id": 1, "list": [1, 2, 3]}]
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_write_disposition_append_replace",
|
|
destination="duckdb",
|
|
dataset_name="local",
|
|
)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
# nested_data 2 records (append), nested_data__list 3 records (replace - overwrite)
|
|
assert p.dataset().row_counts().fetchall() == [("nested_data", 2), ("nested_data__list", 3)]
|
|
|
|
|
|
def test_nested_hints_write_disposition_replace_replace() -> None:
|
|
# if nested hints have the same write disposition as root - all is good
|
|
@dlt.resource(
|
|
nested_hints={"list": dlt.mark.make_nested_hints(write_disposition="replace")},
|
|
write_disposition="replace",
|
|
)
|
|
def nested_data():
|
|
yield [{"id": 1, "list": [1, 2, 3]}]
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_write_disposition_replace_replace",
|
|
destination="duckdb",
|
|
dataset_name="local",
|
|
)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
# nested_data 1 record (append), nested_data__list 3 records (replace - overwrite)
|
|
assert p.dataset().row_counts().fetchall() == [("nested_data", 1), ("nested_data__list", 3)]
|
|
|
|
|
|
def test_nested_hints_write_disposition_nested_merge() -> None:
|
|
# checks what happens when nested tables has write_disposition "merge" and root table is
|
|
# replace / append. this combination will keep nested table in staging dataset and never copy
|
|
# it to final dataset.
|
|
@dlt.resource(
|
|
nested_hints={"list": dlt.mark.make_nested_hints(write_disposition="merge")},
|
|
write_disposition="replace",
|
|
primary_key="id",
|
|
)
|
|
def nested_data():
|
|
yield [{"id": 1, "list": [1, 2, 3]}]
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_write_disposition_nested_merge",
|
|
destination="duckdb",
|
|
dataset_name="local",
|
|
)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
load_info = p.run(nested_data())
|
|
assert_load_info(load_info)
|
|
# nested_data__list not copied to main dataset
|
|
assert p.dataset().row_counts().fetchall() == [("nested_data", 1), ("nested_data__list", 0)]
|
|
# will be loading to staging and always overwritten but not merged
|
|
staging_dataset = dlt.dataset(p.destination, "local_staging", p.default_schema)
|
|
assert staging_dataset.row_counts(table_names=["nested_data__list"]).fetchall() == [
|
|
("nested_data__list", 3)
|
|
]
|
|
|
|
|
|
def test_nested_hints_write_disposition_merge() -> None:
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false"
|
|
|
|
@dlt.resource(
|
|
nested_hints={"list": dlt.mark.make_nested_hints(write_disposition="replace")},
|
|
write_disposition="merge",
|
|
primary_key="id",
|
|
)
|
|
def nested_data():
|
|
yield [{"id": 1, "list": [1, 2, 3]}]
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_write_disposition_merge",
|
|
destination="duckdb",
|
|
dataset_name="local",
|
|
)
|
|
# generates package error (nested_data__list is not added to staging dataset and will not be merged, should we fix it?)
|
|
load_info = p.run(nested_data())
|
|
assert load_info.has_failed_jobs is True
|
|
|
|
|
|
def test_nested_hints_in_variants() -> None:
|
|
# we create table schemas at runtime via table variants, all names are not normalized
|
|
data = [
|
|
{"Event": "issue", "DataBlob": [{"ID": 1, "Name": "first", "Date": "2024-01-01"}]},
|
|
{"Event": "purchase", "DataBlob": [{"PID": "20-1", "Name": "first", "Value": "28.34"}]},
|
|
]
|
|
|
|
@dlt.resource
|
|
def nested_events():
|
|
# create two table variants
|
|
yield dlt.mark.with_hints(
|
|
data[0],
|
|
dlt.mark.make_hints(
|
|
"IssuesTable",
|
|
nested_hints={
|
|
"DataBlob": dlt.mark.make_nested_hints(
|
|
columns=[{"name": "Date", "data_type": "date"}]
|
|
)
|
|
},
|
|
),
|
|
create_table_variant=True,
|
|
)
|
|
yield dlt.mark.with_hints(
|
|
data[1],
|
|
dlt.mark.make_hints(
|
|
"PurchasesTable",
|
|
nested_hints={
|
|
"DataBlob": dlt.mark.make_nested_hints(
|
|
columns=[{"name": "Value", "data_type": "decimal"}]
|
|
)
|
|
},
|
|
),
|
|
create_table_variant=True,
|
|
)
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_in_variants", destination="duckdb", dataset_name="local"
|
|
)
|
|
r_ = nested_events()
|
|
load_info = p.run(r_)
|
|
assert_load_info(load_info)
|
|
# we have variants stored in the resource (before name normalization)
|
|
assert r_._hints_variants == {
|
|
"IssuesTable": {
|
|
"columns": {},
|
|
"write_disposition": "append",
|
|
"table_name": "IssuesTable",
|
|
"nested_hints": {
|
|
"DataBlob": {
|
|
"columns": {"Date": {"name": "Date", "data_type": "date"}},
|
|
"original_columns": [{"name": "Date", "data_type": "date"}],
|
|
}
|
|
},
|
|
},
|
|
"PurchasesTable": {
|
|
"columns": {},
|
|
"write_disposition": "append",
|
|
"table_name": "PurchasesTable",
|
|
"nested_hints": {
|
|
"DataBlob": {
|
|
"columns": {"Value": {"name": "Value", "data_type": "decimal"}},
|
|
"original_columns": [{"name": "Value", "data_type": "decimal"}],
|
|
}
|
|
},
|
|
},
|
|
}
|
|
# check table schemas
|
|
assert "issues_table" in p.default_schema.tables
|
|
assert "purchases_table" in p.default_schema.tables
|
|
issues_table__data_blob = p.default_schema.tables["issues_table__data_blob"]
|
|
assert issues_table__data_blob["columns"]["date"]["data_type"] == "date"
|
|
assert "write_disposition" not in issues_table__data_blob
|
|
|
|
purchases_table__data_blob = p.default_schema.tables["purchases_table__data_blob"]
|
|
assert purchases_table__data_blob["columns"]["value"]["data_type"] == "decimal"
|
|
|
|
# make sure that _load_id was generated
|
|
assert "_dlt_load_id" not in issues_table__data_blob["columns"]
|
|
assert "_dlt_load_id" not in purchases_table__data_blob["columns"]
|
|
assert "_dlt_load_id" in p.default_schema.tables["issues_table"]["columns"]
|
|
assert "_dlt_load_id" in p.default_schema.tables["purchases_table"]["columns"]
|
|
|
|
|
|
def test_nested_hints_primary_key() -> None:
|
|
@dlt.resource(
|
|
primary_key="id",
|
|
write_disposition="merge",
|
|
nested_hints={
|
|
"purchases": dlt.mark.make_nested_hints(
|
|
# column hint is optional - makes sure that customer_id is a first column in the table
|
|
columns=[{"name": "customer_id", "data_type": "bigint"}],
|
|
primary_key=["customer_id", "id"],
|
|
write_disposition="merge",
|
|
references=[
|
|
{
|
|
"referenced_table": "customers",
|
|
"columns": ["customer_id"],
|
|
"referenced_columns": ["id"],
|
|
}
|
|
],
|
|
)
|
|
},
|
|
)
|
|
def customers():
|
|
"""Load customer data from a simple python list."""
|
|
yield [
|
|
{
|
|
"id": 1,
|
|
"name": "simon",
|
|
"city": "berlin",
|
|
"purchases": [{"id": 1, "name": "apple", "price": Decimal("1.50")}],
|
|
},
|
|
{
|
|
"id": 2,
|
|
"name": "violet",
|
|
"city": "london",
|
|
"purchases": [{"id": 1, "name": "banana", "price": Decimal("1.70")}],
|
|
},
|
|
{
|
|
"id": 3,
|
|
"name": "tammo",
|
|
"city": "new york",
|
|
"purchases": [{"id": 1, "name": "pear", "price": Decimal("2.50")}],
|
|
},
|
|
]
|
|
|
|
def _pushdown_customer_id(row):
|
|
id_ = row["id"]
|
|
for purchase in row["purchases"]:
|
|
purchase["customer_id"] = id_
|
|
return row
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_primary_key", destination="duckdb", dataset_name="local"
|
|
)
|
|
load_info = p.run(customers().add_map(_pushdown_customer_id))
|
|
assert_load_info(load_info)
|
|
customers__purchases = p.default_schema.tables["customers__purchases"]
|
|
# primary_key controls order of the columns
|
|
assert list(customers__purchases["columns"].keys())[:2] == ["customer_id", "id"]
|
|
# check if primary key set
|
|
assert customers__purchases["columns"]["customer_id"]["primary_key"] is True
|
|
assert customers__purchases["columns"]["id"]["primary_key"] is True
|
|
# make sure that _load_id was generated
|
|
assert "_dlt_load_id" in customers__purchases["columns"]
|
|
assert "_dlt_load_id" in p.default_schema.tables["customers"]["columns"]
|
|
# check counts
|
|
row_count = p.dataset().row_counts().fetchall()
|
|
assert row_count == [("customers", 3), ("customers__purchases", 3)]
|
|
# load again, merge should overwrite rows
|
|
load_info = p.run(customers().add_map(_pushdown_customer_id))
|
|
assert p.dataset().row_counts().fetchall() == row_count
|
|
|
|
|
|
def test_merge_without_root_key() -> None:
|
|
@dlt.source(root_key=False)
|
|
def double_nested():
|
|
@dlt.resource(
|
|
primary_key="id",
|
|
write_disposition="merge",
|
|
)
|
|
def customers():
|
|
"""Load customer data from a simple python list."""
|
|
yield [
|
|
{
|
|
"id": 1,
|
|
"name": "simon",
|
|
"city": "berlin",
|
|
"purchases": [{"id": 1, "name": "apple", "price": Decimal("1.50")}],
|
|
},
|
|
{
|
|
"id": 2,
|
|
"name": "violet",
|
|
"city": "london",
|
|
"purchases": [{"id": 1, "name": "banana", "price": Decimal("1.70")}],
|
|
},
|
|
{
|
|
"id": 3,
|
|
"name": "tammo",
|
|
"city": "new york",
|
|
"purchases": [{"id": 1, "name": "pear", "price": Decimal("2.50")}],
|
|
},
|
|
]
|
|
|
|
return customers
|
|
|
|
p = dlt.pipeline(
|
|
pipeline_name="test_nested_hints_primary_key", destination="duckdb", dataset_name="local"
|
|
)
|
|
s_ = double_nested()
|
|
# load twice. merge should work as usual
|
|
p.run(s_)
|
|
p.run(s_)
|
|
# no root key
|
|
assert (
|
|
get_first_column_name_with_prop(p.default_schema.tables["customers__purchases"], "root_key")
|
|
is None
|
|
)
|
|
# no data duplication (merge worked)
|
|
assert_table_counts(p, {"customers__purchases": 3, "customers": 3})
|
|
|
|
|
|
def test_pipeline_repr() -> None:
|
|
sentinel = object()
|
|
p = dlt.pipeline(pipeline_name="repr_pipeline", destination="duckdb")
|
|
|
|
repr_ = p.__repr__()
|
|
assert isinstance(repr_, str)
|
|
assert "dlt.pipeline(" in repr_
|
|
|
|
# check that properties used by `__repr__` exist
|
|
assert getattr(p, "pipeline_name", sentinel) is not sentinel
|
|
assert getattr(p, "_destination", sentinel) is not sentinel
|
|
# we know `._destination` is set on this Pipeline
|
|
assert getattr(p._destination, "destination_name", sentinel) is not sentinel
|
|
assert getattr(p, "_staging", sentinel) is not sentinel
|
|
# NOTE we also expect `_staging.destination_name` to exist
|
|
assert getattr(p, "dataset_name", sentinel) is not sentinel
|
|
assert getattr(p, "default_schema_name", sentinel) is not sentinel
|
|
assert getattr(p, "schema_names", sentinel) is not sentinel
|
|
assert getattr(p, "first_run", sentinel) is not sentinel
|
|
assert getattr(p, "dev_mode", sentinel) is not sentinel
|
|
assert getattr(p, "is_active", sentinel) is not sentinel
|
|
assert getattr(p, "pipelines_dir", sentinel) is not sentinel
|
|
assert getattr(p, "working_dir", sentinel) is not sentinel
|
|
|
|
|
|
def test_repr_after_loading_trace(tmp_path):
|
|
pipeline_name = "foo"
|
|
trace_path = tmp_path / pipeline_name / "trace.pickle"
|
|
pipeline = dlt.pipeline(pipeline_name, destination="duckdb", pipelines_dir=tmp_path)
|
|
pipeline.run([{"foo": "bar"}], table_name="temp")
|
|
trace = pickle.load(trace_path.open("rb"))
|
|
|
|
# ensure calling `__repr__()` shouldn't cause any error
|
|
trace.__repr__()
|
|
|
|
|
|
def test_pipeline_with_null_executors(monkeypatch) -> None:
|
|
# NOTE: emscripten forces null executor, this is tested in test_runners.py
|
|
monkeypatch.setattr(sys, "platform", "emscripten")
|
|
p = dlt.pipeline(pipeline_name="null_executor", destination="duckdb")
|
|
p.run([{"id": 1}], table_name="test_table")
|
|
assert p.dataset().row_counts().fetchall() == [("test_table", 1)]
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.parametrize("sig", (signals.signal.SIGINT, signals.signal.SIGTERM))
|
|
@pytest.mark.forked
|
|
def test_signal_graceful_load_step_shutdown(sig: int) -> None:
|
|
# NOTE: forked tests do not show any console/logs
|
|
|
|
# do not create additional job for status. this creates race condition during pool drain
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
@dlt.destination
|
|
def wait_until_signal(item, schema):
|
|
# exit if signalled
|
|
while not signals.was_signal_received():
|
|
signals.sleep(1)
|
|
# some more sleep to make pipeline load pool drain
|
|
signals.sleep(2)
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter",
|
|
destination=wait_until_signal(),
|
|
dataset_name="_data",
|
|
progress=DictCollector(),
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# wait until pipeline gets into load step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Load"):
|
|
signals.sleep(0.1)
|
|
|
|
signals.sleep(1)
|
|
# send signal to drain pool and stop load
|
|
os.kill(os.getpid(), sig)
|
|
|
|
p = DummyProcess(target=_thread)
|
|
p.start()
|
|
|
|
# should end gracefully
|
|
load_info = pipeline.run([1, 2, 3], table_name="digits")
|
|
assert_load_info(load_info)
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.parametrize("start_new_jobs_on_signal", (True, False))
|
|
@pytest.mark.forked
|
|
def test_signal_graceful_complete_load_step(start_new_jobs_on_signal: bool) -> None:
|
|
# test setup makes sure that only one job at a time is started
|
|
# so we can send the signal after first job is in the pool and expect second one to get completed
|
|
os.environ["LOAD__START_NEW_JOBS_ON_SIGNAL"] = str(start_new_jobs_on_signal)
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
@dlt.destination(loader_parallelism_strategy="sequential")
|
|
def wait_until_signal(item, schema):
|
|
# exit if signalled
|
|
while not signals.was_signal_received():
|
|
signals.sleep(1)
|
|
# some more sleep to make pipeline load pool drain
|
|
signals.sleep(2)
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter",
|
|
destination=wait_until_signal(),
|
|
dataset_name="_data",
|
|
progress=DictCollector(),
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# wait until pipeline gets into load step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Load"):
|
|
signals.sleep(0.1)
|
|
|
|
signals.sleep(0.5)
|
|
# send signal to drain pool and stop load
|
|
os.kill(os.getpid(), signals.signal.SIGTERM)
|
|
|
|
p = DummyProcess(target=_thread)
|
|
p.start()
|
|
|
|
if start_new_jobs_on_signal:
|
|
# should end gracefully
|
|
load_info = pipeline.run(
|
|
[dlt.resource([1, 2, 3], name="digits"), dlt.resource(["a", "b", "c"], name="letters")]
|
|
)
|
|
assert_load_info(load_info)
|
|
# sanity check on load info
|
|
load_package = load_info.load_packages[0]
|
|
assert load_info.finished_at is not None
|
|
assert load_info.started_at is not None
|
|
assert load_package.completed_at is not None
|
|
assert load_package.state == "loaded"
|
|
|
|
metrics = load_info.metrics[load_package.load_id][0]["job_metrics"]
|
|
assert len(metrics) == 2
|
|
for metric in metrics.values():
|
|
assert metric.started_at is not None
|
|
assert metric.finished_at is not None
|
|
assert metric.state == "completed"
|
|
assert metric.retry_count == 0
|
|
|
|
# two tables completed
|
|
completed_job_count = 2
|
|
new_job_count = 0
|
|
else:
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(
|
|
[
|
|
dlt.resource([1, 2, 3], name="digits"),
|
|
dlt.resource(["a", "b", "c"], name="letters"),
|
|
]
|
|
)
|
|
assert isinstance(pip_ex.value.__cause__, SignalReceivedException)
|
|
assert pip_ex.value.__cause__.signal_code == 15 # SIGTERM
|
|
# load info in exception
|
|
load_info = pip_ex.value.step_info # type: ignore
|
|
|
|
# sanity check on load info
|
|
load_package = load_info.load_packages[0]
|
|
assert load_info.finished_at is None
|
|
assert load_info.started_at is not None
|
|
assert load_package.completed_at is None
|
|
assert load_package.state == "normalized"
|
|
|
|
# just one job should go into job pool (read from package into ready state)
|
|
metrics = load_info.metrics[load_package.load_id][0]["job_metrics"]
|
|
# no more jobs because we drained the pool after the first
|
|
assert len(metrics) == 1
|
|
for metric in metrics.values():
|
|
assert metric.started_at is not None
|
|
assert metric.finished_at is not None
|
|
# completed because we drained
|
|
assert metric.state == "completed"
|
|
assert metric.retry_count == 0
|
|
|
|
# one table completed
|
|
completed_job_count = 1
|
|
new_job_count = 1
|
|
|
|
assert len(load_info.load_packages[0].jobs["completed_jobs"]) == completed_job_count
|
|
assert len(load_info.load_packages[0].jobs["new_jobs"]) == new_job_count
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.forked
|
|
def test_ignore_signals_in_load() -> None:
|
|
os.environ["PIPELINES__SIGNAL_WAITER__RUNTIME__INTERCEPT_SIGNALS"] = "False"
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
job_started = False
|
|
|
|
@dlt.destination(loader_parallelism_strategy="sequential")
|
|
def wait_until_signal(item, schema):
|
|
nonlocal job_started
|
|
|
|
signals.sleep(0.2)
|
|
job_started = True
|
|
|
|
# exit if signalled
|
|
while not signals.was_signal_received():
|
|
signals.sleep(1)
|
|
raise KeyboardInterrupt()
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter",
|
|
destination=wait_until_signal(),
|
|
dataset_name="_data",
|
|
progress=DictCollector(),
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# wait until pipeline gets into load step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Load"):
|
|
signals.sleep(0.1)
|
|
|
|
# wait for job to run
|
|
while not job_started:
|
|
signals.sleep(0.1)
|
|
|
|
# sleep before signal to let pool to collect jobs and set metrics
|
|
signals.sleep(2)
|
|
|
|
# send signal to drain pool and stop load
|
|
os.kill(os.getpid(), signals.signal.SIGINT)
|
|
|
|
p = DummyProcess(target=_thread)
|
|
p.start()
|
|
|
|
# should raise on KeyboardInterrupt - delayed signals disabled
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(dlt.resource([1, 2, 3], name="digits"))
|
|
assert isinstance(pip_ex.value.__cause__, KeyboardInterrupt)
|
|
# must have metrics and package
|
|
assert pip_ex.value.step == "load"
|
|
load_info: LoadInfo = pip_ex.value.step_info # type: ignore
|
|
assert len(load_info.load_packages) == 1
|
|
load_id = load_info.loads_ids[0]
|
|
assert load_info.load_packages[0].load_id == load_id
|
|
assert load_info.started_at is not None
|
|
assert load_info.finished_at is None
|
|
metrics = load_info.metrics[load_id][0]["job_metrics"]
|
|
assert len(metrics) == 1
|
|
for metric in metrics.values():
|
|
assert metric.started_at is not None
|
|
assert metric.finished_at is None
|
|
# killed in the middle
|
|
assert metric.state == "running"
|
|
assert metric.retry_count == 0
|
|
# stop destination
|
|
signals.set_received_signal(signals.signal.SIGINT)
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.parametrize("sig", (signals.signal.SIGINT, signals.signal.SIGTERM))
|
|
@pytest.mark.forked
|
|
def test_signal_graceful_load_step_shutdown_pipeline_in_thread(sig: int) -> None:
|
|
# NOTE: forked tests do not show any console/logs
|
|
|
|
@dlt.destination
|
|
def wait_until_signal(item, schema):
|
|
# exit if signalled
|
|
while not signals.was_signal_received():
|
|
signals.sleep(1)
|
|
# some more sleep to make pipeline load pool drain
|
|
signals.sleep(2)
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter",
|
|
destination=wait_until_signal(),
|
|
dataset_name="_data",
|
|
progress=DictCollector(),
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# should end gracefully
|
|
load_info = pipeline.run([1, 2, 3], table_name="digits")
|
|
assert_load_info(load_info)
|
|
|
|
# we can nest those freely
|
|
with signals.intercepted_signals():
|
|
with signals.intercepted_signals():
|
|
p = threading.Thread(target=_thread)
|
|
p.start()
|
|
|
|
# wait until pipeline gets into load step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Load"):
|
|
signals.sleep(0.1)
|
|
|
|
signals.sleep(1)
|
|
# send signal to drain pool and stop load
|
|
os.kill(os.getpid(), sig)
|
|
|
|
# must join in context manager
|
|
p.join()
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.parametrize("sig", (signals.signal.SIGINT,)) # signals.signal.SIGTERM
|
|
@pytest.mark.forked
|
|
def test_signal_force_load_step_shutdown(sig: int) -> None:
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
_done = False
|
|
|
|
@dlt.destination
|
|
def wait_forever(item, schema):
|
|
# never exit
|
|
from time import sleep
|
|
|
|
# this one does not wake up on signal
|
|
while not _done:
|
|
sleep(1)
|
|
|
|
# make the job fail if it gets here
|
|
raise KeyboardInterrupt()
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter", destination=wait_forever(), dataset_name="_data", progress=DictCollector()
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# wait until pipeline gets into load step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Load"):
|
|
signals.sleep(0.1)
|
|
|
|
# wait for load pool to complete at least once
|
|
signals.sleep(1.5)
|
|
# send signal to drain pool and stop load
|
|
os.kill(os.getpid(), sig)
|
|
signals.sleep(0.5)
|
|
# send to kill
|
|
os.kill(os.getpid(), sig)
|
|
|
|
p = DummyProcess(target=_thread)
|
|
p.start()
|
|
|
|
# should raise regular pipeline exception
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run([1, 2, 3], table_name="digits")
|
|
assert isinstance(pip_ex.value.__cause__, KeyboardInterrupt)
|
|
# check load info
|
|
load_info: LoadInfo = pip_ex.value.step_info # type: ignore
|
|
assert len(load_info.load_packages) == 1
|
|
load_id = load_info.loads_ids[0]
|
|
assert load_info.load_packages[0].load_id == load_id
|
|
assert load_info.started_at is not None
|
|
assert load_info.finished_at is None
|
|
metrics = load_info.metrics[load_id][0]["job_metrics"]
|
|
assert len(metrics) == 1
|
|
for metric in metrics.values():
|
|
assert metric.started_at is not None
|
|
assert metric.finished_at is None
|
|
# killed in the middle
|
|
assert metric.state == "running"
|
|
assert metric.retry_count == 0
|
|
|
|
# now we have hanging `wait_forever` in job pool. load step exited after short wait & warning
|
|
_done = True
|
|
|
|
|
|
@skipifwindows
|
|
@pytest.mark.parametrize("sig", (signals.signal.SIGINT,)) # signals.signal.SIGTERM
|
|
@pytest.mark.forked
|
|
def test_signal_extract_step_shutdown(sig: int) -> None:
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
_done = False
|
|
|
|
@dlt.resource
|
|
def wait_forever():
|
|
# never exit
|
|
from time import sleep
|
|
|
|
# this one does not wake up on signal
|
|
while not _done:
|
|
sleep(1)
|
|
|
|
# make the job fail if it gets here
|
|
raise KeyboardInterrupt()
|
|
|
|
pipeline = dlt.pipeline(
|
|
"signal_waiter", destination="dummy", dataset_name="_data", progress=DictCollector()
|
|
)
|
|
|
|
def _thread() -> None:
|
|
# wait until pipeline gets into extract step
|
|
while not pipeline.collector.step or not pipeline.collector.step.startswith("Extract"):
|
|
signals.sleep(0.1)
|
|
|
|
# extract step does not set signals
|
|
os.kill(os.getpid(), sig)
|
|
signals.sleep(0.5)
|
|
|
|
p = DummyProcess(target=_thread)
|
|
p.start()
|
|
|
|
# should raise regular pipeline exception
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(wait_forever())
|
|
assert pip_ex.value.step == "extract"
|
|
assert isinstance(pip_ex.value.__cause__, KeyboardInterrupt)
|
|
# check extract info
|
|
extract_info: ExtractInfo = pip_ex.value.step_info # type: ignore
|
|
assert len(extract_info.load_packages) == 1
|
|
load_id = extract_info.loads_ids[0]
|
|
assert extract_info.load_packages[0].load_id == load_id
|
|
# package is always completed
|
|
assert extract_info.started_at is not None
|
|
assert extract_info.finished_at is not None
|
|
# no jobs got collected. running jobs are not included
|
|
metrics = extract_info.metrics[load_id][0]["job_metrics"]
|
|
assert len(metrics) == 0
|
|
|
|
# now we have hanging `wait_forever` in job pool. load step exited after short wait & warning
|
|
_done = True
|
|
|
|
|
|
def test_uninitialized_source_factory() -> None:
|
|
"""Test that passing an uncalled source factory preserves resource-level hints such as write_disposition"""
|
|
data = [
|
|
{"id": 1, "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
|
{"id": 4, "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
|
{"id": 25, "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
|
]
|
|
|
|
@dlt.resource(
|
|
name="pokemon_resource",
|
|
write_disposition="merge",
|
|
primary_key="id",
|
|
)
|
|
def pokemon() -> TDataItems:
|
|
yield data
|
|
|
|
@dlt.source(name="pokemon_source")
|
|
def pokemon_source():
|
|
yield pokemon
|
|
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name=uniq_id(),
|
|
destination="duckdb",
|
|
dataset_name="pokemon_data",
|
|
)
|
|
|
|
pipeline.run(pokemon_source)
|
|
pipeline.run(pokemon_source)
|
|
|
|
with pipeline.sql_client() as client:
|
|
rows = client.execute_sql("SELECT * FROM pokemon_resource")
|
|
|
|
assert len(rows) == 3
|
|
|
|
|
|
def test_pending_package_exception_warning() -> None:
|
|
partially = False
|
|
terminally = False
|
|
|
|
@dlt.destination
|
|
def fail_load(item, schema):
|
|
signals.sleep(0.1)
|
|
|
|
# fail one of the resources
|
|
if partially and terminally and schema["name"] == "letters":
|
|
raise DestinationTerminalException("load job failed")
|
|
if partially and schema["name"] == "emojis":
|
|
return
|
|
|
|
raise RuntimeError("load job retry")
|
|
|
|
@dlt.resource
|
|
def fail_extract():
|
|
# make the job fail if it gets here
|
|
raise KeyboardInterrupt()
|
|
|
|
pipeline = dlt.pipeline(
|
|
"fail_at_every_step",
|
|
destination=fail_load(),
|
|
dataset_name="_data",
|
|
)
|
|
pipeline.config.restore_from_destination = False
|
|
|
|
# fail in extract should not generate any warnings: nothing is pending in the pipeline
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(fail_extract())
|
|
|
|
assert pip_ex.value.step == "extract"
|
|
assert "Pending packages" not in str(pip_ex.value)
|
|
assert "partially loaded" not in str(pip_ex.value)
|
|
assert pip_ex.value.load_id is not None
|
|
assert pip_ex.value.is_package_partially_loaded is False
|
|
assert pip_ex.value.has_pending_data is False
|
|
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run(
|
|
[
|
|
dlt.resource([1, 2, 3], name="digits"),
|
|
dlt.resource(["a", "b", "c"], name="letters"),
|
|
dlt.resource(["🤷", "⭐", "🐬"], name="emojis"),
|
|
]
|
|
)
|
|
|
|
# none of the jobs passed so we have pending package but not partial
|
|
assert pip_ex.value.step == "load"
|
|
assert "Pending packages" in str(pip_ex.value)
|
|
assert "partially loaded" not in str(pip_ex.value)
|
|
assert pip_ex.value.load_id is not None
|
|
assert pip_ex.value.is_package_partially_loaded is False
|
|
assert pip_ex.value.has_pending_data is True
|
|
|
|
partially = True
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run()
|
|
|
|
# some job passed some not, still not aborted
|
|
assert pip_ex.value.step == "load"
|
|
assert "Pending packages" in str(pip_ex.value)
|
|
assert "partially loaded" in str(pip_ex.value)
|
|
assert pip_ex.value.load_id is not None
|
|
assert pip_ex.value.is_package_partially_loaded is True
|
|
assert pip_ex.value.has_pending_data is True
|
|
|
|
terminally = True
|
|
with pytest.raises(PipelineStepFailed) as pip_ex:
|
|
pipeline.run()
|
|
|
|
# one of the job failed and package is aborted. sometimes the other
|
|
# job completed, sometimes is still pending so we disable pending test
|
|
assert pip_ex.value.step == "load"
|
|
# assert "Pending packages" not in str(pip_ex.value)
|
|
assert "partially loaded" in str(pip_ex.value)
|
|
assert pip_ex.value.load_id is not None
|
|
assert pip_ex.value.is_package_partially_loaded is True
|
|
# assert pip_ex.value.has_pending_data is False
|
|
|
|
|
|
def test_cleanup() -> None:
|
|
# this must happen after all forked tests (problems with tests teardowns in other tests)
|
|
pass
|