mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
* move pyproject.toml and makefile from old branch and add inbetween changes
* update workflow files to use uv
* run new version of formatter
* fix building of images with uv
* possibly fix docs linting
* downgrade lancedb dependency to fix tests
* fix gcs compat mode for s3 for newest boto
* fix docstrings in examples
* add some uv constraints
* update readme.md and contributing.md and some other places
* allow duckdb 0.8 in range
* add link-mode copy to uv venv on windows
* remove poetry lockfile and unneeded lockfile checker
* fix chess api related failures
* sleep after dremio start..
* set correct package in pyproject
* Revert "add some uv constraints"
This reverts commit d611e9ecce.
# Conflicts:
# pyproject.toml
# uv.lock
* add missing databricks sql connector version bounds
706 lines
27 KiB
Python
706 lines
27 KiB
Python
from copy import deepcopy
|
|
import io
|
|
import os
|
|
import asyncio
|
|
import datetime # noqa: 251
|
|
from typing import Any, List
|
|
from unittest.mock import patch
|
|
import pytest
|
|
import requests_mock
|
|
import yaml
|
|
|
|
import dlt
|
|
|
|
from dlt.common import json
|
|
from dlt.common.configuration.specs import CredentialsConfiguration, RuntimeConfiguration
|
|
from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContainer
|
|
from dlt.common.pipeline import ExtractInfo, NormalizeInfo, LoadInfo
|
|
from dlt.common.schema import Schema
|
|
from dlt.common.runtime.telemetry import stop_telemetry
|
|
from dlt.common.typing import DictStrAny, DictStrStr, TSecretValue
|
|
from dlt.common.utils import digest128
|
|
|
|
from dlt.destinations import dummy, filesystem
|
|
|
|
from dlt.pipeline.exceptions import PipelineStepFailed
|
|
from dlt.pipeline.pipeline import Pipeline
|
|
from dlt.pipeline.trace import (
|
|
PipelineTrace,
|
|
SerializableResolvedValueTrace,
|
|
load_trace,
|
|
)
|
|
from dlt.pipeline.track import slack_notify_load_success
|
|
from dlt.extract import DltResource, DltSource
|
|
from dlt.extract.extract import describe_extract_data
|
|
from dlt.extract.pipe import Pipe
|
|
|
|
from tests.pipeline.utils import PIPELINE_TEST_CASES_PATH
|
|
from tests.utils import TEST_STORAGE_ROOT, start_test_telemetry, temporary_telemetry
|
|
|
|
|
|
def test_create_trace(toml_providers: ConfigProvidersContainer, environment: Any) -> None:
|
|
dlt.secrets["load.delete_completed_jobs"] = True
|
|
|
|
@dlt.source
|
|
def inject_tomls(
|
|
api_type=dlt.config.value,
|
|
credentials: CredentialsConfiguration = dlt.secrets.value,
|
|
secret_value: TSecretValue = TSecretValue("123"), # noqa: B008
|
|
):
|
|
@dlt.resource(write_disposition="replace", primary_key="id")
|
|
def data():
|
|
yield [{"id": 1}, {"id": 2}, {"id": 3}]
|
|
|
|
return data()
|
|
|
|
p = dlt.pipeline(destination="dummy")
|
|
|
|
# read from secrets and configs directly
|
|
databricks_creds = "databricks+connector://token:<databricks_token>@<databricks_host>:443/<database_or_schema_name>?conn_timeout=15&search_path=a,b,c"
|
|
s = dlt.secrets["databricks.credentials"]
|
|
assert s == databricks_creds
|
|
|
|
extract_info = p.extract(inject_tomls())
|
|
trace = p.last_trace
|
|
assert trace is not None
|
|
# assert p._trace is None
|
|
assert len(trace.steps) == 1
|
|
step = trace.steps[0]
|
|
assert step.step == "extract"
|
|
assert isinstance(step.started_at, datetime.datetime)
|
|
assert isinstance(step.finished_at, datetime.datetime)
|
|
assert isinstance(step.step_info, ExtractInfo)
|
|
assert step.step_info.extract_data_info == [{"name": "inject_tomls", "data_type": "source"}]
|
|
# check infos
|
|
extract_info = p.last_trace.last_extract_info
|
|
assert isinstance(extract_info, ExtractInfo)
|
|
# should have single job and single load id
|
|
assert len(extract_info.loads_ids) == 1
|
|
load_id = extract_info.loads_ids[0]
|
|
assert len(extract_info.metrics) == 1
|
|
|
|
# extract of data in the first one
|
|
metrics = extract_info.metrics[load_id][0]
|
|
# inject tomls and dlt state
|
|
assert len(metrics["job_metrics"]) == 1
|
|
assert "data" in metrics["table_metrics"]
|
|
assert set(metrics["resource_metrics"].keys()) == {"data"}
|
|
assert metrics["schema_name"] == "inject_tomls"
|
|
# check dag and hints
|
|
assert metrics["dag"] == [("data", "data")]
|
|
assert metrics["hints"]["data"] == {"write_disposition": "replace", "primary_key": "id"}
|
|
|
|
metrics = extract_info.metrics[load_id][1]
|
|
# inject tomls and dlt state
|
|
assert len(metrics["job_metrics"]) == 1
|
|
assert "_dlt_pipeline_state" in metrics["table_metrics"]
|
|
assert set(metrics["resource_metrics"].keys()) == {"_dlt_pipeline_state"}
|
|
assert metrics["schema_name"] == "inject_tomls"
|
|
# check dag and hints
|
|
assert metrics["dag"] == [("_dlt_pipeline_state", "_dlt_pipeline_state")]
|
|
# state has explicit columns set
|
|
assert metrics["hints"]["_dlt_pipeline_state"]["original_columns"] == "dict"
|
|
|
|
# check packages
|
|
assert len(extract_info.load_packages) == 1
|
|
# two jobs
|
|
print(extract_info.load_packages[0])
|
|
assert len(extract_info.load_packages[0].jobs["new_jobs"]) == 2
|
|
assert extract_info.load_packages[0].state == "extracted"
|
|
|
|
# check config trace
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "api_type", [])
|
|
assert resolved.config_type_name == "TestCreateTraceInjectTomlsConfiguration"
|
|
assert resolved.value == "REST"
|
|
assert resolved.is_secret_hint is False
|
|
assert resolved.default_value is None
|
|
assert resolved.provider_name == "config.toml"
|
|
# dictionaries are not returned anymore, secrets are masked
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "credentials", [])
|
|
assert resolved is None or isinstance(resolved.value, str)
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "secret_value", [])
|
|
assert resolved.is_secret_hint is True
|
|
assert resolved.value is None, "Credential is not masked"
|
|
assert resolved.default_value is None, "Credential is not masked"
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "credentials", ["databricks"])
|
|
assert resolved.is_secret_hint is True
|
|
assert resolved.value is None, "Credential is not masked"
|
|
assert_trace_serializable(trace)
|
|
|
|
# activate pipeline because other was running in assert trace
|
|
p.activate()
|
|
|
|
# extract with exception
|
|
@dlt.source
|
|
def async_exception(max_range=1):
|
|
async def get_val(v):
|
|
await asyncio.sleep(0.1)
|
|
if v % 3 == 0:
|
|
raise ValueError(v)
|
|
return v
|
|
|
|
@dlt.resource
|
|
def data():
|
|
yield from [get_val(v) for v in range(1, max_range)]
|
|
|
|
return data()
|
|
|
|
with pytest.raises(PipelineStepFailed):
|
|
p.extract(async_exception())
|
|
|
|
trace = p.last_trace
|
|
assert p._trace is None
|
|
assert len(trace.steps) == 2
|
|
step = trace.steps[1]
|
|
assert step.step == "extract"
|
|
assert isinstance(step.step_exception, str)
|
|
assert isinstance(step.step_info, ExtractInfo)
|
|
assert len(step.exception_traces) > 0
|
|
assert step.step_info.extract_data_info == [{"name": "async_exception", "data_type": "source"}]
|
|
assert_trace_serializable(trace)
|
|
|
|
extract_info = step.step_info
|
|
# only new (unprocessed) package is present, all other metrics are empty, state won't be extracted
|
|
assert len(extract_info.loads_ids) == 1
|
|
load_id = extract_info.loads_ids[0]
|
|
package = extract_info.load_packages[0]
|
|
assert package.state == "new"
|
|
# no jobs - exceptions happened before save
|
|
assert len(package.jobs["new_jobs"]) == 0
|
|
# metrics should be collected
|
|
assert len(extract_info.metrics[load_id]) == 1
|
|
|
|
# normalize
|
|
norm_info = p.normalize()
|
|
trace = p.last_trace
|
|
assert p._trace is None
|
|
assert len(trace.steps) == 3
|
|
step = trace.steps[2]
|
|
assert step.step == "normalize"
|
|
assert step.step_info is norm_info
|
|
assert_trace_serializable(trace)
|
|
assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo)
|
|
assert p.last_trace.last_normalize_info.row_counts == {"_dlt_pipeline_state": 1, "data": 3}
|
|
|
|
assert len(norm_info.loads_ids) == 1
|
|
load_id = norm_info.loads_ids[0]
|
|
assert len(norm_info.metrics) == 1
|
|
|
|
# just one load package with single metrics
|
|
assert len(norm_info.metrics[load_id]) == 1
|
|
norm_metrics = norm_info.metrics[load_id][0]
|
|
# inject tomls and dlt state
|
|
assert len(norm_metrics["job_metrics"]) == 2
|
|
assert "data" in norm_metrics["table_metrics"]
|
|
|
|
# check packages
|
|
assert len(norm_info.load_packages) == 1
|
|
# two jobs
|
|
assert len(norm_info.load_packages[0].jobs["new_jobs"]) == 2
|
|
assert norm_info.load_packages[0].state == "normalized"
|
|
|
|
# load
|
|
os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately
|
|
load_info = p.load()
|
|
trace = p.last_trace
|
|
assert p._trace is None
|
|
assert len(trace.steps) == 4
|
|
step = trace.steps[3]
|
|
assert step.step == "load"
|
|
assert step.step_info is load_info
|
|
load_info = step.step_info # type: ignore[assignment]
|
|
|
|
# check packages
|
|
assert len(load_info.load_packages) == 1
|
|
# two jobs
|
|
assert load_info.load_packages[0].state == "loaded"
|
|
assert len(load_info.load_packages[0].jobs["completed_jobs"]) == 2
|
|
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "completed_prob", [])
|
|
assert resolved.is_secret_hint is False
|
|
assert resolved.value == "1.0"
|
|
assert resolved.config_type_name == "DummyClientConfiguration"
|
|
assert_trace_serializable(trace)
|
|
assert isinstance(p.last_trace.last_load_info, LoadInfo)
|
|
p.activate()
|
|
|
|
# run resets the trace
|
|
load_info = inject_tomls().run()
|
|
trace = p.last_trace
|
|
assert p._trace is None
|
|
assert len(trace.steps) == 4 # extract, normalize, load, run
|
|
step = trace.steps[-1] # the last one should be run
|
|
assert step.step == "run"
|
|
assert step.step_info is load_info
|
|
assert trace.steps[0].step_info is not extract_info
|
|
|
|
step = trace.steps[-2] # the previous one should be load
|
|
assert step.step == "load"
|
|
assert step.step_info is load_info # same load info
|
|
assert trace.steps[0].step_info is not extract_info
|
|
assert_trace_serializable(trace)
|
|
assert isinstance(p.last_trace.last_load_info, LoadInfo)
|
|
assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo)
|
|
assert isinstance(p.last_trace.last_extract_info, ExtractInfo)
|
|
|
|
|
|
def test_trace_schema() -> None:
|
|
os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True"
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "False"
|
|
|
|
# mock runtime env
|
|
os.environ["CIRCLECI"] = "1"
|
|
os.environ["AWS_LAMBDA_FUNCTION_NAME"] = "lambda"
|
|
|
|
@dlt.source(section="many_hints")
|
|
def many_hints(
|
|
api_type=dlt.config.value,
|
|
credentials: str = dlt.secrets.value,
|
|
secret_value: TSecretValue = TSecretValue("123"), # noqa: B008
|
|
):
|
|
# TODO: create table / column schema from typed dicts, not explicitly
|
|
@dlt.resource(
|
|
write_disposition="replace",
|
|
primary_key="id",
|
|
table_format="delta",
|
|
file_format="jsonl",
|
|
schema_contract="evolve",
|
|
columns=[
|
|
{
|
|
"name": "multi",
|
|
"data_type": "decimal",
|
|
"nullable": True,
|
|
"cluster": True,
|
|
"description": "unknown",
|
|
"merge_key": True,
|
|
"precision": 9,
|
|
"scale": 3,
|
|
"sort": True,
|
|
"variant": True,
|
|
"partition": True,
|
|
}
|
|
],
|
|
)
|
|
def data():
|
|
yield [{"id": 1, "multi": "1.2"}, {"id": 2}, {"id": 3}]
|
|
|
|
return data()
|
|
|
|
@dlt.source
|
|
def github():
|
|
@dlt.resource
|
|
def get_shuffled_events():
|
|
for _ in range(1):
|
|
with open(
|
|
"tests/normalize/cases/github.events.load_page_1_duck.json",
|
|
"r",
|
|
encoding="utf-8",
|
|
) as f:
|
|
issues = json.load(f)
|
|
yield issues
|
|
|
|
return get_shuffled_events()
|
|
|
|
@dlt.source
|
|
def async_exception(max_range=1):
|
|
async def get_val(v):
|
|
await asyncio.sleep(0.1)
|
|
if v % 3 == 0:
|
|
raise ValueError(v)
|
|
return v
|
|
|
|
@dlt.resource
|
|
def data():
|
|
yield from [get_val(v) for v in range(1, max_range)]
|
|
|
|
return data()
|
|
|
|
# create pipeline with staging to get remote_url in load step job_metrics
|
|
dummy_dest = dummy(completed_prob=1.0)
|
|
pipeline = dlt.pipeline(
|
|
pipeline_name="test_trace_schema",
|
|
destination=dummy_dest,
|
|
staging=filesystem(os.path.abspath(os.path.join(TEST_STORAGE_ROOT, "_remote_filesystem"))),
|
|
dataset_name="various",
|
|
)
|
|
|
|
# mock config
|
|
os.environ["API_TYPE"] = "REST"
|
|
os.environ["SOURCES__MANY_HINTS__CREDENTIALS"] = "CREDS"
|
|
|
|
pipeline.run([many_hints(), github()])
|
|
|
|
trace = pipeline.last_trace
|
|
pipeline._schema_storage.storage.save("trace.json", json.dumps(trace, pretty=True))
|
|
|
|
schema = dlt.Schema("trace")
|
|
trace_pipeline = dlt.pipeline(
|
|
pipeline_name="test_trace_schema_traces", destination=dummy(completed_prob=1.0)
|
|
)
|
|
trace_pipeline.run([trace], table_name="trace", schema=schema)
|
|
|
|
# add exception trace
|
|
with pytest.raises(PipelineStepFailed):
|
|
pipeline.extract(async_exception(max_range=4))
|
|
|
|
trace_exception = pipeline.last_trace
|
|
pipeline._schema_storage.storage.save(
|
|
"trace_exception.json", json.dumps(trace_exception, pretty=True)
|
|
)
|
|
|
|
trace_pipeline.run([trace_exception], table_name="trace")
|
|
inferred_trace_contract = trace_pipeline.schemas["trace"]
|
|
inferred_contract_str = inferred_trace_contract.to_pretty_yaml(remove_processing_hints=True)
|
|
|
|
# NOTE: this saves actual inferred contract (schema) to schema storage, move it to test cases if you update
|
|
# trace shapes
|
|
# TODO: create a proper schema for dlt trace and tables/columns
|
|
pipeline._schema_storage.storage.save("trace.schema.yaml", inferred_contract_str)
|
|
# print(pipeline._schema_storage.storage.storage_path)
|
|
|
|
# load the schema and use it as contract
|
|
with open(f"{PIPELINE_TEST_CASES_PATH}/contracts/trace.schema.yaml", encoding="utf-8") as f:
|
|
imported_schema = yaml.safe_load(f)
|
|
trace_contract = Schema.from_dict(imported_schema, remove_processing_hints=True)
|
|
# compare pretty forms of the schemas, they must be identical
|
|
# NOTE: if this fails you can comment this out and use contract run below to find first offending difference
|
|
# assert trace_contract.to_pretty_yaml() == inferred_contract_str
|
|
|
|
# use trace contract to load data again
|
|
contract_trace_pipeline = dlt.pipeline(
|
|
pipeline_name="test_trace_schema_traces_contract", destination=dummy(completed_prob=1.0)
|
|
)
|
|
contract_trace_pipeline.run(
|
|
[trace_exception, trace],
|
|
table_name="trace",
|
|
schema=trace_contract,
|
|
schema_contract="freeze",
|
|
)
|
|
|
|
# assert inferred_trace_contract.version_hash == trace_contract.version_hash
|
|
|
|
# print(trace_pipeline.schemas["trace"].to_pretty_yaml())
|
|
# print(pipeline._schema_storage.storage.storage_path)
|
|
|
|
|
|
# def test_trace_schema_contract() -> None:
|
|
|
|
|
|
def test_save_load_trace() -> None:
|
|
os.environ["COMPLETED_PROB"] = "1.0"
|
|
info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
|
|
pipeline = dlt.pipeline()
|
|
# will get trace from working dir
|
|
trace = pipeline.last_trace
|
|
assert trace is not None
|
|
assert pipeline._trace is None
|
|
assert len(trace.steps) == 4 == len(info.pipeline.last_trace.steps) # type: ignore[attr-defined]
|
|
step = trace.steps[-2] # the previous to last one should be load
|
|
assert step.step == "load"
|
|
resolved = _find_resolved_value(trace.resolved_config_values, "completed_prob", [])
|
|
assert resolved.is_secret_hint is False
|
|
assert resolved.value == "1.0"
|
|
assert resolved.config_type_name == "DummyClientConfiguration"
|
|
assert_trace_serializable(trace)
|
|
# check row counts
|
|
assert pipeline.last_trace.last_normalize_info.row_counts == {
|
|
"_dlt_pipeline_state": 1,
|
|
"data": 3,
|
|
}
|
|
# reactivate the pipeline
|
|
pipeline.activate()
|
|
|
|
# load trace and check if all elements are present
|
|
loaded_trace = load_trace(pipeline.working_dir)
|
|
print(loaded_trace.asstr(2))
|
|
assert len(trace.steps) == 4
|
|
loaded_trace_dict = deepcopy(loaded_trace.asdict())
|
|
trace_dict = deepcopy(trace.asdict())
|
|
assert loaded_trace_dict == trace_dict
|
|
# do it again to check if we are not popping
|
|
assert loaded_trace_dict == loaded_trace.asdict()
|
|
assert trace_dict == trace.asdict()
|
|
|
|
# exception also saves trace
|
|
@dlt.resource
|
|
def data():
|
|
raise NotImplementedError()
|
|
yield
|
|
|
|
with pytest.raises(PipelineStepFailed) as py_ex:
|
|
dlt.run(data(), destination="dummy")
|
|
# there's the same pipeline in exception as in previous run
|
|
assert py_ex.value.pipeline is info.pipeline
|
|
trace = load_trace(py_ex.value.pipeline.working_dir)
|
|
assert trace is not None
|
|
assert pipeline._trace is None
|
|
assert len(trace.steps) == 2 # extract with exception, also has run with exception
|
|
step = trace.steps[-2]
|
|
assert step.step == "extract"
|
|
assert step.step_exception is not None
|
|
run_step = trace.steps[-1]
|
|
assert run_step.step == "run"
|
|
assert run_step.step_exception is not None
|
|
assert step.step_exception == run_step.step_exception
|
|
assert_trace_serializable(trace)
|
|
assert pipeline.last_trace.last_normalize_info is None
|
|
|
|
|
|
def test_save_load_empty_trace() -> None:
|
|
os.environ["COMPLETED_PROB"] = "1.0"
|
|
os.environ["RESTORE_FROM_DESTINATION"] = "false"
|
|
pipeline = dlt.pipeline()
|
|
pipeline.run([], table_name="data", destination="dummy")
|
|
trace = pipeline.last_trace
|
|
assert_trace_serializable(trace)
|
|
assert len(trace.steps) == 4
|
|
|
|
pipeline.activate()
|
|
|
|
# load trace and check if all elements are present
|
|
loaded_trace = load_trace(pipeline.working_dir)
|
|
print(loaded_trace.asstr(2))
|
|
assert len(trace.steps) == 4
|
|
loaded_trace_dict = deepcopy(loaded_trace.asdict())
|
|
trace_dict = deepcopy(trace.asdict())
|
|
assert loaded_trace_dict == trace_dict
|
|
# do it again to check if we are not popping
|
|
assert loaded_trace_dict == loaded_trace.asdict()
|
|
assert trace_dict == trace.asdict()
|
|
|
|
|
|
def test_disable_trace(environment: DictStrStr) -> None:
|
|
environment["ENABLE_RUNTIME_TRACE"] = "false"
|
|
environment["COMPLETED_PROB"] = "1.0"
|
|
dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
|
|
assert dlt.pipeline().last_trace is None
|
|
|
|
|
|
def test_trace_on_restore_state(environment: DictStrStr) -> None:
|
|
environment["COMPLETED_PROB"] = "1.0"
|
|
|
|
def _sync_destination_patch(
|
|
self: Pipeline, destination: str = None, staging: str = None, dataset_name: str = None
|
|
):
|
|
# just wipe the pipeline simulating deleted dataset
|
|
self._wipe_working_folder()
|
|
self._configure(
|
|
self._schema_storage_config.export_schema_path,
|
|
self._schema_storage_config.import_schema_path,
|
|
False,
|
|
)
|
|
|
|
with patch.object(Pipeline, "sync_destination", _sync_destination_patch):
|
|
dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
|
|
assert len(dlt.pipeline().last_trace.steps) == 4
|
|
assert dlt.pipeline().last_trace.last_normalize_info.row_counts == {
|
|
"_dlt_pipeline_state": 1,
|
|
"data": 3,
|
|
}
|
|
|
|
|
|
def test_load_none_trace() -> None:
|
|
p = dlt.pipeline()
|
|
assert load_trace(p.working_dir) is None
|
|
|
|
|
|
def test_trace_telemetry(temporary_telemetry: RuntimeConfiguration) -> None:
|
|
with (
|
|
patch("dlt.common.runtime.sentry.before_send", _mock_sentry_before_send),
|
|
patch("dlt.common.runtime.anon_tracker.before_send", _mock_anon_tracker_before_send),
|
|
):
|
|
ANON_TRACKER_SENT_ITEMS.clear()
|
|
SENTRY_SENT_ITEMS.clear()
|
|
# make dummy fail all files
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
# but do not raise exceptions
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false"
|
|
load_info = dlt.pipeline().run(
|
|
[1, 2, 3], table_name="data", destination="dummy", dataset_name="data_data"
|
|
)
|
|
# we should have 4 tracker items
|
|
assert len(ANON_TRACKER_SENT_ITEMS) == 4
|
|
expected_steps = ["extract", "normalize", "load", "run"]
|
|
for event, step in zip(ANON_TRACKER_SENT_ITEMS, expected_steps):
|
|
assert event["event"] == f"pipeline_{step}"
|
|
assert event["properties"]["success"] is True
|
|
assert event["properties"]["destination_name"] == "dummy"
|
|
assert event["properties"]["destination_type"] == "dlt.destinations.dummy"
|
|
assert event["properties"]["pipeline_name_hash"] == digest128(
|
|
load_info.pipeline.pipeline_name
|
|
)
|
|
assert event["properties"]["dataset_name_hash"] == digest128(
|
|
load_info.pipeline.dataset_name
|
|
)
|
|
assert event["properties"]["default_schema_name_hash"] == digest128(
|
|
load_info.pipeline.default_schema_name
|
|
)
|
|
assert isinstance(event["properties"]["elapsed"], float)
|
|
assert isinstance(event["properties"]["transaction_id"], str)
|
|
# check extract info
|
|
if step == "extract":
|
|
assert event["properties"]["extract_data"] == [{"name": "", "data_type": "int"}]
|
|
if step == "load":
|
|
# dummy has empty fingerprint
|
|
assert event["properties"]["destination_fingerprint"] == ""
|
|
#
|
|
# we have two failed files (state and data) that should be logged by sentry
|
|
# print(SENTRY_SENT_ITEMS)
|
|
# for item in SENTRY_SENT_ITEMS:
|
|
# # print(item)
|
|
# print(item["logentry"]["message"])
|
|
# assert len(SENTRY_SENT_ITEMS) == 4
|
|
|
|
# trace with exception
|
|
@dlt.resource
|
|
def data():
|
|
raise NotImplementedError()
|
|
yield
|
|
|
|
ANON_TRACKER_SENT_ITEMS.clear()
|
|
SENTRY_SENT_ITEMS.clear()
|
|
with pytest.raises(PipelineStepFailed):
|
|
dlt.pipeline().run(data, destination="dummy")
|
|
assert len(ANON_TRACKER_SENT_ITEMS) == 2
|
|
event = ANON_TRACKER_SENT_ITEMS[0]
|
|
assert event["event"] == "pipeline_extract"
|
|
assert event["properties"]["success"] is False
|
|
assert event["properties"]["destination_name"] == "dummy"
|
|
assert event["properties"]["destination_type"] == "dlt.destinations.dummy"
|
|
assert isinstance(event["properties"]["elapsed"], float)
|
|
# check extract info
|
|
if step == "extract":
|
|
assert event["properties"]["extract_data"] == [
|
|
{"name": "data", "data_type": "resource"}
|
|
]
|
|
# we didn't log any errors
|
|
assert len(SENTRY_SENT_ITEMS) == 0
|
|
|
|
# trace without destination and dataset
|
|
p = dlt.pipeline(pipeline_name="fresh").drop()
|
|
ANON_TRACKER_SENT_ITEMS.clear()
|
|
SENTRY_SENT_ITEMS.clear()
|
|
p.extract([1, 2, 3], table_name="data")
|
|
event = ANON_TRACKER_SENT_ITEMS[0]
|
|
assert event["event"] == "pipeline_extract"
|
|
assert event["properties"]["success"] is True
|
|
assert event["properties"]["destination_name"] is None
|
|
assert event["properties"]["destination_type"] is None
|
|
assert event["properties"]["pipeline_name_hash"] == digest128("fresh")
|
|
assert event["properties"]["dataset_name_hash"] is None
|
|
assert event["properties"]["default_schema_name_hash"] == digest128(p.default_schema_name)
|
|
|
|
# trace with dataset name
|
|
p = dlt.pipeline(pipeline_name="fresh", dataset_name="fresh_dataset").drop()
|
|
ANON_TRACKER_SENT_ITEMS.clear()
|
|
SENTRY_SENT_ITEMS.clear()
|
|
p.extract([1, 2, 3], table_name="data")
|
|
event = ANON_TRACKER_SENT_ITEMS[0]
|
|
assert event["properties"]["dataset_name_hash"] == digest128(p.dataset_name)
|
|
|
|
|
|
def test_extract_data_describe() -> None:
|
|
schema = Schema("test")
|
|
assert describe_extract_data(DltSource(schema, "sect")) == [
|
|
{"name": "test", "data_type": "source"}
|
|
]
|
|
assert describe_extract_data(DltResource(Pipe("rrr_extract"), None, False)) == [
|
|
{"name": "rrr_extract", "data_type": "resource"}
|
|
]
|
|
assert describe_extract_data([DltSource(schema, "sect")]) == [
|
|
{"name": "test", "data_type": "source"}
|
|
]
|
|
assert describe_extract_data([DltResource(Pipe("rrr_extract"), None, False)]) == [
|
|
{"name": "rrr_extract", "data_type": "resource"}
|
|
]
|
|
assert describe_extract_data(
|
|
[DltResource(Pipe("rrr_extract"), None, False), DltSource(schema, "sect")]
|
|
) == [{"name": "rrr_extract", "data_type": "resource"}, {"name": "test", "data_type": "source"}]
|
|
assert describe_extract_data([{"a": "b"}]) == [{"name": "", "data_type": "dict"}]
|
|
from pandas import DataFrame
|
|
|
|
# we assume that List content has same type
|
|
assert describe_extract_data([DataFrame(), {"a": "b"}]) == [
|
|
{"name": "", "data_type": "DataFrame"}
|
|
]
|
|
# first unnamed element in the list breaks checking info
|
|
assert describe_extract_data(
|
|
[DltResource(Pipe("rrr_extract"), None, False), DataFrame(), DltSource(schema, "sect")]
|
|
) == [{"name": "rrr_extract", "data_type": "resource"}, {"name": "", "data_type": "DataFrame"}]
|
|
|
|
|
|
def test_slack_hook(environment: DictStrStr) -> None:
|
|
stop_telemetry()
|
|
hook_url = "https://hooks.slack.com/services/T04DHMAF13Q/B04E7B1MQ1H/TDHEI123WUEE"
|
|
environment["COMPLETED_PROB"] = "1.0"
|
|
environment["GITHUB_USER"] = "rudolfix"
|
|
environment["RUNTIME__DLTHUB_TELEMETRY"] = "False"
|
|
environment["RUNTIME__SLACK_INCOMING_HOOK"] = hook_url
|
|
with requests_mock.mock() as m:
|
|
m.post(hook_url, json={})
|
|
load_info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
|
|
assert slack_notify_load_success(load_info.pipeline.runtime_config.slack_incoming_hook, load_info, load_info.pipeline.last_trace) == 200 # type: ignore[attr-defined]
|
|
assert m.called
|
|
message = m.last_request.json()
|
|
assert "rudolfix" in message["text"]
|
|
assert "dummy" in message["text"]
|
|
|
|
|
|
def test_broken_slack_hook(environment: DictStrStr) -> None:
|
|
environment["COMPLETED_PROB"] = "1.0"
|
|
environment["RUNTIME__SLACK_INCOMING_HOOK"] = "http://localhost:22"
|
|
load_info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy")
|
|
# connection error
|
|
assert slack_notify_load_success(load_info.pipeline.runtime_config.slack_incoming_hook, load_info, load_info.pipeline.last_trace) == -1 # type: ignore[attr-defined]
|
|
# pipeline = dlt.pipeline()
|
|
# assert pipeline.last_trace is not None
|
|
# assert pipeline._trace is None
|
|
# trace = load_trace(info.pipeline.working_dir)
|
|
# assert len(trace.steps) == 4
|
|
# run_step = trace.steps[-1]
|
|
# assert run_step.step == "run"
|
|
# assert run_step.step_exception is None
|
|
|
|
|
|
def _find_resolved_value(
|
|
resolved: List[SerializableResolvedValueTrace], key: str, sections: List[str]
|
|
) -> SerializableResolvedValueTrace:
|
|
return next((v for v in resolved if v.key == key and v.sections == sections), None)
|
|
|
|
|
|
ANON_TRACKER_SENT_ITEMS = []
|
|
|
|
|
|
def _mock_anon_tracker_before_send(event: DictStrAny) -> DictStrAny:
|
|
ANON_TRACKER_SENT_ITEMS.append(event)
|
|
return event
|
|
|
|
|
|
SENTRY_SENT_ITEMS = []
|
|
|
|
|
|
def _mock_sentry_before_send(event: DictStrAny, _unused_hint: Any = None) -> DictStrAny:
|
|
SENTRY_SENT_ITEMS.append(event)
|
|
return event
|
|
|
|
|
|
def assert_trace_serializable(trace: PipelineTrace) -> None:
|
|
str(trace)
|
|
trace.asstr(0)
|
|
trace.asstr(1)
|
|
trace_dict = deepcopy(trace.asdict())
|
|
# check if we do not pop
|
|
assert trace_dict == trace.asdict()
|
|
with io.BytesIO() as b:
|
|
json.typed_dump(trace, b, pretty=True)
|
|
b.getvalue()
|
|
json.dumps(trace)
|
|
|
|
# load trace to duckdb
|
|
from dlt.destinations import duckdb
|
|
|
|
trace_pipeline = dlt.pipeline("trace", destination=duckdb(":pipeline:")).drop()
|
|
trace_pipeline.run([trace], table_name="trace_data")
|
|
|
|
# print(trace_pipeline.default_schema.to_pretty_yaml())
|