Files
dlt/tests/load/test_dummy_client.py
David Scharf 3ebbfa1f9e migrate to uv (#2766)
* move pyproject.toml and makefile from old branch and add inbetween changes

* update workflow files to use uv

* run new version of formatter

* fix building of images with uv

* possibly fix docs linting

* downgrade lancedb dependency to fix tests

* fix gcs compat mode for s3 for newest boto

* fix docstrings in examples

* add some uv constraints

* update readme.md and contributing.md and some other places

* allow duckdb 0.8 in range

* add link-mode copy to uv venv on windows

* remove poetry lockfile and unneeded lockfile checker

* fix chess api related failures

* sleep after dremio start..

* set correct package in pyproject

* Revert "add some uv constraints"

This reverts commit d611e9ecce.

# Conflicts:
#	pyproject.toml
#	uv.lock

* add missing databricks sql connector version bounds
2025-06-19 10:11:24 +02:00

1097 lines
45 KiB
Python

import os
from concurrent.futures import ThreadPoolExecutor
from time import sleep, time
from unittest import mock
import pytest
from unittest.mock import patch
from typing import List, Tuple
from dlt.common.exceptions import TerminalException, TerminalValueError
from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName
from dlt.common.storages.configuration import FilesystemConfiguration
from dlt.common.storages.load_package import TPackageJobState
from dlt.common.storages.load_storage import JobFileFormatUnsupported
from dlt.common.destination import AnyDestination
from dlt.common.destination.client import RunnableLoadJob
from dlt.common.schema.utils import (
fill_hints_from_parent_and_clone_table,
get_nested_tables,
get_root_table,
)
from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration
from dlt.destinations import dummy, filesystem
from dlt.destinations.impl.dummy import dummy as dummy_impl
from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration
from dlt.load import Load
from dlt.load.configuration import LoaderConfiguration
from dlt.load.exceptions import (
LoadClientJobFailed,
LoadClientJobRetry,
TableChainFollowupJobCreationFailedException,
FollowupJobCreationFailedException,
)
from dlt.load.utils import get_completed_table_chain, init_client, _extend_tables_with_table_chain
from tests.utils import (
MockPipeline,
clean_test_storage,
init_test_logging,
TEST_DICT_CONFIG_PROVIDER,
)
from tests.load.utils import prepare_load_package
from tests.utils import skip_if_not_active, TEST_STORAGE_ROOT
skip_if_not_active("dummy")
NORMALIZED_FILES = [
"event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl",
"event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl",
]
SMALL_FILES = ["event_user.1234.0.jsonl", "event_loop_interrupted.1234.0.jsonl"]
REMOTE_FILESYSTEM = os.path.abspath(os.path.join(TEST_STORAGE_ROOT, "_remote_filesystem"))
@pytest.fixture(autouse=True)
def storage() -> FileStorage:
return clean_test_storage(init_normalize=True, init_loader=True)
@pytest.fixture(scope="module", autouse=True)
def logger_autouse() -> None:
init_test_logging()
def test_spool_job_started() -> None:
# default config keeps the job always running
load = setup_loader()
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
assert len(files) == 2
jobs: List[RunnableLoadJob] = []
for f in files:
job = load.submit_job(f, load_id, schema)
assert job.state() == "completed"
assert type(job) is dummy_impl.LoadDummyJob
# jobs runs, but is not moved yet (loader will do this)
assert load.load_storage.normalized_packages.storage.has_file(
load.load_storage.normalized_packages.get_job_file_path(
load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name()
)
)
assert_job_metrics(job, "completed")
jobs.append(job)
remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
assert len(remaining_jobs) == 0
assert len(finalized_jobs) == 2
assert len(load._job_metrics) == 2
for job in jobs:
assert load._job_metrics[job.job_id()] == job.metrics()
def test_unsupported_writer_type() -> None:
load = setup_loader()
load_id, _ = prepare_load_package(
load.load_storage, ["event_bot.181291798a78198.0.unsupported_format"]
)
with pytest.raises(TerminalValueError):
load.load_storage.list_new_jobs(load_id)
def test_unsupported_write_disposition() -> None:
# tests terminal error on retrieving job
load = setup_loader()
load_id, schema = prepare_load_package(load.load_storage, [NORMALIZED_FILES[0]])
# mock unsupported disposition
schema.get_table("event_user")["write_disposition"] = "skip"
# write back schema
load.load_storage.normalized_packages.save_schema(load_id, schema)
with pytest.raises(LoadClientJobFailed) as e:
with ThreadPoolExecutor() as pool:
load.run(pool)
assert "LoadClientUnsupportedWriteDisposition" in e.value.failed_message
def test_big_loadpackages() -> None:
"""
This test guards against changes in the load that exponentially makes the loads slower
"""
load = setup_loader()
# make the loop faster by basically not sleeping
load._run_loop_sleep_duration = 0.001
load_id, schema = prepare_load_package(load.load_storage, SMALL_FILES, jobs_per_case=500)
start_time = time()
with ThreadPoolExecutor(max_workers=20) as pool:
load.run(pool)
duration = float(time() - start_time)
# sanity check
assert duration > 2
# we want 1000 empty processed jobs to need less than 15 seconds total (locally it runs in 5)
assert duration < 15
# we should have 1000 jobs processed
assert len(dummy_impl.JOBS) == 1000
def test_get_new_jobs_info() -> None:
load = setup_loader()
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
# no write disposition specified - get all new jobs
assert len(load.get_new_jobs_info(load_id)) == 2
def test_get_completed_table_chain_single_job_per_table() -> None:
load = setup_loader()
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
# update tables so we have all possible hints
for table_name, table in schema.tables.items():
schema.tables[table_name] = fill_hints_from_parent_and_clone_table(schema.tables, table)
top_job_table = get_root_table(schema.tables, "event_user")
all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id)
assert get_completed_table_chain(schema, all_jobs, top_job_table) is None
# fake being completed
assert (
len(
get_completed_table_chain(
schema,
all_jobs,
top_job_table,
"event_user.839c6e6b514e427687586ccc65bf133f.jsonl",
)
)
== 1
)
# actually complete
loop_top_job_table = get_root_table(schema.tables, "event_loop_interrupted")
load.load_storage.normalized_packages.start_job(
load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl"
)
all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id)
assert get_completed_table_chain(schema, all_jobs, loop_top_job_table) is None
load.load_storage.normalized_packages.complete_job(
load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl"
)
all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id)
assert get_completed_table_chain(schema, all_jobs, loop_top_job_table) == [
schema.get_table("event_loop_interrupted")
]
assert get_completed_table_chain(
schema, all_jobs, loop_top_job_table, "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl"
) == [schema.get_table("event_loop_interrupted")]
def test_spool_job_failed() -> None:
# this config fails job on start
load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0))
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
jobs: List[RunnableLoadJob] = []
for f in files:
job = load.submit_job(f, load_id, schema)
assert type(job) is dummy_impl.LoadDummyJob
assert job.state() == "failed"
assert load.load_storage.normalized_packages.storage.has_file(
load.load_storage.normalized_packages.get_job_file_path(
load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name()
)
)
assert_job_metrics(job, "failed")
jobs.append(job)
assert len(jobs) == 2
# complete files
remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
assert len(remaining_jobs) == 0
assert len(finalized_jobs) == 2
for job in jobs:
assert load.load_storage.normalized_packages.storage.has_file(
load.load_storage.normalized_packages.get_job_file_path(
load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name()
)
)
assert load.load_storage.normalized_packages.storage.has_file(
load.load_storage.normalized_packages.get_job_file_path(
load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception"
)
)
# load should collect two jobs
assert load._job_metrics[job.job_id()] == job.metrics()
started_files = load.load_storage.normalized_packages.list_started_jobs(load_id)
assert len(started_files) == 0
# test the whole
loader_config = LoaderConfiguration(
raise_on_failed_jobs=False,
workers=1,
pool_type="none",
)
load = setup_loader(
client_config=DummyClientConfiguration(fail_prob=1.0),
loader_config=loader_config,
)
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
run_all(load)
package_info = load.load_storage.get_load_package_info(load_id)
assert package_info.state == "loaded"
# all jobs failed
assert len(package_info.jobs["failed_jobs"]) == 2
# check metrics
load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract]
metrics = load_info.metrics[load_id][0]["job_metrics"]
assert len(metrics) == 2
for job in jobs:
assert job.job_id() in metrics
assert metrics[job.job_id()].state == "failed"
def test_spool_job_failed_terminally_exception_init() -> None:
load = setup_loader(client_config=DummyClientConfiguration(fail_terminally_in_init=True))
load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
with patch.object(dummy_impl.DummyClient, "complete_load") as complete_load:
with pytest.raises(LoadClientJobFailed) as py_ex:
run_all(load)
assert py_ex.value.load_id == load_id
package_info = load.load_storage.get_load_package_info(load_id)
assert package_info.state == "aborted"
# both failed - we wait till the current loop is completed and then raise
assert len(package_info.jobs["failed_jobs"]) == 2
assert len(package_info.jobs["started_jobs"]) == 0
# load id was never committed
complete_load.assert_not_called()
# metrics can be gathered
assert len(load._job_metrics) == 2
load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract]
metrics = load_info.metrics[load_id][0]["job_metrics"]
assert len(metrics) == 2
def test_spool_job_failed_transiently_exception_init() -> None:
load = setup_loader(client_config=DummyClientConfiguration(fail_transiently_in_init=True))
load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
with patch.object(dummy_impl.DummyClient, "complete_load") as complete_load:
with pytest.raises(LoadClientJobRetry) as py_ex:
run_all(load)
assert py_ex.value.load_id == load_id
package_info = load.load_storage.get_load_package_info(load_id)
assert package_info.state == "normalized"
# both failed - we wait till the current loop is completed and then raise
assert len(package_info.jobs["failed_jobs"]) == 0
assert len(package_info.jobs["started_jobs"]) == 0
assert len(package_info.jobs["new_jobs"]) == 2
# load id was never committed
complete_load.assert_not_called()
# no metrics were gathered
assert len(load._job_metrics) == 0
load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract]
assert len(load_info.metrics) == 0
def test_spool_job_failed_exception_complete() -> None:
load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0))
load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
with pytest.raises(LoadClientJobFailed) as py_ex:
run_all(load)
assert py_ex.value.load_id == load_id
package_info = load.load_storage.get_load_package_info(load_id)
assert package_info.state == "aborted"
# both failed - we wait till the current loop is completed and then raise
assert len(package_info.jobs["failed_jobs"]) == 2
assert len(package_info.jobs["started_jobs"]) == 0
# metrics can be gathered
assert len(load._job_metrics) == 2
load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract]
metrics = load_info.metrics[load_id][0]["job_metrics"]
assert len(metrics) == 2
def test_spool_job_retry_new() -> None:
# this config retries job on start (transient fail)
load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0))
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
for f in files:
job = load.submit_job(f, load_id, schema)
assert job.state() == "retry"
def test_spool_job_retry_spool_new() -> None:
# this config retries job on start (transient fail)
load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0))
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
# call higher level function that returns jobs and counts
with ThreadPoolExecutor() as pool:
load.pool = pool
jobs = load.start_new_jobs(load_id, schema, [])
assert len(jobs) == 2
def test_spool_job_retry_started() -> None:
# this config keeps the job always running
load = setup_loader()
# dummy_impl.CLIENT_CONFIG = DummyClientConfiguration
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
jobs: List[RunnableLoadJob] = []
for f in files:
job = load.submit_job(f, load_id, schema)
assert type(job) is dummy_impl.LoadDummyJob
assert job.state() == "completed"
# mock job state to make it retry
job.config.retry_prob = 1.0
job._state = "retry"
assert load.load_storage.normalized_packages.storage.has_file(
load.load_storage.normalized_packages.get_job_file_path(
load_id, PackageStorage.STARTED_JOBS_FOLDER, job.file_name()
)
)
jobs.append(job)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
assert len(files) == 0
# should retry, that moves jobs into new folder, jobs are not counted as finalized
remaining_jobs, finalized_jobs, _ = load.complete_jobs(load_id, jobs, schema)
assert len(remaining_jobs) == 0
assert len(finalized_jobs) == 0
assert len(load._job_metrics) == 0
# clear retry flag
dummy_impl.JOBS = {}
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
assert len(files) == 2
# parse the new job names
for fn in load.load_storage.normalized_packages.list_new_jobs(load_id):
# we failed when already running the job so retry count will increase
assert ParsedLoadJobFileName.parse(fn).retry_count == 1
# this time it will pass
for f in files:
job = load.submit_job(f, load_id, schema)
assert job.state() == "completed"
def test_try_retrieve_job() -> None:
load = setup_loader()
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
# manually move jobs to started
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
for f in files:
load.load_storage.normalized_packages.start_job(
load_id, FileStorage.get_file_name_from_file_path(f)
)
# dummy client may retrieve jobs that it created itself, jobs in started folder are unknown
# and returned as terminal
jobs = load.resume_started_jobs(load_id, schema)
assert len(jobs) == 2
for j in jobs:
assert j.state() == "failed"
# new load package
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
load.pool = ThreadPoolExecutor()
jobs = load.start_new_jobs(load_id, schema, []) # type: ignore
assert len(jobs) == 2
# now jobs are known
jobs = load.resume_started_jobs(load_id, schema)
assert len(jobs) == 2
for j in jobs:
assert j.state() == "completed"
assert len(dummy_impl.RETRIED_JOBS) == 2
def test_completed_loop() -> None:
load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
assert_complete_job(load)
assert len(dummy_impl.JOBS) == 2
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
def test_completed_loop_followup_jobs() -> None:
# TODO: until we fix how we create capabilities we must set env
load = setup_loader(
client_config=DummyClientConfiguration(completed_prob=1.0, create_followup_jobs=True)
)
assert_complete_job(load)
# for each JOB there's REFERENCE JOB
assert len(dummy_impl.JOBS) == 2 * 2
assert len(dummy_impl.JOBS) == len(dummy_impl.CREATED_FOLLOWUP_JOBS) * 2
def test_failing_followup_jobs() -> None:
load = setup_loader(
client_config=DummyClientConfiguration(
completed_prob=1.0, create_followup_jobs=True, fail_followup_job_creation=True
)
)
with pytest.raises(FollowupJobCreationFailedException) as exc:
assert_complete_job(load)
# follow up job errors on main thread
assert "Failed to create followup job" in str(exc)
# followup job fails, we have both jobs in started folder
load_id = list(dummy_impl.JOBS.values())[1]._load_id
started_files = load.load_storage.normalized_packages.list_started_jobs(load_id)
assert len(started_files) == 2
assert len(dummy_impl.JOBS) == 2
assert len(dummy_impl.RETRIED_JOBS) == 0
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
# no metrics were collected
assert len(load._job_metrics) == 0
# now we can retry the same load, it will restart the two jobs and successfully create the followup jobs
load.initial_client_config.fail_followup_job_creation = False # type: ignore
assert_complete_job(load, load_id=load_id)
assert len(dummy_impl.JOBS) == 2 * 2
assert len(dummy_impl.JOBS) == len(dummy_impl.CREATED_FOLLOWUP_JOBS) * 2
assert len(dummy_impl.RETRIED_JOBS) == 2
def test_failing_table_chain_followup_jobs() -> None:
load = setup_loader(
client_config=DummyClientConfiguration(
completed_prob=1.0,
create_followup_table_chain_reference_jobs=True,
fail_table_chain_followup_job_creation=True,
)
)
with pytest.raises(TableChainFollowupJobCreationFailedException) as exc:
assert_complete_job(load)
# follow up job errors on main thread
assert "Failed creating table chain followup jobs for table chain with root table" in str(exc)
# table chain followup job fails, we have both jobs in started folder
load_id = list(dummy_impl.JOBS.values())[1]._load_id
started_files = load.load_storage.normalized_packages.list_started_jobs(load_id)
assert len(started_files) == 2
assert len(dummy_impl.JOBS) == 2
assert len(dummy_impl.RETRIED_JOBS) == 0
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
# no metrics were collected
assert len(load._job_metrics) == 0
# now we can retry the same load, it will restart the two jobs and successfully create the table chain followup jobs
load.initial_client_config.fail_table_chain_followup_job_creation = False # type: ignore
assert_complete_job(load, load_id=load_id)
assert len(dummy_impl.JOBS) == 2 * 2
assert len(dummy_impl.JOBS) == len(dummy_impl.CREATED_TABLE_CHAIN_FOLLOWUP_JOBS) * 2
assert len(dummy_impl.RETRIED_JOBS) == 2
def test_failing_sql_table_chain_job() -> None:
"""
Make sure we get a useful exception from a failing sql job
"""
load = setup_loader(
client_config=DummyClientConfiguration(
completed_prob=1.0, create_followup_table_chain_sql_jobs=True
),
)
with pytest.raises(Exception) as exc:
assert_complete_job(load)
# sql jobs always fail because this is not an sql client, we just make sure the exception is there
assert "Failed creating table chain followup jobs for table chain with root table" in str(exc)
def test_successful_table_chain_jobs() -> None:
load = setup_loader(
client_config=DummyClientConfiguration(
completed_prob=1.0, create_followup_table_chain_reference_jobs=True
),
)
# we create 10 jobs per case (for two cases)
# and expect two table chain jobs at the end
assert_complete_job(load, jobs_per_case=10)
assert len(dummy_impl.CREATED_TABLE_CHAIN_FOLLOWUP_JOBS) == 2
assert len(dummy_impl.JOBS) == 22
# check that we have 10 references per followup job
for _, job in dummy_impl.CREATED_TABLE_CHAIN_FOLLOWUP_JOBS.items():
assert len(job._remote_paths) == 10 # type: ignore
def test_failed_loop() -> None:
# ask to delete completed
load = setup_loader(
delete_completed_jobs=True, client_config=DummyClientConfiguration(fail_prob=1.0)
)
# actually not deleted because one of the jobs failed
with pytest.raises(LoadClientJobFailed) as e:
assert_complete_job(load, should_delete_completed=False)
assert "a random fail occurred" in e.value.failed_message
# two failed jobs
assert len(dummy_impl.JOBS) == 2
assert list(dummy_impl.JOBS.values())[0].state() == "failed"
assert list(dummy_impl.JOBS.values())[1].state() == "failed"
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
def test_failed_loop_followup_jobs() -> None:
# ask to delete completed
load = setup_loader(
delete_completed_jobs=True,
client_config=DummyClientConfiguration(fail_prob=1.0, create_followup_jobs=True),
)
# actually not deleted because one of the jobs failed
with pytest.raises(LoadClientJobFailed) as e:
assert_complete_job(load, should_delete_completed=False)
assert "a random fail occurred" in e.value.failed_message
# followup jobs were not started
assert len(dummy_impl.JOBS) == 2
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
def test_completed_loop_with_delete_completed() -> None:
load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
load.load_storage = load.create_storage(is_storage_owner=False)
load.load_storage.config.delete_completed_jobs = True
assert_complete_job(load, should_delete_completed=True)
@pytest.mark.parametrize("to_truncate", [True, False])
def test_truncate_table_before_load_on_staging(to_truncate) -> None:
load = setup_loader(
client_config=DummyClientConfiguration(
truncate_tables_on_staging_destination_before_load=to_truncate
)
)
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
destination_client = load.get_destination_client(schema)
assert (
destination_client.should_truncate_table_before_load_on_staging_destination( # type: ignore
schema.tables["_dlt_version"]["name"]
)
== to_truncate
)
def test_retry_on_new_loop() -> None:
# test job that retries sitting in new jobs
load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0))
load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES)
with ThreadPoolExecutor() as pool:
# 1st retry
with pytest.raises(LoadClientJobRetry):
load.run(pool)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
assert len(files) == 2
# 2nd retry
with pytest.raises(LoadClientJobRetry):
load.run(pool)
files = load.load_storage.normalized_packages.list_new_jobs(load_id)
assert len(files) == 2
# package will be completed
load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
load.run(pool)
assert not load.load_storage.normalized_packages.storage.has_folder(
load.load_storage.get_normalized_package_path(load_id)
)
sleep(1)
# parse the completed job names
completed_path = load.load_storage.loaded_packages.get_package_path(load_id)
for fn in load.load_storage.loaded_packages.storage.list_folder_files(
os.path.join(completed_path, PackageStorage.COMPLETED_JOBS_FOLDER)
):
# we update a retry count in each case (5 times for each loop run)
assert ParsedLoadJobFileName.parse(fn).retry_count == 10
def test_retry_exceptions() -> None:
load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0))
prepare_load_package(load.load_storage, NORMALIZED_FILES)
with ThreadPoolExecutor() as pool:
# 1st retry
with pytest.raises(LoadClientJobRetry) as py_ex:
while True:
load.run(pool)
# configured to retry 5 times before exception
assert py_ex.value.max_retry_count == py_ex.value.retry_count == 5
# we can do it again
with pytest.raises(LoadClientJobRetry) as py_ex:
while True:
load.run(pool)
# this continues retry
assert py_ex.value.max_retry_count * 2 == py_ex.value.retry_count == 10
def test_load_single_thread() -> None:
os.environ["LOAD__WORKERS"] = "1"
load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
assert load.config.pool_type == "none"
load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
# we do not need pool to complete
metrics = load.run(None)
while metrics.pending_items > 0:
metrics = load.run(None)
assert not load.load_storage.storage.has_folder(
load.load_storage.get_normalized_package_path(load_id)
)
def test_wrong_writer_type() -> None:
load = setup_loader()
load_id, _ = prepare_load_package(
load.load_storage,
[
"event_bot.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values",
"event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values",
],
)
with ThreadPoolExecutor() as pool:
with pytest.raises(JobFileFormatUnsupported) as exv:
load.run(pool)
assert exv.value.load_id == load_id
def test_extend_table_chain() -> None:
load = setup_loader()
_, schema = prepare_load_package(
load.load_storage, ["event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values"]
)
# only event user table (no other jobs)
tables = _extend_tables_with_table_chain(schema, ["event_user"], ["event_user"])
assert tables == {"event_user"}
# add child jobs
tables = _extend_tables_with_table_chain(
schema, ["event_user"], ["event_user", "event_user__parse_data__entities"]
)
assert tables == {"event_user", "event_user__parse_data__entities"}
user_chain = {name for name in schema.data_table_names() if name.startswith("event_user__")} | {
"event_user"
}
# change event user to merge/replace to get full table chain
for w_d in ["merge", "replace"]:
schema.tables["event_user"]["write_disposition"] = w_d # type:ignore[typeddict-item]
tables = _extend_tables_with_table_chain(schema, ["event_user"], ["event_user"])
assert tables == user_chain
# no jobs for bot
assert _extend_tables_with_table_chain(schema, ["event_bot"], ["event_user"]) == set()
# skip unseen tables
del schema.tables["event_user__parse_data__entities"]["x-normalizer"]
entities_chain = {
name
for name in schema.data_table_names()
if name.startswith("event_user__parse_data__entities")
}
tables = _extend_tables_with_table_chain(schema, ["event_user"], ["event_user"])
assert tables == user_chain - {"event_user__parse_data__entities"}
# exclude the whole chain
tables = _extend_tables_with_table_chain(
schema, ["event_user"], ["event_user"], lambda table_name: table_name not in entities_chain
)
assert tables == user_chain - entities_chain
# ask for tables that are not top
tables = _extend_tables_with_table_chain(schema, ["event_user__parse_data__entities"], [])
# user chain but without entities (not seen data)
assert tables == user_chain - {"event_user__parse_data__entities"}
# go to append and ask only for entities chain
schema.tables["event_user"]["write_disposition"] = "append"
tables = _extend_tables_with_table_chain(
schema, ["event_user__parse_data__entities"], entities_chain
)
# without entities (not seen data)
assert tables == entities_chain - {"event_user__parse_data__entities"}
# add multiple chains
bot_jobs = {"event_bot", "event_bot__data__buttons"}
tables = _extend_tables_with_table_chain(
schema, ["event_user__parse_data__entities", "event_bot"], entities_chain | bot_jobs
)
assert tables == (entities_chain | bot_jobs) - {"event_user__parse_data__entities"}
def test_get_completed_table_chain_cases() -> None:
load = setup_loader()
_, schema = prepare_load_package(
load.load_storage, ["event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values"]
)
# update tables so we have all possible hints
for table_name, table in schema.tables.items():
schema.tables[table_name] = fill_hints_from_parent_and_clone_table(schema.tables, table)
# child completed, parent not
event_user = schema.get_table("event_user")
event_user_entities = schema.get_table("event_user__parse_data__entities")
event_user_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = (
"started_jobs",
ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl"),
)
event_user_entities_job: Tuple[TPackageJobState, ParsedLoadJobFileName] = (
"completed_jobs",
ParsedLoadJobFileName(
"event_user__parse_data__entities", "event_user__parse_data__entities_id", 0, "jsonl"
),
)
chain = get_completed_table_chain(schema, [event_user_job, event_user_entities_job], event_user)
assert chain is None
# parent just got completed
chain = get_completed_table_chain(
schema,
[event_user_job, event_user_entities_job],
event_user,
event_user_job[1].job_id(),
)
# full chain
assert chain == [event_user, event_user_entities]
# parent failed, child completed
chain = get_completed_table_chain(
schema, [("failed_jobs", event_user_job[1]), event_user_entities_job], event_user
)
assert chain == [event_user, event_user_entities]
# both failed
chain = get_completed_table_chain(
schema,
[("failed_jobs", event_user_job[1]), ("failed_jobs", event_user_entities_job[1])],
event_user,
)
assert chain == [event_user, event_user_entities]
# merge and replace do not require whole chain to be in jobs
user_chain = get_nested_tables(schema.tables, "event_user")
for w_d in ["merge", "replace"]:
event_user["write_disposition"] = w_d # type:ignore[typeddict-item]
chain = get_completed_table_chain(
schema, [event_user_job], event_user, event_user_job[1].job_id()
)
assert chain == user_chain
# but if child is present and incomplete...
chain = get_completed_table_chain(
schema,
[event_user_job, ("new_jobs", event_user_entities_job[1])],
event_user,
event_user_job[1].job_id(),
)
# noting is returned
assert chain is None
# skip unseen
deep_child = schema.tables[
"event_user__parse_data__response_selector__default__response__response_templates"
]
del deep_child["x-normalizer"]
chain = get_completed_table_chain(
schema, [event_user_job], event_user, event_user_job[1].job_id()
)
user_chain.remove(deep_child)
assert chain == user_chain
def test_init_client_truncate_tables() -> None:
load = setup_loader()
_, schema = prepare_load_package(
load.load_storage, ["event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values"]
)
nothing_ = lambda _: False
all_ = lambda _: True
event_user = ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl")
event_bot = ParsedLoadJobFileName("event_bot", "event_bot_id", 0, "jsonl")
with patch.object(dummy_impl.DummyClient, "initialize_storage") as initialize_storage:
with patch.object(dummy_impl.DummyClient, "update_stored_schema") as update_stored_schema:
with load.get_destination_client(schema) as client:
init_client(client, schema, [], {}, nothing_, nothing_)
# we do not allow for any staging dataset tables
assert update_stored_schema.call_count == 1
assert update_stored_schema.call_args[1]["only_tables"] == {
"_dlt_loads",
"_dlt_version",
}
assert initialize_storage.call_count == 2
# initialize storage is called twice, we deselected all tables to truncate
assert initialize_storage.call_args_list[0].args == ()
assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set()
initialize_storage.reset_mock()
update_stored_schema.reset_mock()
# now we want all tables to be truncated but not on staging
with load.get_destination_client(schema) as client:
init_client(client, schema, [event_user], {}, all_, nothing_)
assert update_stored_schema.call_count == 1
assert "event_user" in update_stored_schema.call_args[1]["only_tables"]
assert initialize_storage.call_count == 2
assert initialize_storage.call_args_list[0].args == ()
assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == {"event_user"}
# now we push all to stage
initialize_storage.reset_mock()
update_stored_schema.reset_mock()
with load.get_destination_client(schema) as client:
init_client(client, schema, [event_user, event_bot], {}, nothing_, all_)
assert update_stored_schema.call_count == 2
# first call main dataset
assert {"event_user", "event_bot"} <= set(
update_stored_schema.call_args_list[0].kwargs["only_tables"]
)
# second one staging dataset
assert {"event_user", "event_bot"} <= set(
update_stored_schema.call_args_list[1].kwargs["only_tables"]
)
assert initialize_storage.call_count == 4
assert initialize_storage.call_args_list[0].args == ()
assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set()
assert initialize_storage.call_args_list[2].args == ()
# all tables that will be used on staging must be truncated
assert initialize_storage.call_args_list[3].kwargs["truncate_tables"] == {
"event_user",
"event_bot",
}
replace_ = (
lambda table_name: client.prepare_load_table(table_name)["write_disposition"]
== "replace"
)
merge_ = (
lambda table_name: client.prepare_load_table(table_name)["write_disposition"]
== "merge"
)
# set event_bot chain to merge
bot_chain = get_nested_tables(schema.tables, "event_bot")
for w_d in ["merge", "replace"]:
initialize_storage.reset_mock()
update_stored_schema.reset_mock()
for bot in bot_chain:
bot["write_disposition"] = w_d # type:ignore[typeddict-item]
# merge goes to staging, replace goes to truncate
with load.get_destination_client(schema) as client:
init_client(client, schema, [event_user, event_bot], {}, replace_, merge_)
if w_d == "merge":
# we use staging dataset
assert update_stored_schema.call_count == 2
# 4 tables to update in main dataset
assert len(update_stored_schema.call_args_list[0].kwargs["only_tables"]) == 4
assert (
"event_user" in update_stored_schema.call_args_list[0].kwargs["only_tables"]
)
# full bot table chain + dlt version but no user
assert len(
update_stored_schema.call_args_list[1].kwargs["only_tables"]
) == 1 + len(bot_chain)
assert (
"event_user"
not in update_stored_schema.call_args_list[1].kwargs["only_tables"]
)
assert initialize_storage.call_count == 4
assert initialize_storage.call_args_list[1].kwargs["truncate_tables"] == set()
assert initialize_storage.call_args_list[3].kwargs[
"truncate_tables"
] == update_stored_schema.call_args_list[1].kwargs["only_tables"] - {
"_dlt_version"
}
if w_d == "replace":
assert update_stored_schema.call_count == 1
assert initialize_storage.call_count == 2
# we truncate the whole bot chain but not user (which is append)
assert len(
initialize_storage.call_args_list[1].kwargs["truncate_tables"]
) == len(bot_chain)
# migrate only tables for which we have jobs
assert len(update_stored_schema.call_args_list[0].kwargs["only_tables"]) == 4
# print(initialize_storage.call_args_list)
# print(update_stored_schema.call_args_list)
def test_dummy_staging_filesystem() -> None:
load = setup_loader(
client_config=DummyClientConfiguration(completed_prob=1.0), filesystem_staging=True
)
assert_complete_job(load)
# two reference jobs
assert len(dummy_impl.JOBS) == 2
assert len(dummy_impl.CREATED_FOLLOWUP_JOBS) == 0
def test_load_multiple_packages() -> None:
load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0))
load.config.pool_type = "none"
load_id_1, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
sleep(0.1)
load_id_2, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES)
run_metrics = load.run(None)
assert run_metrics.pending_items == 1
# assert load._current_load_id is None
metrics_id_1 = load._job_metrics
assert len(metrics_id_1) == 2
assert load._step_info_metrics(load_id_1)[0]["job_metrics"] == metrics_id_1
run_metrics = load.run(None)
assert run_metrics.pending_items == 0
metrics_id_2 = load._job_metrics
assert len(metrics_id_2) == 2
assert load._step_info_metrics(load_id_2)[0]["job_metrics"] == metrics_id_2
load_info = load.get_step_info(MockPipeline("pipe", True)) # type: ignore[abstract]
assert load_id_1 in load_info.metrics
assert load_id_2 in load_info.metrics
assert load_info.metrics[load_id_1][0]["job_metrics"] == metrics_id_1
assert load_info.metrics[load_id_2][0]["job_metrics"] == metrics_id_2
# execute empty run
load.run(None)
assert len(load_info.metrics) == 2
def test_terminal_exceptions() -> None:
try:
raise TerminalValueError("a")
except TerminalException:
assert True
else:
raise AssertionError()
def assert_job_metrics(job: RunnableLoadJob, expected_state: str) -> None:
metrics = job.metrics()
assert metrics.state == expected_state
assert metrics.started_at <= metrics.finished_at
assert metrics.job_id == job.job_id()
assert metrics.table_name == job._parsed_file_name.table_name
assert metrics.file_path == job._file_path
def assert_complete_job(
load: Load, should_delete_completed: bool = False, load_id: str = None, jobs_per_case: int = 1
) -> None:
if not load_id:
load_id, _ = prepare_load_package(
load.load_storage, NORMALIZED_FILES, jobs_per_case=jobs_per_case
)
# will complete all jobs
timestamp = "2024-04-05T09:16:59.942779Z"
mocked_timestamp = {"state": {"created_at": timestamp}}
with (
mock.patch(
"dlt.current.load_package",
return_value=mocked_timestamp,
),
patch.object(
dummy_impl.DummyClient,
"complete_load",
) as complete_load,
):
with ThreadPoolExecutor() as pool:
load.run(pool)
# moved to loaded
assert not load.load_storage.storage.has_folder(
load.load_storage.get_normalized_package_path(load_id)
)
completed_path = load.load_storage.loaded_packages.get_job_state_folder_path(
load_id, "completed_jobs"
)
# should have migrated the schema
assert load.load_storage.storage.has_file(
os.path.join(
load.load_storage.get_loaded_package_path(load_id),
PackageStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME,
)
)
if should_delete_completed:
# package was deleted
assert not load.load_storage.loaded_packages.storage.has_folder(completed_path)
else:
# package not deleted
assert load.load_storage.loaded_packages.storage.has_folder(completed_path)
# complete load on client was called
complete_load.assert_called_once_with(load_id)
# assert if all jobs in final state have metrics
metrics = load.get_step_info(MockPipeline("pipe", True)).metrics[load_id][0] # type: ignore[abstract]
package_info = load.load_storage.loaded_packages.get_load_package_jobs(load_id)
for state, jobs in package_info.items():
for job in jobs:
job_metrics = metrics["job_metrics"].get(job.job_id())
if state in ("failed_jobs", "completed_jobs"):
assert job_metrics is not None
assert (
metrics["job_metrics"][job.job_id()].state == "failed"
if state == "failed_jobs"
else "completed"
)
remote_url = job_metrics.remote_url
if load.initial_client_config.create_followup_jobs: # type: ignore
assert remote_url.endswith(job.file_name())
elif load.is_staging_destination_job(job.file_name()):
# staging destination should contain reference to remote filesystem
assert (
FilesystemConfiguration.make_file_url(REMOTE_FILESYSTEM)
in remote_url
)
else:
assert remote_url is None
else:
assert job_metrics is None
def run_all(load: Load) -> None:
pool = ThreadPoolExecutor()
while True:
metrics = load.run(pool)
if metrics.pending_items == 0:
return
sleep(0.1)
def setup_loader(
delete_completed_jobs: bool = False,
client_config: DummyClientConfiguration = None,
loader_config: LoaderConfiguration = None,
filesystem_staging: bool = False,
) -> Load:
# reset jobs for a test
dummy_impl.JOBS = {}
dummy_impl.CREATED_FOLLOWUP_JOBS = {}
dummy_impl.RETRIED_JOBS = {}
dummy_impl.CREATED_TABLE_CHAIN_FOLLOWUP_JOBS = {}
client_config = client_config or DummyClientConfiguration(
loader_file_format="jsonl", completed_prob=1
)
destination: AnyDestination = dummy(**client_config) # type: ignore[assignment]
# setup
staging_system_config = None
staging = None
if filesystem_staging:
# do not accept jsonl to not conflict with filesystem destination
# client_config = client_config or DummyClientConfiguration(
# loader_file_format="reference", completed_prob=1
# )
staging_system_config = FilesystemDestinationClientConfiguration()._bind_dataset_name(
dataset_name="dummy"
)
staging_system_config.as_staging_destination = True
os.makedirs(REMOTE_FILESYSTEM)
staging = filesystem(bucket_url=REMOTE_FILESYSTEM)
# patch destination to provide client_config
# destination.client = lambda schema: dummy_impl.DummyClient(schema, client_config)
# setup loader
with TEST_DICT_CONFIG_PROVIDER().values({"delete_completed_jobs": delete_completed_jobs}):
return Load(
destination,
initial_client_config=client_config,
config=loader_config,
staging_destination=staging, # type: ignore[arg-type]
initial_staging_client_config=staging_system_config,
)