mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
* move pyproject.toml and makefile from old branch and add inbetween changes
* update workflow files to use uv
* run new version of formatter
* fix building of images with uv
* possibly fix docs linting
* downgrade lancedb dependency to fix tests
* fix gcs compat mode for s3 for newest boto
* fix docstrings in examples
* add some uv constraints
* update readme.md and contributing.md and some other places
* allow duckdb 0.8 in range
* add link-mode copy to uv venv on windows
* remove poetry lockfile and unneeded lockfile checker
* fix chess api related failures
* sleep after dremio start..
* set correct package in pyproject
* Revert "add some uv constraints"
This reverts commit d611e9ecce.
# Conflicts:
# pyproject.toml
# uv.lock
* add missing databricks sql connector version bounds
572 lines
17 KiB
Python
572 lines
17 KiB
Python
import contextlib
|
|
import multiprocessing
|
|
import os
|
|
import platform
|
|
import sys
|
|
from os import environ
|
|
from typing import Any, Iterable, Iterator, Literal, Optional, Union, get_args, List
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
import requests
|
|
from requests import Response
|
|
|
|
import dlt
|
|
from dlt.common import known_env
|
|
from dlt.common.runtime import telemetry
|
|
from dlt.common.configuration.container import Container
|
|
from dlt.common.configuration.providers import (
|
|
DictionaryProvider,
|
|
EnvironProvider,
|
|
SecretsTomlProvider,
|
|
ConfigTomlProvider,
|
|
)
|
|
from dlt.common.configuration.providers.provider import ConfigProvider
|
|
from dlt.common.configuration.resolve import resolve_configuration
|
|
from dlt.common.configuration.specs import RuntimeConfiguration, PluggableRunContext, configspec
|
|
from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContainer
|
|
from dlt.common.configuration.specs.pluggable_run_context import (
|
|
SupportsRunContext,
|
|
)
|
|
from dlt.common.pipeline import LoadInfo, PipelineContext, SupportsPipeline
|
|
from dlt.common.runtime.run_context import DOT_DLT, RunContext
|
|
from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry
|
|
from dlt.common.schema import Schema
|
|
from dlt.common.schema.typing import TTableFormat
|
|
from dlt.common.storages import FileStorage
|
|
from dlt.common.storages.versioned_storage import VersionedStorage
|
|
from dlt.common.typing import DictStrAny, StrAny, TDataItem
|
|
from dlt.common.utils import custom_environ, set_working_dir, uniq_id
|
|
|
|
TEST_STORAGE_ROOT = "_storage"
|
|
|
|
ALL_DESTINATIONS = dlt.config.get("ALL_DESTINATIONS", list) or [
|
|
"duckdb",
|
|
]
|
|
|
|
LOCAL_POSTGRES_CREDENTIALS = "postgresql://loader:loader@localhost:5432/dlt_data"
|
|
|
|
|
|
# destination constants
|
|
IMPLEMENTED_DESTINATIONS = {
|
|
"athena",
|
|
"duckdb",
|
|
"bigquery",
|
|
"redshift",
|
|
"postgres",
|
|
"snowflake",
|
|
"filesystem",
|
|
"weaviate",
|
|
"dummy",
|
|
"motherduck",
|
|
"mssql",
|
|
"qdrant",
|
|
"lancedb",
|
|
"destination",
|
|
"synapse",
|
|
"databricks",
|
|
"clickhouse",
|
|
"dremio",
|
|
"sqlalchemy",
|
|
}
|
|
NON_SQL_DESTINATIONS = {
|
|
"filesystem",
|
|
"weaviate",
|
|
"dummy",
|
|
"qdrant",
|
|
"lancedb",
|
|
"destination",
|
|
}
|
|
|
|
try:
|
|
# register additional destinations from _addons.py which must be placed in the same folder
|
|
# as tests
|
|
from _addons import register_destinations # type: ignore[import-not-found]
|
|
|
|
register_destinations(IMPLEMENTED_DESTINATIONS, NON_SQL_DESTINATIONS)
|
|
except ImportError:
|
|
pass
|
|
|
|
SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS
|
|
|
|
# exclude destination configs (for now used for athena and athena iceberg separation)
|
|
EXCLUDED_DESTINATION_CONFIGURATIONS = set(
|
|
dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set()
|
|
)
|
|
|
|
|
|
# filter out active destinations for current tests
|
|
ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS)
|
|
|
|
ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS)
|
|
ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS)
|
|
|
|
# filter out active table formats for current tests
|
|
IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat))
|
|
ACTIVE_TABLE_FORMATS = set(
|
|
dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS
|
|
)
|
|
|
|
# sanity checks
|
|
assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected"
|
|
|
|
for destination in NON_SQL_DESTINATIONS:
|
|
assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown non sql destination {destination}"
|
|
|
|
for destination in SQL_DESTINATIONS:
|
|
assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown sql destination {destination}"
|
|
|
|
for destination in ACTIVE_DESTINATIONS:
|
|
assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown active destination {destination}"
|
|
|
|
TPythonTableFormat = Literal["pandas", "arrow-table", "arrow-batch"]
|
|
"""Possible arrow item formats"""
|
|
|
|
TestDataItemFormat = Literal["object", "pandas", "arrow-table", "arrow-batch"]
|
|
ALL_TEST_DATA_ITEM_FORMATS = get_args(TestDataItemFormat)
|
|
"""List with TDataItem formats: object, arrow table/batch / pandas"""
|
|
|
|
|
|
def TEST_DICT_CONFIG_PROVIDER():
|
|
# add test dictionary provider
|
|
providers_context = Container()[PluggableRunContext].providers
|
|
try:
|
|
return providers_context[DictionaryProvider.NAME]
|
|
except KeyError:
|
|
provider = DictionaryProvider()
|
|
providers_context.add_provider(provider)
|
|
return provider
|
|
|
|
|
|
class MockHttpResponse(Response):
|
|
def __init__(self, status_code: int) -> None:
|
|
self.status_code = status_code
|
|
|
|
def raise_for_status(self) -> None:
|
|
if self.status_code >= 300:
|
|
raise requests.HTTPError(response=self)
|
|
|
|
|
|
class MockPipeline(SupportsPipeline):
|
|
def __init__(self, pipeline_name: str, first_run: bool) -> None:
|
|
self.pipeline_name = pipeline_name
|
|
self.first_run = first_run
|
|
|
|
|
|
def write_version(storage: FileStorage, version: str) -> None:
|
|
storage.save(VersionedStorage.VERSION_FILE, str(version))
|
|
|
|
|
|
def delete_test_storage() -> None:
|
|
storage = FileStorage(TEST_STORAGE_ROOT)
|
|
if storage.has_folder(""):
|
|
storage.delete_folder("", recursively=True, delete_ro=True)
|
|
|
|
|
|
@pytest.fixture()
|
|
def test_storage() -> FileStorage:
|
|
return clean_test_storage()
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def autouse_test_storage(request) -> Optional[FileStorage]:
|
|
# skip storage cleanup if module level fixture is activated
|
|
if not request.keywords.get("skip_autouse_test_storage", False):
|
|
return clean_test_storage()
|
|
return None
|
|
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def preserve_environ() -> Iterator[None]:
|
|
yield from _preserve_environ()
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def patch_home_dir() -> Iterator[None]:
|
|
yield from _patch_home_dir()
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="module")
|
|
def autouse_module_test_storage(request) -> FileStorage:
|
|
request.keywords["skip_autouse_test_storage"] = True
|
|
return clean_test_storage()
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="module")
|
|
def preserve_module_environ() -> Iterator[None]:
|
|
yield from _preserve_environ()
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="module")
|
|
def patch_module_home_dir(autouse_module_test_storage) -> Iterator[None]:
|
|
yield from _patch_home_dir()
|
|
|
|
|
|
def _patch_home_dir() -> Iterator[None]:
|
|
ctx = PluggableRunContext()
|
|
mock = MockableRunContext.from_context(ctx.context)
|
|
mock._local_dir = os.path.abspath(TEST_STORAGE_ROOT)
|
|
mock._global_dir = mock._data_dir = os.path.join(mock._local_dir, DOT_DLT)
|
|
ctx.context = mock
|
|
|
|
# also emit corresponding env variables so pipelines in env work like that
|
|
with custom_environ(
|
|
{
|
|
known_env.DLT_LOCAL_DIR: mock.local_dir,
|
|
known_env.DLT_DATA_DIR: mock.data_dir,
|
|
}
|
|
):
|
|
with Container().injectable_context(ctx):
|
|
yield
|
|
|
|
|
|
def _preserve_environ() -> Iterator[None]:
|
|
saved_environ = environ.copy()
|
|
# delta-rs sets those keys without updating environ and there's no
|
|
# method to refresh environ
|
|
known_environ = {
|
|
key_: saved_environ.get(key_)
|
|
for key_ in [
|
|
"AWS_ACCESS_KEY_ID",
|
|
"AWS_SECRET_ACCESS_KEY",
|
|
"AWS_REGION",
|
|
"AWS_SESSION_TOKEN",
|
|
]
|
|
}
|
|
try:
|
|
yield
|
|
finally:
|
|
environ.clear()
|
|
environ.update(saved_environ)
|
|
for key_, value_ in known_environ.items():
|
|
if value_ is not None or key_ not in environ:
|
|
environ[key_] = value_ or ""
|
|
else:
|
|
del environ[key_]
|
|
|
|
|
|
class MockableRunContext(RunContext):
|
|
@property
|
|
def name(self) -> str:
|
|
return self._name
|
|
|
|
@property
|
|
def global_dir(self) -> str:
|
|
return self._global_dir
|
|
|
|
@property
|
|
def run_dir(self) -> str:
|
|
return os.environ.get(known_env.DLT_PROJECT_DIR, self._run_dir)
|
|
|
|
@property
|
|
def local_dir(self) -> str:
|
|
return os.environ.get(known_env.DLT_LOCAL_DIR, self._local_dir)
|
|
|
|
# @property
|
|
# def settings_dir(self) -> str:
|
|
# return self._settings_dir
|
|
|
|
@property
|
|
def data_dir(self) -> str:
|
|
return os.environ.get(known_env.DLT_DATA_DIR, self._data_dir)
|
|
|
|
_name: str
|
|
_global_dir: str
|
|
_run_dir: str
|
|
_settings_dir: str
|
|
_data_dir: str
|
|
_local_dir: str
|
|
|
|
@classmethod
|
|
def from_context(cls, ctx: SupportsRunContext) -> "MockableRunContext":
|
|
cls_ = cls(ctx.run_dir)
|
|
cls_._name = ctx.name
|
|
cls_._global_dir = ctx.global_dir
|
|
cls_._run_dir = ctx.run_dir
|
|
cls_._settings_dir = ctx.settings_dir
|
|
cls_._data_dir = ctx.data_dir
|
|
cls_._local_dir = ctx.local_dir
|
|
return cls_
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def patch_random_home_dir() -> Iterator[None]:
|
|
ctx = PluggableRunContext()
|
|
mock = MockableRunContext.from_context(ctx.context)
|
|
mock._global_dir = mock._data_dir = os.path.abspath(
|
|
os.path.join(TEST_STORAGE_ROOT, "global_" + uniq_id(), DOT_DLT)
|
|
)
|
|
ctx.context = mock
|
|
|
|
os.makedirs(ctx.context.global_dir, exist_ok=True)
|
|
with Container().injectable_context(ctx):
|
|
yield
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def unload_modules() -> Iterator[None]:
|
|
"""Unload all modules inspected in this tests"""
|
|
prev_modules = dict(sys.modules)
|
|
yield
|
|
mod_diff = set(sys.modules.keys()) - set(prev_modules.keys())
|
|
for mod in mod_diff:
|
|
del sys.modules[mod]
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def wipe_pipeline(preserve_environ) -> Iterator[None]:
|
|
"""Wipes pipeline local state and deactivates it"""
|
|
container = Container()
|
|
if container[PipelineContext].is_active():
|
|
container[PipelineContext].deactivate()
|
|
yield
|
|
if container[PipelineContext].is_active():
|
|
# take existing pipeline
|
|
# NOTE: no more needed. test storage is wiped fully when test starts
|
|
# p = dlt.pipeline()
|
|
# p._wipe_working_folder()
|
|
# deactivate context
|
|
container[PipelineContext].deactivate()
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup_secret_providers_to_current_module(request):
|
|
"""Creates set of config providers where secrets are loaded from cwd()/.dlt and
|
|
configs are loaded from the .dlt/ in the same folder as module being tested
|
|
"""
|
|
secret_dir = os.path.abspath("./.dlt")
|
|
dname = os.path.dirname(request.module.__file__)
|
|
config_dir = dname + "/.dlt"
|
|
|
|
# inject provider context so the original providers are restored at the end
|
|
def _initial_providers(self):
|
|
return [
|
|
EnvironProvider(),
|
|
SecretsTomlProvider(settings_dir=secret_dir),
|
|
ConfigTomlProvider(settings_dir=config_dir),
|
|
]
|
|
|
|
with (
|
|
set_working_dir(dname),
|
|
patch(
|
|
"dlt.common.runtime.run_context.RunContext.initial_providers",
|
|
_initial_providers,
|
|
),
|
|
):
|
|
Container()[PluggableRunContext].reload_providers()
|
|
|
|
try:
|
|
sys.path.insert(0, dname)
|
|
yield
|
|
finally:
|
|
sys.path.pop(0)
|
|
|
|
|
|
def data_to_item_format(
|
|
item_format: TestDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]]
|
|
) -> Any:
|
|
"""Return the given data in the form of pandas, arrow table/batch or json items"""
|
|
if item_format == "object":
|
|
return data
|
|
|
|
import pandas as pd
|
|
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
|
|
# Make dataframe from the data
|
|
df = pd.DataFrame(list(data))
|
|
if item_format == "pandas":
|
|
return [df]
|
|
elif item_format == "arrow-table":
|
|
return [pa.Table.from_pandas(df)]
|
|
elif item_format == "arrow-batch":
|
|
return [pa.RecordBatch.from_pandas(df)]
|
|
else:
|
|
raise ValueError(f"Unknown item format: {item_format}")
|
|
|
|
|
|
def data_item_length(data: TDataItem) -> int:
|
|
import pandas as pd
|
|
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
|
|
if isinstance(data, list):
|
|
# If data is a list, check if it's a list of supported data types
|
|
if all(isinstance(item, (list, pd.DataFrame, pa.Table, pa.RecordBatch)) for item in data):
|
|
return sum(data_item_length(item) for item in data)
|
|
# If it's a list but not a list of supported types, treat it as a single list object
|
|
else:
|
|
return len(data)
|
|
elif isinstance(data, pd.DataFrame):
|
|
return len(data.index)
|
|
elif isinstance(data, pa.Table) or isinstance(data, pa.RecordBatch):
|
|
return data.num_rows
|
|
else:
|
|
raise TypeError("Unsupported data type.")
|
|
|
|
|
|
def arrow_item_from_pandas(
|
|
df: Any,
|
|
object_format: TPythonTableFormat,
|
|
) -> Any:
|
|
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
|
|
if object_format == "pandas":
|
|
return df
|
|
elif object_format == "arrow-table":
|
|
return pa.Table.from_pandas(df)
|
|
elif object_format == "arrow-batch":
|
|
return pa.RecordBatch.from_pandas(df)
|
|
raise ValueError("Unknown item type: " + object_format)
|
|
|
|
|
|
def arrow_item_from_table(
|
|
table: Any,
|
|
object_format: TPythonTableFormat,
|
|
) -> Any:
|
|
if object_format == "pandas":
|
|
return table.to_pandas()
|
|
elif object_format == "arrow-table":
|
|
return table
|
|
elif object_format == "arrow-batch":
|
|
return table.to_batches()[0]
|
|
raise ValueError("Unknown item type: " + object_format)
|
|
|
|
|
|
def init_test_logging(c: RuntimeConfiguration = None) -> None:
|
|
if not c:
|
|
c = resolve_configuration(RuntimeConfiguration())
|
|
Container()[PluggableRunContext].initialize_runtime(c)
|
|
|
|
|
|
@configspec
|
|
class SentryLoggerConfiguration(RuntimeConfiguration):
|
|
pipeline_name: str = "logger"
|
|
sentry_dsn: str = (
|
|
"https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752"
|
|
)
|
|
|
|
|
|
def start_test_telemetry(c: RuntimeConfiguration = None):
|
|
stop_telemetry()
|
|
if not c:
|
|
c = resolve_configuration(RuntimeConfiguration())
|
|
start_telemetry(c)
|
|
|
|
|
|
@pytest.fixture
|
|
def temporary_telemetry() -> Iterator[RuntimeConfiguration]:
|
|
c = SentryLoggerConfiguration()
|
|
start_test_telemetry(c)
|
|
try:
|
|
yield c
|
|
finally:
|
|
stop_telemetry()
|
|
|
|
|
|
@pytest.fixture
|
|
def disable_temporary_telemetry() -> Iterator[None]:
|
|
try:
|
|
yield
|
|
finally:
|
|
# force stop telemetry
|
|
telemetry._TELEMETRY_STARTED = True
|
|
stop_telemetry()
|
|
|
|
|
|
def clean_test_storage(
|
|
init_normalize: bool = False, init_loader: bool = False, mode: str = "t"
|
|
) -> FileStorage:
|
|
storage = FileStorage(TEST_STORAGE_ROOT, mode, makedirs=True)
|
|
storage.delete_folder("", recursively=True, delete_ro=True)
|
|
storage.create_folder(".")
|
|
if init_normalize:
|
|
from dlt.common.storages import NormalizeStorage
|
|
|
|
NormalizeStorage(True)
|
|
if init_loader:
|
|
from dlt.common.storages import LoadStorage
|
|
|
|
LoadStorage(True, LoadStorage.ALL_SUPPORTED_FILE_FORMATS)
|
|
return storage
|
|
|
|
|
|
def create_schema_with_name(schema_name) -> Schema:
|
|
schema = Schema(schema_name)
|
|
return schema
|
|
|
|
|
|
def assert_no_dict_key_starts_with(d: StrAny, key_prefix: str) -> None:
|
|
assert all(not key.startswith(key_prefix) for key in d.keys())
|
|
|
|
|
|
def skip_if_not_active(destination: str) -> None:
|
|
assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown skipped destination {destination}"
|
|
if destination not in ACTIVE_DESTINATIONS:
|
|
pytest.skip(f"{destination} not in ACTIVE_DESTINATIONS", allow_module_level=True)
|
|
|
|
|
|
def is_running_in_github_fork() -> bool:
|
|
"""Check if executed by GitHub Actions, in a repo fork."""
|
|
is_github_actions = is_running_in_github_ci()
|
|
is_fork = os.environ.get("IS_FORK") == "true" # custom var set by us in the workflow's YAML
|
|
return is_github_actions and is_fork
|
|
|
|
|
|
def is_running_in_github_ci() -> bool:
|
|
"""Check if executed by GitHub Actions"""
|
|
return os.environ.get("GITHUB_ACTIONS") == "true"
|
|
|
|
|
|
skipifspawn = pytest.mark.skipif(
|
|
multiprocessing.get_start_method() != "fork", reason="process fork not supported"
|
|
)
|
|
|
|
skipifpypy = pytest.mark.skipif(
|
|
platform.python_implementation() == "PyPy", reason="won't run in PyPy interpreter"
|
|
)
|
|
|
|
skipifnotwindows = pytest.mark.skipif(platform.system() != "Windows", reason="runs only on windows")
|
|
|
|
skipifwindows = pytest.mark.skipif(
|
|
platform.system() == "Windows", reason="does not runs on windows"
|
|
)
|
|
|
|
skipifgithubfork = pytest.mark.skipif(
|
|
is_running_in_github_fork(), reason="Skipping test because it runs on a PR coming from fork"
|
|
)
|
|
|
|
skipifgithubci = pytest.mark.skipif(
|
|
is_running_in_github_ci(), reason="This test does not work on github CI"
|
|
)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def reset_providers(settings_dir: str) -> Iterator[ConfigProvidersContainer]:
|
|
"""Context manager injecting standard set of providers where toml providers are initialized from `settings_dir`"""
|
|
return _reset_providers(settings_dir)
|
|
|
|
|
|
def _reset_providers(settings_dir: str) -> Iterator[ConfigProvidersContainer]:
|
|
yield from _inject_providers(
|
|
[
|
|
EnvironProvider(),
|
|
SecretsTomlProvider(settings_dir=settings_dir),
|
|
ConfigTomlProvider(settings_dir=settings_dir),
|
|
]
|
|
)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def inject_providers(providers: List[ConfigProvider]) -> Iterator[ConfigProvidersContainer]:
|
|
return _inject_providers(providers)
|
|
|
|
|
|
def _inject_providers(providers: List[ConfigProvider]) -> Iterator[ConfigProvidersContainer]:
|
|
container = Container()
|
|
ctx = ConfigProvidersContainer(initial_providers=providers)
|
|
try:
|
|
old_providers = container[PluggableRunContext].providers
|
|
container[PluggableRunContext].providers = ctx
|
|
yield ctx
|
|
finally:
|
|
container[PluggableRunContext].providers = old_providers
|