mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
* adds hub extra * makes hub module more user friendly when hub not installed * test and lint fixes * adds plugin version check util function * adds dlt-runtime to hub extra, minimal import tests * bumps to dlthub 0.20.0 alpha * lists pipelines with cli using the same functions as dashboard, dlt pipeline will list pipelines by default * adds configured propfiles method on context so only profiles with configs or pipelines are listed * adds list of locations that contained actual configs to provider interface * improves workspace and profile commands * test fixes * fixes tests
323 lines
12 KiB
Python
323 lines
12 KiB
Python
import io
|
|
import os
|
|
import contextlib
|
|
import pytest
|
|
import logging
|
|
from subprocess import CalledProcessError
|
|
|
|
import dlt
|
|
from dlt.common.runners.venv import Venv
|
|
from dlt.common.storages.file_storage import FileStorage
|
|
|
|
from dlt._workspace.cli import echo, _init_command, _pipeline_command
|
|
|
|
from tests.workspace.cli.utils import (
|
|
auto_echo_default_choice,
|
|
repo_dir,
|
|
cloned_init_repo,
|
|
)
|
|
|
|
|
|
def test_pipeline_command_operations(repo_dir: str) -> None:
|
|
_init_command.init_command("chess", "duckdb", repo_dir)
|
|
|
|
try:
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
print(pipeline.working_dir)
|
|
pipeline.drop()
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
# now run the pipeline
|
|
os.environ.pop(
|
|
"DESTINATION__DUCKDB__CREDENTIALS", None
|
|
) # settings from local project (secrets.toml etc.)
|
|
venv = Venv.restore_current()
|
|
try:
|
|
print(venv.run_script("chess_pipeline.py"))
|
|
except CalledProcessError as cpe:
|
|
print(cpe.stdout)
|
|
print(cpe.stderr)
|
|
raise
|
|
|
|
# we are in the project working dir (thanks to project_files fixture)
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("list", "-", None, 0)
|
|
_out = buf.getvalue()
|
|
# do we have chess pipeline in the list
|
|
assert _out.splitlines()[1].startswith("chess_pipeline")
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("info", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
# do we have duckdb destination
|
|
assert "destination_name: None" in _out
|
|
assert "destination_type: dlt.destinations.duckdb" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("info", "chess_pipeline", None, 1)
|
|
_out = buf.getvalue()
|
|
# were the sources state displayed
|
|
assert '"chess": {' in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("trace", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
# basic trace
|
|
assert "Pipeline chess_pipeline load step completed in" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("trace", "chess_pipeline", None, 1)
|
|
_out = buf.getvalue()
|
|
# extended trace
|
|
assert "span id:" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("trace", "chess_pipeline", None, 2)
|
|
_out = buf.getvalue()
|
|
# trace with job info
|
|
assert "Jobs details:" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("trace", "chess_pipeline", None, 2)
|
|
_out = buf.getvalue()
|
|
# trace with job info
|
|
assert "Jobs details:" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("load-package", "chess_pipeline", None, 2)
|
|
_out = buf.getvalue()
|
|
# has package info
|
|
assert "The package with load" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("failed-jobs", "chess_pipeline", None, 2)
|
|
_out = buf.getvalue()
|
|
# no failed jobs
|
|
assert "No failed jobs found" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
# execute sync
|
|
with echo.always_choose(False, True):
|
|
_pipeline_command.pipeline_command("sync", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
# after sync there's no trace
|
|
_pipeline_command.pipeline_command("info", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
# sync was executed
|
|
assert "Pipeline does not have last run trace." in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
with echo.always_choose(False, True):
|
|
_pipeline_command.pipeline_command(
|
|
"drop", "chess_pipeline", None, 0, resources=["players_games"]
|
|
)
|
|
|
|
_out = buf.getvalue()
|
|
assert "Selected resource(s): ['players_games']" in _out
|
|
assert (
|
|
"WARNING: Unless hardcoded, credentials are loaded from environment variables and/or"
|
|
" configuration files."
|
|
not in _out
|
|
)
|
|
|
|
# Command was executed
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
assert "players_games" not in pipeline.default_schema.tables
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
# Test sync destination and drop when local state is missing
|
|
pipeline._pipeline_storage.delete_folder("", recursively=True)
|
|
with echo.always_choose(False, True):
|
|
_pipeline_command.pipeline_command(
|
|
"drop",
|
|
"chess_pipeline",
|
|
None,
|
|
0,
|
|
destination=pipeline.destination,
|
|
dataset_name=pipeline.dataset_name,
|
|
resources=["players_profiles"],
|
|
)
|
|
_out = buf.getvalue()
|
|
|
|
assert "could not be restored: the pipeline was not found in " in _out
|
|
assert "Selected resource(s): ['players_profiles']" in _out
|
|
|
|
# Command was executed
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
assert "players_profiles" not in pipeline.default_schema.tables
|
|
|
|
|
|
def test_pipeline_command_failed_jobs(repo_dir: str) -> None:
|
|
_init_command.init_command("chess", "dummy", repo_dir)
|
|
|
|
try:
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
pipeline.drop()
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
# now run the pipeline
|
|
os.environ["FAIL_PROB"] = "1.0"
|
|
# let it fail without an exception
|
|
os.environ["RAISE_ON_FAILED_JOBS"] = "false"
|
|
venv = Venv.restore_current()
|
|
try:
|
|
print(venv.run_script("chess_pipeline.py"))
|
|
except CalledProcessError as cpe:
|
|
print(cpe.stdout)
|
|
print(cpe.stderr)
|
|
raise
|
|
|
|
# disable logging output for discovery cache for this test
|
|
logging.getLogger("googleapiclient.discovery_cache").setLevel(logging.ERROR)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("trace", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
# trace has LoadInfo with failed job
|
|
assert "1 FAILED job(s)" in _out
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("failed-jobs", "chess_pipeline", None, 0)
|
|
_out = buf.getvalue()
|
|
# actual failed job data
|
|
assert "JOB file type: jsonl" in _out
|
|
|
|
|
|
def test_pipeline_command_drop_partial_loads(repo_dir: str) -> None:
|
|
_init_command.init_command("chess", "dummy", repo_dir)
|
|
os.environ["EXCEPTION_PROB"] = "1.0"
|
|
|
|
try:
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
pipeline.drop()
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
venv = Venv.restore_current()
|
|
with pytest.raises(CalledProcessError) as cpe:
|
|
print(venv.run_script("chess_pipeline.py"))
|
|
assert "PipelineStepFailed" in cpe.value.stdout
|
|
|
|
# complete job manually to make a partial load
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
load_storage = pipeline._get_load_storage()
|
|
load_id = load_storage.normalized_packages.list_packages()[0]
|
|
job = load_storage.normalized_packages.list_new_jobs(load_id)[0]
|
|
load_storage.normalized_packages.start_job(
|
|
load_id, FileStorage.get_file_name_from_file_path(job)
|
|
)
|
|
load_storage.normalized_packages.complete_job(
|
|
load_id, FileStorage.get_file_name_from_file_path(job)
|
|
)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("info", "chess_pipeline", None, 1)
|
|
_out = buf.getvalue()
|
|
# one package is partially loaded
|
|
assert "This package is partially loaded" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
with echo.always_choose(False, True):
|
|
_pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1)
|
|
_out = buf.getvalue()
|
|
assert "Pending packages deleted" in _out
|
|
print(_out)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1)
|
|
_out = buf.getvalue()
|
|
assert "No pending packages found" in _out
|
|
print(_out)
|
|
|
|
|
|
def test_drop_from_wrong_dir(repo_dir: str) -> None:
|
|
# import contextlib
|
|
|
|
# with contextlib.contextmanager(create_test_run_context)():
|
|
|
|
_init_command.init_command("chess", "duckdb", repo_dir)
|
|
|
|
os.environ.pop(
|
|
"DESTINATION__DUCKDB__CREDENTIALS", None
|
|
) # settings from local project (secrets.toml etc.)
|
|
venv = Venv.restore_current()
|
|
try:
|
|
print(venv.run_script("chess_pipeline.py"))
|
|
except CalledProcessError as cpe:
|
|
print(cpe.stdout)
|
|
print(cpe.stderr)
|
|
raise
|
|
|
|
# Running from the correct location should not raise warning
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command(
|
|
"drop", "chess_pipeline", None, 0, resources=["players_games"]
|
|
)
|
|
_out = buf.getvalue()
|
|
assert (
|
|
"WARNING: You should run this from the same directory as the pipeline script"
|
|
not in _out
|
|
)
|
|
|
|
# load pipeline and last run dir to trigger the warning
|
|
pipeline = dlt.attach("chess_pipeline")
|
|
last_run_context = pipeline.get_local_state_val("last_run_context")
|
|
last_run_context["run_dir"] = "wrong_dir"
|
|
pipeline.set_local_state_val("last_run_context", last_run_context)
|
|
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
_pipeline_command.pipeline_command(
|
|
"drop", "chess_pipeline", None, 0, resources=["players_games"]
|
|
)
|
|
_out = buf.getvalue()
|
|
assert "WARNING: You should run this from the same directory as the pipeline script" in _out
|
|
|
|
|
|
def test_pipeline_command_drop_with_global_args(repo_dir: str) -> None:
|
|
"""Test that global CLI arguments don't cause errors in pipeline drop command."""
|
|
_init_command.init_command("chess", "duckdb", repo_dir)
|
|
|
|
os.environ.pop("DESTINATION__DUCKDB__CREDENTIALS", None)
|
|
venv = Venv.restore_current()
|
|
try:
|
|
print(venv.run_script("chess_pipeline.py"))
|
|
except CalledProcessError as cpe:
|
|
print(cpe.stdout)
|
|
print(cpe.stderr)
|
|
raise
|
|
|
|
# Test drop command with global arguments that should be ignored
|
|
with io.StringIO() as buf, contextlib.redirect_stdout(buf):
|
|
with echo.always_choose(False, True):
|
|
_pipeline_command.pipeline_command(
|
|
"drop",
|
|
"chess_pipeline",
|
|
None,
|
|
0,
|
|
resources=["players_games"],
|
|
no_pwd=False, # Global arg that should be ignored
|
|
debug=False, # Another global arg
|
|
)
|
|
_out = buf.getvalue()
|
|
assert "Selected resource(s): ['players_games']" in _out
|
|
|
|
# Verify the command actually executed
|
|
pipeline = dlt.attach(pipeline_name="chess_pipeline")
|
|
assert "players_games" not in pipeline.default_schema.tables
|