moves dlt core in

This commit is contained in:
Marcin Rudolf
2022-06-03 18:48:12 +02:00
parent ed85c90f03
commit 892e1e3139
154 changed files with 26239 additions and 212 deletions

13
.dockerignore Normal file
View File

@@ -0,0 +1,13 @@
.idea
.direnv
.mypy_cache
.pytest_cache
htmlcov
.coverage
__pycache__
.eggs
.egg-info
_storage
_test_storage
Dockerfile
.md

203
LICENSE.txt Normal file
View File

@@ -0,0 +1,203 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2022 ScaleVector
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -1,3 +1,28 @@
PYV=$(shell python3 -c "import sys;t='{v[0]}.{v[1]}'.format(v=list(sys.version_info[:2]));sys.stdout.write(t)")
.SILENT:has-poetry
# pipeline version info
AUTV=$(shell python3 -c "from dlt import __version__;print(__version__)")
AUTVMINMAJ=$(shell python3 -c "from dlt import __version__;print('.'.join(__version__.split('.')[:-1]))")
NAME := scalevector/dlt
TAG := $(shell git log -1 --pretty=%h)
IMG := ${NAME}:${TAG}
LATEST := ${NAME}:latest${VERSION_SUFFIX}
VERSION := ${AUTV}${VERSION_SUFFIX}
VERSION_MM := ${AUTVMINMAJ}${VERSION_SUFFIX}
# dbt runner version info
DBT_AUTV=$(shell python3 -c "from dlt.dbt_runner._version import __version__;print(__version__)")
DBT_AUTVMINMAJ=$(shell python3 -c "from dlt.dbt_runner._version import __version__;print('.'.join(__version__.split('.')[:-1]))")
DBT_NAME := scalevector/dlt-dbt-runner
DBT_IMG := ${DBT_NAME}:${TAG}
DBT_LATEST := ${DBT_NAME}:latest${VERSION_SUFFIX}
DBT_VERSION := ${DBT_AUTV}${VERSION_SUFFIX}
DBT_VERSION_MM := ${DBT_AUTVMINMAJ}${VERSION_SUFFIX}
install-poetry:
ifneq ($(VIRTUAL_ENV),)
$(error you cannot be under virtual environment $(VIRTUAL_ENV))
@@ -8,14 +33,70 @@ has-poetry:
poetry --version
dev: has-poetry
# will install itself as editable module
poetry install
poetry run pip install -e ../rasa_data_ingestion
# will install itself as editable module with all the extras
poetry install -E "postgres redshift dbt gcp"
lint:
poetry run mypy --config-file mypy.ini dlt examples
poetry run flake8 --max-line-length=200 dlt examples
# poetry run flake8 --max-line-length=200 dlt examples tests
$(MAKE) lint-security
lint-security:
poetry run bandit -r autopoiesis/ -n 3 -ll
poetry run bandit -r dlt/ -n 3 -l
reset-test-storage:
-rm -r _storage
mkdir _storage
python3 test/tools/create_storages.py
recreate-compiled-deps:
poetry export -f requirements.txt --output _gen_requirements.txt --without-hashes --extras gcp --extras redshift
grep `cat compiled_packages.txt` _gen_requirements.txt > compiled_requirements.txt
publish-library:
poetry version ${VERSION}
poetry build
poetry publish -u __token__
build-image-tags:
@echo ${IMG}
@echo ${LATEST}
@echo ${NAME}:${VERSION_MM}
@echo ${NAME}:${VERSION}
build-image-no-version-tags:
poetry export -f requirements.txt --output _gen_requirements.txt --without-hashes --extras gcp --extras redshift
docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA=${TAG} --build-arg=IMAGE_VERSION="${VERSION}" . -t ${IMG}
build-image: build-image-no-version-tags
docker tag ${IMG} ${LATEST}
docker tag ${IMG} ${NAME}:${VERSION_MM}
docker tag ${IMG} ${NAME}:${VERSION}
push-image:
docker push ${IMG}
docker push ${LATEST}
docker push ${NAME}:${VERSION_MM}
docker push ${NAME}:${VERSION}
dbt-build-image-tags:
@echo ${DBT_IMG}
@echo ${DBT_LATEST}
@echo ${DBT_VERSION_MM}
@echo ${DBT_VERSION}
dbt-build-image:
poetry export -f requirements.txt --output _gen_requirements_dbt.txt --without-hashes --extras dbt
docker build -f dlt/dbt_runner/Dockerfile --build-arg=COMMIT_SHA=${TAG} --build-arg=IMAGE_VERSION="${DBT_VERSION}" . -t ${DBT_IMG}
docker tag ${DBT_IMG} ${DBT_LATEST}
docker tag ${DBT_IMG} ${DBT_NAME}:${DBT_VERSION_MM}
docker tag ${DBT_IMG} ${DBT_NAME}:${DBT_VERSION}
dbt-push-image:
docker push ${DBT_IMG}
docker push ${DBT_LATEST}
docker push ${DBT_NAME}:${DBT_VERSION_MM}
docker push ${DBT_NAME}:${DBT_VERSION}
docker-login:
docker login -u scalevector -p ${DOCKER_PASS}

1
compiled_packages.txt Normal file
View File

@@ -0,0 +1 @@
cffi\|idna\|simplejson\|pendulum\|grpcio\|google-crc32c

View File

@@ -0,0 +1,6 @@
google-crc32c==1.3.0; python_version >= "3.6" and python_version < "3.11"
grpcio-status==1.43.0; python_version >= "3.6" and python_version < "3.11"
grpcio==1.43.0; python_version >= "3.6"
idna==3.3; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.11" or python_full_version >= "3.6.0" and python_version >= "3.6" and python_version < "3.11"
pendulum==2.1.2; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
simplejson==3.17.6; (python_version >= "2.5" and python_full_version < "3.0.0") or (python_full_version >= "3.3.0")

View File

@@ -0,0 +1,54 @@
FROM python:3.8-slim-bullseye as base
# Metadata
LABEL org.label-schema.vendor="ScaleVector" \
org.label-schema.url="https://scalevector.ai" \
org.label-schema.name="dbt_runner" \
org.label-schema.description="DBT Package Runner for DLT"
# prepare dirs to install autopoieses
RUN mkdir -p /usr/src/app && mkdir /var/local/app && mkdir /usr/src/app/autopoiesis
WORKDIR /usr/src/app
# System setup for DBT
RUN apt-get update \
&& apt-get dist-upgrade -y \
&& apt-get install -y --no-install-recommends \
git \
ssh-client \
software-properties-common \
make \
build-essential \
ca-certificates \
libpq-dev \
&& apt-get clean \
&& rm -rf \
/var/lib/apt/lists/* \
/tmp/* \
/var/tmp/*
# Env vars
ENV PYTHONIOENCODING=utf-8
ENV LANG=C.UTF-8
# Update python
RUN python -m pip install --upgrade pip setuptools wheel --no-cache-dir
ENV PYTHONPATH $PYTHONPATH:/usr/src/app
ADD _gen_requirements_dbt.txt .
RUN pip3 install -r _gen_requirements_dbt.txt
COPY autopoiesis/common autopoiesis/common
COPY autopoiesis/dbt_runner autopoiesis/dbt_runner
COPY autopoiesis/*.py autopoiesis/
# add build labels and envs
ARG COMMIT_SHA=""
ARG IMAGE_VERSION=""
LABEL commit_sha = ${COMMIT_SHA}
LABEL version=${IMAGE_VERSION}
ENV COMMIT_SHA=${COMMIT_SHA}
ENV IMAGE_VERSION=${IMAGE_VERSION}

43
deploy/dlt/Dockerfile Normal file
View File

@@ -0,0 +1,43 @@
# Python 3.8 required
FROM alpine:3.15
# Metadata
LABEL org.label-schema.vendor="ScaleVector" \
org.label-schema.url="https://scalevector.ai" \
org.label-schema.name="DLT" \
org.label-schema.description="DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run."
# prepare dirs to install autopoieses
RUN mkdir -p /tmp/pydlt
WORKDIR /tmp/pydlt
# generated by make recreate-compiled-deps to install packages requiring compiler
# recreate only when you have new deps requiring compilation - step below is very slow
ADD compiled_requirements.txt .
# install alpine deps
RUN apk update &&\
apk add --no-cache python3 ca-certificates curl postgresql &&\
apk add --no-cache --virtual build-deps build-base automake autoconf libtool python3-dev postgresql-dev libffi-dev linux-headers gcc musl-dev &&\
ln -s /usr/bin/python3 /usr/bin/python &&\
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py &&\
rm get-pip.py &&\
pip3 install --upgrade setuptools wheel &&\
rm -r /usr/lib/python*/ensurepip &&\
pip3 install -r compiled_requirements.txt &&\
apk del --purge build-deps
#rm -r /root/.cache
# add build labels and envs
ARG COMMIT_SHA=""
ARG IMAGE_VERSION=""
LABEL commit_sha = ${COMMIT_SHA}
LABEL version=${IMAGE_VERSION}
ENV COMMIT_SHA=${COMMIT_SHA}
ENV IMAGE_VERSION=${IMAGE_VERSION}
# install exactly the same version of the library we used to build
RUN pip3 install python-dlt==${IMAGE_VERSION}[gcp,redshift]
RUN rm -r /tmp/pydlt

View File

@@ -0,0 +1 @@
from dlt._version import common_version as __version__

3
dlt/_version.py Normal file
View File

@@ -0,0 +1,3 @@
common_version = "0.1.0"
loader_version = "0.1.0"
unpacker_version = "0.1.0"

5
dlt/common/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
from .pendulum import pendulum # noqa: F401
from .json import json # noqa: F401, I251
from .time import sleep # noqa: F401
from .arithmetics import Decimal # noqa: F401
from dlt._version import common_version as __version__

32
dlt/common/arithmetics.py Normal file
View File

@@ -0,0 +1,32 @@
import decimal
from contextlib import contextmanager
from typing import Iterator
from decimal import ROUND_HALF_UP, Decimal, DefaultContext, DivisionByZero, InvalidOperation, localcontext, Context, ConversionSyntax
DefaultContext.rounding = ROUND_HALF_UP
# use small caps for exponent
DefaultContext.capitals = 0
# prevent NaN to be returned
DefaultContext.traps[InvalidOperation] = True
# prevent Inf to be returned
DefaultContext.traps[DivisionByZero] = True
decimal.setcontext(DefaultContext)
DEFAULT_NUMERIC_PRECISION = 38
DEFAULT_NUMERIC_SCALE = 9
NUMERIC_DEFAULT_QUANTIZER = Decimal("1." + "0" * DEFAULT_NUMERIC_SCALE)
@contextmanager
def numeric_default_context() -> Iterator[Context]:
with localcontext() as c:
c.prec=DEFAULT_NUMERIC_PRECISION
yield c
def numeric_default_quantize(v: Decimal) -> Decimal:
if v == 0:
return v
return v.quantize(NUMERIC_DEFAULT_QUANTIZER)

View File

@@ -0,0 +1,11 @@
from .basic_configuration import BasicConfiguration # noqa: F401
from .unpacking_volume_configuration import UnpackingVolumeConfiguration, ProductionUnpackingVolumeConfiguration # noqa: F401
from .loading_volume_configuration import LoadingVolumeConfiguration, ProductionLoadingVolumeConfiguration # noqa: F401
from .schema_volume_configuration import SchemaVolumeConfiguration, ProductionSchemaVolumeConfiguration # noqa: F401
from .pool_runner_configuration import PoolRunnerConfiguration, TPoolType # noqa: F401
from .gcp_client_configuration import GcpClientConfiguration, GcpClientProductionConfiguration # noqa: F401
from .postgres_configuration import PostgresConfiguration, PostgresProductionConfiguration # noqa: F401
from .utils import make_configuration, TConfigSecret, open_configuration_file # noqa: F401
from .exceptions import ( # noqa: F401
ConfigEntryMissingException, ConfigEnvValueCannotBeCoercedException, ConfigIntegrityException, ConfigFileNotFoundException)

View File

@@ -0,0 +1,21 @@
from typing import Optional, Tuple
DEVELOPMENT_CONFIG_FILES_STORAGE_PATH = "_storage/config/%s"
PRODUCTION_CONFIG_FILES_STORAGE_PATH = "/run/config/%s"
class BasicConfiguration:
NAME: str = None # the name of the component, must be supplied
SENTRY_DSN: Optional[str] = None # keep None to disable Sentry
PROMETHEUS_PORT: Optional[int] = None # keep None to disable Prometheus
LOG_FORMAT: str = '{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}'
LOG_LEVEL: str = "DEBUG"
IS_DEVELOPMENT_CONFIG: bool = True
REQUEST_TIMEOUT: Tuple[int, int] = (15, 300) # default request timeout for all http clients
CONFIG_FILES_STORAGE_PATH: str = DEVELOPMENT_CONFIG_FILES_STORAGE_PATH
@classmethod
def check_integrity(cls) -> None:
# if CONFIG_FILES_STORAGE_PATH not overwritten and we are in production mode
if cls.CONFIG_FILES_STORAGE_PATH == DEVELOPMENT_CONFIG_FILES_STORAGE_PATH and not cls.IS_DEVELOPMENT_CONFIG:
# set to mount where config files will be present
cls.CONFIG_FILES_STORAGE_PATH = PRODUCTION_CONFIG_FILES_STORAGE_PATH

View File

@@ -0,0 +1,43 @@
from typing import Iterable, Union
from dlt.common.exceptions import DltException
class ConfigurationException(DltException):
def __init__(self, msg: str) -> None:
super().__init__(msg)
class ConfigEntryMissingException(ConfigurationException):
"""thrown when not all required config elements are present"""
def __init__(self, missing_set: Iterable[str]) -> None:
self.missing_set = missing_set
super().__init__('Missing config keys: ' + str(missing_set))
class ConfigEnvValueCannotBeCoercedException(ConfigurationException):
"""thrown when value from ENV cannot be coerced to hinted type"""
def __init__(self, attr_name: str, env_value: str, hint: type) -> None:
self.attr_name = attr_name
self.env_value = env_value
self.hint = hint
super().__init__('env value %s cannot be coerced into type %s in attr %s' % (env_value, str(hint), attr_name))
class ConfigIntegrityException(ConfigurationException):
"""thrown when value from ENV cannot be coerced to hinted type"""
def __init__(self, attr_name: str, env_value: str, info: Union[type, str]) -> None:
self.attr_name = attr_name
self.env_value = env_value
self.info = info
super().__init__('integrity error for attr %s with value %s. %s.' % (attr_name, env_value, info))
class ConfigFileNotFoundException(ConfigurationException):
"""thrown when configuration file cannot be found in config folder"""
def __init__(self, path: str) -> None:
super().__init__(f"Missing config file in {path}")

View File

@@ -0,0 +1,34 @@
from dlt.common.typing import StrStr
from dlt.common.configuration.utils import TConfigSecret
class GcpClientConfiguration:
PROJECT_ID: str = None
DATASET: str = None
TIMEOUT: float = 30.0
BQ_CRED_TYPE: str = "service_account"
BQ_CRED_PRIVATE_KEY: TConfigSecret = None
BQ_CRED_TOKEN_URI: str = "https://oauth2.googleapis.com/token"
BQ_CRED_CLIENT_EMAIL: str = None
@classmethod
def check_integrity(cls) -> None:
if cls.BQ_CRED_PRIVATE_KEY and cls.BQ_CRED_PRIVATE_KEY[-1] != "\n":
# must end with new line, otherwise won't be parsed by Crypto
cls.BQ_CRED_PRIVATE_KEY = TConfigSecret(cls.BQ_CRED_PRIVATE_KEY + "\n")
@classmethod
def to_service_credentials(cls) -> StrStr:
return {
"type": cls.BQ_CRED_TYPE,
"project_id": cls.PROJECT_ID,
"private_key": cls.BQ_CRED_PRIVATE_KEY,
"token_uri": cls.BQ_CRED_TOKEN_URI,
"client_email": cls.BQ_CRED_CLIENT_EMAIL
}
class GcpClientProductionConfiguration(GcpClientConfiguration):
PROJECT_ID: str = None
DATASET: str = None
BQ_CRED_PRIVATE_KEY: TConfigSecret = None
BQ_CRED_CLIENT_EMAIL: str = None

View File

@@ -0,0 +1,6 @@
class LoadingVolumeConfiguration:
LOADING_VOLUME_PATH: str = "_storage/loading" # path to volume where files to be loaded to analytical storage are stored
DELETE_COMPLETED_JOBS: bool = False # if set to true the folder with completed jobs will be deleted
class ProductionLoadingVolumeConfiguration(LoadingVolumeConfiguration):
LOADING_VOLUME_PATH: str = None

View File

@@ -0,0 +1,13 @@
from typing import Literal, Optional
from dlt.common.configuration import BasicConfiguration
TPoolType = Literal["process", "thread", "none"]
class PoolRunnerConfiguration(BasicConfiguration):
MAX_PARALLELISM: Optional[int] = None # how many threads/processes in the pool
EXIT_ON_EXCEPTION: bool = False # should exit on exception
STOP_AFTER_RUNS: int = 10000 # will stop runner with exit code -2 after so many runs, that prevents memory fragmentation
POOL_TYPE: TPoolType = None # type of pool to run, must be set in derived configs
RUN_SLEEP: float = 0.5 # how long to sleep between runs with workload, seconds
RUN_SLEEP_IDLE: float = 1.0 # how long to sleep when no more items are pending, seconds
RUN_SLEEP_WHEN_FAILED: float = 1.0 # how long to sleep between the runs when failed

View File

@@ -0,0 +1,25 @@
from dlt.common.configuration.utils import TConfigSecret
class PostgresConfiguration:
PG_DATABASE_NAME: str = None
PG_SCHEMA_PREFIX: str = None
PG_PASSWORD: TConfigSecret = None
PG_USER: str = None
PG_HOST: str = None
PG_PORT: int = 5439
PG_CONNECTION_TIMEOUT: int = 15
@classmethod
def check_integrity(cls) -> None:
cls.PG_DATABASE_NAME = cls.PG_DATABASE_NAME.lower()
cls.PG_SCHEMA_PREFIX = cls.PG_SCHEMA_PREFIX.lower()
cls.PG_PASSWORD = TConfigSecret(cls.PG_PASSWORD.strip())
class PostgresProductionConfiguration(PostgresConfiguration):
PG_DATABASE_NAME: str = None
PG_SCHEMA_PREFIX: str = None
PG_PASSWORD: TConfigSecret = None
PG_USER: str = None
PG_HOST: str = None

View File

@@ -0,0 +1,6 @@
class SchemaVolumeConfiguration:
SCHEMA_VOLUME_PATH: str = "_storage/schemas" # path to volume with default schemas
class ProductionSchemaVolumeConfiguration:
SCHEMA_VOLUME_PATH: str = None

View File

@@ -0,0 +1,6 @@
class UnpackingVolumeConfiguration:
UNPACKING_VOLUME_PATH: str = "_storage/unpacking" # path to volume where unpacking will happen
class ProductionUnpackingVolumeConfiguration:
UNPACKING_VOLUME_PATH: str = None

View File

@@ -0,0 +1,214 @@
import sys
import semver
from os import environ
from os.path import isdir, isfile
from typing import Any, Dict, List, Mapping, NewType, Optional, Type, TypeVar, Union, Literal, IO, cast
from dlt.common.typing import StrAny
from dlt.common.configuration import BasicConfiguration
from dlt.common.configuration.exceptions import (ConfigEntryMissingException,
ConfigEnvValueCannotBeCoercedException, ConfigFileNotFoundException)
from dlt.common.utils import uniq_id
SIMPLE_TYPES: List[Any] = [int, bool, list, dict, tuple, bytes, set, float]
# those types and Optionals of those types should not be passed to eval function
NON_EVAL_TYPES = [str, None, Any]
# allows to coerce (type1 from type2)
ALLOWED_TYPE_COERCIONS = [(float, int), (str, int), (str, float)]
IS_DEVELOPMENT_CONFIG_KEY: str = "IS_DEVELOPMENT_CONFIG"
CHECK_INTEGRITY_F: str = "check_integrity"
SECRET_STORAGE_PATH: str = "/run/secrets/%s"
TConfiguration = TypeVar("TConfiguration", bound=Type[BasicConfiguration])
TProductionConfiguration = TypeVar("TProductionConfiguration", bound=Type[BasicConfiguration])
TConfigSecret = NewType("TConfigSecret", str)
def make_configuration(config: TConfiguration,
production_config: TProductionConfiguration,
initial_values: StrAny = None,
accept_partial: bool = False,
skip_subclass_check: bool = False) -> TConfiguration:
if not skip_subclass_check:
assert issubclass(production_config, config)
final_config: TConfiguration = config if _is_development_config() else production_config
possible_keys_in_config = _get_config_attrs_with_hints(final_config)
# create dynamic class type to not touch original config variables
derived_config: TConfiguration = cast(TConfiguration,
type(final_config.__name__ + "_" + uniq_id(), (final_config, ), {})
)
# apply initial values while preserving hints
if initial_values:
for k, v in initial_values.items():
setattr(derived_config, k, v)
_apply_environ_to_config(derived_config, possible_keys_in_config)
try:
_is_config_bounded(derived_config, possible_keys_in_config)
_check_configuration_integrity(derived_config)
except ConfigEntryMissingException:
if not accept_partial:
raise
_add_module_version(derived_config)
return derived_config
def has_configuration_file(name: str, config: TConfiguration) -> bool:
return isfile(get_configuration_file_path(name, config))
def open_configuration_file(name: str, mode: str, config: TConfiguration) -> IO[Any]:
path = get_configuration_file_path(name, config)
if not has_configuration_file(name, config):
raise ConfigFileNotFoundException(path)
return open(path, mode)
def get_configuration_file_path(name: str, config: TConfiguration) -> str:
return config.CONFIG_FILES_STORAGE_PATH % name
def is_direct_descendant(child: Type[Any], base: Type[Any]) -> bool:
# TODO: there may be faster way to get direct descendant that mro
# note: at index zero there's child
return base == type.mro(child)[1]
def _is_development_config() -> bool:
is_dev_config = True
# get from environment
if IS_DEVELOPMENT_CONFIG_KEY in environ:
is_dev_config = _coerce_single_value(IS_DEVELOPMENT_CONFIG_KEY, environ[IS_DEVELOPMENT_CONFIG_KEY], bool)
return is_dev_config
def _add_module_version(config: TConfiguration) -> None:
try:
v = sys._getframe(1).f_back.f_globals["__version__"]
semver.VersionInfo.parse(v)
setattr(config, "_VERSION", v) # noqa: B010
except KeyError:
pass
def _apply_environ_to_config(config: TConfiguration, keys_in_config: Mapping[str, type]) -> None:
for key, hint in keys_in_config.items():
value = _get_key_value(key, hint)
if value is not None:
value_from_environment_variable = _coerce_single_value(key, value, hint)
# set value
setattr(config, key, value_from_environment_variable)
def _get_key_value(key: str, hint: Type[Any]) -> Optional[str]:
if hint is TConfigSecret:
# try secret storage
try:
# must conform to RFC1123
secret_name = key.lower().replace("_", "-")
secret_path = SECRET_STORAGE_PATH % secret_name
# kubernetes stores secrets as files in a dir, docker compose plainly
if isdir(secret_path):
secret_path += "/" + secret_name
with open(secret_path, "r") as f:
secret = f.read()
# add secret to environ so forks have access
# TODO: removing new lines is not always good. for password OK for PEMs not
# TODO: in regular secrets that is dealt with in particular configuration logic
environ[key] = secret.strip()
# do not strip returned secret
return secret
except FileNotFoundError:
pass
return environ.get(key, None)
def _is_config_bounded(config: TConfiguration, keys_in_config: Mapping[str, type]) -> None:
_unbound_attrs = [
key for key in keys_in_config if getattr(config, key) is None and not _is_optional_type(keys_in_config[key])
]
if len(_unbound_attrs) > 0:
raise ConfigEntryMissingException(_unbound_attrs)
def _check_configuration_integrity(config: TConfiguration) -> None:
# python multi-inheritance is cooperative and this would require that all configurations cooperatively
# call each other check_integrity. this is not at all possible as we do not know which configs in the end will
# be mixed together.
# get base classes in order of derivation
mro = type.mro(config)
for c in mro:
# check if this class implements check_integrity (skip pure inheritance to not do double work)
if CHECK_INTEGRITY_F in c.__dict__ and callable(getattr(c, CHECK_INTEGRITY_F)):
# access unbounded __func__ to pass right class type so we check settings of the tip of mro
c.__dict__[CHECK_INTEGRITY_F].__func__(config)
def _coerce_single_value(key: str, value: str, hint: Type[Any]) -> Any:
try:
hint_primitive_type = _extract_simple_type(hint)
if hint_primitive_type not in NON_EVAL_TYPES:
# create primitive types out of strings
typed_value = eval(value) # nosec
# for primitive types check coercion
if hint_primitive_type in SIMPLE_TYPES and type(typed_value) != hint_primitive_type:
# allow some exceptions
coerce_exception = next(
(e for e in ALLOWED_TYPE_COERCIONS if e == (hint_primitive_type, type(typed_value))), None)
if coerce_exception:
return hint_primitive_type(typed_value)
else:
raise ConfigEnvValueCannotBeCoercedException(key, typed_value, hint)
return typed_value
else:
return value
except ConfigEnvValueCannotBeCoercedException:
raise
except Exception as exc:
raise ConfigEnvValueCannotBeCoercedException(key, value, hint) from exc
def _extract_simple_type(hint: Type[Any]) -> Type[Any]:
# extract optional type and call recursively
if _is_literal_type(hint):
# assume that all literals are of the same type
return _extract_simple_type(type(hint.__args__[0]))
if _is_optional_type(hint):
# todo: use `get_args` in python 3.8
return _extract_simple_type(hint.__args__[0])
if not hasattr(hint, "__supertype__"):
return hint
# descend into supertypes of NewType
return _extract_simple_type(hint.__supertype__)
def _get_config_attrs_with_hints(config: TConfiguration) -> Dict[str, type]:
keys: Dict[str, type] = {}
mro = type.mro(config)
for cls in reversed(mro):
# update in reverse derivation order so derived classes overwrite hints from base classes
if cls is not object:
keys.update(
[(attr, cls.__annotations__.get(attr, None))
# if hasattr(config, '__annotations__') and attr in config.__annotations__ else None)
for attr in cls.__dict__.keys() if not callable(getattr(cls, attr)) and not attr.startswith("__")
])
return keys
def _is_optional_type(hint: Type[Any]) -> bool:
# todo: use typing get_args and get_origin in python 3.8
if hasattr(hint, "__origin__"):
return hint.__origin__ is Union and type(None) in hint.__args__
return False
def _is_literal_type(hint: Type[Any]) -> bool:
return hasattr(hint, "__origin__") and hint.__origin__ is Literal

View File

@@ -0,0 +1,58 @@
import jsonlines
from typing import Any, Iterable, Literal, Sequence, IO
from dlt.common import json
from dlt.common.typing import StrAny
TWriterType = Literal["jsonl", "insert_values"]
def write_jsonl(f: IO[Any], rows: Sequence[Any]) -> None:
# use jsonl to write load files https://jsonlines.org/
with jsonlines.Writer(f, dumps=json.dumps) as w:
w.write_all(rows)
def write_insert_values(f: IO[Any], rows: Sequence[StrAny], headers: Iterable[str]) -> None:
# dict lookup is always faster
headers_lookup = {v: i for i, v in enumerate(headers)}
# do not write INSERT INTO command, this must be added together with table name by the loader
f.write("INSERT INTO {}(")
f.write(",".join(map(escape_redshift_identifier, headers)))
f.write(")\nVALUES\n")
def stringify(v: Any) -> str:
if type(v) is bytes:
return f"from_hex('{v.hex()}')"
else:
return str(v)
def write_row(row: StrAny) -> None:
output = ["NULL" for _ in range(len(headers_lookup))]
for n,v in row.items():
output[headers_lookup[n]] = escape_redshift_literal(v) if type(v) is str else stringify(v)
f.write("(")
f.write(",".join(output))
f.write(")")
for row in rows[:-1]:
write_row(row)
f.write(",\n")
write_row(rows[-1])
f.write(";")
def escape_redshift_literal(v: str) -> str:
# https://www.postgresql.org/docs/9.3/sql-syntax-lexical.html
# looks like this is the only thing we need to escape for Postgres > 9.1
# redshift keeps \ as escape character which is pre 9 behavior
return "'" + v.replace("'", "''").replace("\\", "\\\\") + "'"
def escape_redshift_identifier(v: str) -> str:
return '"' + v.replace('"', '""').replace("\\", "\\\\") + '"'
def escape_bigquery_identifier(v: str) -> str:
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical
return "`" + v.replace("\\", "\\\\").replace("`","\\`") + "`"

58
dlt/common/exceptions.py Normal file
View File

@@ -0,0 +1,58 @@
class DltException(Exception):
pass
class SignalReceivedException(DltException):
def __init__(self, signal_code: int) -> None:
self.signal_code = signal_code
super().__init__(f"Signal {signal_code} received")
class PoolException(DltException):
"""
Thrown by worker pool to pass information when thrown during processing an item
"""
def __init__(self, pool_name: str = None, item: str = None, internal_exception: Exception = None) -> None:
# we need it to make it pickle compatible
if pool_name:
self.pool_name = pool_name
self.item = item
self.internal_exception = internal_exception
super().__init__(f"Pool {pool_name} raised on item {item} with {str(internal_exception)}")
class UnsupportedProcessStartMethodException(DltException):
def __init__(self, method: str) -> None:
self.method = method
super().__init__(f"Process pool supports only fork start method, {method} not supported. Switch the pool type to threading")
class TerminalException(Exception):
"""
Marks an exception that cannot be recovered from, should be mixed in into concrete exception class
"""
pass
class TransientException(Exception):
"""
Marks an exception in operation that can be retried, should be mixed in into concrete exception class
"""
pass
class TerminalValueError(ValueError, TerminalException):
"""
ValueError that is unrecoverable
"""
pass
class TimeRangeExhaustedException(DltException):
"""
Raised when backfilling complete and no more time ranges can be generated
"""
def __init__(self, start_ts: float, end_ts: float) -> None:
self.start_ts = start_ts
self.end_ts = end_ts
super().__init__(f"Timerange ({start_ts} to {end_ts}> exhausted")

135
dlt/common/file_storage.py Normal file
View File

@@ -0,0 +1,135 @@
import os
import tempfile
import shutil
from pathlib import Path
from typing import IO, Any, List
class FileStorage:
def __init__(self,
storage_path: str,
file_type: str = "t",
makedirs: bool = False) -> None:
# make it absolute path
self.storage_path = os.path.join(os.path.realpath(storage_path), '')
self.file_type = file_type
if makedirs:
os.makedirs(storage_path, exist_ok=True)
@classmethod
def from_file(cls, file_path: str, file_type: str = "t",) -> "FileStorage":
return cls(os.path.dirname(file_path), file_type)
def save(self, relative_path: str, data: Any) -> str:
return self.save_atomic(self.storage_path, relative_path, data, file_type=self.file_type)
@staticmethod
def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str:
with tempfile.NamedTemporaryFile(dir=storage_path, mode="w" + file_type, delete=False) as f:
tmp_path = f.name
f.write(data)
try:
dest_path = os.path.join(storage_path, relative_path)
os.rename(tmp_path, dest_path)
return dest_path
except Exception:
if os.path.isfile(tmp_path):
os.remove(tmp_path)
raise
def load(self, relative_path: str) -> Any:
# raises on file not existing
with self.open(relative_path) as text_file:
return text_file.read()
def delete(self, relative_path: str) -> None:
file_path = self._make_path(relative_path)
if os.path.isfile(file_path):
os.remove(file_path)
else:
raise FileNotFoundError(file_path)
def delete_folder(self, relative_path: str, recursively: bool = False) -> None:
folder_path = self._make_path(relative_path)
if os.path.isdir(folder_path):
if recursively:
shutil.rmtree(folder_path)
else:
os.rmdir(folder_path)
else:
raise NotADirectoryError(folder_path)
def open(self, realtive_path: str, mode: str = "r") -> IO[Any]:
return open(self._make_path(realtive_path), mode + self.file_type)
def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]:
ft = file_type or self.file_type
return tempfile.NamedTemporaryFile(dir=self.storage_path, mode=mode + ft, delete=delete)
def has_file(self, relative_path: str) -> bool:
return os.path.isfile(self._make_path(relative_path))
def has_folder(self, relative_path: str) -> bool:
return os.path.isdir(self._make_path(relative_path))
def list_folder_files(self, relative_path: str, to_root: bool = True) -> List[str]:
scan_path = self._make_path(relative_path)
if to_root:
# list files in relative path, returning paths relative to storage root
return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_file()]
else:
# or to the folder
return [e.name for e in os.scandir(scan_path) if e.is_file()]
def list_folder_dirs(self, relative_path: str, to_root: bool = True) -> List[str]:
# list content of relative path, returning paths relative to storage root
scan_path = self._make_path(relative_path)
if to_root:
# list folders in relative path, returning paths relative to storage root
return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_dir()]
else:
# or to the folder
return [e.name for e in os.scandir(scan_path) if e.is_dir()]
def create_folder(self, relative_path: str, exists_ok: bool = False) -> None:
os.makedirs(self._make_path(relative_path), exist_ok=exists_ok)
def copy_cross_storage_atomically(self, dest_volume_root: str, dest_relative_path: str, source_path: str, dest_name: str) -> None:
external_tmp_file = tempfile.mktemp(dir=dest_volume_root)
# first copy to temp file
shutil.copy(self._make_path(source_path), external_tmp_file)
# then rename to dest name
external_dest = os.path.join(dest_volume_root, dest_relative_path, dest_name)
try:
os.rename(external_tmp_file, external_dest)
except Exception:
if os.path.isfile(external_tmp_file):
os.remove(external_tmp_file)
raise
def atomic_rename(self, from_relative_path: str, to_relative_path: str) -> None:
os.rename(
self._make_path(from_relative_path),
self._make_path(to_relative_path)
)
def in_storage(self, path: str) -> bool:
file = os.path.realpath(path)
# return true, if the common prefix of both is equal to directory
# e.g. /a/b/c/d.rst and directory is /a/b, the common prefix is /a/b
return os.path.commonprefix([file, self.storage_path]) == self.storage_path
def to_relative_path(self, path: str) -> str:
if not self.in_storage(path):
raise ValueError(path)
return os.path.relpath(path, start=self.storage_path)
def get_file_stem(self, path: str) -> str:
return Path(os.path.basename(path)).stem
def get_file_name(self, path: str) -> str:
return Path(path).name
def _make_path(self, relative_path: str) -> str:
return os.path.join(self.storage_path, relative_path)

46
dlt/common/json.py Normal file
View File

@@ -0,0 +1,46 @@
import base64
from datetime import date, datetime # noqa: I251
from functools import partial
from typing import Any, Union
from uuid import UUID
from hexbytes import HexBytes
import simplejson
from simplejson.raw_json import RawJSON
from dlt.common.arithmetics import Decimal
# simplejson._toggle_speedups(False)
def custom_encode(obj: Any) -> Union[RawJSON, str]:
if isinstance(obj, Decimal):
# always return decimals as string (not RawJSON) so they are not deserialized back to float
return str(obj.normalize())
# this works both for standard datetime and pendulum
elif isinstance(obj, datetime):
# See "Date Time String Format" in the ECMA-262 specification.
r = obj.isoformat()
# leave microseconds alone
# if obj.microsecond:
# r = r[:23] + r[26:]
if r.endswith('+00:00'):
r = r[:-6] + 'Z'
return r
elif isinstance(obj, date):
return obj.isoformat()
elif isinstance(obj, UUID):
return str(obj)
elif isinstance(obj, HexBytes):
return obj.hex()
elif isinstance(obj, bytes):
return base64.b64encode(obj).decode('ascii')
raise TypeError(repr(obj) + " is not JSON serializable")
simplejson.loads = partial(simplejson.loads, use_decimal=False)
simplejson.load = partial(simplejson.load, use_decimal=False)
# prevent default decimal serializer (use_decimal=False) and binary serializer (encoding=None)
simplejson.dumps = partial(simplejson.dumps, use_decimal=False, default=custom_encode, encoding=None)
simplejson.dump = partial(simplejson.dump, use_decimal=False, default=custom_encode, encoding=None)
# provide drop-in replacement
json = simplejson

207
dlt/common/logger.py Normal file
View File

@@ -0,0 +1,207 @@
import logging
import json_logging
import traceback
import sentry_sdk
from sentry_sdk.transport import HttpTransport
from logging import LogRecord, Logger
from typing import Any, Callable, Dict, Type
from dlt.common.json import json
from dlt.common.typing import DictStrAny, DictStrStr, StrStr
from dlt.common.configuration import BasicConfiguration
from dlt.common.utils import filter_env_vars
from dlt._version import common_version as __version__
DLT_LOGGER_NAME = "sv-dlt"
LOGGER: Logger = None
def _add_logging_level(level_name: str, level: int, method_name:str = None) -> None:
"""
Comprehensively adds a new logging level to the `logging` module and the
currently configured logging class.
`levelName` becomes an attribute of the `logging` module with the value
`levelNum`. `methodName` becomes a convenience method for both `logging`
itself and the class returned by `logging.getLoggerClass()` (usually just
`logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
used.
To avoid accidental clobberings of existing attributes, this method will
raise an `AttributeError` if the level name is already an attribute of the
`logging` module or if the method name is already present
"""
if not method_name:
method_name = level_name.lower()
if hasattr(logging, level_name):
raise AttributeError('{} already defined in logging module'.format(level_name))
if hasattr(logging, method_name):
raise AttributeError('{} already defined in logging module'.format(method_name))
if hasattr(logging.getLoggerClass(), method_name):
raise AttributeError('{} already defined in logger class'.format(method_name))
# This method was inspired by the answers to Stack Overflow post
# http://stackoverflow.com/q/2183233/2988730, especially
# http://stackoverflow.com/a/13638084/2988730
def logForLevel(self: logging.Logger, message: str, *args: Any, **kwargs: Any) -> None:
if self.isEnabledFor(level):
self._log(level, message, args, **kwargs)
def logToRoot(message: str, *args: Any, **kwargs: Any) -> None:
logging.root._log(level, message, args, **kwargs)
logging.addLevelName(level, level_name)
setattr(logging, level_name, level)
setattr(logging.getLoggerClass(), method_name, logForLevel)
setattr(logging, method_name, logToRoot)
class _MetricsFormatter(logging.Formatter):
def format(self, record: LogRecord) -> str:
s = super(_MetricsFormatter, self).format(record)
if record.exc_text:
s = s + '|'
# dump metrics dictionary nicely
if "metrics" in record.__dict__:
s = s + ": " + json.dumps(record.__dict__["metrics"])
return s
class _CustomJsonFormatter(json_logging.JSONLogFormatter):
version: StrStr = None
def _format_log_object(self, record: LogRecord, request_util: Any) -> Any:
json_log_object = super(_CustomJsonFormatter, self)._format_log_object(record, request_util)
if self.version:
json_log_object.update({"version": self.version})
return json_log_object
def _init_logging(logger_name: str, level: str, format: str, component: str, version: StrStr) -> Logger:
if logger_name == "root":
logging.basicConfig(level=level)
handler = logging.getLogger().handlers[0]
# handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
logger = logging.getLogger()
else:
logger = logging.getLogger(DLT_LOGGER_NAME)
logger.propagate = False
logger.setLevel(level)
handler = logging.StreamHandler()
# handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
logger.addHandler(handler)
# set right formatter
if is_json_logging(format):
json_logging.COMPONENT_NAME = component
json_logging.JSON_SERIALIZER = json.dumps
json_logging.RECORD_ATTR_SKIP_LIST.remove("process")
# set version as class variable as we cannot pass custom constructor parameters
_CustomJsonFormatter.version = version
# the only thing method above effectively does is to replace the formatter
json_logging.init_non_web(enable_json=True, custom_formatter=_CustomJsonFormatter)
if logger_name == "root":
json_logging.config_root_logger()
else:
handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
return logger
def __getattr__(name: str) -> Callable[..., Any]:
# a catch all function for a module that forwards calls to unknown methods to LOGGER
def wrapper(msg: str, *args: Any, **kwargs: Any) -> None:
if LOGGER:
getattr(LOGGER, name)(msg, *args, **kwargs, stacklevel=2)
return wrapper
def _extract_version_info(config: Type[BasicConfiguration]) -> StrStr:
version_info = {"version": __version__, "component_name": config.NAME}
version = getattr(config, "_VERSION", None)
if version:
version_info["component_version"] = version
# extract envs with build info
version_info.update(filter_env_vars(["COMMIT_SHA", "IMAGE_VERSION"]))
return version_info
def _extract_pod_info() -> StrStr:
return filter_env_vars(["KUBE_NODE_NAME", "KUBE_POD_NAME", "KUBE_POD_NAMESPACE"])
class _SentryHttpTransport(HttpTransport):
timeout: int = 0
def _get_pool_options(self, *a: Any, **kw: Any) -> DictStrAny:
rv = HttpTransport._get_pool_options(self, *a, **kw)
rv['timeout'] = self.timeout
return rv
def _init_sentry(config: Type[BasicConfiguration], version: StrStr) -> None:
if config.SENTRY_DSN:
global sentry_client
sys_ver = version["version"]
release = sys_ver + "_" + version.get("commit_sha", "")
_SentryHttpTransport.timeout = config.REQUEST_TIMEOUT[0]
# TODO: setup automatic sending of log messages by log level (ie. we send a lot dbt trash logs)
# https://docs.sentry.io/platforms/python/guides/logging/
sentry_sdk.init(config.SENTRY_DSN, release=release, transport=_SentryHttpTransport)
# add version tags
for k, v in version.items():
sentry_sdk.set_tag(k, v)
# add kubernetes tags
pod_tags = _extract_pod_info()
for k, v in pod_tags.items():
sentry_sdk.set_tag(k, v)
def init_telemetry(config: Type[BasicConfiguration]) -> None:
if config.PROMETHEUS_PORT:
from prometheus_client import start_http_server, Info
logging.info(f"Starting prometheus server port {config.PROMETHEUS_PORT}")
start_http_server(config.PROMETHEUS_PORT)
# collect info
Info("runs_component_name", "Name of the executing component").info(_extract_version_info(config))
def init_logging_from_config(config: Type[BasicConfiguration]) -> None:
global LOGGER
# add HEALTH and METRICS log levels
_add_logging_level("HEALTH", logging.WARNING - 1, "health")
_add_logging_level("METRICS", logging.WARNING - 2, "metrics")
version = _extract_version_info(config)
LOGGER = _init_logging(
DLT_LOGGER_NAME,
# "root",
config.LOG_LEVEL,
config.LOG_FORMAT,
config.NAME,
version)
_init_sentry(config, version)
def is_json_logging(log_format: str) -> bool:
return log_format == "JSON"
def process_internal_exception(msg: str, exc_info: Any = True) -> None:
# Passing default True value will cause implementation to use data provided by sys.exc_info
if LOGGER:
LOGGER.error(msg, exc_info=exc_info, stacklevel=2)
report_exception()
def report_exception() -> None:
if sentry_sdk.Hub.current:
sentry_sdk.capture_exception()
def pretty_format_exception() -> str:
return traceback.format_exc()

147
dlt/common/parser.py Normal file
View File

@@ -0,0 +1,147 @@
import re
from typing import Iterator, Optional, Tuple, Callable, cast
from dlt.common import json
from dlt.common.schema import Schema
from dlt.common.utils import uniq_id, digest128
from dlt.common.typing import TEvent, TEventRowChild, TEventRowRoot, StrAny
# I(table name, row data)
TUnpackedRowIterator = Iterator[Tuple[str, StrAny]]
TExtractFunc = Callable[[Schema, TEvent, str, bool], TUnpackedRowIterator]
RE_UNDERSCORES = re.compile("_+")
RE_LEADING_DIGITS = re.compile(r"^\d+")
INVALID_SQL_IDENT_CHARS = "- *!:,.'\\\"`"
INVALID_SQL_TX = str.maketrans(INVALID_SQL_IDENT_CHARS, "_" * len(INVALID_SQL_IDENT_CHARS))
# subsequent nested fields will be separated with the string below, applies both to field and table names
PATH_SEPARATOR = "__"
# for those paths the complex nested objects should be left in place
# current use case: we want to preserve event_slot__value in db even if it's an object
# TODO: pass table definition and accept complex type
def _should_preserve_complex_value(table: str, field_name: str) -> bool:
path = f"{table}{PATH_SEPARATOR}{field_name}"
return path in ["event_slot__value"]
def _fix_field_name(name: str) -> str:
def camel_to_snake(name: str) -> str:
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
# fix field name so it's an acceptable name for a database column
# all characters that are not letters digits or a few special chars
name = camel_to_snake(name.translate(INVALID_SQL_TX))
name = RE_LEADING_DIGITS.sub("_", name)
# replace consecutive underscores with single one to prevent name clashes with parent child
return RE_UNDERSCORES.sub("_", name)
def _flatten(table: str, dict_row: TEventRowChild) -> TEventRowChild:
out_rec_row: TEventRowChild = {}
def unpack_row_dicts(dict_row: StrAny, parent_name: Optional[str]) -> None:
for k, v in dict_row.items():
corrected_k = _fix_field_name(k)
child_name = corrected_k if not parent_name else f'{parent_name}{PATH_SEPARATOR}{corrected_k}'
if type(v) is dict:
unpack_row_dicts(v, parent_name=child_name)
if _should_preserve_complex_value(table, child_name):
out_rec_row[child_name] = v # type: ignore
else:
out_rec_row[child_name] = v # type: ignore
unpack_row_dicts(dict_row, None)
return out_rec_row
def _get_child_row_hash(parent_hash: str, child_table: str, list_pos: int) -> str:
# create deterministic unique id of the child row taking into account that all lists are ordered
# and all child tables must be lists
return digest128(f"{parent_hash}_{child_table}_{list_pos}")
def _unpack_row(
schema: Schema,
dict_row: TEventRowChild,
extend: TEventRowChild,
table: str,
parent_hash: Optional[str] = None,
pos: Optional[int] = None
) -> TUnpackedRowIterator:
def _append_child_meta(_row: TEventRowChild, _hash: str, _p_hash: str, _p_pos: int) -> TEventRowChild:
_row["_parent_hash"] = _p_hash
_row["_pos"] = _p_pos
_row.update(extend)
return _row
is_top_level = parent_hash is None
# flatten current row
new_dict_row = _flatten(table, dict_row)
# infer record hash or leave existing primary key if present
record_hash = new_dict_row.get("_record_hash", None)
if not record_hash:
# check if we have primary key: if so use it
primary_key = schema.filter_hints_in_row(table, "primary_key", new_dict_row)
if primary_key:
# create row id from primary key
record_hash = digest128("_".join(map(lambda v: str(v), primary_key.values())))
elif not is_top_level:
# child table row deterministic hash
record_hash = _get_child_row_hash(parent_hash, table, pos)
# link to parent table
_append_child_meta(new_dict_row, record_hash, parent_hash, pos)
else:
# create random row id, note that incremental loads will not work with such tables
record_hash = uniq_id()
new_dict_row["_record_hash"] = record_hash
# if _root_hash propagation requested and we are at the top level then update extend
if "_root_hash" in extend and extend["_root_hash"] is None and is_top_level:
extend["_root_hash"] = record_hash
# generate child tables only for lists
children = [k for k in new_dict_row if type(new_dict_row[k]) is list] # type: ignore
for k in children:
child_table = f"{table}{PATH_SEPARATOR}{k}"
# this will skip empty lists
v: TEventRowChild
for idx, v in enumerate(new_dict_row[k]): # type: ignore
# yield child table row
if type(v) is dict:
yield from _unpack_row(schema, v, extend, child_table, record_hash, idx)
elif type(v) is list:
# unpack lists of lists
raise ValueError(v)
else:
# list of simple types
child_row_hash = _get_child_row_hash(record_hash, child_table, idx)
e = _append_child_meta({"value": v, "_record_hash": child_row_hash}, child_row_hash, record_hash, idx)
yield child_table, e
if not _should_preserve_complex_value(table, k):
# remove child list
del new_dict_row[k] # type: ignore
yield table, new_dict_row
def extract(schema: Schema, source_event: TEvent, load_id: str, add_json: bool) -> TUnpackedRowIterator:
# we will extend event with all the fields necessary to load it as root row
event = cast(TEventRowRoot, source_event)
# identify load id if loaded data must be processed after loading incrementally
event["_load_id"] = load_id
# add original json field, mostly useful for debugging
if add_json:
event["_event_json"] = json.dumps(event)
# find table name
table_name = event.pop("_event_type", None) or schema.schema_name
# TODO: if table_name exist get "_dist_key" and "_timestamp" from the table definition in schema and propagate, if not take them from global hints
# use event type or schema name as table name, request _root_hash propagation
yield from _unpack_row(schema, cast(TEventRowChild, event), {"_root_hash": None}, table_name)

16
dlt/common/pendulum.py Normal file
View File

@@ -0,0 +1,16 @@
import pendulum # noqa: I251
# force UTC as the local timezone to prevent local dates to be written to dbs
pendulum.set_local_timezone(pendulum.timezone('UTC')) # type: ignore
def __utcnow() -> pendulum.DateTime:
"""
Use this function instead of datetime.now
Returns:
pendulum.DateTime -- current time in UTC timezone
"""
return pendulum.now()
pendulum.utcnow = __utcnow # type: ignore

181
dlt/common/runners.py Normal file
View File

@@ -0,0 +1,181 @@
import argparse
import multiprocessing
from prometheus_client import Counter, Gauge, Summary, CollectorRegistry, REGISTRY
from typing import Callable, Dict, NamedTuple, Optional, Type, TypeVar, Union, cast
from multiprocessing.pool import ThreadPool, Pool
from dlt.common import logger, signals
from dlt.common.configuration.basic_configuration import BasicConfiguration
from dlt.common.time import sleep
from dlt.common.telemetry import TRunHealth, TRunMetrics, get_logging_extras, get_metrics_from_prometheus
from dlt.common.logger import init_logging_from_config, init_telemetry, process_internal_exception
from dlt.common.signals import register_signals
from dlt.common.utils import str2bool
from dlt.common.exceptions import SignalReceivedException, TimeRangeExhaustedException, UnsupportedProcessStartMethodException
from dlt.common.configuration import PoolRunnerConfiguration
TPool = TypeVar("TPool", bound=Pool)
class TRunArgs(NamedTuple):
single_run: bool
wait_runs: int
RUN_ARGS = TRunArgs(False, 0)
HEALTH_PROPS_GAUGES: Dict[str, Union[Counter, Gauge]] = None
RUN_DURATION_GAUGE: Gauge = None
RUN_DURATION_SUMMARY: Summary = None
LAST_RUN_METRICS: TRunMetrics = None
LAST_RUN_EXCEPTION: BaseException = None
def create_gauges(registry: CollectorRegistry) -> None:
global HEALTH_PROPS_GAUGES, RUN_DURATION_GAUGE, RUN_DURATION_SUMMARY
HEALTH_PROPS_GAUGES = {
"runs_count": Counter("runs_count", "Count runs", registry=registry),
"runs_not_idle_count": Counter("runs_not_idle_count", "Count not idle runs", registry=registry),
"runs_healthy_count": Counter("runs_healthy_count", "Count healthy runs", registry=registry),
"runs_cs_healthy_gauge": Gauge("runs_cs_healthy_gauge", "Count consecutive healthy runs, reset on failed run", registry=registry),
"runs_failed_count": Counter("runs_failed_count", "Count failed runs", registry=registry),
"runs_cs_failed_gauge": Gauge("runs_cs_failed_gauge", "Count consecutive failed runs, reset on healthy run", registry=registry),
"runs_pending_items_gauge": Gauge("runs_pending_items_gauge", "Number of items pending at the end of the run", registry=registry),
}
RUN_DURATION_GAUGE = Gauge("runs_duration_seconds", "Duration of the run", registry=registry)
RUN_DURATION_SUMMARY = Summary("runs_duration_summary", "Summary of the run duration", registry=registry)
def update_gauges() -> TRunHealth:
return get_metrics_from_prometheus(HEALTH_PROPS_GAUGES.values()) # type: ignore
def str2bool_a(v: str) -> bool:
try:
return str2bool(v)
except ValueError:
raise argparse.ArgumentTypeError('Boolean value expected.')
def create_default_args(C: Type[PoolRunnerConfiguration]) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=f"Default runner for {C.NAME}")
add_pool_cli_arguments(parser)
return parser
def add_pool_cli_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--single-run", type=str2bool_a, nargs='?', const=True, default=False, help="exit when all pending items are processed")
parser.add_argument("--wait-runs", type=int, nargs='?', const=True, default=1, help="maximum idle runs to wait for incoming data")
def initialize_runner(C: Type[BasicConfiguration], run_args: Optional[TRunArgs] = None) -> None:
global RUN_ARGS
init_logging_from_config(C)
init_telemetry(C)
create_gauges(REGISTRY)
register_signals()
if run_args is not None:
RUN_ARGS = run_args
def pool_runner(C: Type[PoolRunnerConfiguration], run_f: Callable[[TPool], TRunMetrics]) -> int:
# start pool
pool: Pool = None
if C.POOL_TYPE == "process":
# our pool implementation do not work on spawn
if multiprocessing.get_start_method() != "fork":
raise UnsupportedProcessStartMethodException(multiprocessing.get_start_method())
pool = Pool(processes=C.MAX_PARALLELISM)
elif C.POOL_TYPE == "thread":
pool = ThreadPool(processes=C.MAX_PARALLELISM)
else:
pool = None
logger.info(f"Created {C.POOL_TYPE} pool with {C.MAX_PARALLELISM or 'default no.'} workers")
try:
while True:
run_metrics: TRunMetrics = None
try:
HEALTH_PROPS_GAUGES["runs_count"].inc()
# run pool logic
with RUN_DURATION_SUMMARY.time(), RUN_DURATION_GAUGE.time():
run_metrics = run_f(cast(TPool, pool))
except Exception as exc:
if (type(exc) is SignalReceivedException) or (type(exc) is TimeRangeExhaustedException):
# always exit
raise
else:
process_internal_exception("run")
# the run failed
run_metrics = TRunMetrics(True, True, -1)
# preserve exception
global LAST_RUN_EXCEPTION
LAST_RUN_EXCEPTION = exc
# gather and emit metrics
if not run_metrics.was_idle:
HEALTH_PROPS_GAUGES["runs_not_idle_count"].inc()
if run_metrics.has_failed:
HEALTH_PROPS_GAUGES["runs_failed_count"].inc()
HEALTH_PROPS_GAUGES["runs_cs_failed_gauge"].inc()
HEALTH_PROPS_GAUGES["runs_cs_healthy_gauge"].set(0)
else:
HEALTH_PROPS_GAUGES["runs_healthy_count"].inc()
HEALTH_PROPS_GAUGES["runs_cs_healthy_gauge"].inc()
HEALTH_PROPS_GAUGES["runs_cs_failed_gauge"].set(0)
HEALTH_PROPS_GAUGES["runs_pending_items_gauge"].set(run_metrics.pending_items)
health_props = update_gauges()
logger.health("run health counters", extra={"metrics": health_props})
logger.metrics("run metrics", extra=get_logging_extras([RUN_DURATION_GAUGE, RUN_DURATION_SUMMARY]))
# preserve last run metrics
global LAST_RUN_METRICS
LAST_RUN_METRICS = run_metrics
# exit due to signal
signals.raise_if_signalled()
# exit due to exception and flag
if run_metrics.has_failed and C.EXIT_ON_EXCEPTION:
logger.warning(f"Exiting runner due to EXIT_ON_EXCEPTION flag set")
return -1
# single run may be forced but at least wait_runs must pass
if RUN_ARGS.single_run and (health_props["runs_count"] >= RUN_ARGS.wait_runs and
# and was all the time idle or (was not idle but now pending is 0)
(health_props["runs_not_idle_count"] == 0 or run_metrics.pending_items == 0)):
logger.warning(f"Stopping runner due to single run override")
return 0
if run_metrics.has_failed:
sleep(C.RUN_SLEEP_WHEN_FAILED)
elif run_metrics.pending_items == 0:
# nothing is pending so we can sleep longer
sleep(C.RUN_SLEEP_IDLE)
else:
# more items are pending, sleep (typically) shorter
sleep(C.RUN_SLEEP)
# this allows to recycle long living process that get their memory fragmented
# exit after runners sleeps so we keep the running period
if health_props["runs_count"] == C.STOP_AFTER_RUNS:
logger.warning(f"Stopping runner due to max runs {health_props['runs_count']} exceeded")
return -2
except SignalReceivedException as sigex:
# sleep this may raise SignalReceivedException
logger.warning(f"Exiting runner due to signal {sigex.signal_code}")
return sigex.signal_code
except TimeRangeExhaustedException as tre:
logger.info(f"{str(tre)}, not further processing will be done")
return 0
finally:
if pool:
logger.info("Closing processing pool")
pool.close()
pool.join()
pool = None

575
dlt/common/schema.py Normal file
View File

@@ -0,0 +1,575 @@
import base64
import binascii
import yaml
import re
from re import Pattern
from copy import deepcopy
from dateutil.parser import isoparse
from typing import Dict, List, Set, Mapping, Optional, Sequence, Tuple, Type, TypedDict, Literal, Any, cast
from dlt.common import pendulum, json, Decimal
from dlt.common.typing import DictStrAny, StrAny, StrStr
from dlt.common.arithmetics import ConversionSyntax
from dlt.common.exceptions import DltException
DataType = Literal["text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei"]
HintType = Literal["not_null", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"]
ColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"]
DATA_TYPES: Set[DataType] = set(["text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei"])
COLUMN_PROPS: Set[ColumnProp] = set(["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"])
COLUMN_HINTS: Set[HintType] = set(["partition", "cluster", "primary_key", "foreign_key", "sort", "unique"])
class ColumnBase(TypedDict, total=True):
name: str
data_type: DataType
nullable: bool
class Column(ColumnBase, total=True):
partition: bool
cluster: bool
unique: bool
sort: bool
primary_key: bool
foreign_key: bool
Table = Dict[str, Column]
SchemaTables = Dict[str, Table]
SchemaUpdate = Dict[str, List[Column]]
class StoredSchema(TypedDict, total=True):
version: int
engine_version: int
name: str
tables: SchemaTables
preferred_types: Mapping[str, DataType]
hints: Mapping[HintType, Sequence[str]]
excludes: Sequence[str]
includes: Sequence[str]
class Schema:
VERSION_TABLE_NAME = "_version"
VERSION_COLUMN_NAME = "version"
LOADS_TABLE_NAME = "_loads"
ENGINE_VERSION = 2
def __init__(self, name: str) -> None:
self._schema_tables: SchemaTables = {}
self._schema_name: str = name
self._version = 1
# list of preferred types: map regex on columns into types
self._preferred_types: Mapping[str, DataType] = {}
# compiled regexes
self._compiled_preferred_types: List[Tuple[Pattern[str], DataType]] = []
# table hints
self._hints: Mapping[HintType, Sequence[str]] = {}
self._compiled_hints: Dict[HintType, Sequence[Pattern[str]]] = {}
# excluded paths
self._excludes: Sequence[str] = []
self._compiled_excludes: Sequence[Pattern[str]] = []
# included paths
self._includes: Sequence[str] = []
self._compiled_includes: Sequence[Pattern[str]] = []
# add version table
self._add_standard_tables()
# add standard hints
self._add_standard_hints()
# compile hints
self._compile_regexes()
@classmethod
def from_dict(cls, stored_schema: StoredSchema) -> "Schema":
# upgrade engine if needed
cls._upgrade_engine_version(stored_schema, stored_schema["engine_version"], cls.ENGINE_VERSION)
# create new instance from dict
self: Schema = cls(stored_schema["name"])
self._schema_tables = stored_schema["tables"]
# TODO: generate difference if STANDARD SCHEMAS are different than those and increase schema version
if Schema.VERSION_TABLE_NAME not in self._schema_tables:
raise SchemaCorruptedException(f"Schema must contain table {Schema.VERSION_TABLE_NAME}")
if Schema.LOADS_TABLE_NAME not in self._schema_tables:
raise SchemaCorruptedException(f"Schema must contain table {Schema.LOADS_TABLE_NAME}")
# verify table schemas
for table_name, table in self._schema_tables.items():
for column_name in table:
# add default hints to tables
column = self._add_missing_hints(table[column_name])
# overwrite column name
column["name"] = column_name
# verify column
self._verify_column(table_name, column_name, column)
table[column_name] = column
self._version = stored_schema["version"]
self._preferred_types = stored_schema["preferred_types"]
self._hints = stored_schema["hints"]
self._excludes = stored_schema["excludes"]
self._includes = stored_schema["includes"]
# compile regexes
self._compile_regexes()
return self
def filter_row(self, table_name: str, row: StrAny, path_separator: str) -> StrAny:
# include and exclude paths follow the naming convention of the unpacker and correspond to json document nesting
# current version of the unpacker separates json elements with __
def _exclude(path: str) -> bool:
is_included = False
is_excluded = any(exclude.search(path) for exclude in self._compiled_excludes)
if is_excluded:
# we may have exception if explicitely included
is_included = any(include.search(path) for include in self._compiled_includes)
return is_excluded and not is_included
# check if any of the rows is excluded
for field_name in list(row.keys()):
path = f"{table_name}{path_separator}{field_name}"
# excluded if any rule matches
if _exclude(path):
# TODO: copy to new instance
del row[field_name] # type: ignore
return row
def coerce_row(self, table_name: str, row: StrAny) -> Tuple[StrAny, List[Column]]:
table_schema: Table = self._schema_tables.get(table_name, {})
new_columns: List[Column] = []
new_row: DictStrAny = {}
for col_name, v in row.items():
# skip None values, we should infer the types later
if v is None:
# just check if column is nullable if exists
self._coerce_null_value(table_schema, table_name, col_name)
else:
new_col_name, new_col_def, new_v = self._coerce_non_null_value(table_schema, table_name, col_name, v)
new_row[new_col_name] = new_v
if new_col_def:
new_columns.append(new_col_def)
return new_row, new_columns
def filter_hints_in_row(self, table_name: str, hint_type: HintType, row: StrAny) -> StrAny:
rv_row: DictStrAny = {}
column_prop: ColumnProp = self._hint_to_column_prop(hint_type)
try:
table = self.get_table(table_name)
for column_name in table:
if column_name in row:
hint_value = table[column_name][column_prop]
if (hint_value and column_prop != "nullable") or (column_prop == "nullable" and not hint_value):
rv_row[column_name] = row[column_name]
except KeyError:
for k, v in row.items():
if self._infer_hint(hint_type, v, k):
rv_row[k] = v
# dicts are ordered and we will return the rows with hints in the same order as they appear in the columns
return rv_row
def update_schema(self, table_name: str, updated_columns: List[Column]) -> None:
# all tables in the schema must start with the schema name
# if not table_name.startswith(f"{self._schema_name}"):
# raise InvalidTableNameException(self._schema_name, table_name)
if table_name not in self._schema_tables:
# add the whole new table to SchemaTables
self._schema_tables[table_name] = {c["name"]: c for c in updated_columns}
else:
# add several columns to existing table
table_schema = self._schema_tables[table_name]
for column in updated_columns:
column_name = column["name"]
if column_name in table_schema:
# we do not support changing existing columns
if not Schema._compare_columns(table_schema[column_name], column):
# attempt to update to incompatible columns
raise CannotCoerceColumnException(table_name, column_name, table_schema[column_name]["data_type"], column["data_type"], None)
else:
table_schema[column_name] = column
# bump schema version
self._version += 1
def get_schema_update_for(self, table_name: str, t: Table) -> List[Column]:
# gets new columns to be added to "t" to bring up to date with stored schema
diff_c: List[Column] = []
s_t = self.get_table(table_name)
for c in s_t.values():
if c["name"] not in t:
diff_c.append(c)
return diff_c
def get_table(self, table_name: str) -> Table:
return self._schema_tables[table_name]
def to_dict(self) -> StoredSchema:
return {
"tables": self._schema_tables,
"name": self._schema_name,
"version": self._version,
"preferred_types": self._preferred_types,
"hints": self._hints,
"excludes": self._excludes,
"includes": self._includes,
"engine_version": Schema.ENGINE_VERSION
}
@property
def schema_version(self) -> int:
return self._version
@property
def schema_name(self) -> str:
return self._schema_name
@property
def schema_tables(self) -> SchemaTables:
return self._schema_tables
def as_yaml(self, remove_default_hints: bool = False) -> str:
d = self.to_dict()
clean_tables = deepcopy(d["tables"])
for t in clean_tables.values():
for c in t.values():
# do not save names
del c["name"] # type: ignore
# remove hints with default values
if remove_default_hints:
for h in list(c.keys()):
if type(c[h]) is bool and c[h] is False and h != "nullable": # type: ignore
del c[h] # type: ignore
d["tables"] = clean_tables
return cast(str, yaml.dump(d, allow_unicode=True, default_flow_style=False, sort_keys=False))
def _infer_column(self, k: str, v: Any) -> Column:
return Column(
name=k,
data_type=self._map_value_to_column_type(v, k),
nullable=not self._infer_hint("not_null", v, k),
partition=self._infer_hint("partition", v, k),
cluster=self._infer_hint("cluster", v, k),
sort=self._infer_hint("sort", v, k),
unique=self._infer_hint("unique", v, k),
primary_key=self._infer_hint("primary_key", v, k),
foreign_key=self._infer_hint("foreign_key", v, k)
)
def _coerce_null_value(self, table_schema: Table, table_name: str, col_name: str) -> None:
if col_name in table_schema:
existing_column = table_schema[col_name]
if not existing_column["nullable"]:
raise CannotCoerceNullException(table_name, col_name)
def _coerce_non_null_value(self, table_schema: Table, table_name: str, col_name: str, v: Any) -> Tuple[str, Column, Any]:
new_column: Column = None
rv = v
variant_col_name = col_name
if col_name in table_schema:
existing_column = table_schema[col_name]
# existing columns cannot be changed so we must update row
py_data_type = Schema._py_type_to_sc_type(type(v))
if existing_column["data_type"] != py_data_type:
# first try to coerce existing value into destination type
try:
rv = Schema._coerce_type(existing_column["data_type"], py_data_type, v)
except (ValueError, SyntaxError):
# for complex types we must coerce to text
if py_data_type == "complex":
py_data_type = "text"
rv = Schema._coerce_type("text", "complex", v)
# if that does not work we must create variant extension to the table
variant_col_name = f"{col_name}_v_{py_data_type}"
# if variant exists check type, coercions are not required
if variant_col_name in table_schema:
if table_schema[variant_col_name]["data_type"] != py_data_type:
raise CannotCoerceColumnException(table_name, variant_col_name, table_schema[variant_col_name]["data_type"], py_data_type, v)
else:
# new column
# add new column
new_column = self._infer_column(variant_col_name, v)
# must have variant type, not preferred or coerced type
new_column["data_type"] = py_data_type
else:
# just copy row: types match
pass
else:
# infer new column
new_column = self._infer_column(col_name, v)
# and coerce type if inference changed the python type
py_type = Schema._py_type_to_sc_type(type(v))
rv = Schema._coerce_type(new_column["data_type"], py_type, v)
return variant_col_name, new_column, rv
def _map_value_to_column_type(self, v: Any, k: str) -> DataType:
mapped_type = Schema._py_type_to_sc_type(type(v))
# if complex type was detected we must coerce to string
if mapped_type == "complex":
mapped_type = "text"
# get preferred type based on column name
preferred_type = self._get_preferred_type(k)
# try to match python type to preferred
if preferred_type:
# try to coerce to destination type
try:
Schema._coerce_type(preferred_type, mapped_type, v)
# coercion possible so preferred type may be used
mapped_type = preferred_type
except ValueError:
# coercion not possible
pass
return mapped_type
def _get_preferred_type(self, col_name: str) -> Optional[DataType]:
return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None)
def _infer_hint(self, hint_type: HintType, _: Any, k: str) -> bool:
if hint_type in self._compiled_hints:
return any(h.search(k) for h in self._compiled_hints[hint_type])
else:
return False
def _add_standard_tables(self) -> None:
version_table: Table = {
"version": self._add_missing_hints({
"name": "version",
"data_type": "bigint",
"nullable": False,
}),
"engine_version": self._add_missing_hints({
"name": "engine_version",
"data_type": "bigint",
"nullable": False
}),
"inserted_at": self._add_missing_hints({
"name": "inserted_at",
"data_type": "timestamp",
"nullable": False
})
}
self._schema_tables[Schema.VERSION_TABLE_NAME] = version_table
load_table: Table = {
"load_id": self._add_missing_hints({
"name": "load_id",
"data_type": "text",
"nullable": False
}),
"status": self._add_missing_hints({
"name": "status",
"data_type": "bigint",
"nullable": False
}),
"inserted_at": self._add_missing_hints({
"name": "inserted_at",
"data_type": "timestamp",
"nullable": False
})
}
self._schema_tables[Schema.LOADS_TABLE_NAME] = load_table
def _add_standard_hints(self) -> None:
self._hints = {
"not_null": ["^_record_hash$", "^_root_hash$", "^_parent_hash$", "^_pos$", "_load_id"],
"foreign_key": ["^_parent_hash$"],
"unique": ["^_record_hash$"]
}
def _compile_regexes(self) -> None:
for pattern, dt in self._preferred_types.items():
# add tuples to be searched in coercions
self._compiled_preferred_types.append((re.compile(pattern), dt))
for hint_name, hint_list in self._hints.items():
# compile hints which are column matching regexes
self._compiled_hints[hint_name] = list(map(lambda hint: re.compile(hint), hint_list))
self._compiled_excludes = list(map(lambda exclude: re.compile(exclude), self._excludes))
self._compiled_includes = list(map(lambda include: re.compile(include), self._includes))
@staticmethod
def _verify_column(table_name: str, column_name: str, column: Column) -> None:
existing_props = set(column.keys())
missing_props = COLUMN_PROPS.difference(existing_props)
if len(missing_props) > 0:
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: Column definition is missing following properties {missing_props}")
data_type = column["data_type"]
if data_type not in DATA_TYPES:
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: {data_type} is not one of available types: {DATA_TYPES}")
for p, v in column.items():
if p in COLUMN_HINTS and not type(v) is bool:
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: hint {p} is not boolean.")
@staticmethod
def _upgrade_engine_version(schema_dict: StoredSchema, from_engine: int, to_engine: int) -> None:
if from_engine == 1:
schema_dict["engine_version"] = 2
schema_dict["includes"] = []
schema_dict["excludes"] = []
from_engine = 2
if from_engine == 2:
pass
if from_engine != to_engine:
raise SchemaEngineNoUpgradePathException(schema_dict["name"], schema_dict["engine_version"], from_engine, to_engine)
@staticmethod
def _add_missing_hints(column: ColumnBase) -> Column:
return {
**{ # type:ignore
"partition": False,
"cluster": False,
"unique": False,
"sort": False,
"primary_key": False,
"foreign_key": False,
},
**column
}
@staticmethod
def _py_type_to_sc_type(t: Type[Any]) -> DataType:
if t is float:
return "double"
elif t is int:
return "bigint"
elif t is bool:
return "bool"
elif t is bytes:
return "binary"
elif t in [dict, list]:
return "complex"
elif t is Decimal:
return "decimal"
else:
return "text"
@staticmethod
def _coerce_type(to_type: DataType, from_type: DataType, value: Any) -> Any:
if to_type == from_type:
return value
if to_type == "text":
if from_type == "complex":
return json.dumps(value)
else:
return str(value)
if to_type == "binary":
if from_type == "text":
if value.startswith("0x"):
return bytes.fromhex(value[2:])
try:
return base64.b64decode(value, validate=True)
except binascii.Error:
raise ValueError(value)
if from_type == "bigint":
return value.to_bytes((value.bit_length() + 7) // 8, 'little')
if to_type in ["wei", "bigint"]:
if from_type == "bigint":
return value
if from_type in ["decimal", "double"]:
if value % 1 != 0:
# only integer decimals and floats can be coerced
raise ValueError(value)
return int(value)
if from_type == "text":
trim_value = value.strip()
if trim_value.startswith("0x"):
return int(trim_value[2:], 16)
else:
return int(trim_value)
if to_type == "double":
if from_type in ["bigint", "wei", "decimal"]:
return float(value)
if from_type == "text":
trim_value = value.strip()
if trim_value.startswith("0x"):
return float(int(trim_value[2:], 16))
else:
return float(trim_value)
if to_type == "decimal":
if from_type in ["bigint", "wei"]:
return value
if from_type == "double":
return Decimal(value)
if from_type == "text":
trim_value = value.strip()
if trim_value.startswith("0x"):
return int(trim_value[2:], 16)
elif "." not in trim_value and "e" not in trim_value:
return int(trim_value)
else:
try:
return Decimal(trim_value)
except ConversionSyntax:
raise ValueError(trim_value)
if to_type == "timestamp":
if from_type in ["bigint", "double"]:
# returns ISO datetime with timezone
return str(pendulum.from_timestamp(value))
if from_type == "text":
# if parses as ISO date then pass it
try:
isoparse(value)
return value
except ValueError:
# try to convert string to integer, or float
try:
value = int(value)
except ValueError:
# raises ValueError if not parsing correctly
value = float(value)
return str(pendulum.from_timestamp(value))
raise ValueError(value)
@staticmethod
def _compare_columns(a: Column, b: Column) -> bool:
return a["data_type"] == b["data_type"] and a["nullable"] == b["nullable"]
@staticmethod
def _hint_to_column_prop(h: HintType) -> ColumnProp:
if h == "not_null":
return "nullable"
return h
class SchemaException(DltException):
pass
class CannotCoerceColumnException(SchemaException):
def __init__(self, table_name: str, column_name: str, from_type: DataType, to_type: DataType, value: Any) -> None:
super().__init__(f"Cannot coerce type in table {table_name} column {column_name} existing type {from_type} coerced type {to_type} value: {value}")
class CannotCoerceNullException(SchemaException):
def __init__(self, table_name: str, column_name: str) -> None:
super().__init__(f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable")
class InvalidTableNameException(SchemaException):
def __init__(self, schema_name: str, table_name: str) -> None:
self.schema_name = schema_name
self.table_name = table_name
super().__init__(f"All table names must start with '{schema_name}' so {table_name} is invalid")
class SchemaCorruptedException(SchemaException):
pass
class SchemaEngineNoUpgradePathException(SchemaException):
def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engine: int) -> None:
self.schema_name = schema_name
self.init_engine = init_engine
self.from_engine = from_engine
self.to_engine = to_engine
super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}")

35
dlt/common/signals.py Normal file
View File

@@ -0,0 +1,35 @@
import signal
from threading import Event
from typing import Any
from dlt.common import logger
from dlt.common.exceptions import SignalReceivedException
_received_signal: int = 0
exit_event = Event()
def signal_receiver(signal: int, frame: Any) -> None:
global _received_signal
logger.info(f"Signal {signal} received")
if _received_signal > 0:
logger.info(f"Another signal received after {_received_signal}")
return
_received_signal = signal
# awake all threads sleeping on event
exit_event.set()
logger.info(f"Sleeping threads signalled")
def raise_if_signalled() -> None:
if _received_signal:
raise SignalReceivedException(_received_signal)
def register_signals() -> None:
signal.signal(signal.SIGINT, signal_receiver)
signal.signal(signal.SIGTERM, signal_receiver)

View File

@@ -0,0 +1 @@
from .schema_storage import SchemaStorage # noqa: F401

View File

@@ -0,0 +1,23 @@
import semver
from dlt.common.exceptions import DltException
class StorageException(DltException):
def __init__(self, msg: str) -> None:
super().__init__(msg)
class NoMigrationPathException(StorageException):
def __init__(self, storage_path: str, initial_version: semver.VersionInfo, migrated_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None:
self.storage_path = storage_path
self.initial_version = initial_version
self.migrated_version = migrated_version
self.target_version = target_version
super().__init__(f"Could not find migration path for {storage_path} from v {initial_version} to {target_version}, stopped at {migrated_version}")
class WrongStorageVersionException(StorageException):
def __init__(self, storage_path: str, initial_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None:
self.storage_path = storage_path
self.initial_version = initial_version
self.target_version = target_version
super().__init__(f"Expected storage {storage_path} with v {target_version} but found {initial_version}")

View File

@@ -0,0 +1,181 @@
import os
from pathlib import Path
from typing import List, Literal, Optional, Sequence, Tuple, Type
from dlt.common import json, pendulum
from dlt.common.file_storage import FileStorage
from dlt.common.dataset_writers import TWriterType, write_jsonl, write_insert_values
from dlt.common.configuration import LoadingVolumeConfiguration
from dlt.common.exceptions import TerminalValueError
from dlt.common.schema import SchemaUpdate, Table
from dlt.common.storages.versioned_storage import VersionedStorage
from dlt.common.typing import StrAny
from dlt.common.storages.exceptions import StorageException
# folders to manage load jobs in a single load package
TWorkingFolder = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"]
class LoaderStorage(VersionedStorage):
STORAGE_VERSION = "1.0.0"
LOADING_FOLDER = "loading" # folder within the volume where load packages are stored
LOADED_FOLDER = "loaded" # folder to keep the loads that were completely processed
NEW_JOBS_FOLDER: TWorkingFolder = "new_jobs"
FAILED_JOBS_FOLDER: TWorkingFolder = "failed_jobs"
STARTED_JOBS_FOLDER: TWorkingFolder = "started_jobs"
COMPLETED_JOBS_FOLDER: TWorkingFolder = "completed_jobs"
LOAD_SCHEMA_UPDATE_FILE_NAME = "schema_updates.json"
SUPPORTED_WRITERS: List[TWriterType] = ["jsonl", "insert_values"]
def __init__(self, is_owner: bool, C: Type[LoadingVolumeConfiguration], writer_type: TWriterType) -> None:
if writer_type not in LoaderStorage.SUPPORTED_WRITERS:
raise TerminalValueError(writer_type)
self.writer_type = writer_type
self.delete_completed_jobs = C.DELETE_COMPLETED_JOBS
super().__init__(LoaderStorage.STORAGE_VERSION, is_owner, FileStorage(C.LOADING_VOLUME_PATH, "t", makedirs=is_owner))
def initialize_storage(self) -> None:
self.storage.create_folder(LoaderStorage.LOADED_FOLDER, exists_ok=True)
self.storage.create_folder(LoaderStorage.LOADING_FOLDER, exists_ok=True)
def create_temp_load_folder(self, load_id: str) -> None:
# delete previous version
if self.storage.has_folder(load_id):
self.storage.delete_folder(load_id, recursively=True)
self.storage.create_folder(load_id)
# create processing directories
self.storage.create_folder(f"{load_id}/{LoaderStorage.NEW_JOBS_FOLDER}")
self.storage.create_folder(f"{load_id}/{LoaderStorage.COMPLETED_JOBS_FOLDER}")
self.storage.create_folder(f"{load_id}/{LoaderStorage.FAILED_JOBS_FOLDER}")
self.storage.create_folder(f"{load_id}/{LoaderStorage.STARTED_JOBS_FOLDER}")
def write_temp_loading_file(self, load_id: str, table_name: str, table: Table, file_id: str, rows: Sequence[StrAny]) -> str:
file_name = self.build_loading_file_name(load_id, table_name, file_id)
with self.storage.open(file_name, mode = "w") as f:
if self.writer_type == "jsonl":
write_jsonl(f, rows)
elif self.writer_type == "insert_values":
write_insert_values(f, rows, table.keys())
return Path(file_name).name
def save_schema_updates(self, load_id: str, schema_updates: Sequence[SchemaUpdate]) -> None:
with self.storage.open(f"{load_id}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}", mode="w") as f:
json.dump(schema_updates, f)
def commit_temp_load_folder(self, load_id: str) -> None:
self.storage.atomic_rename(load_id, self.get_load_path(load_id))
def list_loads(self) -> Sequence[str]:
loads = self.storage.list_folder_dirs(LoaderStorage.LOADING_FOLDER, to_root=False)
# start from the oldest packages
return sorted(loads)
def list_completed_loads(self) -> Sequence[str]:
loads = self.storage.list_folder_dirs(LoaderStorage.LOADED_FOLDER, to_root=False)
# start from the oldest packages
return sorted(loads)
def list_new_jobs(self, load_id: str) -> Sequence[str]:
new_jobs = self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.NEW_JOBS_FOLDER}")
# make sure all jobs have supported writers
wrong_job = next((j for j in new_jobs if LoaderStorage.parse_load_file_name(j)[1] != self.writer_type), None)
if wrong_job is not None:
raise JobWithUnsupportedWriterException(load_id, self.writer_type, wrong_job)
return new_jobs
def list_started_jobs(self, load_id: str) -> Sequence[str]:
return self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.STARTED_JOBS_FOLDER}")
def list_failed_jobs(self, load_id: str) -> Sequence[str]:
return self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.FAILED_JOBS_FOLDER}")
def list_archived_failed_jobs(self, load_id: str) -> Sequence[str]:
return self.storage.list_folder_files(f"{self.get_archived_path(load_id)}/{LoaderStorage.FAILED_JOBS_FOLDER}")
def begin_schema_update(self, load_id: str) -> Optional[SchemaUpdate]:
schema_update_file = f"{self.get_load_path(load_id)}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}"
if self.storage.has_file(schema_update_file):
schema_update: SchemaUpdate = json.loads(self.storage.load(schema_update_file))
return schema_update
else:
return None
def commit_schema_update(self, load_id: str) -> None:
load_path = self.get_load_path(load_id)
schema_update_file = f"{load_path}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}"
self.storage.atomic_rename(schema_update_file, f"{load_path}/{LoaderStorage.COMPLETED_JOBS_FOLDER}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}")
def start_job(self, load_id: str, file_name: str) -> str:
return self._move_file(load_id, LoaderStorage.NEW_JOBS_FOLDER, LoaderStorage.STARTED_JOBS_FOLDER, file_name)
def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) -> str:
load_path = self.get_load_path(load_id)
if failed_message:
self.storage.save(f"{load_path}/{LoaderStorage.FAILED_JOBS_FOLDER}/{file_name}.exception", failed_message)
# move to failed jobs
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.FAILED_JOBS_FOLDER, file_name)
def retry_job(self, load_id: str, file_name: str) -> str:
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.NEW_JOBS_FOLDER, file_name)
def complete_job(self, load_id: str, file_name: str) -> str:
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.COMPLETED_JOBS_FOLDER, file_name)
def archive_load(self, load_id: str) -> None:
load_path = self.get_load_path(load_id)
has_failed_jobs = len(self.list_failed_jobs(load_id)) > 0
# delete load that does not contain failed jobs
if self.delete_completed_jobs and not has_failed_jobs:
self.storage.delete_folder(load_path, recursively=True)
else:
archive_path = self.get_archived_path(load_id)
self.storage.atomic_rename(load_path, archive_path)
def get_load_path(self, load_id: str) -> str:
return f"{LoaderStorage.LOADING_FOLDER}/{load_id}"
def get_archived_path(self, load_id: str) -> str:
return f"{LoaderStorage.LOADED_FOLDER}/{load_id}"
def build_loading_file_name(self, load_id: str, table_name: str, file_id: str) -> str:
file_name = f"{table_name}.{file_id}.{self.writer_type}"
return f"{load_id}/{LoaderStorage.NEW_JOBS_FOLDER}/{file_name}"
def _move_file(self, load_id: str, source_folder: TWorkingFolder, dest_folder: TWorkingFolder, file_name: str) -> str:
load_path = self.get_load_path(load_id)
dest_path = f"{load_path}/{dest_folder}/{file_name}"
self.storage.atomic_rename(f"{load_path}/{source_folder}/{file_name}", dest_path)
return self.storage._make_path(dest_path)
def job_elapsed_time_seconds(self, file_path: str) -> float:
return pendulum.now().timestamp() - os.path.getmtime(file_path) # type: ignore
def _get_file_path(self, load_id: str, folder: TWorkingFolder, file_name: str) -> str:
load_path = self.get_load_path(load_id)
return f"{load_path}/{folder}/{file_name}"
@staticmethod
def parse_load_file_name(file_name: str) -> Tuple[str, TWriterType]:
p = Path(file_name)
ext: TWriterType = p.suffix[1:] # type: ignore
if ext not in LoaderStorage.SUPPORTED_WRITERS:
raise TerminalValueError(ext)
parts = p.stem.split(".")
return (parts[0], ext)
class LoaderStorageException(StorageException):
pass
class JobWithUnsupportedWriterException(LoaderStorageException):
def __init__(self, load_id: str, expected_writer_type: TWriterType, wrong_job: str) -> None:
self.load_id = load_id
self.expected_writer_type = expected_writer_type
self.wrong_job = wrong_job

View File

@@ -0,0 +1,49 @@
import os
from typing import Optional
from dlt.common import json
from dlt.common.file_storage import FileStorage
from dlt.common.schema import Schema, StoredSchema
class SchemaStorage:
STORE_SCHEMA_FILE_PATTERN = "%s_schema.json"
FOLDER_SCHEMA_FILE = "schema.json"
def __init__(self, schema_storage_root: str, makedirs: bool = False) -> None:
self.storage = FileStorage(schema_storage_root, makedirs=makedirs)
def load_store_schema(self, name: str) -> Schema:
# loads a schema from a store holding many schemas
schema_file = self._get_file_by_name(name)
stored_schema: StoredSchema = json.loads(self.storage.load(schema_file))
return Schema.from_dict(stored_schema)
def load_folder_schema(self, from_folder: str) -> Schema:
# loads schema from a folder containing one default schema
schema_path = self._get_file_in_folder(from_folder)
stored_schema: StoredSchema = json.loads(self.storage.load(schema_path))
return Schema.from_dict(stored_schema)
def save_store_schema(self, schema: Schema) -> str:
# save a schema to schema store
dump = json.dumps(schema.to_dict(), indent=2)
schema_file = self._get_file_by_name(schema.schema_name)
return self.storage.save(schema_file, dump)
def save_folder_schema(self, schema: Schema, in_folder: str) -> str:
# save a schema to a folder holding one schema
dump = json.dumps(schema.to_dict())
schema_file = self._get_file_in_folder(in_folder)
return self.storage.save(schema_file, dump)
def has_store_schema(self, name: str) -> bool:
schema_file = self._get_file_by_name(name)
return self.storage.has_file(schema_file)
def _get_file_by_name(self, name: str) -> str:
return SchemaStorage.STORE_SCHEMA_FILE_PATTERN % name
def _get_file_in_folder(self, folder: str) -> str:
return os.path.join(folder, SchemaStorage.FOLDER_SCHEMA_FILE) # if folder is None else os.path.join(folder, SchemaStorage.SCHEMA_FILE)

View File

@@ -0,0 +1,73 @@
from typing import List, Sequence, Tuple, Type
from itertools import groupby
from pathlib import Path
from dlt.common.utils import chunks
from dlt.common.file_storage import FileStorage
from dlt.common.configuration import UnpackingVolumeConfiguration
from dlt.common.storages.versioned_storage import VersionedStorage
class UnpackerStorage(VersionedStorage):
STORAGE_VERSION = "1.0.0"
UNPACKING_FOLDER: str = "unpacking" # folder within the volume where files to be unpacked are stored
UNPACK_FILE_EXTENSION = ".unpack.json"
UNPACK_FILE_EXTENSION_LEN = len(UNPACK_FILE_EXTENSION)
def __init__(self, is_owner: bool, C: Type[UnpackingVolumeConfiguration]) -> None:
super().__init__(UnpackerStorage.STORAGE_VERSION, is_owner, FileStorage(C.UNPACKING_VOLUME_PATH, "t", makedirs=is_owner))
def initialize_storage(self) -> None:
self.storage.create_folder(UnpackerStorage.UNPACKING_FOLDER, exists_ok=True)
def list_files_to_unpack_sorted(self) -> Sequence[str]:
return sorted(self.storage.list_folder_files(UnpackerStorage.UNPACKING_FOLDER))
def get_grouped_iterator(self, files: Sequence[str]) -> "groupby[str, str]":
return groupby(files, lambda f: UnpackerStorage.get_schema_name(f))
@staticmethod
def chunk_by_events(files: Sequence[str], max_events: int, processing_cores: int) -> List[Sequence[str]]:
# should distribute ~ N events evenly among m cores with fallback for small amounts of events
def count_events(file_name : str) -> int:
# return event count from file name
return UnpackerStorage.get_events_count(file_name)
counts = list(map(count_events, files))
# make a list of files containing ~max_events
events_count = 0
m = 0
while events_count < max_events and m < len(files):
events_count += counts[m]
m += 1
processing_chunks = round(m / processing_cores)
if processing_chunks == 0:
# return one small chunk
return [files]
else:
# should return ~ amount of chunks to fill all the cores
return list(chunks(files[:m], processing_chunks))
@staticmethod
def get_events_count(file_name: str) -> int:
return UnpackerStorage._parse_unpack_file_name(file_name)[0]
@staticmethod
def get_schema_name(file_name: str) -> str:
return UnpackerStorage._parse_unpack_file_name(file_name)[2]
@staticmethod
def build_unpack_file_name(schema_name: str, stem: str, event_count: int, load_id: str) -> str:
# builds file name of the unpack file for the tracker
return f"{schema_name}_{stem}_{load_id}_{event_count}{UnpackerStorage.UNPACK_FILE_EXTENSION}"
@staticmethod
def _parse_unpack_file_name(file_name: str) -> Tuple[int, str, str]:
# parser unpack tracker file and returns (events found, load id, schema_name)
if not file_name.endswith(UnpackerStorage.UNPACK_FILE_EXTENSION):
raise ValueError(file_name)
parts = Path(file_name[:-UnpackerStorage.UNPACK_FILE_EXTENSION_LEN]).stem.split("_")
return (int(parts[-1]), parts[-2], parts[0])

View File

@@ -0,0 +1,54 @@
import semver
from dlt.common.file_storage import FileStorage
from dlt.common.storages.exceptions import NoMigrationPathException, WrongStorageVersionException
class VersionedStorage:
VERSION_FILE = ".version"
def __init__(self, version: semver.VersionInfo, is_owner: bool, storage: FileStorage) -> None:
self.storage = storage
# read current version
if self.storage.has_file(VersionedStorage.VERSION_FILE):
existing_version = self._load_version()
if existing_version != version:
if existing_version > version:
# version cannot be downgraded
raise NoMigrationPathException(storage.storage_path, existing_version, existing_version, version)
if is_owner:
# only owner can migrate storage
self.migrate_storage(existing_version, version)
# storage should be migrated to desired version
migrated_version = self._load_version()
if version != migrated_version:
raise NoMigrationPathException(storage.storage_path, existing_version, migrated_version, version)
else:
# we cannot use storage and we must wait for owner to upgrade it
raise WrongStorageVersionException(storage.storage_path, existing_version, version)
else:
if is_owner:
self._save_version(version)
else:
raise WrongStorageVersionException(storage.storage_path, semver.VersionInfo.parse("0.0.0"), version)
def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None:
# migration example:
# # semver lib supports comparing both to string and other semvers
# if from_version == "1.0.0" and from_version < to_version:
# # do migration
# # save migrated version
# from_version = semver.VersionInfo.parse("1.1.0")
# self._save_version(from_version)
pass
@property
def version(self) -> semver.VersionInfo:
return self._load_version()
def _load_version(self) -> semver.VersionInfo:
return self.storage.load(VersionedStorage.VERSION_FILE)
def _save_version(self, version: semver.VersionInfo) -> None:
self.storage.save(VersionedStorage.VERSION_FILE, str(version))

65
dlt/common/telemetry.py Normal file
View File

@@ -0,0 +1,65 @@
from typing import Iterable, Sequence, TypedDict, NamedTuple
from prometheus_client import Gauge
from prometheus_client.metrics import MetricWrapperBase
from dlt.common.typing import DictStrAny, StrAny
class TRunHealth(TypedDict):
# count runs
runs_count: int
# count not idle runs
runs_not_idle_count: int
# succesfull runs
runs_healthy_count: int
# count consecutive successful runs
runs_cs_healthy_gauge: int
# count failed runs
runs_failed_count: int
# count consecutive failed runs
runs_cs_failed_gauge: int
# number of items pending at the end of the run
runs_pending_items_gauge: int
class TRunMetrics(NamedTuple):
was_idle: bool
has_failed: bool
pending_items: int
def get_metrics_from_prometheus(gauges: Iterable[MetricWrapperBase]) -> StrAny:
metrics: DictStrAny = {}
for g in gauges:
name = g._name
if g._is_parent():
# for gauges containing many label values, enumerate all
metrics.update(get_metrics_from_prometheus([g.labels(*l) for l in g._metrics.keys()]))
continue
# for gauges with labels: add the label to the name and enumerate samples
if g._labelvalues:
name += "_" + "_".join(g._labelvalues)
for m in g._child_samples():
k = name
if m[0] == "_created":
continue
if m[0] != "_total":
k += m[0]
if g._type == "info":
# actual descriptive value is held in [1], [2] is a placeholder in info
metrics[k] = m[1]
else:
metrics[k] = m[2]
return metrics
def set_gauge_all_labels(gauge: Gauge, value: float) -> None:
if gauge._is_parent():
for l in gauge._metrics.keys():
set_gauge_all_labels(gauge.labels(*l), value)
else:
gauge.set(value)
def get_logging_extras(gauges: Iterable[MetricWrapperBase]) -> StrAny:
return {"metrics": get_metrics_from_prometheus(gauges)}

30
dlt/common/time.py Normal file
View File

@@ -0,0 +1,30 @@
from typing import Optional # noqa
from dlt.common import signals
PAST_TIMESTAMP: float = 0.0
FUTURE_TIMESTAMP: float = 9999999999.0
DAY_DURATION_SEC: float = 24 * 60 * 60.0
def timestamp_within(timestamp: float, min_exclusive: Optional[float], max_inclusive: Optional[float]) -> bool:
"""
check if timestamp within range uniformly treating none and range inclusiveness
"""
return timestamp > (min_exclusive or PAST_TIMESTAMP) and timestamp <= (max_inclusive or FUTURE_TIMESTAMP)
def timestamp_before(timestamp: float, max_inclusive: Optional[float]) -> bool:
"""
check if timestamp is before max timestamp, inclusive
"""
return timestamp <= (max_inclusive or FUTURE_TIMESTAMP)
def sleep(sleep_seconds: float) -> None:
# do not allow sleeping if signal was received
signals.raise_if_signalled()
# sleep or wait for signal
signals.exit_event.wait(sleep_seconds)
# if signal then raise
signals.raise_if_signalled()

32
dlt/common/typing.py Normal file
View File

@@ -0,0 +1,32 @@
from typing import Dict, Any, List, Literal, Mapping, Sequence, TypedDict, Optional, Union
DictStrAny = Dict[str, Any]
DictStrStr = Dict[str, str]
StrAny = Mapping[str, Any] # immutable, covariant entity
StrStr = Mapping[str, str] # immutable, covariant entity
StrStrStr = Mapping[str, Mapping[str, str]] # immutable, covariant entity
class TEventRow(TypedDict, total=False):
_timestamp: float # used for partitioning
_dist_key: str # distribution key used for clustering
_record_hash: str # unique id of current row
_root_hash: str # unique id of top level parent
class TEventRowRoot(TEventRow, total=False):
_load_id: str # load id to identify records loaded together that ie. need to be processed
_event_json: str # dump of the original event
_event_type: str # sets event type which will be translated to table
class TEventRowChild(TEventRow, total=False):
_parent_hash: str # unique id of parent row
_pos: int # position in the list of rows
value: Any # for lists of simple types
class TEvent(TypedDict, total=False):
pass
class TTimestampEvent(TEvent, total=False):
timestamp: float # timestamp of event

117
dlt/common/utils.py Normal file
View File

@@ -0,0 +1,117 @@
import hashlib
from os import environ
from uuid import uuid4
from typing import Any, Iterator, Sequence, TypeVar, Mapping, List, Union
from dlt.common.typing import StrAny, DictStrAny, StrStr
T = TypeVar("T")
def chunks(list: Sequence[T], n: int) -> Iterator[Sequence[T]]:
for i in range(0, len(list), n):
yield list[i:i + n]
def uniq_id() -> str:
return uuid4().hex
def digest128(v: str) -> str:
return hashlib.shake_128(v.encode("utf-8")).hexdigest(16)
def str2bool(v: str) -> bool:
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise ValueError('Boolean value expected.')
def flatten_list_of_dicts(dicts: Sequence[StrAny]) -> StrAny:
"""
Transforms a list of objects [{K: {...}}, {L: {....}}, ...] -> {K: {...}, L: {...}...}
"""
o: DictStrAny = {}
for d in dicts:
for k,v in d.items():
if k in o:
raise KeyError(f"Cannot flatten with duplicate key {k}")
o[k] = v
return o
def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> StrAny:
"""
Transforms a list of objects or strings [{K: {...}}, L, ...] -> {K: {...}, L: None, ...}
"""
o: DictStrAny = {}
for e in seq:
if type(e) is dict:
for k,v in e.items():
if k in o:
raise KeyError(f"Cannot flatten with duplicate key {k}")
o[k] = v
else:
key = str(e)
if key in o:
raise KeyError(f"Cannot flatten with duplicate key {k}")
o[key] = None
return o
def flatten_dicts_of_dicts(dicts: Mapping[str, Any]) -> Sequence[Any]:
"""
Transform and object {K: {...}, L: {...}...} -> [{key:K, ....}, {key: L, ...}, ...]
"""
o: List[Any] = []
for k, v in dicts.items():
if type(v) is list:
# if v is a list then add "key" to each list element
for lv in v:
lv["key"] = k
else:
# add as "key" to dict
v["key"] = k
o.append(v)
return o
def tuplify_list_of_dicts(dicts: Sequence[DictStrAny]) -> Sequence[DictStrAny]:
"""
Transform dicts with single key into {"key": orig_key, "value": orig_value}
"""
for d in dicts:
if len(d) > 1:
raise ValueError(f"Tuplify requires one key dicts {d}")
if len(d) == 1:
key = next(iter(d))
# delete key first to avoid name clashes
value = d[key]
del d[key]
d["key"] = key
d["value"] = value
return dicts
def filter_env_vars(vars: List[str]) -> StrStr:
return {k.lower(): environ[k] for k in vars if k in environ}
def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None:
for k, v in update.items():
if v is not None:
dest[k] = v
elif k in dest:
del dest[k]
def is_interactive() -> bool:
import __main__ as main
return not hasattr(main, '__file__')

17
dlt/dbt_runner/README.md Normal file
View File

@@ -0,0 +1,17 @@
https://github.com/davidgasquez/kubedbt
https://discourse.getdbt.com/t/running-dbt-in-kubernetes/92
https://github.com/godatadriven/pytest-dbt-core
https://github.com/great-expectations/great_expectations
https://github.com/fal-ai/fal (attach python scripts to models)
https://blog.getdbt.com/how-great-data-teams-test-their-data-models/
PG_DATABASE_NAME=chat_analytics_rasa PG_PASSWORD=8P5gyDPNo9zo582rQG6a PG_USER=loader PG_HOST=3.66.204.141 PG_PORT=5439 dbt list --profiles-dir . --vars '{source_schema_prefix: "unk"}' --resource-type test -s source:*
https://docs.getdbt.com/reference/node-selection/test-selection-examples
# list tests with selectors
PG_DATABASE_NAME=chat_analytics_rasa PG_PASSWORD=8P5gyDPNo9zo582rQG6a PG_USER=loader PG_HOST=3.66.204.141 PG_PORT=5439 dbt list --profiles-dir . --vars '{source_schema_prefix: "unk"}' --resource-type test -s views

View File

@@ -0,0 +1 @@
from ._version import __version__

View File

@@ -0,0 +1 @@
__version__ = "1.0.0"

View File

@@ -0,0 +1,69 @@
from typing import List, Optional, Type
from dlt.common.typing import StrAny
from dlt.common.configuration.utils import TConfigSecret, make_configuration, _get_key_value
from dlt.common.configuration import PoolRunnerConfiguration, TPoolType, PostgresConfiguration, PostgresProductionConfiguration, GcpClientConfiguration, GcpClientProductionConfiguration
from . import __version__
class DBTRunnerConfiguration(PoolRunnerConfiguration):
POOL_TYPE: TPoolType = "none"
STOP_AFTER_RUNS: int = 1
PACKAGE_VOLUME_PATH: str = "_storage/dbt_runner"
PACKAGE_REPOSITORY_URL: str = "https://github.com/scale-vector/rasa_semantic_schema_customization.git"
PACKAGE_REPOSITORY_BRANCH: Optional[str] = None
PACKAGE_REPOSITORY_SSH_KEY: TConfigSecret = TConfigSecret("") # the default is empty value which will disable custom SSH KEY
PACKAGE_PROFILES_DIR: str = "."
PACKAGE_PROFILE_PREFIX: str = "rasa_semantic_schema"
PACKAGE_SOURCE_TESTS_SELECTOR: str = "tag:prerequisites"
PACKAGE_ADDITIONAL_VARS: Optional[StrAny] = None
PACKAGE_RUN_PARAMS: List[str] = ["--fail-fast"]
AUTO_FULL_REFRESH_WHEN_OUT_OF_SYNC: bool = True
SOURCE_SCHEMA_PREFIX: str = None
DEST_SCHEMA_PREFIX: Optional[str] = None
@classmethod
def check_integrity(cls) -> None:
if cls.PACKAGE_REPOSITORY_SSH_KEY and cls.PACKAGE_REPOSITORY_SSH_KEY[-1] != "\n":
# must end with new line, otherwise won't be parsed by Crypto
cls.PACKAGE_REPOSITORY_SSH_KEY = TConfigSecret(cls.PACKAGE_REPOSITORY_SSH_KEY + "\n")
if cls.STOP_AFTER_RUNS != 1:
# always stop after one run
cls.STOP_AFTER_RUNS = 1
class DBTRunnerProductionConfiguration(DBTRunnerConfiguration):
PACKAGE_VOLUME_PATH: str = "/var/local/app" # this is actually not exposed as volume
PACKAGE_REPOSITORY_URL: str = None
def gen_configuration_variant(initial_values: StrAny = None) -> Type[DBTRunnerConfiguration]:
# derive concrete config depending on env vars present
DBTRunnerConfigurationImpl: Type[DBTRunnerConfiguration]
DBTRunnerProductionConfigurationImpl: Type[DBTRunnerProductionConfiguration]
if _get_key_value("PG_SCHEMA_PREFIX", type(str)):
source_schema_prefix = _get_key_value("PG_SCHEMA_PREFIX", type(str))
class DBTRunnerConfigurationPostgress(PostgresConfiguration, DBTRunnerConfiguration):
SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
DBTRunnerConfigurationImpl = DBTRunnerConfigurationPostgress
class DBTRunnerProductionConfigurationPostgress(DBTRunnerProductionConfiguration, PostgresProductionConfiguration, DBTRunnerConfigurationPostgress):
pass
# SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
DBTRunnerProductionConfigurationImpl = DBTRunnerProductionConfigurationPostgress
else:
source_schema_prefix = _get_key_value("DATASET", type(str))
class DBTRunnerConfigurationGcp(GcpClientConfiguration, DBTRunnerConfiguration):
SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
DBTRunnerConfigurationImpl = DBTRunnerConfigurationGcp
class DBTRunnerProductionConfigurationGcp(DBTRunnerProductionConfiguration, GcpClientProductionConfiguration, DBTRunnerConfigurationGcp):
pass
# SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
DBTRunnerProductionConfigurationImpl = DBTRunnerProductionConfigurationGcp
return make_configuration(DBTRunnerConfigurationImpl, DBTRunnerProductionConfigurationImpl, initial_values=initial_values)

View File

@@ -0,0 +1,9 @@
from dlt.common.exceptions import DltException
class DBTRunnerException(DltException):
pass
class PrerequisitesException(DBTRunnerException):
pass

187
dlt/dbt_runner/runner.py Normal file
View File

@@ -0,0 +1,187 @@
from typing import Optional, Sequence, Tuple, Type
from git import GitError
from prometheus_client import REGISTRY, Gauge, CollectorRegistry, Info
from prometheus_client.metrics import MetricWrapperBase
from dlt.common.configuration import GcpClientConfiguration
from dlt.common import logger
from dlt.common.typing import DictStrAny, DictStrStr, StrAny
from dlt.common.logger import process_internal_exception, is_json_logging
from dlt.common.telemetry import get_logging_extras
from dlt.common.file_storage import FileStorage
from dlt.common.runners import TRunArgs, create_default_args, initialize_runner, pool_runner
from dlt.common.telemetry import TRunMetrics
from dlt.dbt_runner.configuration import DBTRunnerConfiguration, gen_configuration_variant
from dlt.dbt_runner.utils import DBTProcessingError, clone_repo, dbt_results, ensure_remote_head, git_custom_key_command, initialize_dbt_logging, is_incremental_schema_out_of_sync_error, run_dbt_command
from dlt.dbt_runner.exceptions import PrerequisitesException
CLONED_PACKAGE_NAME = "dbt_package"
CONFIG: Type[DBTRunnerConfiguration] = None
storage: FileStorage = None
dbt_package_vars: StrAny = None
global_args: Sequence[str] = None
repo_path: str = None
profile_name: str = None
model_elapsed_gauge: Gauge = None
model_exec_info: Info = None
def create_folders() -> Tuple[FileStorage, StrAny, Sequence[str], str, str]:
storage = FileStorage(CONFIG.PACKAGE_VOLUME_PATH, makedirs=True)
dbt_package_vars: DictStrAny = {
"source_schema_prefix": CONFIG.SOURCE_SCHEMA_PREFIX
}
if CONFIG.DEST_SCHEMA_PREFIX:
dbt_package_vars["dest_schema_prefix"] = CONFIG.DEST_SCHEMA_PREFIX
if CONFIG.PACKAGE_ADDITIONAL_VARS:
dbt_package_vars.update(CONFIG.PACKAGE_ADDITIONAL_VARS)
# initialize dbt logging, returns global parameters to dbt command
global_args = initialize_dbt_logging(CONFIG.LOG_LEVEL, is_json_logging(CONFIG.LOG_FORMAT))
# generate path for the dbt package repo
repo_path = storage._make_path(CLONED_PACKAGE_NAME)
# generate profile name
profile_name: str = None
if CONFIG.PACKAGE_PROFILE_PREFIX:
if issubclass(CONFIG, GcpClientConfiguration):
profile_name = "%s_bigquery" % (CONFIG.PACKAGE_PROFILE_PREFIX)
else:
profile_name = "%s_redshift" % (CONFIG.PACKAGE_PROFILE_PREFIX)
return storage, dbt_package_vars, global_args, repo_path, profile_name
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase]:
return (
Gauge("dbtrunner_model_elapsed_seconds", "Last model processing time", ["model"], registry=registry),
Info("dbtrunner_model_status", "Last execution status of the model", registry=registry)
)
def run_dbt(command: str, command_args: Sequence[str] = None) -> Sequence[dbt_results. BaseResult]:
logger.info(f"Exec dbt command: {global_args} {command} {command_args} {dbt_package_vars} on profile {profile_name or '<project_default>'}")
return run_dbt_command(
repo_path, command,
CONFIG.PACKAGE_PROFILES_DIR,
profile_name=profile_name,
command_args=command_args,
global_args=global_args,
vars=dbt_package_vars
)
def log_dbt_run_results(results: dbt_results.RunExecutionResult) -> None:
# run may return RunResult of something different depending on error
if issubclass(type(results), dbt_results.BaseResult):
results = [results] # make it iterable
elif issubclass(type(results), dbt_results.ExecutionResult):
pass
else:
logger.warning(f"{type(results)} is unknown and cannot be logged")
return
info: DictStrStr = {}
for res in results:
name = res.node.name
message = res.message
time = res.execution_time
if res.status == dbt_results.RunStatus.Error:
logger.error(f"Model {name} errored! Error: {message}")
else:
logger.info(f"Model {name} {res.status} in {time} seconds with {message}")
model_elapsed_gauge.labels(name).set(time)
info[name] = message
# log execution
model_exec_info.info(info)
logger.metrics("Executed models", extra=get_logging_extras([model_elapsed_gauge, model_exec_info]))
def initialize_package(with_git_command: Optional[str]) -> None:
try:
# cleanup package folder
if storage.has_folder(CLONED_PACKAGE_NAME):
storage.delete_folder(CLONED_PACKAGE_NAME, recursively=True)
logger.info(f"Will clone {CONFIG.PACKAGE_REPOSITORY_URL} head {CONFIG.PACKAGE_REPOSITORY_BRANCH} into {repo_path}")
clone_repo(CONFIG.PACKAGE_REPOSITORY_URL, repo_path, branch=CONFIG.PACKAGE_REPOSITORY_BRANCH, with_git_command=with_git_command)
run_dbt("deps")
except Exception as e:
# delete folder so we start clean next time
if storage.has_folder(CLONED_PACKAGE_NAME):
storage.delete_folder(CLONED_PACKAGE_NAME, recursively=True)
raise
def ensure_newest_package() -> None:
with git_custom_key_command(CONFIG.PACKAGE_REPOSITORY_SSH_KEY) as ssh_command:
try:
ensure_remote_head(repo_path, with_git_command=ssh_command)
except GitError as err:
# cleanup package folder
logger.info(f"Package will be cloned due to {type(err).__name__}:{str(err)}")
initialize_package(with_git_command=ssh_command)
def run_db_steps() -> Sequence[dbt_results.BaseResult]:
# make sure we use package from the remote head
ensure_newest_package()
# check if raw schema exists
try:
if CONFIG.PACKAGE_SOURCE_TESTS_SELECTOR:
run_dbt("test", ["-s", CONFIG.PACKAGE_SOURCE_TESTS_SELECTOR])
except DBTProcessingError as err:
raise PrerequisitesException() from err
# always run seeds
run_dbt("seed")
# throws DBTProcessingError
try:
return run_dbt("run", CONFIG.PACKAGE_RUN_PARAMS)
except DBTProcessingError as e:
# detect incremental model out of sync
if is_incremental_schema_out_of_sync_error(e.results) and CONFIG.AUTO_FULL_REFRESH_WHEN_OUT_OF_SYNC:
logger.warning(f"Attempting full refresh due to incremental model out of sync on {e.results.message}")
return run_dbt("run", CONFIG.PACKAGE_RUN_PARAMS + ["--full-refresh"])
else:
raise
def run(_: None) -> TRunMetrics:
try:
# there were many issues with running the method below with pool.apply
# 1 - some exceptions are not serialized well on process boundary and queue hangs
# 2 - random hangs event if there's no exception, probably issues with DBT spawning its own workers
# instead the runner host was configured to recycle each run
results = run_db_steps()
log_dbt_run_results(results)
return TRunMetrics(False, False, 0)
except PrerequisitesException:
logger.warning(f"Raw schema test failed, it may yet not be created")
# run failed and loads possibly still pending
return TRunMetrics(False, True, 1)
except DBTProcessingError as runerr:
log_dbt_run_results(runerr.results)
# pass exception to the runner
raise
if __name__ == '__main__':
CONFIG = gen_configuration_variant()
parser = create_default_args(CONFIG)
args = parser.parse_args()
# we should force single run
initialize_runner(CONFIG, TRunArgs(args.single_run, args.wait_runs))
try:
storage, dbt_package_vars, global_args, repo_path, profile_name = create_folders()
model_elapsed_gauge, model_exec_info = create_gauges(REGISTRY)
except Exception:
process_internal_exception("init module")
exit(-1)
exit(pool_runner(CONFIG, run))

130
dlt/dbt_runner/utils.py Normal file
View File

@@ -0,0 +1,130 @@
import os
import logging
import tempfile
from typing import Any, Iterator, List, Sequence
from git import Repo, Git, RepositoryDirtyError
from contextlib import contextmanager
from dlt.common import json
from dlt.common.utils import uniq_id
from dlt.common.typing import StrAny, Optional
from dlt.dbt_runner.exceptions import DBTRunnerException
# block disabling root logger
import logbook.compat
logbook.compat.redirect_logging = lambda : None
# can only import DBT after redirect is disabled
import dbt.main
import dbt.logger
from dbt.events import functions
from dbt.contracts import results as dbt_results
from dbt.exceptions import FailFastException
# keep this exception definition here due to mock of logbook
class DBTProcessingError(DBTRunnerException):
def __init__(self, command: str, results: Any) -> None:
self.command = command
# the results from DBT may be anything
self.results = results
super().__init__(f"DBT command {command} could not be executed")
@contextmanager
def git_custom_key_command(private_key: Optional[str]) -> Iterator[str]:
if private_key:
key_file = tempfile.mktemp(prefix=uniq_id())
with open(key_file, "w") as f:
f.write(private_key)
try:
# permissions so SSH does not complain
os.chmod(key_file, 0o600)
yield 'ssh -o "StrictHostKeyChecking accept-new" -i %s' % key_file
finally:
os.remove(key_file)
else:
yield 'ssh -o "StrictHostKeyChecking accept-new"'
def ensure_remote_head(repo_path: str, with_git_command: Optional[str] = None) -> None:
# update remotes and check if heads are same. ignores locally modified files
repo = Repo(repo_path)
# use custom environemnt if specified
with repo.git.custom_environment(GIT_SSH_COMMAND=with_git_command):
# update origin
repo.remote().update()
# get branch status
status: str = repo.git.status("--short", "--branch", "-uno")
# we expect first status line ## main...origin/main
status_line = status.split("/n")[0]
if not (status_line.startswith("##") and not status_line.endswith("]")):
raise RepositoryDirtyError(repo, status)
def clone_repo(repository_url: str, clone_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> None:
repo = Repo.clone_from(repository_url, clone_path, env=dict(GIT_SSH_COMMAND=with_git_command))
if branch:
repo.git.checkout(branch)
def initialize_dbt_logging(level: str, is_json_logging: bool) -> Sequence[str]:
int_level = logging._nameToLevel[level]
# wrap log setup to force out log level
def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None:
functions.setup_event_logger(log_path, level)
# force log level as file is debug only
functions.this.FILE_LOG.setLevel(level)
functions.this.FILE_LOG.handlers[0].setLevel(level)
dbt.main.setup_event_logger = setup_event_logger_wrapper
globs = []
if int_level <= logging.DEBUG:
globs = ["--debug"]
# return global parameters to be passed to setup logging
if is_json_logging:
return ["--log-format", "json"] + globs
else:
return globs
def is_incremental_schema_out_of_sync_error(error: dbt_results.RunResult) -> bool:
return issubclass(type(error), dbt_results.RunResult) and error.status == dbt_results.RunStatus.Error and\
"The source and target schemas on this incremental model are out of sync" in error.message
def run_dbt_command(package_path: str, command: str, profiles_dir: str, profile_name: Optional[str] = None,
global_args: Sequence[str] = None, command_args: Sequence[str] = None, vars: StrAny = None) -> Sequence[dbt_results.BaseResult]:
args = ["--profiles-dir", profiles_dir]
# add profile name if provided
if profile_name:
args += ["--profile", profile_name]
# serialize dbt variables to pass to package
if vars:
args += ["--vars", json.dumps(vars)]
if command_args:
args += command_args
# cwd to package dir
working_dir = os.getcwd()
os.chdir(package_path)
try:
results: List[dbt_results.BaseResult] = None
success: bool = None
results, success = dbt.main.handle_and_check((global_args or []) + [command] + args) # type: ignore
assert type(success) is bool
if not success:
raise DBTProcessingError(command ,results)
return results
except FailFastException as ff:
raise DBTProcessingError(command, ff.result) from ff
finally:
# unblock logger manager to run next command
dbt.logger.log_manager.reset_handlers()
# go back to working dir
os.chdir(working_dir)

View File

View File

@@ -0,0 +1,5 @@
from dlt.common.exceptions import DltException
class ExtractorException(DltException):
pass

View File

@@ -0,0 +1,34 @@
import semver
from dlt.common.utils import uniq_id
from dlt.common.file_storage import FileStorage
from dlt.common.storages.versioned_storage import VersionedStorage
from dlt.common.storages.unpacker_storage import UnpackerStorage
class ExtractorStorageBase(VersionedStorage):
def __init__(self, version: semver.VersionInfo, is_owner: bool, storage: FileStorage, unpacker_storage: UnpackerStorage) -> None:
self.unpacker_storage = unpacker_storage
super().__init__(version, is_owner, storage)
def create_temp_folder(self) -> str:
tf_name = uniq_id()
self.storage.create_folder(tf_name)
return tf_name
def commit_events(self, schema_name: str, processed_file_path: str, dest_file_stem: str, no_processed_events: int, load_id: str, with_delete: bool = True) -> str:
# schema name cannot contain underscores
if "_" in schema_name:
raise ValueError(schema_name)
dest_name = UnpackerStorage.build_unpack_file_name(schema_name, dest_file_stem, no_processed_events, load_id)
# if no events extracted from tracker, file is not saved
if no_processed_events > 0:
# moves file to possibly external storage and place in the dest folder atomically
self.storage.copy_cross_storage_atomically(
self.unpacker_storage.storage.storage_path, UnpackerStorage.UNPACKING_FOLDER, processed_file_path, dest_name)
if with_delete:
self.storage.delete(processed_file_path)
return dest_name

View File

View File

1
dlt/loaders/__init__.py Normal file
View File

@@ -0,0 +1 @@
from dlt._version import loader_version as __version__

150
dlt/loaders/client_base.py Normal file
View File

@@ -0,0 +1,150 @@
from abc import ABC, abstractmethod
from types import TracebackType
from typing import Any, Literal, Sequence, Type, TypeVar, AnyStr
from pathlib import Path
from dlt.common import pendulum, logger
from dlt.common.schema import Column, Schema, Table
# from dlt.common.file_storage import FileStorage
from dlt.loaders.local_types import LoadJobStatus
from dlt.loaders.exceptions import LoadClientSchemaVersionCorrupted, LoadUnknownTableException
# typing for context manager
TClient = TypeVar("TClient", bound="ClientBase")
class LoadJob:
def __init__(self, file_name: str) -> None:
"""
File name is also a job id (or job id is deterministically derived) so it must be globally unique
"""
self._file_name = file_name
@abstractmethod
def status(self) -> LoadJobStatus:
pass
@abstractmethod
def file_name(self) -> str:
pass
@abstractmethod
def exception(self) -> str:
pass
class LoadEmptyJob(LoadJob):
def __init__(self, file_name: str, status: LoadJobStatus, exception: str = None) -> None:
self._status = status
self._exception = exception
super().__init__(file_name)
def status(self) -> LoadJobStatus:
return self._status
def file_name(self) -> str:
return self._file_name
def exception(self) -> str:
return self._exception
class ClientBase(ABC):
def __init__(self, schema: Schema) -> None:
self.schema = schema
@abstractmethod
def initialize_storage(self) -> None:
pass
@abstractmethod
def update_storage_schema(self) -> None:
pass
@abstractmethod
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
pass
@abstractmethod
def get_file_load(self, file_path: str) -> LoadJob:
pass
@abstractmethod
def complete_load(self, load_id: str) -> None:
pass
@abstractmethod
def _open_connection(self) -> None:
pass
@abstractmethod
def _close_connection(self) -> None:
pass
def __enter__(self: TClient) -> TClient:
self._open_connection()
return self
def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None:
self._close_connection()
def _get_table_by_name(self, table_name: str, file_name: str) -> Table:
try:
return self.schema.get_table(table_name)
except KeyError:
raise LoadUnknownTableException(table_name, file_name)
@staticmethod
def get_file_name_from_file_path(file_path: str) -> str:
return Path(file_path).name
@staticmethod
def make_job_with_status(file_path: str, status: LoadJobStatus, message: str = None) -> LoadJob:
return LoadEmptyJob(ClientBase.get_file_name_from_file_path(file_path), status, exception=message)
@staticmethod
def make_absolute_path(file_path: str) -> str:
return str(Path(file_path).absolute())
class SqlClientBase(ClientBase):
def __init__(self, schema: Schema) -> None:
super().__init__(schema)
def complete_load(self, load_id: str) -> None:
name = self._to_canonical_table_name(Schema.LOADS_TABLE_NAME)
now_ts = str(pendulum.now())
self._execute_sql(f"INSERT INTO {name}(load_id, status, inserted_at) VALUES('{load_id}', 0, '{now_ts}');")
@abstractmethod
def _execute_sql(self, query: AnyStr) -> Any:
pass
@abstractmethod
def _to_canonical_schema_name(self) -> str:
pass
def _create_table_update(self, table_name: str, storage_table: Table) -> Sequence[Column]:
# compare table with stored schema and produce delta
l = self.schema.get_schema_update_for(table_name, storage_table)
logger.info(f"Found {len(l)} updates for {table_name} in {self.schema.schema_name}")
return l
def _to_canonical_table_name(self, table_name: str) -> str:
return f"{self._to_canonical_schema_name()}.{table_name}"
def _get_schema_version_from_storage(self) -> int:
name = self._to_canonical_table_name(Schema.VERSION_TABLE_NAME)
rows = list(self._execute_sql(f"SELECT {Schema.VERSION_COLUMN_NAME} FROM {name} ORDER BY inserted_at DESC LIMIT 1;"))
if len(rows) > 1:
raise LoadClientSchemaVersionCorrupted(self._to_canonical_schema_name())
if len(rows) == 0:
return 0
return int(rows[0][0])
def _update_schema_version(self, new_version: int) -> None:
now_ts = str(pendulum.now())
name = self._to_canonical_table_name(Schema.VERSION_TABLE_NAME)
self._execute_sql(f"INSERT INTO {name}({Schema.VERSION_COLUMN_NAME}, engine_version, inserted_at) VALUES ({new_version}, {Schema.ENGINE_VERSION}, '{now_ts}');")

View File

@@ -0,0 +1,51 @@
from typing import Any, Type
from dlt.common.utils import uniq_id
from dlt.common.typing import StrAny
from dlt.common.configuration import (PoolRunnerConfiguration,
LoadingVolumeConfiguration,
ProductionLoadingVolumeConfiguration,
PostgresConfiguration, PostgresProductionConfiguration,
GcpClientConfiguration, GcpClientProductionConfiguration,
TPoolType, make_configuration)
from dlt.loaders.dummy.configuration import DummyClientConfiguration
from . import __version__
class LoaderConfiguration(PoolRunnerConfiguration, LoadingVolumeConfiguration):
CLIENT_TYPE: str = "dummy" # which analytical storage to use
MAX_PARALLEL_LOADS: int = 20 # how many parallel loads can be executed
MAX_PARALLELISM: int = 20 # in 20 separate threads
POOL_TYPE: TPoolType = "thread" # mostly i/o (upload) so may be thread pool
class ProductionLoaderConfiguration(ProductionLoadingVolumeConfiguration, LoaderConfiguration):
pass
def configuration(initial_values: StrAny = None) -> Type[LoaderConfiguration]:
# synthesize right configuration
C = make_configuration(LoaderConfiguration, ProductionLoaderConfiguration, initial_values=initial_values)
T: Type[Any] = None
T_P: Type[Any] = None
if C.CLIENT_TYPE == "dummy":
T = DummyClientConfiguration
T_P = DummyClientConfiguration
elif C.CLIENT_TYPE == "gcp":
T = GcpClientConfiguration
T_P = GcpClientProductionConfiguration
elif C.CLIENT_TYPE == "redshift":
T = PostgresConfiguration
T_P = PostgresProductionConfiguration
else:
raise ValueError(C.CLIENT_TYPE)
ST = type(LoaderConfiguration.__name__ + "_" + T.__name__ + "_" + uniq_id(), (T, LoaderConfiguration), {})
ST_P = type(ProductionLoaderConfiguration.__name__ + "_" + T_P.__name__ + "_" + uniq_id(), (T_P, ProductionLoaderConfiguration), {})
return make_configuration(
ST,
ST_P,
initial_values=initial_values,
skip_subclass_check=True
)

View File

134
dlt/loaders/dummy/client.py Normal file
View File

@@ -0,0 +1,134 @@
from datetime import time
import random
from typing import Dict, Literal, Type
from dlt.common.dataset_writers import TWriterType
from dlt.common import pendulum
from dlt.common.schema import Schema
from dlt.common.typing import StrAny
from dlt.loaders.client_base import ClientBase, LoadJob
from dlt.loaders.local_types import LoadJobStatus
from dlt.loaders.exceptions import (LoadJobNotExistsException, LoadJobInvalidStateTransitionException,
LoadClientTerminalException, LoadClientTransientException)
from dlt.loaders.dummy.configuration import DummyClientConfiguration
class LoadDummyJob(LoadJob):
def __init__(self, file_name: str, fail_prob: float = 0.0, retry_prob: float = 0.0, completed_prob: float = 1.0, timeout: float = 10.0) -> None:
self.fail_prob = fail_prob
self.retry_prob = retry_prob
self.completed_prob = completed_prob
self.timeout = timeout
self._status: LoadJobStatus = "running"
self._exception: str = None
self.start_time: float = pendulum.now().timestamp()
super().__init__(file_name)
s = self.status()
if s == "failed":
raise LoadClientTerminalException(self._exception)
if s == "retry":
raise LoadClientTransientException(self._exception)
def status(self) -> LoadJobStatus:
# this should poll the server for a job status, here we simulate various outcomes
if self._status == "running":
n = pendulum.now().timestamp()
if n - self.start_time > self.timeout:
self._status = "failed"
self._exception = "failed due to timeout"
else:
c_r = random.random()
if self.completed_prob >= c_r:
self._status = "completed"
else:
c_r = random.random()
if self.retry_prob >= c_r:
self._status = "retry"
self._exception = "a random retry occured"
else:
c_r = random.random()
if self.fail_prob >= c_r:
self._status = "failed"
self._exception = "a random fail occured"
return self._status
def file_name(self) -> str:
return self._file_name
def exception(self) -> str:
# this will typically call server for error messages
return self._exception
def retry(self) -> None:
if self._status != "retry":
raise LoadJobInvalidStateTransitionException(self._status, "retry")
self._status = "retry"
JOBS: Dict[str, LoadDummyJob] = {}
class DummyClient(ClientBase):
"""
dummy client storing jobs in memory
"""
def __init__(self, schema: Schema, CONFIG: Type[DummyClientConfiguration]) -> None:
self.C = CONFIG
super().__init__(schema)
def initialize_storage(self) -> None:
pass
def update_storage_schema(self) -> None:
pass
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
self._get_table_by_name(table_name, file_path)
job_id = ClientBase.get_file_name_from_file_path(file_path)
file_name = ClientBase.get_file_name_from_file_path(file_path)
# return existing job if already there
if job_id not in JOBS:
JOBS[job_id] = self._create_job(file_name)
else:
job = JOBS[job_id]
if job.status == "retry":
job.retry()
return JOBS[job_id]
def get_file_load(self, file_path: str) -> LoadJob:
job_id = ClientBase.get_file_name_from_file_path(file_path)
if job_id not in JOBS:
raise LoadJobNotExistsException(job_id)
return JOBS[job_id]
def complete_load(self, load_id: str) -> None:
pass
def _open_connection(self) -> None:
pass
def _close_connection(self) -> None:
pass
def _create_job(self, job_id: str) -> LoadDummyJob:
return LoadDummyJob(
job_id,
fail_prob=self.C.FAIL_PROB,
retry_prob=self.C.RETRY_PROB,
completed_prob=self.C.COMPLETED_PROB,
timeout=self.C.TIMEOUT
)
def make_client(schema: Schema, C: Type[DummyClientConfiguration]) -> ClientBase:
return DummyClient(schema, C)
def supported_writer(C: Type[DummyClientConfiguration]) -> TWriterType:
return C.WRITER_TYPE

View File

@@ -0,0 +1,8 @@
from dlt.common.dataset_writers import TWriterType
class DummyClientConfiguration:
WRITER_TYPE: TWriterType = "jsonl"
FAIL_PROB: float = 0.0
RETRY_PROB: float = 0.0
COMPLETED_PROB: float = 0.0
TIMEOUT: float = 10.0

72
dlt/loaders/exceptions.py Normal file
View File

@@ -0,0 +1,72 @@
from typing import Sequence
from dlt.common.exceptions import DltException, TerminalException, TransientException
from dlt.loaders.local_types import LoadJobStatus
class LoadException(DltException):
def __init__(self, msg: str) -> None:
super().__init__(msg)
class LoadClientTerminalException(LoadException, TerminalException):
def __init__(self, msg: str) -> None:
super().__init__(msg)
class LoadClientTransientException(LoadException, TransientException):
def __init__(self, msg: str) -> None:
super().__init__(msg)
class LoadClientTerminalInnerException(LoadClientTerminalException):
def __init__(self, msg: str, inner_exc: Exception) -> None:
self.inner_exc = inner_exc
super().__init__(msg)
class LoadClientTransientInnerException(LoadClientTransientException):
def __init__(self, msg: str, inner_exc: Exception) -> None:
self.inner_exc = inner_exc
super().__init__(msg)
class LoadJobNotExistsException(LoadClientTerminalException):
def __init__(self, job_id: str) -> None:
super().__init__(f"Job with id/file name {job_id} not found")
class LoadUnknownTableException(LoadClientTerminalException):
def __init__(self, table_name: str, file_name: str) -> None:
self.table_name = table_name
super().__init__(f"Client does not know table {table_name} for load file {file_name}")
class LoadJobInvalidStateTransitionException(LoadClientTerminalException):
def __init__(self, from_state: LoadJobStatus, to_state: LoadJobStatus) -> None:
self.from_state = from_state
self.to_state = to_state
super().__init__(f"Load job cannot transition form {from_state} to {to_state}")
class LoadJobServerTerminalException(LoadClientTerminalException):
def __init__(self, file_path: str) -> None:
super().__init__(f"Job with id/file name {file_path} encountered unrecoverable problem")
class LoadClientSchemaVersionCorrupted(LoadClientTerminalException):
def __init__(self, dataset_name: str) -> None:
self.dataset_name = dataset_name
super().__init__(f"Schema _version table contains too many rows in {dataset_name}")
class LoadClientSchemaWillNotUpdate(LoadClientTerminalException):
def __init__(self, table_name: str, columns: Sequence[str], msg: str) -> None:
self.table_name = table_name
self.columns = columns
super().__init__(f"Schema for table {table_name} column(s) {columns} will not update: {msg}")
class LoadFileTooBig(LoadClientTerminalException):
def __init__(self, file_name: str, max_size: int) -> None:
super().__init__(f"File {file_name} exceedes {max_size} and cannot be loaded. Split the file and try again.")

View File

@@ -0,0 +1,6 @@
# Loader account setup
1. Create new services account, add private key to it and download the `services.json` file
2. Make sure that this newly created account has access to BigQuery API
3. You must add followig roles to the account above: `BigQuery Data Editor` and `BigQuey Job User`
4. IAM to add roles is here https://console.cloud.google.com/iam-admin/iam?project=chat-analytics-rasa-ci

View File

324
dlt/loaders/gcp/client.py Normal file
View File

@@ -0,0 +1,324 @@
from pathlib import Path
from typing import Any, AnyStr, Dict, List, Literal, Optional, Tuple, Type
import google.cloud.bigquery as bigquery
from google.cloud import exceptions as gcp_exceptions
from google.oauth2 import service_account
from google.api_core import exceptions as api_core_exceptions
from dlt.common import json, logger
from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
from dlt.common.configuration import GcpClientConfiguration
from dlt.common.dataset_writers import TWriterType, escape_bigquery_identifier
from dlt.loaders.local_types import LoadJobStatus
from dlt.common.schema import Column, DataType, Schema, Table
from dlt.loaders.client_base import SqlClientBase, LoadJob
from dlt.loaders.exceptions import LoadClientSchemaWillNotUpdate, LoadJobNotExistsException, LoadJobServerTerminalException, LoadUnknownTableException
SCT_TO_BQT: Dict[DataType, str] = {
"text": "STRING",
"double": "FLOAT64",
"bool": "BOOLEAN",
"timestamp": "TIMESTAMP",
"bigint": "INTEGER",
"binary": "BYTES",
"decimal": f"NUMERIC({DEFAULT_NUMERIC_PRECISION},{DEFAULT_NUMERIC_SCALE})",
"wei": "BIGNUMERIC" # non parametrized should hold wei values
}
BQT_TO_SCT: Dict[str, DataType] = {
"STRING": "text",
"FLOAT": "double",
"BOOLEAN": "bool",
"TIMESTAMP": "timestamp",
"INTEGER": "bigint",
"BYTES": "binary",
"NUMERIC": "decimal",
"BIGNUMERIC": "decimal"
}
class BigQueryLoadJob(LoadJob):
def __init__(self, file_name: str, bq_load_job: bigquery.LoadJob, CONFIG: Type[GcpClientConfiguration]) -> None:
self.bq_load_job = bq_load_job
self.C = CONFIG
self.default_retry = bigquery.DEFAULT_RETRY.with_deadline(CONFIG.TIMEOUT)
super().__init__(file_name)
def status(self) -> LoadJobStatus:
# check server if done
done = self.bq_load_job.done(retry=self.default_retry, timeout=self.C.TIMEOUT)
if done:
# rows processed
if self.bq_load_job.output_rows is not None and self.bq_load_job.error_result is None:
return "completed"
else:
return "failed"
else:
return "running"
def file_name(self) -> str:
return self._file_name
def exception(self) -> str:
exception: str = json.dumps({
"error_result": self.bq_load_job.error_result,
"errors": self.bq_load_job.errors,
"job_start": self.bq_load_job.started,
"job_end": self.bq_load_job.ended,
"job_id": self.bq_load_job.job_id
})
return exception
class BigQueryClient(SqlClientBase):
def __init__(self, schema: Schema, CONFIG: Type[GcpClientConfiguration]) -> None:
self._client: bigquery.Client = None
self.C = CONFIG
self.default_retry = bigquery.DEFAULT_RETRY.with_deadline(CONFIG.TIMEOUT)
super().__init__(schema)
def initialize_storage(self) -> None:
dataset_name = self._to_canonical_schema_name()
try:
self._client.get_dataset(dataset_name, retry=self.default_retry, timeout=self.C.TIMEOUT)
except gcp_exceptions.NotFound:
self._client.create_dataset(dataset_name, exists_ok=False, retry=self.default_retry, timeout=self.C.TIMEOUT)
def get_file_load(self, file_path: str) -> LoadJob:
try:
return BigQueryLoadJob(
SqlClientBase.get_file_name_from_file_path(file_path),
self._retrieve_load_job(file_path),
self.C
)
except api_core_exceptions.NotFound:
raise LoadJobNotExistsException(file_path)
except (api_core_exceptions.BadRequest, api_core_exceptions.NotFound):
raise LoadJobServerTerminalException(file_path)
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
# verify that table exists in the schema
self._get_table_by_name(table_name, file_path)
try:
return BigQueryLoadJob(
SqlClientBase.get_file_name_from_file_path(file_path),
self._create_load_job(table_name, file_path),
self.C
)
except api_core_exceptions.NotFound:
# google.api_core.exceptions.BadRequest - will not be processed ie bad job name
raise LoadUnknownTableException(table_name, file_path)
except (api_core_exceptions.BadRequest, api_core_exceptions.NotFound):
# google.api_core.exceptions.NotFound: 404 - table not found
raise LoadJobServerTerminalException(file_path)
except api_core_exceptions.Conflict:
# google.api_core.exceptions.Conflict: 409 PUT - already exists
return self.get_file_load(file_path)
def update_storage_schema(self) -> None:
storage_version = self._get_schema_version_from_storage()
if storage_version < self.schema.schema_version:
for sql in self._build_schema_update_sql():
self._execute_sql(sql)
self._update_schema_version(self.schema.schema_version)
def _open_connection(self) -> None:
credentials = service_account.Credentials.from_service_account_info(self.C.to_service_credentials())
self._client = bigquery.Client(self.C.PROJECT_ID, credentials=credentials)
def _close_connection(self) -> None:
if self._client:
self._client.close()
self._client = None
def _get_schema_version_from_storage(self) -> int:
try:
return super()._get_schema_version_from_storage()
except api_core_exceptions.NotFound:
# there's no table so there's no schema
return 0
def _build_schema_update_sql(self) -> List[str]:
sql_updates = []
for table_name in self.schema.schema_tables:
exists, storage_table = self._get_storage_table(table_name)
sql = self._get_table_update_sql(table_name, storage_table, exists)
if sql:
sql_updates.append(sql)
return sql_updates
def _get_table_update_sql(self, table_name: str, storage_table: Table, exists: bool) -> str:
new_columns = self._create_table_update(table_name, storage_table)
if len(new_columns) == 0:
# no changes
return None
# build sql
canonical_name = self._to_canonical_table_name(table_name)
if not exists:
# build CREATE
sql = f"CREATE TABLE {canonical_name} (\n"
sql += ",\n".join([self._get_column_def_sql(c) for c in new_columns])
sql += ")"
else:
# build ALTER
sql = f"ALTER TABLE {canonical_name}\n"
sql += ",\n".join(["ADD COLUMN " + self._get_column_def_sql(c) for c in new_columns])
# scan columns to get hints
cluster_list = [escape_bigquery_identifier(c["name"]) for c in new_columns if c.get("cluster", False)]
partition_list = [escape_bigquery_identifier(c["name"]) for c in new_columns if c.get("partition", False)]
# partition by must be added first
if len(partition_list) > 0:
if exists:
raise LoadClientSchemaWillNotUpdate(canonical_name, partition_list, "Partition requested after table was created")
elif len(partition_list) > 1:
raise LoadClientSchemaWillNotUpdate(canonical_name, partition_list, "Partition requested for more than one column")
else:
sql += f"\nPARTITION BY DATE({partition_list[0]})"
if len(cluster_list) > 0:
if exists:
raise LoadClientSchemaWillNotUpdate(canonical_name, cluster_list, "Clustering requested after table was created")
else:
sql += "\nCLUSTER BY " + ",".join(cluster_list)
return sql
def _get_column_def_sql(self, c: Column) -> str:
name = escape_bigquery_identifier(c["name"])
return f"{name} {self._sc_t_to_bq_t(c['data_type'])} {self._gen_not_null(c['nullable'])}"
def _get_storage_table(self, table_name: str) -> Tuple[bool, Table]:
schema_table: Table = {}
try:
table = self._client.get_table(self._to_canonical_table_name(table_name), retry=self.default_retry, timeout=self.C.TIMEOUT)
partition_field = table.time_partitioning.field if table.time_partitioning else None
for c in table.schema:
schema_c: Column = {
"name": c.name,
"nullable": c.is_nullable,
"data_type": self._bq_t_to_sc_t(c.field_type, c.precision, c.scale),
"unique": False,
"sort": False,
"primary_key": False,
"foreign_key": False,
"cluster": c.name in (table.clustering_fields or []),
"partition": c.name == partition_field
}
schema_table[c.name] = schema_c
return True, schema_table
except gcp_exceptions.NotFound:
return False, schema_table
def _execute_sql(self, query: AnyStr) -> Any:
logger.debug(f"Will execute query {query}") # type: ignore
return self._client.query(query, job_retry=self.default_retry, timeout=self.C.TIMEOUT).result()
def _to_canonical_schema_name(self) -> str:
return f"{self.C.PROJECT_ID}.{self.C.DATASET}_{self.schema.schema_name}"
def _create_load_job(self, table_name: str, file_path: str) -> bigquery.LoadJob:
job_id = BigQueryClient._get_job_id_from_file_path(file_path)
job_config = bigquery.LoadJobConfig(
autodetect=False,
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
create_disposition=bigquery.CreateDisposition.CREATE_NEVER,
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
ignore_unknown_values=False,
max_bad_records=0,
)
with open(file_path, "rb") as f:
return self._client.load_table_from_file(f,
self._to_canonical_table_name(table_name),
job_id=job_id,
job_config=job_config,
timeout=self.C.TIMEOUT
)
def _retrieve_load_job(self, file_path: str) -> bigquery.LoadJob:
job_id = BigQueryClient._get_job_id_from_file_path(file_path)
return self._client.get_job(job_id)
@staticmethod
def _get_job_id_from_file_path(file_path: str) -> str:
return Path(file_path).name.replace(".", "_")
@staticmethod
def _gen_not_null(v: bool) -> str:
return "NOT NULL" if not v else ""
@staticmethod
def _sc_t_to_bq_t(sc_t: DataType) -> str:
return SCT_TO_BQT[sc_t]
@staticmethod
def _bq_t_to_sc_t(bq_t: str, precision: Optional[int], scale: Optional[int]) -> DataType:
if bq_t == "BIGNUMERIC":
if precision is None: # biggest numeric possible
return "wei"
return BQT_TO_SCT.get(bq_t, "text")
def make_client(schema: Schema, C: Type[GcpClientConfiguration]) -> BigQueryClient:
return BigQueryClient(schema, C)
def supported_writer(C: Type[GcpClientConfiguration]) -> TWriterType:
return "jsonl"
# cred = service_account.Credentials.from_service_account_info(_credentials)
# project_id = cred.get('project_id')
# client = bigquery.Client(project_id, credentials=cred)
# print(client.get_dataset("carbon_bot_extract_7"))
# exit(0)
# from dlt.common.configuration import SchemaStoreConfiguration
# from dlt.common.logger import init_logging_from_config
# init_logging_from_config(CLIENT_CONFIG)
# schema = Schema(SchemaStoreConfiguration.TRACKER_SCHEMA_FILE_PATH)
# schema.load_schema()
# import pprint
# # pprint.pprint(schema.as_yaml())
# with make_client(schema) as client:
# client.initialize_storage()
# # job = client._create_load_job("tracker", "_storage/loaded/1630949263.574516/completed_jobs/tracker.1c31ff1b-c250-4690-8973-14f0ee9ae355.jsonl")
# # unk table
# # job = client._create_load_job("trackerZ", "_storage/loaded/1630949263.574516/completed_jobs/tracker.4876f905-aefe-4262-a440-d29ed2643c3a.jsonl")
# # job = client._create_load_job("tracker", "_storage/loaded/1630949263.574516/completed_jobs/event_bot.c9105079-2d1d-4ad3-8613-a5dff790889d.jsonl")
# # failed
# # job = client._retrieve_load_job("_storage/loaded/1630949263.574516/completed_jobs/event_bot.c9105079-2d1d-4ad3-8613-a5dff790889d.jsonl")
# # OK
# job = client._retrieve_load_job("_storage/loaded/1630949263.574516/completed_jobs/tracker.1c31ff1b-c250-4690-8973-14f0ee9ae355.jsonl")
# while True:
# try:
# # this does not throw
# done = job.done()
# print(f"DONE: {job.done(reload=False)}")
# except Exception as e:
# logger.exception("DONE")
# done = True
# if done:
# break;
# # done is not self running
# # print(job.running())
# sleep(1)
# try:
# print(f"status: {job.state}")
# print(f"error: {job.error_result}")
# print(f"errors: {job.errors}")
# print(f"line count: {job.output_rows}")
# print(job.exception())
# except:
# logger.exception("EXCEPTION")
# try:
# print(job.result())
# except:
# logger.exception("RESULT")
# non existing table
# wrong data - unknown column

238
dlt/loaders/loader.py Normal file
View File

@@ -0,0 +1,238 @@
from types import ModuleType
from typing import Any, Iterator, List, Dict, Literal, Optional, Tuple, Type
from multiprocessing.pool import ThreadPool
from importlib import import_module
from prometheus_client import REGISTRY, Counter, Gauge, CollectorRegistry, Summary
from prometheus_client.metrics import MetricWrapperBase
from dlt.common import sleep, logger
from dlt.common.runners import TRunArgs, TRunMetrics, create_default_args, initialize_runner, pool_runner
from dlt.common.logger import process_internal_exception, pretty_format_exception
from dlt.common.exceptions import TerminalValueError
from dlt.common.dataset_writers import TWriterType
from dlt.common.schema import Schema
from dlt.common.storages import SchemaStorage
from dlt.common.storages.loader_storage import LoaderStorage
from dlt.common.telemetry import get_logging_extras, set_gauge_all_labels
from dlt.loaders.exceptions import LoadClientTerminalException, LoadClientTransientException, LoadJobNotExistsException
from dlt.loaders.client_base import ClientBase, LoadJob
from dlt.loaders.local_types import LoadJobStatus
from dlt.loaders.configuration import configuration, LoaderConfiguration
CONFIG: Type[LoaderConfiguration] = None
load_storage: LoaderStorage = None
client_module: ModuleType = None
load_counter: Counter = None
job_gauge: Gauge = None
job_counter: Counter = None
job_wait_summary: Summary = None
def client_impl(client_type: str) -> ModuleType:
return import_module(f".{client_type}.client", "dlt.loaders")
def create_client(schema: Schema) -> ClientBase:
return client_module.make_client(schema, CONFIG) # type: ignore
def supported_writer() -> TWriterType:
return client_module.supported_writer(CONFIG) # type: ignore
def create_folders() -> LoaderStorage:
load_storage = LoaderStorage(False, CONFIG, supported_writer())
load_storage.initialize_storage()
return load_storage
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase, MetricWrapperBase, MetricWrapperBase]:
return (
Counter("loader_load_package_counter", "Counts load package processed", registry=registry),
Gauge("loader_last_package_jobs_counter", "Counts jobs in last package per status", ["status"], registry=registry),
Counter("loader_jobs_counter", "Counts jobs per job status", ["status"], registry=registry),
Summary("loader_jobs_wait_seconds", "Counts jobs total wait until completion", registry=registry)
)
def spool_job(file_path: str, load_id: str, schema: Schema) -> Optional[LoadJob]:
# open new connection for each upload
job: LoadJob = None
try:
with create_client(schema) as client:
table_name, _ = load_storage.parse_load_file_name(file_path)
logger.info(f"Will load file {file_path} with table name {table_name}")
job = client.start_file_load(table_name, load_storage.storage._make_path(file_path))
except (LoadClientTerminalException, TerminalValueError):
# if job irreversible cannot be started, mark it as failed
process_internal_exception(f"Terminal problem with spooling job {file_path}")
job = ClientBase.make_job_with_status(file_path, "failed", pretty_format_exception())
except (LoadClientTransientException, Exception):
# return no job so file stays in new jobs (root) folder
process_internal_exception(f"Temporary problem with spooling job {file_path}")
return None
load_storage.start_job(load_id, job.file_name())
return job
def spool_new_jobs(pool: ThreadPool, load_id: str, schema: Schema) -> Tuple[int, List[LoadJob]]:
# TODO: validate file type, combine files, finalize etc., this is client specific, jsonl for single table
# can just be combined, insert_values must be finalized and then combined
# use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs
# TODO: combine files by providing a list of files pertaining to same table into job, so job must be
# extended to accept a list
load_files = load_storage.list_new_jobs(load_id)[:CONFIG.MAX_PARALLEL_LOADS]
file_count = len(load_files)
if file_count == 0:
logger.info(f"No new jobs found in {load_id}")
return 0, []
logger.info(f"Will load {file_count}, creating jobs")
param_chunk = [(file, load_id, schema) for file in load_files]
# exceptions should not be raised, None as job is a temporary failure
# other jobs should not be affected
jobs: List[LoadJob] = pool.starmap(spool_job, param_chunk)
# remove None jobs and check the rest
return file_count, [job for job in jobs if job is not None]
def retrieve_jobs(client: ClientBase, load_id: str) -> Tuple[int, List[LoadJob]]:
jobs: List[LoadJob] = []
# list all files that were started but not yet completed
started_jobs = load_storage.list_started_jobs(load_id)
logger.info(f"Found {len(started_jobs)} that are already started and should be continued")
if len(started_jobs) == 0:
return 0, jobs
for file_path in started_jobs:
try:
logger.info(f"Will retrieve {file_path}")
job = client.get_file_load(file_path)
except LoadClientTerminalException:
process_internal_exception(f"Job retrieval for {file_path} failed, job will be terminated")
job = ClientBase.make_job_with_status(file_path, "failed", pretty_format_exception())
# proceed to appending job, do not reraise
except (LoadClientTransientException, Exception) as e:
# raise on all temporary exceptions, typically network / server problems
raise
jobs.append(job)
job_gauge.labels("retrieved").inc()
job_counter.labels("retrieved").inc()
logger.metrics("Retrieve jobs metrics",
extra=get_logging_extras([job_gauge.labels("retrieved"), job_counter.labels("retrieved")])
)
return len(jobs), jobs
def complete_jobs(load_id: str, jobs: List[LoadJob]) -> List[LoadJob]:
remaining_jobs: List[LoadJob] = []
logger.info(f"Will complete {len(jobs)} for {load_id}")
for ii in range(len(jobs)):
job = jobs[ii]
logger.debug(f"Checking status for job {job.file_name()}")
status: LoadJobStatus = job.status()
final_location: str = None
if status == "running":
# ask again
logger.debug(f"job {job.file_name()} still running")
remaining_jobs.append(job)
elif status == "failed":
# try to get exception message from job
failed_message = job.exception()
final_location = load_storage.fail_job(load_id, job.file_name(), failed_message)
logger.error(f"Job for {job.file_name()} failed terminally in load {load_id} with message {failed_message}")
elif status == "retry":
# try to get exception message from job
retry_message = job.exception()
# move back to new folder to try again
final_location = load_storage.retry_job(load_id, job.file_name())
logger.error(f"Job for {job.file_name()} retried in load {load_id} with message {retry_message}")
elif status == "completed":
# move to completed folder
final_location = load_storage.complete_job(load_id, job.file_name())
logger.info(f"Job for {job.file_name()} completed in load {load_id}")
if status != "running":
job_gauge.labels(status).inc()
job_counter.labels(status).inc()
job_wait_summary.observe(load_storage.job_elapsed_time_seconds(final_location))
logger.metrics("Completing jobs metrics", extra=get_logging_extras([job_counter, job_gauge, job_wait_summary]))
return remaining_jobs
def run(pool: ThreadPool) -> TRunMetrics:
logger.info(f"Running file loading")
# get list of loads and order by name ASC to execute schema updates
loads = load_storage.list_loads()
logger.info(f"Found {len(loads)} load packages")
if len(loads) == 0:
return TRunMetrics(True, False, 0)
load_id = loads[0]
logger.info(f"Loading schema from load package in {load_id}")
# one load package contains table from one schema
schema_storage = SchemaStorage(load_storage.storage.storage_path)
# get relative path to load schema from load package
schema = schema_storage.load_folder_schema(load_storage.get_load_path(load_id))
logger.info(f"Loaded schema name {schema.schema_name} and version {schema.schema_version}")
# initialize analytical storage ie. create dataset required by passed schema
with create_client(schema) as client:
logger.info(f"Client {CONFIG.CLIENT_TYPE} will start load")
client.initialize_storage()
schema_update = load_storage.begin_schema_update(load_id)
if schema_update:
logger.info(f"Client {CONFIG.CLIENT_TYPE} will update schema to package schema")
client.update_storage_schema()
load_storage.commit_schema_update(load_id)
# spool or retrieve unfinished jobs
jobs_count, jobs = retrieve_jobs(client, load_id)
if not jobs:
# jobs count is a total number of jobs including those that could not be initialized
jobs_count, jobs = spool_new_jobs(pool, load_id, schema)
if jobs_count > 0:
# this is a new load package
set_gauge_all_labels(job_gauge, 0)
job_gauge.labels("running").inc(len(jobs))
job_counter.labels("running").inc(len(jobs))
logger.metrics("New jobs metrics",
extra=get_logging_extras([job_counter.labels("running"), job_gauge.labels("running")])
)
# if there are no existing or new jobs we archive the package
if jobs_count == 0:
with create_client(schema) as client:
remaining_jobs = client.complete_load(load_id)
load_storage.archive_load(load_id)
logger.info(f"All jobs completed, archiving package {load_id}")
load_counter.inc()
logger.metrics("Load package metrics", extra=get_logging_extras([load_counter]))
else:
while True:
remaining_jobs = complete_jobs(load_id, jobs)
if len(remaining_jobs) == 0:
break
# process remaining jobs again
jobs = remaining_jobs
# this will raise on signal
sleep(1)
return TRunMetrics(False, False, len(load_storage.list_loads()))
if __name__ == '__main__':
CONFIG = configuration()
parser = create_default_args(CONFIG)
args = parser.parse_args()
initialize_runner(CONFIG, TRunArgs(args.single_run, args.wait_runs))
try:
client_module = client_impl(CONFIG.CLIENT_TYPE)
load_counter, job_gauge, job_counter, job_wait_summary = create_gauges(REGISTRY)
load_storage = create_folders()
except Exception:
process_internal_exception("run")
exit(-1)
exit(pool_runner(CONFIG, run))

View File

@@ -0,0 +1,4 @@
from typing import Literal
LoadJobStatus = Literal["running", "failed", "retry", "completed"]

View File

@@ -0,0 +1,23 @@
# Public Access setup
There's *Modify publicly accessible settings* in Actions of each Redshift cluster. Assign your IP there.
# Runtime optimization
https://www.intermix.io/blog/top-14-performance-tuning-techniques-for-amazon-redshift/
1. we should use separate work queue for loader user
2. they suggest to not use dist keys
3. data must be inserted in order of sortkey
# loader account setup
1. Create new database `CREATE DATABASE chat_analytics_rasa_ci`
2. Create new user, set password
3. Set as database owner (we could set lower permission) `ALTER DATABASE chat_analytics_rasa_ci OWNER TO loader`
# Public access setup for Serverless
Follow https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-connecting.html `Connecting from the public subnet to the Amazon Redshift Serverless endpoint using Network Load Balancer`
that will use terraform template to create load balancer endpoint and assign public IP. The cost of the load balancer is ~16$/month + cost of IP
It seems that port 5439 is closed to the VPC on which serverless redshift created itself. In the cluster panel: Data Access : VPC security group add Inbound Rule to allow 5439 port from any subnet 0.0.0.0/0

View File

View File

@@ -0,0 +1,282 @@
import os
import psycopg2
from psycopg2.sql import SQL, Identifier, Composed, Literal as SQLLiteral
from typing import Any, AnyStr, Dict, List, Literal, Optional, Tuple, Type
from dlt.common.typing import StrAny
from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
from dlt.common.configuration import PostgresConfiguration
from dlt.common.dataset_writers import TWriterType, escape_redshift_identifier
from dlt.common.schema import COLUMN_HINTS, Column, ColumnBase, DataType, HintType, Schema, SchemaUpdate, Table
from dlt.loaders.exceptions import (LoadClientSchemaWillNotUpdate, LoadClientTerminalInnerException,
LoadClientTransientInnerException, LoadFileTooBig)
from dlt.loaders.local_types import LoadJobStatus
from dlt.loaders.client_base import ClientBase, SqlClientBase, LoadJob
SCT_TO_PGT: Dict[DataType, str] = {
"text": "varchar(max)",
"double": "double precision",
"bool": "boolean",
"timestamp": "timestamp with time zone",
"bigint": "bigint",
"binary": "varbinary",
"decimal": f"numeric({DEFAULT_NUMERIC_PRECISION},{DEFAULT_NUMERIC_SCALE})"
}
PGT_TO_SCT: Dict[str, DataType] = {
"varchar(max)": "text",
"double precision": "double",
"boolean": "bool",
"timestamp with time zone": "timestamp",
"bigint": "bigint",
"binary varying": "binary",
"numeric": "decimal"
}
HINT_TO_REDSHIFT_ATTR: Dict[HintType, str] = {
"cluster": "DISTKEY",
# it is better to not enforce constraints in redshift
# "primary_key": "PRIMARY KEY",
"sort": "SORTKEY"
}
class SqlClientMixin:
MAX_STATEMENT_SIZE = 16 * 1024 * 1204
def __init__(self, CONFIG: Type[PostgresConfiguration], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._conn: psycopg2.connection = None
self.C = CONFIG
def _open_connection(self) -> None:
self._conn = psycopg2.connect(dbname=self.C.PG_DATABASE_NAME,
user=self.C.PG_USER,
host=self.C.PG_HOST,
port=self.C.PG_PORT,
password=self.C.PG_PASSWORD,
connect_timeout=self.C.PG_CONNECTION_TIMEOUT
)
# we'll provide explicit transactions
self._conn.set_session(autocommit=True)
def _close_connection(self) -> None:
if self._conn:
self._conn.close()
self._conn = None
def _execute_sql(self, query: AnyStr) -> Any:
curr: psycopg2.cursor
with self._conn.cursor() as curr:
try:
curr.execute(query)
except psycopg2.Error as outer:
try:
self._conn.rollback()
self._conn.reset()
except psycopg2.Error:
self._close_connection()
self._open_connection()
raise outer
if curr.description is None:
return None
else:
f = curr.fetchall()
return f
class RedshiftInsertLoadJob(SqlClientMixin, LoadJob):
def __init__(self, canonical_table_name: str, file_path: str, conn: Any, CONFIG: Type[PostgresConfiguration]) -> None:
super().__init__(CONFIG, ClientBase.get_file_name_from_file_path(file_path))
self._conn = conn
# insert file content immediately
self._insert(canonical_table_name, file_path)
def status(self) -> LoadJobStatus:
# this job is always done
return "completed"
def file_name(self) -> str:
return self._file_name
def exception(self) -> str:
# this part of code should be never reached
raise NotImplementedError()
def _insert(self, canonical_table_name: str, file_path: str) -> None:
# TODO: implement tracking of jobs in storage, both completed and failed
# WARNING: maximum redshift statement is 16MB https://docs.aws.amazon.com/redshift/latest/dg/c_redshift-sql.html
# in case of postgres: 2GiB
if os.stat(file_path).st_size >= SqlClientMixin.MAX_STATEMENT_SIZE:
# terminal exception
raise LoadFileTooBig(file_path, SqlClientMixin.MAX_STATEMENT_SIZE)
with open(file_path, "r") as f:
header = f.readline()
content = f.read()
sql = Composed(
[SQL("BEGIN TRANSACTION;"),
SQL(header).format(SQL(canonical_table_name)),
SQL(content),
SQL("COMMIT TRANSACTION;")]
)
self._execute_sql(sql)
class RedshiftClient(SqlClientMixin, SqlClientBase):
def __init__(self, schema: Schema, CONFIG: Type[PostgresConfiguration]) -> None:
super().__init__(CONFIG, schema)
def initialize_storage(self) -> None:
schema_name = self._to_canonical_schema_name()
query = """
SELECT 1
FROM INFORMATION_SCHEMA.SCHEMATA
WHERE schema_name = {};
"""
rows = self._execute_sql(SQL(query).format(SQLLiteral(schema_name)))
if len(rows) == 0:
self._execute_sql(SQL("CREATE SCHEMA {};").format(Identifier(schema_name)))
def get_file_load(self, file_path: str) -> LoadJob:
# always returns completed jobs as RedshiftInsertLoadJob is executed
# atomically in start_file_load so any jobs that should be recreated are already completed
# in case of bugs in loader (asking for jobs that were never created) we are not able to detect that
return ClientBase.make_job_with_status(file_path, "completed")
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
# verify that table exists in the schema
self._get_table_by_name(table_name, file_path)
try:
return RedshiftInsertLoadJob(self._to_canonical_table_name(table_name), file_path, self._conn, self.C)
except (psycopg2.OperationalError, psycopg2.InternalError) as tr_ex:
if tr_ex.pgerror is not None:
if "Cannot insert a NULL value into column" in tr_ex.pgerror:
# NULL violations is internal error, probably a redshift thing
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
if "Numeric data overflow" in tr_ex.pgerror:
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
if "Precision exceeds maximum":
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
raise LoadClientTransientInnerException("Error may go away, will retry", tr_ex)
except (psycopg2.DataError, psycopg2.ProgrammingError, psycopg2.IntegrityError) as ter_ex:
raise LoadClientTerminalInnerException("Terminal error, file will not load", ter_ex)
def update_storage_schema(self) -> None:
storage_version = self._get_schema_version_from_storage()
if storage_version < self.schema.schema_version:
for sql in self._build_schema_update_sql():
self._execute_sql(sql)
self._update_schema_version(self.schema.schema_version)
def _get_schema_version_from_storage(self) -> int:
try:
return super()._get_schema_version_from_storage()
except psycopg2.ProgrammingError:
# there's no table so there's no schema
return 0
def _build_schema_update_sql(self) -> List[str]:
sql_updates = []
for table_name in self.schema.schema_tables:
exists, storage_table = self._get_storage_table(table_name)
sql = self._get_table_update_sql(table_name, storage_table, exists)
if sql:
sql_updates.append(sql)
return sql_updates
def _get_table_update_sql(self, table_name: str, storage_table: Table, exists: bool) -> str:
new_columns = self._create_table_update(table_name, storage_table)
if len(new_columns) == 0:
# no changes
return None
# build sql
canonical_name = self._to_canonical_table_name(table_name)
sql = "BEGIN TRANSACTION;\n"
if not exists:
# build CREATE
sql += f"CREATE TABLE {canonical_name} (\n"
sql += ",\n".join([self._get_column_def_sql(c) for c in new_columns])
sql += ");"
else:
# build ALTER as separate statement for each column (redshift limitation)
sql += "\n".join([f"ALTER TABLE {canonical_name}\nADD COLUMN {self._get_column_def_sql(c)};" for c in new_columns])
# scan columns to get hints
if exists:
# no hints may be specified on added columns
for hint in COLUMN_HINTS:
if any(c.get(hint, False) is True for c in new_columns):
hint_columns = [c["name"] for c in new_columns if c.get(hint, False)]
raise LoadClientSchemaWillNotUpdate(canonical_name, hint_columns, f"{hint} requested after table was created")
# TODO: add FK relations
sql += "\nCOMMIT TRANSACTION;"
return sql
def _get_column_def_sql(self, c: Column) -> str:
hints_str = " ".join(HINT_TO_REDSHIFT_ATTR.get(h, "") for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True)
column_name = escape_redshift_identifier(c["name"])
return f"{column_name} {self._sc_t_to_pq_t(c['data_type'])} {hints_str} {self._gen_not_null(c['nullable'])}"
def _get_storage_table(self, table_name: str) -> Tuple[bool, Table]:
schema_table: Table = {}
query = f"""
SELECT column_name, data_type, is_nullable, numeric_precision, numeric_scale
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_schema = '{self._to_canonical_schema_name()}' AND table_name = '{table_name}'
ORDER BY ordinal_position;
"""
rows = self._execute_sql(query)
# if no rows we assume that table does not exist
if len(rows) == 0:
# TODO: additionally check if table exists
return False, schema_table
# TODO: pull more data to infer DISTKEY, PK and SORTKEY attributes/constraints
for c in rows:
schema_c: ColumnBase = {
"name": c[0],
"nullable": self._null_to_bool(c[2]),
"data_type": self._pq_t_to_sc_t(c[1], c[3], c[4]),
}
schema_table[c[0]] = Schema._add_missing_hints(schema_c)
return True, schema_table
def _to_canonical_schema_name(self) -> str:
return f"{self.C.PG_SCHEMA_PREFIX}_{self.schema.schema_name}"
def _to_canonical_table_name(self, table_name: str) -> str:
return f"{self._to_canonical_schema_name()}.{table_name}"
@staticmethod
def _null_to_bool(v: str) -> bool:
if v == "NO":
return False
elif v == "YES":
return True;
raise ValueError(v)
@staticmethod
def _gen_not_null(v: bool) -> str:
return "NOT NULL" if not v else ""
@staticmethod
def _sc_t_to_pq_t(sc_t: DataType) -> str:
if sc_t == "wei":
return f"numeric({DEFAULT_NUMERIC_PRECISION},0)"
return SCT_TO_PGT[sc_t]
@staticmethod
def _pq_t_to_sc_t(pq_t: str, precision: Optional[int], scale: Optional[int]) -> DataType:
if pq_t == "numeric":
if precision == DEFAULT_NUMERIC_PRECISION and scale == 0:
return "wei"
return PGT_TO_SCT.get(pq_t, "text")
def make_client(schema: Schema, C: Type[PostgresConfiguration]) -> RedshiftClient:
return RedshiftClient(schema, C)
def supported_writer(C: Type[PostgresConfiguration]) -> TWriterType:
return "insert_values"

View File

@@ -5,21 +5,21 @@ import os.path
from typing import Callable, Dict, Iterator, List, Literal, Sequence, Tuple
from prometheus_client import REGISTRY
from autopoiesis.common import json, runners
from autopoiesis.common.configuration import BasicConfiguration, make_configuration
from autopoiesis.common.configuration.utils import TConfigSecret
from autopoiesis.common.file_storage import FileStorage
from autopoiesis.common.logger import process_internal_exception
from autopoiesis.common.runners import TRunArgs, TRunMetrics
from autopoiesis.common.schema import Schema, StoredSchema
from autopoiesis.common.typing import DictStrAny, StrAny
from autopoiesis.common.utils import uniq_id, is_interactive
from dlt.common import json, runners
from dlt.common.configuration import BasicConfiguration, make_configuration
from dlt.common.configuration.utils import TConfigSecret
from dlt.common.file_storage import FileStorage
from dlt.common.logger import process_internal_exception
from dlt.common.runners import TRunArgs, TRunMetrics
from dlt.common.schema import Schema, StoredSchema
from dlt.common.typing import DictStrAny, StrAny
from dlt.common.utils import uniq_id, is_interactive
from autopoiesis.extractors.extractor_storage import ExtractorStorageBase
from autopoiesis.unpacker.configuration import configuration as unpacker_configuration
from autopoiesis.loaders.configuration import configuration as loader_configuration
from autopoiesis.unpacker import unpacker
from autopoiesis.loaders import loader
from dlt.extractors.extractor_storage import ExtractorStorageBase
from dlt.unpacker.configuration import configuration as unpacker_configuration
from dlt.loaders.configuration import configuration as loader_configuration
from dlt.unpacker import unpacker
from dlt.loaders import loader
TClientType = Literal["gcp", "redshift"]

0
dlt/py.typed Normal file
View File

1
dlt/unpacker/__init__.py Normal file
View File

@@ -0,0 +1 @@
from dlt._version import unpacker_version as __version__

View File

@@ -0,0 +1,29 @@
from typing import Type
from dlt.common.typing import StrAny
from dlt.common.configuration.pool_runner_configuration import TPoolType
from dlt.common.dataset_writers import TWriterType
from dlt.common.configuration import (PoolRunnerConfiguration, UnpackingVolumeConfiguration,
LoadingVolumeConfiguration, SchemaVolumeConfiguration,
ProductionLoadingVolumeConfiguration, ProductionUnpackingVolumeConfiguration,
ProductionSchemaVolumeConfiguration,
TPoolType, make_configuration)
from . import __version__
class UnpackerConfiguration(PoolRunnerConfiguration, UnpackingVolumeConfiguration, LoadingVolumeConfiguration, SchemaVolumeConfiguration):
MAX_EVENTS_IN_CHUNK: int = 40000 # maximum events to be processed in single chunk
WRITER_TYPE: TWriterType = "jsonl" # jsonp or insert commands will be generated
ADD_EVENT_JSON: bool = True # add event json to "event" table, useful for debugging or recreating tracker
POOL_TYPE: TPoolType = "process"
class ProductionUnpackerConfiguration(ProductionUnpackingVolumeConfiguration, ProductionLoadingVolumeConfiguration,
ProductionSchemaVolumeConfiguration, UnpackerConfiguration):
pass
def configuration(initial_values: StrAny = None) -> Type[UnpackerConfiguration]:
return make_configuration(UnpackerConfiguration, ProductionUnpackerConfiguration, initial_values=initial_values)

View File

249
dlt/unpacker/unpacker.py Normal file
View File

@@ -0,0 +1,249 @@
from typing import Any, Callable, Type, List, Dict, Optional, Sequence, Tuple
from multiprocessing.pool import Pool as ProcessPool
from itertools import chain
from prometheus_client import Counter, CollectorRegistry, REGISTRY, Gauge
from prometheus_client.metrics import MetricWrapperBase
from dlt.common import pendulum, signals, json, logger
from dlt.common.runners import TRunArgs, TRunMetrics, create_default_args, pool_runner, initialize_runner
from dlt.common.storages.unpacker_storage import UnpackerStorage
from dlt.common.telemetry import get_logging_extras
from dlt.common.utils import uniq_id
from dlt.common.typing import TEvent
from dlt.common.logger import process_internal_exception
from dlt.common.exceptions import PoolException
from dlt.common.storages import SchemaStorage
from dlt.common.schema import CannotCoerceColumnException, SchemaUpdate, Schema
from dlt.common.parser import PATH_SEPARATOR
from dlt.common.storages.loader_storage import LoaderStorage
from dlt.common.parser import extract, TExtractFunc
from dlt.unpacker.configuration import configuration, UnpackerConfiguration
extract_func: TExtractFunc = extract
CONFIG: Type[UnpackerConfiguration] = None
unpack_storage: UnpackerStorage = None
load_storage: LoaderStorage = None
schema_storage: SchemaStorage = None
load_schema_storage: SchemaStorage = None
event_counter: Counter = None
event_gauge: Gauge = None
schema_version_gauge: Gauge = None
load_package_counter: Counter = None
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase, MetricWrapperBase, MetricWrapperBase]:
return (
Counter("unpacker_event_count", "Events processed in unpacker", ["schema"], registry=registry),
Gauge("unpacker_last_events", "Number of events processed in last run", ["schema"], registry=registry),
Gauge("unpacker_schema_version", "Current schema version", ["schema"], registry=registry),
Gauge("unpacker_load_packages_created_count", "Count of load package created", ["schema"], registry=registry)
)
def create_folders() -> Tuple[UnpackerStorage, LoaderStorage, SchemaStorage, SchemaStorage]:
unpack_storage = UnpackerStorage(True, CONFIG)
schema_storage = SchemaStorage(CONFIG.SCHEMA_VOLUME_PATH, makedirs=True)
load_schema_storage = SchemaStorage(CONFIG.LOADING_VOLUME_PATH, makedirs=False)
load_storage = LoaderStorage(True, CONFIG, CONFIG.WRITER_TYPE)
unpack_storage.initialize_storage()
load_storage.initialize_storage()
return unpack_storage, load_storage, schema_storage, load_schema_storage
def install_schemas(default_schemas_path: str, schema_names: List[str]) -> None:
# copy default schemas if not present
default_schemas = SchemaStorage(default_schemas_path)
logger.info(f"Checking default schemas in {schema_storage.storage.storage_path}")
for name in schema_names:
if not schema_storage.has_store_schema(name):
logger.info(f"Schema, {name} not present in {schema_storage.storage.storage_path}, installing...")
schema = default_schemas.load_store_schema(name)
schema_storage.save_store_schema(schema)
def load_or_create_schema(schema_name: str) -> Schema:
try:
schema = schema_storage.load_store_schema(schema_name)
logger.info(f"Loaded schema with name {schema_name} with version {schema.schema_version}")
except FileNotFoundError:
schema = Schema(schema_name)
logger.info(f"Created new schema with name {schema_name}")
return schema
# this is a worker process
def w_unpack_files(schema_name: str, load_id: str, events_files: Sequence[str]) -> SchemaUpdate:
unpacked_data: Dict[str, List[Any]] = {}
schema_update: SchemaUpdate = {}
schema = load_or_create_schema(schema_name)
file_id = uniq_id()
# process all event files and store rows in memory
for events_file in events_files:
try:
logger.debug(f"Processing events file {events_file}")
with unpack_storage.storage.open(events_file) as f:
events: Sequence[TEvent] = json.load(f)
for event in events:
for table_name, row in extract_func(schema, event, load_id, CONFIG.ADD_EVENT_JSON):
# filter row, may eliminate some or all fields
row = schema.filter_row(table_name, row, PATH_SEPARATOR)
# do not process empty rows
if row:
# check if schema can be updated
row, table_update = schema.coerce_row(table_name, row)
if len(table_update) > 0:
# update schema and save the change
schema.update_schema(table_name, table_update)
table_updates = schema_update.setdefault(table_name, [])
table_updates.extend(table_update)
# store row
rows = unpacked_data.setdefault(table_name, [])
rows.append(row)
except Exception:
process_internal_exception(f"Exception when processing file {events_file}")
raise PoolException("unpack_files", events_file)
# save rows and return schema changes to be gathered in parent process
for table_name, rows in unpacked_data.items():
# save into new jobs to processed as load
table = schema.get_table(table_name)
load_storage.write_temp_loading_file(load_id, table_name, table, file_id, rows)
return schema_update
TMapFuncRV = Tuple[List[SchemaUpdate], List[Sequence[str]]]
TMapFuncType = Callable[[ProcessPool, str, str, Sequence[str]], TMapFuncRV]
def map_parallel(pool: ProcessPool, schema_name: str, load_id: str, files: Sequence[str]) -> TMapFuncRV:
# we chunk files in a way to not exceed MAX_EVENTS_IN_CHUNK and split them equally
# between processors
configured_processes = pool._processes # type: ignore
chunk_files = UnpackerStorage.chunk_by_events(files, CONFIG.MAX_EVENTS_IN_CHUNK, configured_processes)
logger.info(f"Obtained {len(chunk_files)} processing chunks")
param_chunk = [(schema_name, load_id, files) for files in chunk_files]
return pool.starmap(w_unpack_files, param_chunk), chunk_files
def map_single(_: ProcessPool, schema_name: str, load_id: str, files: Sequence[str]) -> TMapFuncRV:
chunk_files = UnpackerStorage.chunk_by_events(files, CONFIG.MAX_EVENTS_IN_CHUNK, 1)
# get in one chunk
assert len(chunk_files) == 1
logger.info(f"Obtained {len(chunk_files)} processing chunks")
return [w_unpack_files(schema_name, load_id, chunk_files[0])], chunk_files
def update_schema(schema_name: str, schema_updates: List[SchemaUpdate]) -> Schema:
schema = load_or_create_schema(schema_name)
# gather schema from all manifests, validate consistency and combine
for schema_update in schema_updates:
for table_name, table_updates in schema_update.items():
logger.debug(f"Updating schema for table {table_name} with {len(table_updates)} deltas")
schema.update_schema(table_name, table_updates)
return schema
def spool_files(pool: ProcessPool, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None:
# process files in parallel or in single thread, depending on map_f
schema_updates, chunk_files = map_f(pool, schema_name, load_id, files)
schema = update_schema(schema_name, schema_updates)
schema_version_gauge.labels(schema_name).set(schema._version)
logger.metrics("Unpacker metrics", extra=get_logging_extras([schema_version_gauge.labels(schema_name)]))
logger.info(f"Saving schema {schema_name} with version {schema._version}, writing manifest files")
# schema is updated, save it to schema volume
schema_storage.save_store_schema(schema)
# save schema and schema updates to temp load folder
load_schema_storage.save_folder_schema(schema, load_id)
load_storage.save_schema_updates(load_id, schema_updates)
# files must be renamed and deleted together so do not attempt that when process is about to be terminated
signals.raise_if_signalled()
logger.info(f"Committing storage, do not kill this process")
# rename temp folder to processing
load_storage.commit_temp_load_folder(load_id)
# delete event files and count events to provide metrics
total_events = 0
for event_file in chain.from_iterable(chunk_files): # flatten chunks
unpack_storage.storage.delete(event_file)
total_events += UnpackerStorage.get_events_count(event_file)
# log and update metrics
logger.info(f"Chunk {load_id} processed")
load_package_counter.labels(schema_name).inc()
event_counter.labels(schema_name).inc(total_events)
event_gauge.labels(schema_name).set(total_events)
logger.metrics("Unpacker metrics", extra=get_logging_extras(
[load_package_counter.labels(schema_name), event_counter.labels(schema_name), event_gauge.labels(schema_name)]))
def spool_schema_files(pool: ProcessPool, schema_name: str, files: Sequence[str]) -> str:
# unpacked files will go here before being atomically renamed
load_id = str(pendulum.now().timestamp())
load_storage.create_temp_load_folder(load_id)
logger.info(f"Created temp load folder {load_id} on loading volume")
try:
# process parallel
spool_files(pool, schema_name, load_id, map_parallel, files)
except CannotCoerceColumnException as exc:
# schema conflicts resulting from parallel executing
logger.warning(f"Parallel schema update conflict, switching to single thread ({str(exc)}")
# start from scratch
load_storage.create_temp_load_folder(load_id)
spool_files(pool, schema_name, load_id, map_single, files)
return load_id
def run(pool: ProcessPool) -> TRunMetrics:
logger.info(f"Running file unpacking")
# list files and group by schema name, list must be sorted for group by to actually work
files = unpack_storage.list_files_to_unpack_sorted()
logger.info(f"Found {len(files)} files, will process in chunks of {CONFIG.MAX_EVENTS_IN_CHUNK} of events")
if len(files) == 0:
return TRunMetrics(True, False, 0)
# group files by schema
for schema_name, files_in_schema in unpack_storage.get_grouped_iterator(files):
logger.info(f"Found files in schema {schema_name}")
spool_schema_files(pool, schema_name, list(files_in_schema))
# return info on still pending files (if extractor saved something in the meantime)
return TRunMetrics(False, False, len(unpack_storage.list_files_to_unpack_sorted()))
def configure(C: Type[UnpackerConfiguration], collector: CollectorRegistry, extract_f: TExtractFunc, default_schemas_path: str = None, schema_names: List[str] = None) -> bool:
global CONFIG
global unpack_storage, load_storage, schema_storage, load_schema_storage
global event_counter, event_gauge, schema_version_gauge, load_package_counter
global extract_func
CONFIG = C
# set extracting parser function
extract_func = extract_f
try:
unpack_storage, load_storage, schema_storage, load_schema_storage = create_folders()
event_counter, event_gauge, schema_version_gauge, load_package_counter = create_gauges(collector)
if default_schemas_path and schema_names:
install_schemas(default_schemas_path, schema_names)
return True
except Exception:
process_internal_exception("init module")
return False
def main(extract_f: TExtractFunc, default_schemas_path: str = None, schema_names: List[str] = None) -> None:
# initialize runner
C = configuration()
parser = create_default_args(C)
args = parser.parse_args()
initialize_runner(C, TRunArgs(args.single_run, args.wait_runs))
if not configure(C, REGISTRY, extract_f, default_schemas_path, schema_names):
exit(-1)
# run
exit(pool_runner(C, run))
if __name__ == '__main__':
main(extract)

View File

@@ -7,9 +7,9 @@
from typing import Sequence
from autopoiesis.common.typing import StrAny
from autopoiesis.common import json
from autopoiesis.common.schema import Schema
from dlt.common.typing import StrAny
from dlt.common import json
from dlt.common.schema import Schema
from dlt.pipeline import Pipeline
# the load schema will be named {pipeline_mame}_{source_name}

View File

@@ -1,6 +1,6 @@
from autopoiesis.common import json
from autopoiesis.common.schema import Schema
from autopoiesis.common.typing import DictStrAny, StrAny
from dlt.common import json
from dlt.common.schema import Schema
from dlt.common.typing import DictStrAny, StrAny
from dlt.pipeline import Pipeline, PostgresPipelineCredentials
@@ -17,7 +17,6 @@ from dlt.pipeline import Pipeline, PostgresPipelineCredentials
# credentials = Pipeline.load_gcp_credentials("_secrets/project1234_service.json", "gamma_guild")
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
if __name__ == '__main__':
# working redshift creds, you can pass password as last parameter or via PG_PASSWORD env variable ie.
@@ -70,7 +69,7 @@ if __name__ == '__main__':
# from now on each pipeline does more or less the same thing: unpack and load data
# now create loading packages and infer the schema
m = pipeline.unpack(workers=2)
m = pipeline.unpack()
if m.has_failed:
print("Unpacking failed")
print(pipeline.last_run_exception)

View File

@@ -2,10 +2,11 @@ import requests
from typing import Iterator, Sequence, cast
from web3 import Web3, HTTPProvider
from autopoiesis.common import Decimal
from autopoiesis.common.arithmetics import numeric_default_context, numeric_default_quantize
from autopoiesis.common.schema import Schema
from autopoiesis.common.typing import DictStrAny, StrAny
from dlt.common import json
from dlt.common import Decimal
from dlt.common.arithmetics import numeric_default_context, numeric_default_quantize
from dlt.common.schema import Schema
from dlt.common.typing import DictStrAny, StrAny
from dlt.pipeline import Pipeline, TExtractorItemWithTable, TExtractorItem
@@ -117,6 +118,10 @@ schema: Schema = None
# in case of ethereum data the fundamental problem is 2^256 integer size which does not fit in any BIGINT
# type. that is fixed in schema loaded below
schema = Pipeline.load_schema_from_file("examples/schemas/ethereum_schema.yml")
# jschema = schema.to_dict()
# with open("examples/schemas/ethereum_schema.json", "w") as f:
# json.dump(jschema, f)
# exit(-1)
pipeline.create_pipeline(credentials, schema=schema)
print(pipeline.root_path)
@@ -124,13 +129,12 @@ m = pipeline.extract_generator(block_generator)
if m.has_failed:
print("Extracting failed")
print(pipeline.last_run_exception)
exit(0)
exit(0)
m = pipeline.unpack()
if m.has_failed:
print("Unpacking failed")
print(pipeline.last_run_exception)
exit(0)
# get inferred schema
schema = pipeline.get_current_schema()

View File

@@ -5,8 +5,8 @@ import io
from typing import Any, Iterator
import csv
from autopoiesis.common.typing import StrAny
from autopoiesis.common.schema import Schema
from dlt.common.typing import StrAny
from dlt.common.schema import Schema
from dlt.pipeline import Pipeline
SCOPES = ['https://www.googleapis.com/auth/drive']
@@ -15,19 +15,19 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
KEY_FILE_LOCATION = '_secrets/project1234_service.json'
def _initialize_drive() -> Any:
"""Initializes an drive service object.
# def _initialize_drive() -> Any:
# """Initializes an drive service object.
Returns:
An authorized drive service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Returns:
# An authorized drive service object.
# """
# credentials = ServiceAccountCredentials.from_json_keyfile_name(
# KEY_FILE_LOCATION, SCOPES)
# Build the service object.
service = build('drive', 'v3', credentials=credentials)
# # Build the service object.
# service = build('drive', 'v3', credentials=credentials)
return service
# return service
def _initialize_sheets() -> Any:
@@ -41,20 +41,20 @@ def _initialize_sheets() -> Any:
return service
def download_csv_as_json(file_id: str, csv_options: StrAny = None) -> Iterator[StrAny]:
if csv_options is None:
csv_options = {}
# def download_csv_as_json(file_id: str, csv_options: StrAny = None) -> Iterator[StrAny]:
# if csv_options is None:
# csv_options = {}
drive_service = _initialize_drive()
request = drive_service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Download %d%%." % int(status.progress() * 100))
rows = fh.getvalue().decode("utf-8")
return csv.DictReader(io.StringIO(rows), **csv_options)
# drive_service = _initialize_drive()
# request = drive_service.files().get_media(fileId=file_id)
# fh = io.BytesIO()
# downloader = MediaIoBaseDownload(fh, request)
# done = False
# while done is False:
# status, done = downloader.next_chunk()
# print("Download %d%%." % int(status.progress() * 100))
# rows = fh.getvalue().decode("utf-8")
# return csv.DictReader(io.StringIO(rows), **csv_options)
def download_sheet_to_csv(spreadsheet_id: str, sheet_name: str) -> Iterator[StrAny]:

View File

@@ -0,0 +1,911 @@
{
"tables": {
"_loads": {
"inserted_at": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "timestamp",
"name": "inserted_at",
"nullable": false
},
"load_id": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "load_id",
"nullable": false
},
"status": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "status",
"nullable": false
}
},
"_version": {
"engine_version": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "engine_version",
"nullable": false
},
"inserted_at": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "timestamp",
"name": "inserted_at",
"nullable": false
},
"version": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "version",
"nullable": false
}
},
"blocks": {
"_load_id": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_load_id",
"nullable": false
},
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"number": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "number",
"nullable": false
},
"parent_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "parent_hash",
"nullable": true
},
"hash": {
"partition": false,
"cluster": true,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "hash",
"nullable": false
},
"base_fee_per_gas": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "base_fee_per_gas",
"nullable": false
},
"difficulty": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "difficulty",
"nullable": false
},
"extra_data": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "extra_data",
"nullable": true
},
"gas_limit": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "gas_limit",
"nullable": false
},
"gas_used": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "gas_used",
"nullable": false
},
"logs_bloom": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "logs_bloom",
"nullable": true
},
"miner": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "miner",
"nullable": true
},
"mix_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "mix_hash",
"nullable": true
},
"nonce": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "nonce",
"nullable": true
},
"receipts_root": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "receipts_root",
"nullable": true
},
"sha3_uncles": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "sha3_uncles",
"nullable": true
},
"size": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "size",
"nullable": true
},
"state_root": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "state_root",
"nullable": false
},
"timestamp": {
"partition": false,
"cluster": false,
"unique": true,
"sort": true,
"primary_key": false,
"foreign_key": false,
"data_type": "timestamp",
"name": "timestamp",
"nullable": false
},
"total_difficulty": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "total_difficulty",
"nullable": true
},
"transactions_root": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "transactions_root",
"nullable": false
}
},
"blocks__transactions": {
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"block_number": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "block_number",
"nullable": false
},
"transaction_index": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "transaction_index",
"nullable": false
},
"hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "hash",
"nullable": false
},
"block_hash": {
"partition": false,
"cluster": true,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "block_hash",
"nullable": false
},
"block_timestamp": {
"partition": false,
"cluster": false,
"unique": false,
"sort": true,
"primary_key": false,
"foreign_key": false,
"data_type": "timestamp",
"name": "block_timestamp",
"nullable": false
},
"chain_id": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "chain_id",
"nullable": true
},
"from": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "from",
"nullable": true
},
"gas": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "gas",
"nullable": true
},
"gas_price": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "gas_price",
"nullable": true
},
"input": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "input",
"nullable": true
},
"max_fee_per_gas": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "max_fee_per_gas",
"nullable": true
},
"max_priority_fee_per_gas": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "max_priority_fee_per_gas",
"nullable": true
},
"nonce": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "nonce",
"nullable": true
},
"r": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "r",
"nullable": true
},
"s": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "s",
"nullable": true
},
"status": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "status",
"nullable": true
},
"to": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "to",
"nullable": true
},
"type": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "type",
"nullable": true
},
"v": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "v",
"nullable": true
},
"value": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "wei",
"name": "value",
"nullable": false
},
"eth_value": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "decimal",
"name": "eth_value",
"nullable": true
}
},
"blocks__transactions__logs": {
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"address": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "address",
"nullable": false
},
"block_timestamp": {
"partition": false,
"cluster": false,
"unique": false,
"sort": true,
"primary_key": false,
"foreign_key": false,
"data_type": "timestamp",
"name": "block_timestamp",
"nullable": false
},
"block_hash": {
"partition": false,
"cluster": true,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "block_hash",
"nullable": false
},
"block_number": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "block_number",
"nullable": false
},
"transaction_index": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "transaction_index",
"nullable": false
},
"log_index": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": true,
"foreign_key": false,
"data_type": "bigint",
"name": "log_index",
"nullable": false
},
"data": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "data",
"nullable": true
},
"removed": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bool",
"name": "removed",
"nullable": true
},
"transaction_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "transaction_hash",
"nullable": false
}
},
"blocks__transactions__logs__topics": {
"_parent_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": true,
"data_type": "text",
"name": "_parent_hash",
"nullable": false
},
"_pos": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "_pos",
"nullable": false
},
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"_root_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_root_hash",
"nullable": false
},
"value": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "value",
"nullable": true
}
},
"blocks__transactions__access_list": {
"_parent_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": true,
"data_type": "text",
"name": "_parent_hash",
"nullable": false
},
"_pos": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "_pos",
"nullable": false
},
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"_root_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_root_hash",
"nullable": false
},
"address": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "address",
"nullable": true
}
},
"blocks__transactions__access_list__storage_keys": {
"_parent_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": true,
"data_type": "text",
"name": "_parent_hash",
"nullable": false
},
"_pos": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "_pos",
"nullable": false
},
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"_root_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_root_hash",
"nullable": false
},
"value": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "value",
"nullable": true
}
},
"blocks__uncles": {
"_parent_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": true,
"data_type": "text",
"name": "_parent_hash",
"nullable": false
},
"_pos": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "bigint",
"name": "_pos",
"nullable": false
},
"_record_hash": {
"partition": false,
"cluster": false,
"unique": true,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_record_hash",
"nullable": false
},
"_root_hash": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "_root_hash",
"nullable": false
},
"value": {
"partition": false,
"cluster": false,
"unique": false,
"sort": false,
"primary_key": false,
"foreign_key": false,
"data_type": "text",
"name": "value",
"nullable": true
}
}
},
"name": "ethereum",
"version": 8,
"preferred_types": {},
"hints": {
"foreign_key": [
"^_parent_hash$"
],
"not_null": [
"^_record_hash$",
"^_root_hash$",
"^_parent_hash$",
"^_pos$"
],
"unique": [
"^_record_hash$"
]
},
"excludes": [],
"includes": [],
"engine_version": 2
}

View File

@@ -0,0 +1,936 @@
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2
tables:
_version:
version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
engine_version:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
_loads:
load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
status:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: false
inserted_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: timestamp
nullable: false
model_annotations:
sender_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
message_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
annotation:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
confidence:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: double
nullable: true
count:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bigint
nullable: true
added_at:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: true
reviewed:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: bool
nullable: true
_load_id:
partition: false
cluster: false
unique: false
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
_record_hash:
partition: false
cluster: false
unique: true
sort: false
primary_key: false
foreign_key: false
data_type: text
nullable: false
name: csv
version: 2
preferred_types: {}
hints:
not_null:
- ^_record_hash$
- ^_root_hash$
- ^_parent_hash$
- ^_pos$
- _load_id
foreign_key:
- ^_parent_hash$
unique:
- ^_record_hash$
excludes: []
includes: []
engine_version: 2

1110
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,27 +1,61 @@
[tool.poetry]
name = "python-dlt"
version = "0.0.1"
version = "0.1.0.dev0"
description = "DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run."
authors = ["Marcin Rudolf <rudolfix@rudolfix.org>"]
license = "MIT"
authors = ["ScaleVector <services@scalevector.ai>"]
maintainers = [ "Marcin Rudolf <marcin@scalevector.ai>", "Adrian Brudaru <adrian@scalevector.ai>",]
readme = "README.md"
license = "Apache-2.0"
homepage = "https://github.com/scale-vector"
repository = "https://github.com/scale-vector/dlt"
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Topic :: Software Development :: Libraries",
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",]
keywords = [ "etl" ]
include = [ "LICENSE.txt", "README.md"]
packages = [
{ include = "dlt" },
]
[tool.poetry.dependencies]
python = "^3.8,<3.11"
# autopoiesis = {path = "../rasa_data_ingestion"}
requests = "^2.26.0"
pendulum = "^2.1.2"
simplejson = "^3.17.5"
jsonlines = "^2.0.0"
PyYAML = "^5.4.1"
json-logging = "1.4.1rc0"
prometheus-client = "^0.11.0"
semver = "^2.13.0"
sentry-sdk = "^1.4.3"
hexbytes = "^0.2.2"
cachetools = "^5.2.0"
psycopg2-binary = {version = "^2.9.1", optional = true, extras = ["redshift", "postgres"]}
grpcio = {version = "1.43.0", optional = true, extras = ["gcp"]}
google-cloud-bigquery = {version = "^2.26.0", optional = true, extras = ["gcp"]}
GitPython = {version = "^3.1.26", optional = true, extras = ["dbt"]}
dbt-core = {version = "1.0.6", optional = true, extras = ["dbt"]}
dbt-redshift = {version = "1.0.1", optional = true, extras = ["dbt"]}
dbt-bigquery = {version = "1.0.0", optional = true, extras = ["dbt"]}
[tool.poetry.dev-dependencies]
pytest = "6.2.4"
pytest = "^6.2.4"
mypy = "0.931"
flake8 = "3.9.2"
bandit = "1.7.0"
flake8-bugbear = "21.4.3"
pytest-pythonpath = "0.7.3"
bandit = "^1.7.0"
flake8-bugbear = "^21.4.3"
pytest-pythonpath = "^0.7.3"
pytest-order = "^1.0.0"
pytest-cases = "^3.6.9"
pytest-forked = "^1.3.0"
types-PyYAML = "^6.0.7"
types-cachetools = "^4.2.9"
types-protobuf = "^3.19.8"
@@ -29,6 +63,12 @@ types-simplejson = "^3.17.0"
types-requests = "^2.25.6"
types-python-dateutil = "^2.8.15"
[tool.poetry.extras]
dbt = ["dbt-core", "GitPython", "dbt-redshift", "dbt-bigquery"]
gcp = ["grpcio", "google-cloud-bigquery"]
postgres = ["psycopg2-binary"]
redshift = ["psycopg2-binary"]
[build-system]
requires = ["poetry-core>=1.0.8"]
build-backend = "poetry.core.masonry.api"

7
pytest.ini Normal file
View File

@@ -0,0 +1,7 @@
[pytest]
python_paths= autopoiesis
norecursedirs= .direnv .eggs build dist
addopts= -v --showlocals --durations 10
xfail_strict= true
log_cli= 1
log_cli_level= INFO

18
tests/.example.env Normal file
View File

@@ -0,0 +1,18 @@
# copy to .env and run with (set -a && . tests/.env && pytest tests)
# for tests that do not involve any secrets you may run (set -a && . tests/.example.env && pytest tests)
PROJECT_ID=chat-analytics-317513
DATASET=carbon_bot_3
BQ_CRED_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----
paste key here
-----END PRIVATE KEY-----
"
BQ_CRED_CLIENT_EMAIL=loader@chat-analytics-317513.iam.gserviceaccount.com
PG_DATABASE_NAME=chat_analytics_rasa
PG_SCHEMA_PREFIX=carbon_bot_3
PG_USER=loader
PG_HOST=3.73.90.3
PG_PASSWORD=set-me-up

0
tests/__init__.py Normal file
View File

0
tests/common/__init__.py Normal file
View File

View File

@@ -0,0 +1,32 @@
{
"event": "bot",
"timestamp": 1624001210.7276764,
"metadata": {
"rasa_x_flagged": false,
"rasa_x_id": 60304
},
"text": "Hello! Just a heads up - this bot is part of a research project and we intend to make the conversations publicly available to researchers. So please don't share any personal information! [Privacy Policy](https://rasa.com/carbon-bot-privacy-policy/)",
"data": {
"elements": null,
"quick_replies": null,
"buttons": null,
"attachment": null,
"image": null,
"custom": null
},
"data__custom": "remains",
"data__custom__goes": "goes",
"custom_data": {
"excluded_path": {
"prop1": "str1"
},
"included_object": {
"included_exception": "exception",
"eliminated": true
}
},
"is_flagged": false,
"sender_id": "411b44bdfcc545f282fb4aa15282b73f",
"model_id": "__unknown",
"environment": "__unknown"
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,63 @@
{
"tables": {
"_version": {
"version": {
"name": "version",
"data_type": "bigint",
"nullable": false
},
"engine_version": {
"name": "engine_version",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
},
"_loads": {
"load_id": {
"name": "load_id",
"data_type": "text",
"nullable": false
},
"status": {
"name": "status",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
}
},
"name": "model",
"version": 1,
"preferred_types": {
"^timestamp$": "timestamp",
"trained_at$": "timestamp",
"^inserted_at$": "timestamp",
"^_pos$": "bigint"
},
"hints": {
"not_null": [
"^timestamp$",
"^_record_hash$",
"^_root_hash$",
"^_load_id$",
"^_parent_hash$",
"^_pos$"
],
"primary_key": [
"^_record_hash$"
],
"foreign_key": [
"^_parent_hash$"
]
},
"engine_version": 1
}

View File

@@ -0,0 +1,59 @@
{
"tables": {
"_version": {
"version": {
"name": "version",
"data_type": "bigint",
"nullable": false
},
"engine_version": {
"name": "engine_version",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
},
"_loads": {
"load_id": {
"name": "load_id",
"data_type": "text",
"nullable": false
},
"status": {
"name": "status",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
}
},
"version": 1,
"engine_version": 2,
"name": "event",
"preferred_types": {
"^timestamp$": "timestamp",
"^_timestamp$": "timestamp",
"^inserted_at$": "timestamp",
"confidence": "double",
"^_pos$": "bigint"
},
"hints": {
"not_null": ["^timestamp$", "^_timestamp$", "^_dist_key$", "^_record_hash$", "^_root_hash$", "^_load_id$", "^_parent_hash$", "^_pos$", "^sender_id$"],
"partition": ["^_timestamp$", "^timestamp$"],
"cluster": ["^_dist_key$", "^sender_id$"],
"primary_key": [],
"foreign_key": ["^_parent_hash$"],
"sort": ["^timestamp$", "^_timestamp$"],
"unique": ["^_record_hash$"]
},
"excludes": ["^event_user__parse_data", "^event_bot__data", "^event_bot__metadata"],
"includes": ["^event_user__parse_data__(intent|entities|message_id$|text$)", "^event_bot__metadata__(utter_action|template_name|rasa_x_[a-z]+)$"]
}

View File

@@ -0,0 +1,54 @@
{
"tables": {
"_version": {
"version": {
"name": "version",
"data_type": "bigint",
"nullable": false
},
"engine_version": {
"name": "engine_version",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
},
"_loads": {
"load_id": {
"name": "load_id",
"data_type": "text",
"nullable": false
},
"status": {
"name": "status",
"data_type": "bigint",
"nullable": false
},
"inserted_at": {
"name": "inserted_at",
"data_type": "timestamp",
"nullable": false
}
}
},
"version": 1,
"engine_version": 2,
"name": "model",
"preferred_types": {
"^timestamp$": "timestamp",
"trained_at$": "timestamp",
"^inserted_at$": "timestamp",
"^_pos$": "bigint"
},
"hints": {
"not_null": ["^timestamp$", "^_record_hash$", "^_root_hash$", "^_load_id$", "^_parent_hash$", "^_pos$"],
"unique": ["^_record_hash$"],
"foreign_key": ["^_parent_hash$"]
},
"excludes": [],
"includes": []
}

View File

@@ -0,0 +1 @@
kube

View File

@@ -0,0 +1 @@
BANANA

View File

@@ -0,0 +1,16 @@
[
{
"f_int": 7817289712,
"f_float": 92898e37,
"f_timestamp": "2021-10-13T13:49:32.901899+00:00",
"f_bool": true,
"f_bool_2": false,
"f_str": "some string"
},
{
"f_int": 7817289713,
"f_float": 878172.8292,
"f_timestamp": "2021-10-13T13:49:32.901899+00:00",
"f_bool_2": false
}
]

View File

@@ -0,0 +1,14 @@
[
{
"idx": 1,
"str": ", NULL'); DROP SCHEMA Public --"
},
{
"idx": 2,
"str": "イロハニホヘト チリヌルヲ 'ワカヨタレソ ツネナラム"
},
{
"idx": 3,
"str": "ऄअआइ'ईउऊऋऌऍऎए"
}
]

View File

View File

@@ -0,0 +1,84 @@
import pytest
from typing import Sequence, Tuple
from dlt.common.file_storage import FileStorage
from dlt.common.storages.loader_storage import LoaderStorage
from dlt.common.configuration import LoadingVolumeConfiguration, make_configuration
from dlt.common.storages.exceptions import NoMigrationPathException
from dlt.common.typing import StrAny
from dlt.common.utils import uniq_id
from tests.utils import write_version, autouse_root_storage
@pytest.fixture
def storage() -> LoaderStorage:
C = make_configuration(LoadingVolumeConfiguration, LoadingVolumeConfiguration)
s = LoaderStorage(True, C, "jsonl")
s.initialize_storage()
return s
def test_archive_completed(storage: LoaderStorage) -> None:
# should delete archive in full
storage.delete_completed_jobs = True
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
assert storage.storage.has_folder(storage.get_load_path(load_id))
storage.complete_job(load_id, file_name)
storage.archive_load(load_id)
# deleted from loading
assert not storage.storage.has_folder(storage.get_load_path(load_id))
# deleted from archive
assert not storage.storage.has_folder(storage.get_archived_path(load_id))
# do not delete completed jobs
storage.delete_completed_jobs = False
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
storage.complete_job(load_id, file_name)
storage.archive_load(load_id)
# deleted from loading
assert not storage.storage.has_folder(storage.get_load_path(load_id))
# has load archived
assert storage.storage.has_folder(storage.get_archived_path(load_id))
def test_archive_failed(storage: LoaderStorage) -> None:
# loads with failed jobs are always archived
storage.delete_completed_jobs = True
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
assert storage.storage.has_folder(storage.get_load_path(load_id))
storage.fail_job(load_id, file_name, "EXCEPTION")
storage.archive_load(load_id)
# deleted from loading
assert not storage.storage.has_folder(storage.get_load_path(load_id))
# present in archive
assert storage.storage.has_folder(storage.get_archived_path(load_id))
def test_full_migration_path() -> None:
# create directory structure
s = LoaderStorage(True, LoadingVolumeConfiguration, "jsonl")
# overwrite known initial version
write_version(s.storage, "1.0.0")
# must be able to migrate to current version
s = LoaderStorage(False, LoadingVolumeConfiguration, "jsonl")
assert s.version == LoaderStorage.STORAGE_VERSION
def test_unknown_migration_path() -> None:
# create directory structure
s = LoaderStorage(True, LoadingVolumeConfiguration, "jsonl")
# overwrite known initial version
write_version(s.storage, "10.0.0")
# must be able to migrate to current version
with pytest.raises(NoMigrationPathException):
LoaderStorage(False, LoadingVolumeConfiguration, "jsonl")
def start_loading_file(s: LoaderStorage, content: Sequence[StrAny]) -> Tuple[str, str]:
load_id = uniq_id()
s.create_temp_load_folder(load_id)
file_name = s.write_temp_loading_file(load_id, "mock_table", None, uniq_id(), content)
s.commit_temp_load_folder(load_id)
s.start_job(load_id, file_name)
return load_id, file_name

View File

@@ -0,0 +1,40 @@
import pytest
from dlt.common.file_storage import FileStorage
from dlt.common.storages.exceptions import NoMigrationPathException
from dlt.common.storages.unpacker_storage import UnpackerStorage
from dlt.common.configuration import UnpackingVolumeConfiguration
from tests.utils import TEST_STORAGE, write_version, autouse_root_storage
@pytest.mark.skip()
def test_load_events_and_group_by_sender() -> None:
# TODO: create fixture with two sender ids and 3 files and check the result
pass
@pytest.mark.skip()
def test_chunk_by_events() -> None:
# TODO: should distribute ~ N events evenly among m cores with fallback for small amounts of events
pass
def test_full_migration_path() -> None:
# create directory structure
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
# overwrite known initial version
write_version(s.storage, "1.0.0")
# must be able to migrate to current version
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
assert s.version == UnpackerStorage.STORAGE_VERSION
def test_unknown_migration_path() -> None:
# create directory structure
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
# overwrite known initial version
write_version(s.storage, "10.0.0")
# must be able to migrate to current version
with pytest.raises(NoMigrationPathException):
UnpackerStorage(False, UnpackingVolumeConfiguration)

View File

@@ -0,0 +1,59 @@
import pytest
import semver
from dlt.common.file_storage import FileStorage
from dlt.common.storages.exceptions import NoMigrationPathException, WrongStorageVersionException
from dlt.common.storages.versioned_storage import VersionedStorage
from tests.utils import write_version, root_storage
class MigratedStorage(VersionedStorage):
def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None:
# migration example:
if from_version == "1.0.0" and from_version < to_version:
from_version = semver.VersionInfo.parse("1.1.0")
self._save_version(from_version)
if from_version == "1.1.0" and from_version < to_version:
from_version = semver.VersionInfo.parse("1.2.0")
self._save_version(from_version)
def test_new_versioned_storage(root_storage: FileStorage) -> None:
v = VersionedStorage("1.0.1", True, root_storage)
assert v.version == "1.0.1"
def test_new_versioned_storage_non_owner(root_storage: FileStorage) -> None:
with pytest.raises(WrongStorageVersionException) as wsve:
VersionedStorage("1.0.1", False, root_storage)
assert wsve.value.storage_path == root_storage.storage_path
assert wsve.value.target_version == "1.0.1"
assert wsve.value.initial_version == "0.0.0"
def test_migration(root_storage: FileStorage) -> None:
write_version(root_storage, "1.0.0")
v = MigratedStorage("1.2.0", True, root_storage)
assert v.version == "1.2.0"
def test_unknown_migration_path(root_storage: FileStorage) -> None:
write_version(root_storage, "1.0.0")
with pytest.raises(NoMigrationPathException) as wmpe:
MigratedStorage("1.3.0", True, root_storage)
assert wmpe.value.migrated_version == "1.2.0"
def test_only_owner_migrates(root_storage: FileStorage) -> None:
write_version(root_storage, "1.0.0")
with pytest.raises(WrongStorageVersionException) as wmpe:
MigratedStorage("1.2.0", False, root_storage)
assert wmpe.value.initial_version == "1.0.0"
def test_downgrade_not_possible(root_storage: FileStorage) -> None:
write_version(root_storage, "1.2.0")
with pytest.raises(NoMigrationPathException) as wmpe:
MigratedStorage("1.1.0", True, root_storage)
assert wmpe.value.migrated_version == "1.2.0"

Some files were not shown because too many files have changed in this diff Show More