mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
moves dlt core in
This commit is contained in:
13
.dockerignore
Normal file
13
.dockerignore
Normal file
@@ -0,0 +1,13 @@
|
||||
.idea
|
||||
.direnv
|
||||
.mypy_cache
|
||||
.pytest_cache
|
||||
htmlcov
|
||||
.coverage
|
||||
__pycache__
|
||||
.eggs
|
||||
.egg-info
|
||||
_storage
|
||||
_test_storage
|
||||
Dockerfile
|
||||
.md
|
||||
203
LICENSE.txt
Normal file
203
LICENSE.txt
Normal file
@@ -0,0 +1,203 @@
|
||||
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2022 ScaleVector
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
91
Makefile
91
Makefile
@@ -1,3 +1,28 @@
|
||||
PYV=$(shell python3 -c "import sys;t='{v[0]}.{v[1]}'.format(v=list(sys.version_info[:2]));sys.stdout.write(t)")
|
||||
.SILENT:has-poetry
|
||||
|
||||
# pipeline version info
|
||||
AUTV=$(shell python3 -c "from dlt import __version__;print(__version__)")
|
||||
AUTVMINMAJ=$(shell python3 -c "from dlt import __version__;print('.'.join(__version__.split('.')[:-1]))")
|
||||
|
||||
NAME := scalevector/dlt
|
||||
TAG := $(shell git log -1 --pretty=%h)
|
||||
IMG := ${NAME}:${TAG}
|
||||
LATEST := ${NAME}:latest${VERSION_SUFFIX}
|
||||
VERSION := ${AUTV}${VERSION_SUFFIX}
|
||||
VERSION_MM := ${AUTVMINMAJ}${VERSION_SUFFIX}
|
||||
|
||||
|
||||
# dbt runner version info
|
||||
DBT_AUTV=$(shell python3 -c "from dlt.dbt_runner._version import __version__;print(__version__)")
|
||||
DBT_AUTVMINMAJ=$(shell python3 -c "from dlt.dbt_runner._version import __version__;print('.'.join(__version__.split('.')[:-1]))")
|
||||
|
||||
DBT_NAME := scalevector/dlt-dbt-runner
|
||||
DBT_IMG := ${DBT_NAME}:${TAG}
|
||||
DBT_LATEST := ${DBT_NAME}:latest${VERSION_SUFFIX}
|
||||
DBT_VERSION := ${DBT_AUTV}${VERSION_SUFFIX}
|
||||
DBT_VERSION_MM := ${DBT_AUTVMINMAJ}${VERSION_SUFFIX}
|
||||
|
||||
install-poetry:
|
||||
ifneq ($(VIRTUAL_ENV),)
|
||||
$(error you cannot be under virtual environment $(VIRTUAL_ENV))
|
||||
@@ -8,14 +33,70 @@ has-poetry:
|
||||
poetry --version
|
||||
|
||||
dev: has-poetry
|
||||
# will install itself as editable module
|
||||
poetry install
|
||||
poetry run pip install -e ../rasa_data_ingestion
|
||||
# will install itself as editable module with all the extras
|
||||
poetry install -E "postgres redshift dbt gcp"
|
||||
|
||||
lint:
|
||||
poetry run mypy --config-file mypy.ini dlt examples
|
||||
poetry run flake8 --max-line-length=200 dlt examples
|
||||
# poetry run flake8 --max-line-length=200 dlt examples tests
|
||||
$(MAKE) lint-security
|
||||
|
||||
lint-security:
|
||||
poetry run bandit -r autopoiesis/ -n 3 -ll
|
||||
poetry run bandit -r dlt/ -n 3 -l
|
||||
|
||||
reset-test-storage:
|
||||
-rm -r _storage
|
||||
mkdir _storage
|
||||
python3 test/tools/create_storages.py
|
||||
|
||||
recreate-compiled-deps:
|
||||
poetry export -f requirements.txt --output _gen_requirements.txt --without-hashes --extras gcp --extras redshift
|
||||
grep `cat compiled_packages.txt` _gen_requirements.txt > compiled_requirements.txt
|
||||
|
||||
publish-library:
|
||||
poetry version ${VERSION}
|
||||
poetry build
|
||||
poetry publish -u __token__
|
||||
|
||||
build-image-tags:
|
||||
@echo ${IMG}
|
||||
@echo ${LATEST}
|
||||
@echo ${NAME}:${VERSION_MM}
|
||||
@echo ${NAME}:${VERSION}
|
||||
|
||||
build-image-no-version-tags:
|
||||
poetry export -f requirements.txt --output _gen_requirements.txt --without-hashes --extras gcp --extras redshift
|
||||
docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA=${TAG} --build-arg=IMAGE_VERSION="${VERSION}" . -t ${IMG}
|
||||
|
||||
build-image: build-image-no-version-tags
|
||||
docker tag ${IMG} ${LATEST}
|
||||
docker tag ${IMG} ${NAME}:${VERSION_MM}
|
||||
docker tag ${IMG} ${NAME}:${VERSION}
|
||||
|
||||
push-image:
|
||||
docker push ${IMG}
|
||||
docker push ${LATEST}
|
||||
docker push ${NAME}:${VERSION_MM}
|
||||
docker push ${NAME}:${VERSION}
|
||||
|
||||
dbt-build-image-tags:
|
||||
@echo ${DBT_IMG}
|
||||
@echo ${DBT_LATEST}
|
||||
@echo ${DBT_VERSION_MM}
|
||||
@echo ${DBT_VERSION}
|
||||
|
||||
dbt-build-image:
|
||||
poetry export -f requirements.txt --output _gen_requirements_dbt.txt --without-hashes --extras dbt
|
||||
docker build -f dlt/dbt_runner/Dockerfile --build-arg=COMMIT_SHA=${TAG} --build-arg=IMAGE_VERSION="${DBT_VERSION}" . -t ${DBT_IMG}
|
||||
docker tag ${DBT_IMG} ${DBT_LATEST}
|
||||
docker tag ${DBT_IMG} ${DBT_NAME}:${DBT_VERSION_MM}
|
||||
docker tag ${DBT_IMG} ${DBT_NAME}:${DBT_VERSION}
|
||||
|
||||
dbt-push-image:
|
||||
docker push ${DBT_IMG}
|
||||
docker push ${DBT_LATEST}
|
||||
docker push ${DBT_NAME}:${DBT_VERSION_MM}
|
||||
docker push ${DBT_NAME}:${DBT_VERSION}
|
||||
|
||||
docker-login:
|
||||
docker login -u scalevector -p ${DOCKER_PASS}
|
||||
|
||||
1
compiled_packages.txt
Normal file
1
compiled_packages.txt
Normal file
@@ -0,0 +1 @@
|
||||
cffi\|idna\|simplejson\|pendulum\|grpcio\|google-crc32c
|
||||
6
compiled_requirements.txt
Normal file
6
compiled_requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
google-crc32c==1.3.0; python_version >= "3.6" and python_version < "3.11"
|
||||
grpcio-status==1.43.0; python_version >= "3.6" and python_version < "3.11"
|
||||
grpcio==1.43.0; python_version >= "3.6"
|
||||
idna==3.3; python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.11" or python_full_version >= "3.6.0" and python_version >= "3.6" and python_version < "3.11"
|
||||
pendulum==2.1.2; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
simplejson==3.17.6; (python_version >= "2.5" and python_full_version < "3.0.0") or (python_full_version >= "3.3.0")
|
||||
54
deploy/dbt_runner/Dockerfile
Normal file
54
deploy/dbt_runner/Dockerfile
Normal file
@@ -0,0 +1,54 @@
|
||||
FROM python:3.8-slim-bullseye as base
|
||||
|
||||
# Metadata
|
||||
LABEL org.label-schema.vendor="ScaleVector" \
|
||||
org.label-schema.url="https://scalevector.ai" \
|
||||
org.label-schema.name="dbt_runner" \
|
||||
org.label-schema.description="DBT Package Runner for DLT"
|
||||
|
||||
# prepare dirs to install autopoieses
|
||||
RUN mkdir -p /usr/src/app && mkdir /var/local/app && mkdir /usr/src/app/autopoiesis
|
||||
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# System setup for DBT
|
||||
RUN apt-get update \
|
||||
&& apt-get dist-upgrade -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
ssh-client \
|
||||
software-properties-common \
|
||||
make \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
libpq-dev \
|
||||
&& apt-get clean \
|
||||
&& rm -rf \
|
||||
/var/lib/apt/lists/* \
|
||||
/tmp/* \
|
||||
/var/tmp/*
|
||||
|
||||
# Env vars
|
||||
ENV PYTHONIOENCODING=utf-8
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
# Update python
|
||||
RUN python -m pip install --upgrade pip setuptools wheel --no-cache-dir
|
||||
|
||||
|
||||
ENV PYTHONPATH $PYTHONPATH:/usr/src/app
|
||||
|
||||
ADD _gen_requirements_dbt.txt .
|
||||
RUN pip3 install -r _gen_requirements_dbt.txt
|
||||
|
||||
COPY autopoiesis/common autopoiesis/common
|
||||
COPY autopoiesis/dbt_runner autopoiesis/dbt_runner
|
||||
COPY autopoiesis/*.py autopoiesis/
|
||||
|
||||
# add build labels and envs
|
||||
ARG COMMIT_SHA=""
|
||||
ARG IMAGE_VERSION=""
|
||||
LABEL commit_sha = ${COMMIT_SHA}
|
||||
LABEL version=${IMAGE_VERSION}
|
||||
ENV COMMIT_SHA=${COMMIT_SHA}
|
||||
ENV IMAGE_VERSION=${IMAGE_VERSION}
|
||||
43
deploy/dlt/Dockerfile
Normal file
43
deploy/dlt/Dockerfile
Normal file
@@ -0,0 +1,43 @@
|
||||
# Python 3.8 required
|
||||
FROM alpine:3.15
|
||||
|
||||
# Metadata
|
||||
LABEL org.label-schema.vendor="ScaleVector" \
|
||||
org.label-schema.url="https://scalevector.ai" \
|
||||
org.label-schema.name="DLT" \
|
||||
org.label-schema.description="DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run."
|
||||
|
||||
# prepare dirs to install autopoieses
|
||||
RUN mkdir -p /tmp/pydlt
|
||||
|
||||
WORKDIR /tmp/pydlt
|
||||
|
||||
# generated by make recreate-compiled-deps to install packages requiring compiler
|
||||
# recreate only when you have new deps requiring compilation - step below is very slow
|
||||
ADD compiled_requirements.txt .
|
||||
|
||||
# install alpine deps
|
||||
RUN apk update &&\
|
||||
apk add --no-cache python3 ca-certificates curl postgresql &&\
|
||||
apk add --no-cache --virtual build-deps build-base automake autoconf libtool python3-dev postgresql-dev libffi-dev linux-headers gcc musl-dev &&\
|
||||
ln -s /usr/bin/python3 /usr/bin/python &&\
|
||||
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py &&\
|
||||
rm get-pip.py &&\
|
||||
pip3 install --upgrade setuptools wheel &&\
|
||||
rm -r /usr/lib/python*/ensurepip &&\
|
||||
pip3 install -r compiled_requirements.txt &&\
|
||||
apk del --purge build-deps
|
||||
#rm -r /root/.cache
|
||||
|
||||
# add build labels and envs
|
||||
ARG COMMIT_SHA=""
|
||||
ARG IMAGE_VERSION=""
|
||||
LABEL commit_sha = ${COMMIT_SHA}
|
||||
LABEL version=${IMAGE_VERSION}
|
||||
ENV COMMIT_SHA=${COMMIT_SHA}
|
||||
ENV IMAGE_VERSION=${IMAGE_VERSION}
|
||||
|
||||
# install exactly the same version of the library we used to build
|
||||
RUN pip3 install python-dlt==${IMAGE_VERSION}[gcp,redshift]
|
||||
|
||||
RUN rm -r /tmp/pydlt
|
||||
@@ -0,0 +1 @@
|
||||
from dlt._version import common_version as __version__
|
||||
3
dlt/_version.py
Normal file
3
dlt/_version.py
Normal file
@@ -0,0 +1,3 @@
|
||||
common_version = "0.1.0"
|
||||
loader_version = "0.1.0"
|
||||
unpacker_version = "0.1.0"
|
||||
5
dlt/common/__init__.py
Normal file
5
dlt/common/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .pendulum import pendulum # noqa: F401
|
||||
from .json import json # noqa: F401, I251
|
||||
from .time import sleep # noqa: F401
|
||||
from .arithmetics import Decimal # noqa: F401
|
||||
from dlt._version import common_version as __version__
|
||||
32
dlt/common/arithmetics.py
Normal file
32
dlt/common/arithmetics.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import decimal
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
from decimal import ROUND_HALF_UP, Decimal, DefaultContext, DivisionByZero, InvalidOperation, localcontext, Context, ConversionSyntax
|
||||
|
||||
|
||||
DefaultContext.rounding = ROUND_HALF_UP
|
||||
# use small caps for exponent
|
||||
DefaultContext.capitals = 0
|
||||
# prevent NaN to be returned
|
||||
DefaultContext.traps[InvalidOperation] = True
|
||||
# prevent Inf to be returned
|
||||
DefaultContext.traps[DivisionByZero] = True
|
||||
decimal.setcontext(DefaultContext)
|
||||
|
||||
DEFAULT_NUMERIC_PRECISION = 38
|
||||
DEFAULT_NUMERIC_SCALE = 9
|
||||
|
||||
NUMERIC_DEFAULT_QUANTIZER = Decimal("1." + "0" * DEFAULT_NUMERIC_SCALE)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def numeric_default_context() -> Iterator[Context]:
|
||||
with localcontext() as c:
|
||||
c.prec=DEFAULT_NUMERIC_PRECISION
|
||||
yield c
|
||||
|
||||
|
||||
def numeric_default_quantize(v: Decimal) -> Decimal:
|
||||
if v == 0:
|
||||
return v
|
||||
return v.quantize(NUMERIC_DEFAULT_QUANTIZER)
|
||||
11
dlt/common/configuration/__init__.py
Normal file
11
dlt/common/configuration/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from .basic_configuration import BasicConfiguration # noqa: F401
|
||||
from .unpacking_volume_configuration import UnpackingVolumeConfiguration, ProductionUnpackingVolumeConfiguration # noqa: F401
|
||||
from .loading_volume_configuration import LoadingVolumeConfiguration, ProductionLoadingVolumeConfiguration # noqa: F401
|
||||
from .schema_volume_configuration import SchemaVolumeConfiguration, ProductionSchemaVolumeConfiguration # noqa: F401
|
||||
from .pool_runner_configuration import PoolRunnerConfiguration, TPoolType # noqa: F401
|
||||
from .gcp_client_configuration import GcpClientConfiguration, GcpClientProductionConfiguration # noqa: F401
|
||||
from .postgres_configuration import PostgresConfiguration, PostgresProductionConfiguration # noqa: F401
|
||||
from .utils import make_configuration, TConfigSecret, open_configuration_file # noqa: F401
|
||||
|
||||
from .exceptions import ( # noqa: F401
|
||||
ConfigEntryMissingException, ConfigEnvValueCannotBeCoercedException, ConfigIntegrityException, ConfigFileNotFoundException)
|
||||
21
dlt/common/configuration/basic_configuration.py
Normal file
21
dlt/common/configuration/basic_configuration.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
DEVELOPMENT_CONFIG_FILES_STORAGE_PATH = "_storage/config/%s"
|
||||
PRODUCTION_CONFIG_FILES_STORAGE_PATH = "/run/config/%s"
|
||||
|
||||
class BasicConfiguration:
|
||||
NAME: str = None # the name of the component, must be supplied
|
||||
SENTRY_DSN: Optional[str] = None # keep None to disable Sentry
|
||||
PROMETHEUS_PORT: Optional[int] = None # keep None to disable Prometheus
|
||||
LOG_FORMAT: str = '{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}'
|
||||
LOG_LEVEL: str = "DEBUG"
|
||||
IS_DEVELOPMENT_CONFIG: bool = True
|
||||
REQUEST_TIMEOUT: Tuple[int, int] = (15, 300) # default request timeout for all http clients
|
||||
CONFIG_FILES_STORAGE_PATH: str = DEVELOPMENT_CONFIG_FILES_STORAGE_PATH
|
||||
|
||||
@classmethod
|
||||
def check_integrity(cls) -> None:
|
||||
# if CONFIG_FILES_STORAGE_PATH not overwritten and we are in production mode
|
||||
if cls.CONFIG_FILES_STORAGE_PATH == DEVELOPMENT_CONFIG_FILES_STORAGE_PATH and not cls.IS_DEVELOPMENT_CONFIG:
|
||||
# set to mount where config files will be present
|
||||
cls.CONFIG_FILES_STORAGE_PATH = PRODUCTION_CONFIG_FILES_STORAGE_PATH
|
||||
43
dlt/common/configuration/exceptions.py
Normal file
43
dlt/common/configuration/exceptions.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import Iterable, Union
|
||||
|
||||
from dlt.common.exceptions import DltException
|
||||
|
||||
|
||||
class ConfigurationException(DltException):
|
||||
def __init__(self, msg: str) -> None:
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class ConfigEntryMissingException(ConfigurationException):
|
||||
"""thrown when not all required config elements are present"""
|
||||
|
||||
def __init__(self, missing_set: Iterable[str]) -> None:
|
||||
self.missing_set = missing_set
|
||||
super().__init__('Missing config keys: ' + str(missing_set))
|
||||
|
||||
|
||||
class ConfigEnvValueCannotBeCoercedException(ConfigurationException):
|
||||
"""thrown when value from ENV cannot be coerced to hinted type"""
|
||||
|
||||
def __init__(self, attr_name: str, env_value: str, hint: type) -> None:
|
||||
self.attr_name = attr_name
|
||||
self.env_value = env_value
|
||||
self.hint = hint
|
||||
super().__init__('env value %s cannot be coerced into type %s in attr %s' % (env_value, str(hint), attr_name))
|
||||
|
||||
|
||||
class ConfigIntegrityException(ConfigurationException):
|
||||
"""thrown when value from ENV cannot be coerced to hinted type"""
|
||||
|
||||
def __init__(self, attr_name: str, env_value: str, info: Union[type, str]) -> None:
|
||||
self.attr_name = attr_name
|
||||
self.env_value = env_value
|
||||
self.info = info
|
||||
super().__init__('integrity error for attr %s with value %s. %s.' % (attr_name, env_value, info))
|
||||
|
||||
|
||||
class ConfigFileNotFoundException(ConfigurationException):
|
||||
"""thrown when configuration file cannot be found in config folder"""
|
||||
|
||||
def __init__(self, path: str) -> None:
|
||||
super().__init__(f"Missing config file in {path}")
|
||||
34
dlt/common/configuration/gcp_client_configuration.py
Normal file
34
dlt/common/configuration/gcp_client_configuration.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from dlt.common.typing import StrStr
|
||||
from dlt.common.configuration.utils import TConfigSecret
|
||||
|
||||
class GcpClientConfiguration:
|
||||
PROJECT_ID: str = None
|
||||
DATASET: str = None
|
||||
TIMEOUT: float = 30.0
|
||||
BQ_CRED_TYPE: str = "service_account"
|
||||
BQ_CRED_PRIVATE_KEY: TConfigSecret = None
|
||||
BQ_CRED_TOKEN_URI: str = "https://oauth2.googleapis.com/token"
|
||||
BQ_CRED_CLIENT_EMAIL: str = None
|
||||
|
||||
@classmethod
|
||||
def check_integrity(cls) -> None:
|
||||
if cls.BQ_CRED_PRIVATE_KEY and cls.BQ_CRED_PRIVATE_KEY[-1] != "\n":
|
||||
# must end with new line, otherwise won't be parsed by Crypto
|
||||
cls.BQ_CRED_PRIVATE_KEY = TConfigSecret(cls.BQ_CRED_PRIVATE_KEY + "\n")
|
||||
|
||||
@classmethod
|
||||
def to_service_credentials(cls) -> StrStr:
|
||||
return {
|
||||
"type": cls.BQ_CRED_TYPE,
|
||||
"project_id": cls.PROJECT_ID,
|
||||
"private_key": cls.BQ_CRED_PRIVATE_KEY,
|
||||
"token_uri": cls.BQ_CRED_TOKEN_URI,
|
||||
"client_email": cls.BQ_CRED_CLIENT_EMAIL
|
||||
}
|
||||
|
||||
|
||||
class GcpClientProductionConfiguration(GcpClientConfiguration):
|
||||
PROJECT_ID: str = None
|
||||
DATASET: str = None
|
||||
BQ_CRED_PRIVATE_KEY: TConfigSecret = None
|
||||
BQ_CRED_CLIENT_EMAIL: str = None
|
||||
6
dlt/common/configuration/loading_volume_configuration.py
Normal file
6
dlt/common/configuration/loading_volume_configuration.py
Normal file
@@ -0,0 +1,6 @@
|
||||
class LoadingVolumeConfiguration:
|
||||
LOADING_VOLUME_PATH: str = "_storage/loading" # path to volume where files to be loaded to analytical storage are stored
|
||||
DELETE_COMPLETED_JOBS: bool = False # if set to true the folder with completed jobs will be deleted
|
||||
|
||||
class ProductionLoadingVolumeConfiguration(LoadingVolumeConfiguration):
|
||||
LOADING_VOLUME_PATH: str = None
|
||||
13
dlt/common/configuration/pool_runner_configuration.py
Normal file
13
dlt/common/configuration/pool_runner_configuration.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from typing import Literal, Optional
|
||||
from dlt.common.configuration import BasicConfiguration
|
||||
|
||||
TPoolType = Literal["process", "thread", "none"]
|
||||
|
||||
class PoolRunnerConfiguration(BasicConfiguration):
|
||||
MAX_PARALLELISM: Optional[int] = None # how many threads/processes in the pool
|
||||
EXIT_ON_EXCEPTION: bool = False # should exit on exception
|
||||
STOP_AFTER_RUNS: int = 10000 # will stop runner with exit code -2 after so many runs, that prevents memory fragmentation
|
||||
POOL_TYPE: TPoolType = None # type of pool to run, must be set in derived configs
|
||||
RUN_SLEEP: float = 0.5 # how long to sleep between runs with workload, seconds
|
||||
RUN_SLEEP_IDLE: float = 1.0 # how long to sleep when no more items are pending, seconds
|
||||
RUN_SLEEP_WHEN_FAILED: float = 1.0 # how long to sleep between the runs when failed
|
||||
25
dlt/common/configuration/postgres_configuration.py
Normal file
25
dlt/common/configuration/postgres_configuration.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from dlt.common.configuration.utils import TConfigSecret
|
||||
|
||||
|
||||
class PostgresConfiguration:
|
||||
PG_DATABASE_NAME: str = None
|
||||
PG_SCHEMA_PREFIX: str = None
|
||||
PG_PASSWORD: TConfigSecret = None
|
||||
PG_USER: str = None
|
||||
PG_HOST: str = None
|
||||
PG_PORT: int = 5439
|
||||
PG_CONNECTION_TIMEOUT: int = 15
|
||||
|
||||
@classmethod
|
||||
def check_integrity(cls) -> None:
|
||||
cls.PG_DATABASE_NAME = cls.PG_DATABASE_NAME.lower()
|
||||
cls.PG_SCHEMA_PREFIX = cls.PG_SCHEMA_PREFIX.lower()
|
||||
cls.PG_PASSWORD = TConfigSecret(cls.PG_PASSWORD.strip())
|
||||
|
||||
|
||||
class PostgresProductionConfiguration(PostgresConfiguration):
|
||||
PG_DATABASE_NAME: str = None
|
||||
PG_SCHEMA_PREFIX: str = None
|
||||
PG_PASSWORD: TConfigSecret = None
|
||||
PG_USER: str = None
|
||||
PG_HOST: str = None
|
||||
6
dlt/common/configuration/schema_volume_configuration.py
Normal file
6
dlt/common/configuration/schema_volume_configuration.py
Normal file
@@ -0,0 +1,6 @@
|
||||
class SchemaVolumeConfiguration:
|
||||
SCHEMA_VOLUME_PATH: str = "_storage/schemas" # path to volume with default schemas
|
||||
|
||||
|
||||
class ProductionSchemaVolumeConfiguration:
|
||||
SCHEMA_VOLUME_PATH: str = None
|
||||
@@ -0,0 +1,6 @@
|
||||
class UnpackingVolumeConfiguration:
|
||||
UNPACKING_VOLUME_PATH: str = "_storage/unpacking" # path to volume where unpacking will happen
|
||||
|
||||
|
||||
class ProductionUnpackingVolumeConfiguration:
|
||||
UNPACKING_VOLUME_PATH: str = None
|
||||
214
dlt/common/configuration/utils.py
Normal file
214
dlt/common/configuration/utils.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import sys
|
||||
import semver
|
||||
from os import environ
|
||||
from os.path import isdir, isfile
|
||||
from typing import Any, Dict, List, Mapping, NewType, Optional, Type, TypeVar, Union, Literal, IO, cast
|
||||
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.configuration import BasicConfiguration
|
||||
from dlt.common.configuration.exceptions import (ConfigEntryMissingException,
|
||||
ConfigEnvValueCannotBeCoercedException, ConfigFileNotFoundException)
|
||||
from dlt.common.utils import uniq_id
|
||||
|
||||
SIMPLE_TYPES: List[Any] = [int, bool, list, dict, tuple, bytes, set, float]
|
||||
# those types and Optionals of those types should not be passed to eval function
|
||||
NON_EVAL_TYPES = [str, None, Any]
|
||||
# allows to coerce (type1 from type2)
|
||||
ALLOWED_TYPE_COERCIONS = [(float, int), (str, int), (str, float)]
|
||||
IS_DEVELOPMENT_CONFIG_KEY: str = "IS_DEVELOPMENT_CONFIG"
|
||||
CHECK_INTEGRITY_F: str = "check_integrity"
|
||||
SECRET_STORAGE_PATH: str = "/run/secrets/%s"
|
||||
|
||||
TConfiguration = TypeVar("TConfiguration", bound=Type[BasicConfiguration])
|
||||
TProductionConfiguration = TypeVar("TProductionConfiguration", bound=Type[BasicConfiguration])
|
||||
TConfigSecret = NewType("TConfigSecret", str)
|
||||
|
||||
|
||||
def make_configuration(config: TConfiguration,
|
||||
production_config: TProductionConfiguration,
|
||||
initial_values: StrAny = None,
|
||||
accept_partial: bool = False,
|
||||
skip_subclass_check: bool = False) -> TConfiguration:
|
||||
if not skip_subclass_check:
|
||||
assert issubclass(production_config, config)
|
||||
|
||||
final_config: TConfiguration = config if _is_development_config() else production_config
|
||||
possible_keys_in_config = _get_config_attrs_with_hints(final_config)
|
||||
# create dynamic class type to not touch original config variables
|
||||
derived_config: TConfiguration = cast(TConfiguration,
|
||||
type(final_config.__name__ + "_" + uniq_id(), (final_config, ), {})
|
||||
)
|
||||
# apply initial values while preserving hints
|
||||
if initial_values:
|
||||
for k, v in initial_values.items():
|
||||
setattr(derived_config, k, v)
|
||||
|
||||
_apply_environ_to_config(derived_config, possible_keys_in_config)
|
||||
try:
|
||||
_is_config_bounded(derived_config, possible_keys_in_config)
|
||||
_check_configuration_integrity(derived_config)
|
||||
except ConfigEntryMissingException:
|
||||
if not accept_partial:
|
||||
raise
|
||||
_add_module_version(derived_config)
|
||||
|
||||
return derived_config
|
||||
|
||||
|
||||
def has_configuration_file(name: str, config: TConfiguration) -> bool:
|
||||
return isfile(get_configuration_file_path(name, config))
|
||||
|
||||
|
||||
def open_configuration_file(name: str, mode: str, config: TConfiguration) -> IO[Any]:
|
||||
path = get_configuration_file_path(name, config)
|
||||
if not has_configuration_file(name, config):
|
||||
raise ConfigFileNotFoundException(path)
|
||||
return open(path, mode)
|
||||
|
||||
|
||||
def get_configuration_file_path(name: str, config: TConfiguration) -> str:
|
||||
return config.CONFIG_FILES_STORAGE_PATH % name
|
||||
|
||||
|
||||
def is_direct_descendant(child: Type[Any], base: Type[Any]) -> bool:
|
||||
# TODO: there may be faster way to get direct descendant that mro
|
||||
# note: at index zero there's child
|
||||
return base == type.mro(child)[1]
|
||||
|
||||
|
||||
def _is_development_config() -> bool:
|
||||
is_dev_config = True
|
||||
|
||||
# get from environment
|
||||
if IS_DEVELOPMENT_CONFIG_KEY in environ:
|
||||
is_dev_config = _coerce_single_value(IS_DEVELOPMENT_CONFIG_KEY, environ[IS_DEVELOPMENT_CONFIG_KEY], bool)
|
||||
return is_dev_config
|
||||
|
||||
|
||||
def _add_module_version(config: TConfiguration) -> None:
|
||||
try:
|
||||
v = sys._getframe(1).f_back.f_globals["__version__"]
|
||||
semver.VersionInfo.parse(v)
|
||||
setattr(config, "_VERSION", v) # noqa: B010
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
|
||||
def _apply_environ_to_config(config: TConfiguration, keys_in_config: Mapping[str, type]) -> None:
|
||||
for key, hint in keys_in_config.items():
|
||||
value = _get_key_value(key, hint)
|
||||
if value is not None:
|
||||
value_from_environment_variable = _coerce_single_value(key, value, hint)
|
||||
# set value
|
||||
setattr(config, key, value_from_environment_variable)
|
||||
|
||||
|
||||
def _get_key_value(key: str, hint: Type[Any]) -> Optional[str]:
|
||||
if hint is TConfigSecret:
|
||||
# try secret storage
|
||||
try:
|
||||
# must conform to RFC1123
|
||||
secret_name = key.lower().replace("_", "-")
|
||||
secret_path = SECRET_STORAGE_PATH % secret_name
|
||||
# kubernetes stores secrets as files in a dir, docker compose plainly
|
||||
if isdir(secret_path):
|
||||
secret_path += "/" + secret_name
|
||||
with open(secret_path, "r") as f:
|
||||
secret = f.read()
|
||||
# add secret to environ so forks have access
|
||||
# TODO: removing new lines is not always good. for password OK for PEMs not
|
||||
# TODO: in regular secrets that is dealt with in particular configuration logic
|
||||
environ[key] = secret.strip()
|
||||
# do not strip returned secret
|
||||
return secret
|
||||
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return environ.get(key, None)
|
||||
|
||||
|
||||
def _is_config_bounded(config: TConfiguration, keys_in_config: Mapping[str, type]) -> None:
|
||||
_unbound_attrs = [
|
||||
key for key in keys_in_config if getattr(config, key) is None and not _is_optional_type(keys_in_config[key])
|
||||
]
|
||||
|
||||
if len(_unbound_attrs) > 0:
|
||||
raise ConfigEntryMissingException(_unbound_attrs)
|
||||
|
||||
|
||||
def _check_configuration_integrity(config: TConfiguration) -> None:
|
||||
# python multi-inheritance is cooperative and this would require that all configurations cooperatively
|
||||
# call each other check_integrity. this is not at all possible as we do not know which configs in the end will
|
||||
# be mixed together.
|
||||
|
||||
# get base classes in order of derivation
|
||||
mro = type.mro(config)
|
||||
for c in mro:
|
||||
# check if this class implements check_integrity (skip pure inheritance to not do double work)
|
||||
if CHECK_INTEGRITY_F in c.__dict__ and callable(getattr(c, CHECK_INTEGRITY_F)):
|
||||
# access unbounded __func__ to pass right class type so we check settings of the tip of mro
|
||||
c.__dict__[CHECK_INTEGRITY_F].__func__(config)
|
||||
|
||||
|
||||
def _coerce_single_value(key: str, value: str, hint: Type[Any]) -> Any:
|
||||
try:
|
||||
hint_primitive_type = _extract_simple_type(hint)
|
||||
if hint_primitive_type not in NON_EVAL_TYPES:
|
||||
# create primitive types out of strings
|
||||
typed_value = eval(value) # nosec
|
||||
# for primitive types check coercion
|
||||
if hint_primitive_type in SIMPLE_TYPES and type(typed_value) != hint_primitive_type:
|
||||
# allow some exceptions
|
||||
coerce_exception = next(
|
||||
(e for e in ALLOWED_TYPE_COERCIONS if e == (hint_primitive_type, type(typed_value))), None)
|
||||
if coerce_exception:
|
||||
return hint_primitive_type(typed_value)
|
||||
else:
|
||||
raise ConfigEnvValueCannotBeCoercedException(key, typed_value, hint)
|
||||
return typed_value
|
||||
else:
|
||||
return value
|
||||
except ConfigEnvValueCannotBeCoercedException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise ConfigEnvValueCannotBeCoercedException(key, value, hint) from exc
|
||||
|
||||
|
||||
def _extract_simple_type(hint: Type[Any]) -> Type[Any]:
|
||||
# extract optional type and call recursively
|
||||
if _is_literal_type(hint):
|
||||
# assume that all literals are of the same type
|
||||
return _extract_simple_type(type(hint.__args__[0]))
|
||||
if _is_optional_type(hint):
|
||||
# todo: use `get_args` in python 3.8
|
||||
return _extract_simple_type(hint.__args__[0])
|
||||
if not hasattr(hint, "__supertype__"):
|
||||
return hint
|
||||
# descend into supertypes of NewType
|
||||
return _extract_simple_type(hint.__supertype__)
|
||||
|
||||
|
||||
def _get_config_attrs_with_hints(config: TConfiguration) -> Dict[str, type]:
|
||||
keys: Dict[str, type] = {}
|
||||
mro = type.mro(config)
|
||||
for cls in reversed(mro):
|
||||
# update in reverse derivation order so derived classes overwrite hints from base classes
|
||||
if cls is not object:
|
||||
keys.update(
|
||||
[(attr, cls.__annotations__.get(attr, None))
|
||||
# if hasattr(config, '__annotations__') and attr in config.__annotations__ else None)
|
||||
for attr in cls.__dict__.keys() if not callable(getattr(cls, attr)) and not attr.startswith("__")
|
||||
])
|
||||
return keys
|
||||
|
||||
|
||||
def _is_optional_type(hint: Type[Any]) -> bool:
|
||||
# todo: use typing get_args and get_origin in python 3.8
|
||||
if hasattr(hint, "__origin__"):
|
||||
return hint.__origin__ is Union and type(None) in hint.__args__
|
||||
return False
|
||||
|
||||
|
||||
def _is_literal_type(hint: Type[Any]) -> bool:
|
||||
return hasattr(hint, "__origin__") and hint.__origin__ is Literal
|
||||
|
||||
58
dlt/common/dataset_writers.py
Normal file
58
dlt/common/dataset_writers.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import jsonlines
|
||||
from typing import Any, Iterable, Literal, Sequence, IO
|
||||
|
||||
from dlt.common import json
|
||||
from dlt.common.typing import StrAny
|
||||
|
||||
TWriterType = Literal["jsonl", "insert_values"]
|
||||
|
||||
def write_jsonl(f: IO[Any], rows: Sequence[Any]) -> None:
|
||||
# use jsonl to write load files https://jsonlines.org/
|
||||
with jsonlines.Writer(f, dumps=json.dumps) as w:
|
||||
w.write_all(rows)
|
||||
|
||||
|
||||
def write_insert_values(f: IO[Any], rows: Sequence[StrAny], headers: Iterable[str]) -> None:
|
||||
# dict lookup is always faster
|
||||
headers_lookup = {v: i for i, v in enumerate(headers)}
|
||||
# do not write INSERT INTO command, this must be added together with table name by the loader
|
||||
f.write("INSERT INTO {}(")
|
||||
f.write(",".join(map(escape_redshift_identifier, headers)))
|
||||
f.write(")\nVALUES\n")
|
||||
|
||||
def stringify(v: Any) -> str:
|
||||
if type(v) is bytes:
|
||||
return f"from_hex('{v.hex()}')"
|
||||
else:
|
||||
return str(v)
|
||||
|
||||
def write_row(row: StrAny) -> None:
|
||||
output = ["NULL" for _ in range(len(headers_lookup))]
|
||||
for n,v in row.items():
|
||||
output[headers_lookup[n]] = escape_redshift_literal(v) if type(v) is str else stringify(v)
|
||||
f.write("(")
|
||||
f.write(",".join(output))
|
||||
f.write(")")
|
||||
|
||||
for row in rows[:-1]:
|
||||
write_row(row)
|
||||
f.write(",\n")
|
||||
|
||||
write_row(rows[-1])
|
||||
f.write(";")
|
||||
|
||||
|
||||
def escape_redshift_literal(v: str) -> str:
|
||||
# https://www.postgresql.org/docs/9.3/sql-syntax-lexical.html
|
||||
# looks like this is the only thing we need to escape for Postgres > 9.1
|
||||
# redshift keeps \ as escape character which is pre 9 behavior
|
||||
return "'" + v.replace("'", "''").replace("\\", "\\\\") + "'"
|
||||
|
||||
|
||||
def escape_redshift_identifier(v: str) -> str:
|
||||
return '"' + v.replace('"', '""').replace("\\", "\\\\") + '"'
|
||||
|
||||
|
||||
def escape_bigquery_identifier(v: str) -> str:
|
||||
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical
|
||||
return "`" + v.replace("\\", "\\\\").replace("`","\\`") + "`"
|
||||
58
dlt/common/exceptions.py
Normal file
58
dlt/common/exceptions.py
Normal file
@@ -0,0 +1,58 @@
|
||||
class DltException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class SignalReceivedException(DltException):
|
||||
def __init__(self, signal_code: int) -> None:
|
||||
self.signal_code = signal_code
|
||||
super().__init__(f"Signal {signal_code} received")
|
||||
|
||||
|
||||
class PoolException(DltException):
|
||||
"""
|
||||
Thrown by worker pool to pass information when thrown during processing an item
|
||||
"""
|
||||
def __init__(self, pool_name: str = None, item: str = None, internal_exception: Exception = None) -> None:
|
||||
# we need it to make it pickle compatible
|
||||
if pool_name:
|
||||
self.pool_name = pool_name
|
||||
self.item = item
|
||||
self.internal_exception = internal_exception
|
||||
super().__init__(f"Pool {pool_name} raised on item {item} with {str(internal_exception)}")
|
||||
|
||||
|
||||
class UnsupportedProcessStartMethodException(DltException):
|
||||
def __init__(self, method: str) -> None:
|
||||
self.method = method
|
||||
super().__init__(f"Process pool supports only fork start method, {method} not supported. Switch the pool type to threading")
|
||||
|
||||
|
||||
class TerminalException(Exception):
|
||||
"""
|
||||
Marks an exception that cannot be recovered from, should be mixed in into concrete exception class
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TransientException(Exception):
|
||||
"""
|
||||
Marks an exception in operation that can be retried, should be mixed in into concrete exception class
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TerminalValueError(ValueError, TerminalException):
|
||||
"""
|
||||
ValueError that is unrecoverable
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TimeRangeExhaustedException(DltException):
|
||||
"""
|
||||
Raised when backfilling complete and no more time ranges can be generated
|
||||
"""
|
||||
def __init__(self, start_ts: float, end_ts: float) -> None:
|
||||
self.start_ts = start_ts
|
||||
self.end_ts = end_ts
|
||||
super().__init__(f"Timerange ({start_ts} to {end_ts}> exhausted")
|
||||
135
dlt/common/file_storage.py
Normal file
135
dlt/common/file_storage.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, List
|
||||
|
||||
|
||||
class FileStorage:
|
||||
def __init__(self,
|
||||
storage_path: str,
|
||||
file_type: str = "t",
|
||||
makedirs: bool = False) -> None:
|
||||
# make it absolute path
|
||||
self.storage_path = os.path.join(os.path.realpath(storage_path), '')
|
||||
self.file_type = file_type
|
||||
if makedirs:
|
||||
os.makedirs(storage_path, exist_ok=True)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, file_path: str, file_type: str = "t",) -> "FileStorage":
|
||||
return cls(os.path.dirname(file_path), file_type)
|
||||
|
||||
def save(self, relative_path: str, data: Any) -> str:
|
||||
return self.save_atomic(self.storage_path, relative_path, data, file_type=self.file_type)
|
||||
|
||||
@staticmethod
|
||||
def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str:
|
||||
with tempfile.NamedTemporaryFile(dir=storage_path, mode="w" + file_type, delete=False) as f:
|
||||
tmp_path = f.name
|
||||
f.write(data)
|
||||
try:
|
||||
dest_path = os.path.join(storage_path, relative_path)
|
||||
os.rename(tmp_path, dest_path)
|
||||
return dest_path
|
||||
except Exception:
|
||||
if os.path.isfile(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
raise
|
||||
|
||||
def load(self, relative_path: str) -> Any:
|
||||
# raises on file not existing
|
||||
with self.open(relative_path) as text_file:
|
||||
return text_file.read()
|
||||
|
||||
def delete(self, relative_path: str) -> None:
|
||||
file_path = self._make_path(relative_path)
|
||||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
else:
|
||||
raise FileNotFoundError(file_path)
|
||||
|
||||
def delete_folder(self, relative_path: str, recursively: bool = False) -> None:
|
||||
folder_path = self._make_path(relative_path)
|
||||
if os.path.isdir(folder_path):
|
||||
if recursively:
|
||||
shutil.rmtree(folder_path)
|
||||
else:
|
||||
os.rmdir(folder_path)
|
||||
else:
|
||||
raise NotADirectoryError(folder_path)
|
||||
|
||||
def open(self, realtive_path: str, mode: str = "r") -> IO[Any]:
|
||||
return open(self._make_path(realtive_path), mode + self.file_type)
|
||||
|
||||
def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]:
|
||||
ft = file_type or self.file_type
|
||||
return tempfile.NamedTemporaryFile(dir=self.storage_path, mode=mode + ft, delete=delete)
|
||||
|
||||
def has_file(self, relative_path: str) -> bool:
|
||||
return os.path.isfile(self._make_path(relative_path))
|
||||
|
||||
def has_folder(self, relative_path: str) -> bool:
|
||||
return os.path.isdir(self._make_path(relative_path))
|
||||
|
||||
def list_folder_files(self, relative_path: str, to_root: bool = True) -> List[str]:
|
||||
scan_path = self._make_path(relative_path)
|
||||
if to_root:
|
||||
# list files in relative path, returning paths relative to storage root
|
||||
return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_file()]
|
||||
else:
|
||||
# or to the folder
|
||||
return [e.name for e in os.scandir(scan_path) if e.is_file()]
|
||||
|
||||
def list_folder_dirs(self, relative_path: str, to_root: bool = True) -> List[str]:
|
||||
# list content of relative path, returning paths relative to storage root
|
||||
scan_path = self._make_path(relative_path)
|
||||
if to_root:
|
||||
# list folders in relative path, returning paths relative to storage root
|
||||
return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_dir()]
|
||||
else:
|
||||
# or to the folder
|
||||
return [e.name for e in os.scandir(scan_path) if e.is_dir()]
|
||||
|
||||
def create_folder(self, relative_path: str, exists_ok: bool = False) -> None:
|
||||
os.makedirs(self._make_path(relative_path), exist_ok=exists_ok)
|
||||
|
||||
def copy_cross_storage_atomically(self, dest_volume_root: str, dest_relative_path: str, source_path: str, dest_name: str) -> None:
|
||||
external_tmp_file = tempfile.mktemp(dir=dest_volume_root)
|
||||
# first copy to temp file
|
||||
shutil.copy(self._make_path(source_path), external_tmp_file)
|
||||
# then rename to dest name
|
||||
external_dest = os.path.join(dest_volume_root, dest_relative_path, dest_name)
|
||||
try:
|
||||
os.rename(external_tmp_file, external_dest)
|
||||
except Exception:
|
||||
if os.path.isfile(external_tmp_file):
|
||||
os.remove(external_tmp_file)
|
||||
raise
|
||||
|
||||
def atomic_rename(self, from_relative_path: str, to_relative_path: str) -> None:
|
||||
os.rename(
|
||||
self._make_path(from_relative_path),
|
||||
self._make_path(to_relative_path)
|
||||
)
|
||||
|
||||
def in_storage(self, path: str) -> bool:
|
||||
file = os.path.realpath(path)
|
||||
# return true, if the common prefix of both is equal to directory
|
||||
# e.g. /a/b/c/d.rst and directory is /a/b, the common prefix is /a/b
|
||||
return os.path.commonprefix([file, self.storage_path]) == self.storage_path
|
||||
|
||||
def to_relative_path(self, path: str) -> str:
|
||||
if not self.in_storage(path):
|
||||
raise ValueError(path)
|
||||
return os.path.relpath(path, start=self.storage_path)
|
||||
|
||||
def get_file_stem(self, path: str) -> str:
|
||||
return Path(os.path.basename(path)).stem
|
||||
|
||||
def get_file_name(self, path: str) -> str:
|
||||
return Path(path).name
|
||||
|
||||
def _make_path(self, relative_path: str) -> str:
|
||||
return os.path.join(self.storage_path, relative_path)
|
||||
|
||||
46
dlt/common/json.py
Normal file
46
dlt/common/json.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import base64
|
||||
from datetime import date, datetime # noqa: I251
|
||||
from functools import partial
|
||||
from typing import Any, Union
|
||||
from uuid import UUID
|
||||
from hexbytes import HexBytes
|
||||
import simplejson
|
||||
from simplejson.raw_json import RawJSON
|
||||
|
||||
from dlt.common.arithmetics import Decimal
|
||||
|
||||
# simplejson._toggle_speedups(False)
|
||||
|
||||
def custom_encode(obj: Any) -> Union[RawJSON, str]:
|
||||
if isinstance(obj, Decimal):
|
||||
# always return decimals as string (not RawJSON) so they are not deserialized back to float
|
||||
return str(obj.normalize())
|
||||
# this works both for standard datetime and pendulum
|
||||
elif isinstance(obj, datetime):
|
||||
# See "Date Time String Format" in the ECMA-262 specification.
|
||||
r = obj.isoformat()
|
||||
# leave microseconds alone
|
||||
# if obj.microsecond:
|
||||
# r = r[:23] + r[26:]
|
||||
if r.endswith('+00:00'):
|
||||
r = r[:-6] + 'Z'
|
||||
return r
|
||||
elif isinstance(obj, date):
|
||||
return obj.isoformat()
|
||||
elif isinstance(obj, UUID):
|
||||
return str(obj)
|
||||
elif isinstance(obj, HexBytes):
|
||||
return obj.hex()
|
||||
elif isinstance(obj, bytes):
|
||||
return base64.b64encode(obj).decode('ascii')
|
||||
raise TypeError(repr(obj) + " is not JSON serializable")
|
||||
|
||||
|
||||
simplejson.loads = partial(simplejson.loads, use_decimal=False)
|
||||
simplejson.load = partial(simplejson.load, use_decimal=False)
|
||||
# prevent default decimal serializer (use_decimal=False) and binary serializer (encoding=None)
|
||||
simplejson.dumps = partial(simplejson.dumps, use_decimal=False, default=custom_encode, encoding=None)
|
||||
simplejson.dump = partial(simplejson.dump, use_decimal=False, default=custom_encode, encoding=None)
|
||||
|
||||
# provide drop-in replacement
|
||||
json = simplejson
|
||||
207
dlt/common/logger.py
Normal file
207
dlt/common/logger.py
Normal file
@@ -0,0 +1,207 @@
|
||||
import logging
|
||||
import json_logging
|
||||
import traceback
|
||||
import sentry_sdk
|
||||
from sentry_sdk.transport import HttpTransport
|
||||
from logging import LogRecord, Logger
|
||||
from typing import Any, Callable, Dict, Type
|
||||
|
||||
from dlt.common.json import json
|
||||
from dlt.common.typing import DictStrAny, DictStrStr, StrStr
|
||||
from dlt.common.configuration import BasicConfiguration
|
||||
from dlt.common.utils import filter_env_vars
|
||||
from dlt._version import common_version as __version__
|
||||
|
||||
DLT_LOGGER_NAME = "sv-dlt"
|
||||
LOGGER: Logger = None
|
||||
|
||||
def _add_logging_level(level_name: str, level: int, method_name:str = None) -> None:
|
||||
"""
|
||||
Comprehensively adds a new logging level to the `logging` module and the
|
||||
currently configured logging class.
|
||||
|
||||
`levelName` becomes an attribute of the `logging` module with the value
|
||||
`levelNum`. `methodName` becomes a convenience method for both `logging`
|
||||
itself and the class returned by `logging.getLoggerClass()` (usually just
|
||||
`logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
|
||||
used.
|
||||
|
||||
To avoid accidental clobberings of existing attributes, this method will
|
||||
raise an `AttributeError` if the level name is already an attribute of the
|
||||
`logging` module or if the method name is already present
|
||||
|
||||
"""
|
||||
if not method_name:
|
||||
method_name = level_name.lower()
|
||||
|
||||
if hasattr(logging, level_name):
|
||||
raise AttributeError('{} already defined in logging module'.format(level_name))
|
||||
if hasattr(logging, method_name):
|
||||
raise AttributeError('{} already defined in logging module'.format(method_name))
|
||||
if hasattr(logging.getLoggerClass(), method_name):
|
||||
raise AttributeError('{} already defined in logger class'.format(method_name))
|
||||
|
||||
# This method was inspired by the answers to Stack Overflow post
|
||||
# http://stackoverflow.com/q/2183233/2988730, especially
|
||||
# http://stackoverflow.com/a/13638084/2988730
|
||||
def logForLevel(self: logging.Logger, message: str, *args: Any, **kwargs: Any) -> None:
|
||||
if self.isEnabledFor(level):
|
||||
self._log(level, message, args, **kwargs)
|
||||
def logToRoot(message: str, *args: Any, **kwargs: Any) -> None:
|
||||
logging.root._log(level, message, args, **kwargs)
|
||||
|
||||
logging.addLevelName(level, level_name)
|
||||
setattr(logging, level_name, level)
|
||||
setattr(logging.getLoggerClass(), method_name, logForLevel)
|
||||
setattr(logging, method_name, logToRoot)
|
||||
|
||||
|
||||
class _MetricsFormatter(logging.Formatter):
|
||||
def format(self, record: LogRecord) -> str:
|
||||
s = super(_MetricsFormatter, self).format(record)
|
||||
if record.exc_text:
|
||||
s = s + '|'
|
||||
# dump metrics dictionary nicely
|
||||
if "metrics" in record.__dict__:
|
||||
s = s + ": " + json.dumps(record.__dict__["metrics"])
|
||||
return s
|
||||
|
||||
|
||||
class _CustomJsonFormatter(json_logging.JSONLogFormatter):
|
||||
|
||||
version: StrStr = None
|
||||
|
||||
def _format_log_object(self, record: LogRecord, request_util: Any) -> Any:
|
||||
json_log_object = super(_CustomJsonFormatter, self)._format_log_object(record, request_util)
|
||||
if self.version:
|
||||
json_log_object.update({"version": self.version})
|
||||
return json_log_object
|
||||
|
||||
|
||||
def _init_logging(logger_name: str, level: str, format: str, component: str, version: StrStr) -> Logger:
|
||||
if logger_name == "root":
|
||||
logging.basicConfig(level=level)
|
||||
handler = logging.getLogger().handlers[0]
|
||||
# handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
|
||||
logger = logging.getLogger()
|
||||
else:
|
||||
logger = logging.getLogger(DLT_LOGGER_NAME)
|
||||
logger.propagate = False
|
||||
logger.setLevel(level)
|
||||
handler = logging.StreamHandler()
|
||||
# handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
|
||||
logger.addHandler(handler)
|
||||
|
||||
# set right formatter
|
||||
if is_json_logging(format):
|
||||
json_logging.COMPONENT_NAME = component
|
||||
json_logging.JSON_SERIALIZER = json.dumps
|
||||
json_logging.RECORD_ATTR_SKIP_LIST.remove("process")
|
||||
# set version as class variable as we cannot pass custom constructor parameters
|
||||
_CustomJsonFormatter.version = version
|
||||
# the only thing method above effectively does is to replace the formatter
|
||||
json_logging.init_non_web(enable_json=True, custom_formatter=_CustomJsonFormatter)
|
||||
if logger_name == "root":
|
||||
json_logging.config_root_logger()
|
||||
else:
|
||||
handler.setFormatter(_MetricsFormatter(fmt=format, style='{'))
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def __getattr__(name: str) -> Callable[..., Any]:
|
||||
# a catch all function for a module that forwards calls to unknown methods to LOGGER
|
||||
def wrapper(msg: str, *args: Any, **kwargs: Any) -> None:
|
||||
if LOGGER:
|
||||
getattr(LOGGER, name)(msg, *args, **kwargs, stacklevel=2)
|
||||
return wrapper
|
||||
|
||||
|
||||
def _extract_version_info(config: Type[BasicConfiguration]) -> StrStr:
|
||||
version_info = {"version": __version__, "component_name": config.NAME}
|
||||
version = getattr(config, "_VERSION", None)
|
||||
if version:
|
||||
version_info["component_version"] = version
|
||||
# extract envs with build info
|
||||
version_info.update(filter_env_vars(["COMMIT_SHA", "IMAGE_VERSION"]))
|
||||
return version_info
|
||||
|
||||
|
||||
def _extract_pod_info() -> StrStr:
|
||||
return filter_env_vars(["KUBE_NODE_NAME", "KUBE_POD_NAME", "KUBE_POD_NAMESPACE"])
|
||||
|
||||
class _SentryHttpTransport(HttpTransport):
|
||||
|
||||
timeout: int = 0
|
||||
|
||||
def _get_pool_options(self, *a: Any, **kw: Any) -> DictStrAny:
|
||||
rv = HttpTransport._get_pool_options(self, *a, **kw)
|
||||
rv['timeout'] = self.timeout
|
||||
return rv
|
||||
|
||||
|
||||
def _init_sentry(config: Type[BasicConfiguration], version: StrStr) -> None:
|
||||
if config.SENTRY_DSN:
|
||||
global sentry_client
|
||||
|
||||
sys_ver = version["version"]
|
||||
release = sys_ver + "_" + version.get("commit_sha", "")
|
||||
_SentryHttpTransport.timeout = config.REQUEST_TIMEOUT[0]
|
||||
# TODO: setup automatic sending of log messages by log level (ie. we send a lot dbt trash logs)
|
||||
# https://docs.sentry.io/platforms/python/guides/logging/
|
||||
sentry_sdk.init(config.SENTRY_DSN, release=release, transport=_SentryHttpTransport)
|
||||
# add version tags
|
||||
for k, v in version.items():
|
||||
sentry_sdk.set_tag(k, v)
|
||||
# add kubernetes tags
|
||||
pod_tags = _extract_pod_info()
|
||||
for k, v in pod_tags.items():
|
||||
sentry_sdk.set_tag(k, v)
|
||||
|
||||
|
||||
def init_telemetry(config: Type[BasicConfiguration]) -> None:
|
||||
if config.PROMETHEUS_PORT:
|
||||
from prometheus_client import start_http_server, Info
|
||||
|
||||
logging.info(f"Starting prometheus server port {config.PROMETHEUS_PORT}")
|
||||
start_http_server(config.PROMETHEUS_PORT)
|
||||
# collect info
|
||||
Info("runs_component_name", "Name of the executing component").info(_extract_version_info(config))
|
||||
|
||||
|
||||
def init_logging_from_config(config: Type[BasicConfiguration]) -> None:
|
||||
global LOGGER
|
||||
|
||||
# add HEALTH and METRICS log levels
|
||||
_add_logging_level("HEALTH", logging.WARNING - 1, "health")
|
||||
_add_logging_level("METRICS", logging.WARNING - 2, "metrics")
|
||||
|
||||
version = _extract_version_info(config)
|
||||
LOGGER = _init_logging(
|
||||
DLT_LOGGER_NAME,
|
||||
# "root",
|
||||
config.LOG_LEVEL,
|
||||
config.LOG_FORMAT,
|
||||
config.NAME,
|
||||
version)
|
||||
_init_sentry(config, version)
|
||||
|
||||
|
||||
def is_json_logging(log_format: str) -> bool:
|
||||
return log_format == "JSON"
|
||||
|
||||
|
||||
def process_internal_exception(msg: str, exc_info: Any = True) -> None:
|
||||
# Passing default True value will cause implementation to use data provided by sys.exc_info
|
||||
if LOGGER:
|
||||
LOGGER.error(msg, exc_info=exc_info, stacklevel=2)
|
||||
report_exception()
|
||||
|
||||
|
||||
def report_exception() -> None:
|
||||
if sentry_sdk.Hub.current:
|
||||
sentry_sdk.capture_exception()
|
||||
|
||||
|
||||
def pretty_format_exception() -> str:
|
||||
return traceback.format_exc()
|
||||
147
dlt/common/parser.py
Normal file
147
dlt/common/parser.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import re
|
||||
from typing import Iterator, Optional, Tuple, Callable, cast
|
||||
|
||||
from dlt.common import json
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.common.utils import uniq_id, digest128
|
||||
from dlt.common.typing import TEvent, TEventRowChild, TEventRowRoot, StrAny
|
||||
|
||||
|
||||
# I(table name, row data)
|
||||
TUnpackedRowIterator = Iterator[Tuple[str, StrAny]]
|
||||
TExtractFunc = Callable[[Schema, TEvent, str, bool], TUnpackedRowIterator]
|
||||
|
||||
RE_UNDERSCORES = re.compile("_+")
|
||||
RE_LEADING_DIGITS = re.compile(r"^\d+")
|
||||
INVALID_SQL_IDENT_CHARS = "- *!:,.'\\\"`"
|
||||
INVALID_SQL_TX = str.maketrans(INVALID_SQL_IDENT_CHARS, "_" * len(INVALID_SQL_IDENT_CHARS))
|
||||
|
||||
# subsequent nested fields will be separated with the string below, applies both to field and table names
|
||||
PATH_SEPARATOR = "__"
|
||||
|
||||
# for those paths the complex nested objects should be left in place
|
||||
# current use case: we want to preserve event_slot__value in db even if it's an object
|
||||
# TODO: pass table definition and accept complex type
|
||||
def _should_preserve_complex_value(table: str, field_name: str) -> bool:
|
||||
path = f"{table}{PATH_SEPARATOR}{field_name}"
|
||||
return path in ["event_slot__value"]
|
||||
|
||||
|
||||
def _fix_field_name(name: str) -> str:
|
||||
|
||||
def camel_to_snake(name: str) -> str:
|
||||
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
|
||||
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
|
||||
|
||||
# fix field name so it's an acceptable name for a database column
|
||||
# all characters that are not letters digits or a few special chars
|
||||
name = camel_to_snake(name.translate(INVALID_SQL_TX))
|
||||
name = RE_LEADING_DIGITS.sub("_", name)
|
||||
# replace consecutive underscores with single one to prevent name clashes with parent child
|
||||
return RE_UNDERSCORES.sub("_", name)
|
||||
|
||||
|
||||
def _flatten(table: str, dict_row: TEventRowChild) -> TEventRowChild:
|
||||
out_rec_row: TEventRowChild = {}
|
||||
|
||||
def unpack_row_dicts(dict_row: StrAny, parent_name: Optional[str]) -> None:
|
||||
for k, v in dict_row.items():
|
||||
corrected_k = _fix_field_name(k)
|
||||
child_name = corrected_k if not parent_name else f'{parent_name}{PATH_SEPARATOR}{corrected_k}'
|
||||
if type(v) is dict:
|
||||
unpack_row_dicts(v, parent_name=child_name)
|
||||
if _should_preserve_complex_value(table, child_name):
|
||||
out_rec_row[child_name] = v # type: ignore
|
||||
else:
|
||||
out_rec_row[child_name] = v # type: ignore
|
||||
|
||||
unpack_row_dicts(dict_row, None)
|
||||
return out_rec_row
|
||||
|
||||
|
||||
def _get_child_row_hash(parent_hash: str, child_table: str, list_pos: int) -> str:
|
||||
# create deterministic unique id of the child row taking into account that all lists are ordered
|
||||
# and all child tables must be lists
|
||||
return digest128(f"{parent_hash}_{child_table}_{list_pos}")
|
||||
|
||||
|
||||
def _unpack_row(
|
||||
schema: Schema,
|
||||
dict_row: TEventRowChild,
|
||||
extend: TEventRowChild,
|
||||
table: str,
|
||||
parent_hash: Optional[str] = None,
|
||||
pos: Optional[int] = None
|
||||
) -> TUnpackedRowIterator:
|
||||
|
||||
def _append_child_meta(_row: TEventRowChild, _hash: str, _p_hash: str, _p_pos: int) -> TEventRowChild:
|
||||
_row["_parent_hash"] = _p_hash
|
||||
_row["_pos"] = _p_pos
|
||||
_row.update(extend)
|
||||
|
||||
return _row
|
||||
|
||||
is_top_level = parent_hash is None
|
||||
|
||||
# flatten current row
|
||||
new_dict_row = _flatten(table, dict_row)
|
||||
# infer record hash or leave existing primary key if present
|
||||
record_hash = new_dict_row.get("_record_hash", None)
|
||||
if not record_hash:
|
||||
# check if we have primary key: if so use it
|
||||
primary_key = schema.filter_hints_in_row(table, "primary_key", new_dict_row)
|
||||
if primary_key:
|
||||
# create row id from primary key
|
||||
record_hash = digest128("_".join(map(lambda v: str(v), primary_key.values())))
|
||||
elif not is_top_level:
|
||||
# child table row deterministic hash
|
||||
record_hash = _get_child_row_hash(parent_hash, table, pos)
|
||||
# link to parent table
|
||||
_append_child_meta(new_dict_row, record_hash, parent_hash, pos)
|
||||
else:
|
||||
# create random row id, note that incremental loads will not work with such tables
|
||||
record_hash = uniq_id()
|
||||
new_dict_row["_record_hash"] = record_hash
|
||||
|
||||
# if _root_hash propagation requested and we are at the top level then update extend
|
||||
if "_root_hash" in extend and extend["_root_hash"] is None and is_top_level:
|
||||
extend["_root_hash"] = record_hash
|
||||
|
||||
# generate child tables only for lists
|
||||
children = [k for k in new_dict_row if type(new_dict_row[k]) is list] # type: ignore
|
||||
for k in children:
|
||||
child_table = f"{table}{PATH_SEPARATOR}{k}"
|
||||
# this will skip empty lists
|
||||
v: TEventRowChild
|
||||
for idx, v in enumerate(new_dict_row[k]): # type: ignore
|
||||
# yield child table row
|
||||
if type(v) is dict:
|
||||
yield from _unpack_row(schema, v, extend, child_table, record_hash, idx)
|
||||
elif type(v) is list:
|
||||
# unpack lists of lists
|
||||
raise ValueError(v)
|
||||
else:
|
||||
# list of simple types
|
||||
child_row_hash = _get_child_row_hash(record_hash, child_table, idx)
|
||||
e = _append_child_meta({"value": v, "_record_hash": child_row_hash}, child_row_hash, record_hash, idx)
|
||||
yield child_table, e
|
||||
if not _should_preserve_complex_value(table, k):
|
||||
# remove child list
|
||||
del new_dict_row[k] # type: ignore
|
||||
|
||||
yield table, new_dict_row
|
||||
|
||||
|
||||
def extract(schema: Schema, source_event: TEvent, load_id: str, add_json: bool) -> TUnpackedRowIterator:
|
||||
# we will extend event with all the fields necessary to load it as root row
|
||||
event = cast(TEventRowRoot, source_event)
|
||||
# identify load id if loaded data must be processed after loading incrementally
|
||||
event["_load_id"] = load_id
|
||||
# add original json field, mostly useful for debugging
|
||||
if add_json:
|
||||
event["_event_json"] = json.dumps(event)
|
||||
# find table name
|
||||
table_name = event.pop("_event_type", None) or schema.schema_name
|
||||
# TODO: if table_name exist get "_dist_key" and "_timestamp" from the table definition in schema and propagate, if not take them from global hints
|
||||
# use event type or schema name as table name, request _root_hash propagation
|
||||
yield from _unpack_row(schema, cast(TEventRowChild, event), {"_root_hash": None}, table_name)
|
||||
16
dlt/common/pendulum.py
Normal file
16
dlt/common/pendulum.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import pendulum # noqa: I251
|
||||
|
||||
# force UTC as the local timezone to prevent local dates to be written to dbs
|
||||
pendulum.set_local_timezone(pendulum.timezone('UTC')) # type: ignore
|
||||
|
||||
|
||||
def __utcnow() -> pendulum.DateTime:
|
||||
"""
|
||||
Use this function instead of datetime.now
|
||||
Returns:
|
||||
pendulum.DateTime -- current time in UTC timezone
|
||||
"""
|
||||
return pendulum.now()
|
||||
|
||||
|
||||
pendulum.utcnow = __utcnow # type: ignore
|
||||
181
dlt/common/runners.py
Normal file
181
dlt/common/runners.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import argparse
|
||||
import multiprocessing
|
||||
from prometheus_client import Counter, Gauge, Summary, CollectorRegistry, REGISTRY
|
||||
from typing import Callable, Dict, NamedTuple, Optional, Type, TypeVar, Union, cast
|
||||
from multiprocessing.pool import ThreadPool, Pool
|
||||
|
||||
from dlt.common import logger, signals
|
||||
from dlt.common.configuration.basic_configuration import BasicConfiguration
|
||||
from dlt.common.time import sleep
|
||||
from dlt.common.telemetry import TRunHealth, TRunMetrics, get_logging_extras, get_metrics_from_prometheus
|
||||
from dlt.common.logger import init_logging_from_config, init_telemetry, process_internal_exception
|
||||
from dlt.common.signals import register_signals
|
||||
from dlt.common.utils import str2bool
|
||||
from dlt.common.exceptions import SignalReceivedException, TimeRangeExhaustedException, UnsupportedProcessStartMethodException
|
||||
from dlt.common.configuration import PoolRunnerConfiguration
|
||||
|
||||
TPool = TypeVar("TPool", bound=Pool)
|
||||
|
||||
|
||||
class TRunArgs(NamedTuple):
|
||||
single_run: bool
|
||||
wait_runs: int
|
||||
|
||||
|
||||
RUN_ARGS = TRunArgs(False, 0)
|
||||
|
||||
HEALTH_PROPS_GAUGES: Dict[str, Union[Counter, Gauge]] = None
|
||||
RUN_DURATION_GAUGE: Gauge = None
|
||||
RUN_DURATION_SUMMARY: Summary = None
|
||||
|
||||
LAST_RUN_METRICS: TRunMetrics = None
|
||||
LAST_RUN_EXCEPTION: BaseException = None
|
||||
|
||||
def create_gauges(registry: CollectorRegistry) -> None:
|
||||
global HEALTH_PROPS_GAUGES, RUN_DURATION_GAUGE, RUN_DURATION_SUMMARY
|
||||
|
||||
HEALTH_PROPS_GAUGES = {
|
||||
"runs_count": Counter("runs_count", "Count runs", registry=registry),
|
||||
"runs_not_idle_count": Counter("runs_not_idle_count", "Count not idle runs", registry=registry),
|
||||
"runs_healthy_count": Counter("runs_healthy_count", "Count healthy runs", registry=registry),
|
||||
"runs_cs_healthy_gauge": Gauge("runs_cs_healthy_gauge", "Count consecutive healthy runs, reset on failed run", registry=registry),
|
||||
"runs_failed_count": Counter("runs_failed_count", "Count failed runs", registry=registry),
|
||||
"runs_cs_failed_gauge": Gauge("runs_cs_failed_gauge", "Count consecutive failed runs, reset on healthy run", registry=registry),
|
||||
"runs_pending_items_gauge": Gauge("runs_pending_items_gauge", "Number of items pending at the end of the run", registry=registry),
|
||||
}
|
||||
|
||||
RUN_DURATION_GAUGE = Gauge("runs_duration_seconds", "Duration of the run", registry=registry)
|
||||
RUN_DURATION_SUMMARY = Summary("runs_duration_summary", "Summary of the run duration", registry=registry)
|
||||
|
||||
|
||||
def update_gauges() -> TRunHealth:
|
||||
return get_metrics_from_prometheus(HEALTH_PROPS_GAUGES.values()) # type: ignore
|
||||
|
||||
|
||||
def str2bool_a(v: str) -> bool:
|
||||
try:
|
||||
return str2bool(v)
|
||||
except ValueError:
|
||||
raise argparse.ArgumentTypeError('Boolean value expected.')
|
||||
|
||||
|
||||
def create_default_args(C: Type[PoolRunnerConfiguration]) -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description=f"Default runner for {C.NAME}")
|
||||
add_pool_cli_arguments(parser)
|
||||
return parser
|
||||
|
||||
|
||||
def add_pool_cli_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
parser.add_argument("--single-run", type=str2bool_a, nargs='?', const=True, default=False, help="exit when all pending items are processed")
|
||||
parser.add_argument("--wait-runs", type=int, nargs='?', const=True, default=1, help="maximum idle runs to wait for incoming data")
|
||||
|
||||
|
||||
|
||||
def initialize_runner(C: Type[BasicConfiguration], run_args: Optional[TRunArgs] = None) -> None:
|
||||
global RUN_ARGS
|
||||
|
||||
init_logging_from_config(C)
|
||||
init_telemetry(C)
|
||||
create_gauges(REGISTRY)
|
||||
register_signals()
|
||||
if run_args is not None:
|
||||
RUN_ARGS = run_args
|
||||
|
||||
|
||||
def pool_runner(C: Type[PoolRunnerConfiguration], run_f: Callable[[TPool], TRunMetrics]) -> int:
|
||||
# start pool
|
||||
pool: Pool = None
|
||||
if C.POOL_TYPE == "process":
|
||||
# our pool implementation do not work on spawn
|
||||
if multiprocessing.get_start_method() != "fork":
|
||||
raise UnsupportedProcessStartMethodException(multiprocessing.get_start_method())
|
||||
pool = Pool(processes=C.MAX_PARALLELISM)
|
||||
elif C.POOL_TYPE == "thread":
|
||||
pool = ThreadPool(processes=C.MAX_PARALLELISM)
|
||||
else:
|
||||
pool = None
|
||||
logger.info(f"Created {C.POOL_TYPE} pool with {C.MAX_PARALLELISM or 'default no.'} workers")
|
||||
|
||||
|
||||
try:
|
||||
while True:
|
||||
run_metrics: TRunMetrics = None
|
||||
try:
|
||||
HEALTH_PROPS_GAUGES["runs_count"].inc()
|
||||
# run pool logic
|
||||
with RUN_DURATION_SUMMARY.time(), RUN_DURATION_GAUGE.time():
|
||||
run_metrics = run_f(cast(TPool, pool))
|
||||
except Exception as exc:
|
||||
if (type(exc) is SignalReceivedException) or (type(exc) is TimeRangeExhaustedException):
|
||||
# always exit
|
||||
raise
|
||||
else:
|
||||
process_internal_exception("run")
|
||||
# the run failed
|
||||
run_metrics = TRunMetrics(True, True, -1)
|
||||
# preserve exception
|
||||
global LAST_RUN_EXCEPTION
|
||||
LAST_RUN_EXCEPTION = exc
|
||||
|
||||
# gather and emit metrics
|
||||
if not run_metrics.was_idle:
|
||||
HEALTH_PROPS_GAUGES["runs_not_idle_count"].inc()
|
||||
if run_metrics.has_failed:
|
||||
HEALTH_PROPS_GAUGES["runs_failed_count"].inc()
|
||||
HEALTH_PROPS_GAUGES["runs_cs_failed_gauge"].inc()
|
||||
HEALTH_PROPS_GAUGES["runs_cs_healthy_gauge"].set(0)
|
||||
else:
|
||||
HEALTH_PROPS_GAUGES["runs_healthy_count"].inc()
|
||||
HEALTH_PROPS_GAUGES["runs_cs_healthy_gauge"].inc()
|
||||
HEALTH_PROPS_GAUGES["runs_cs_failed_gauge"].set(0)
|
||||
HEALTH_PROPS_GAUGES["runs_pending_items_gauge"].set(run_metrics.pending_items)
|
||||
health_props = update_gauges()
|
||||
logger.health("run health counters", extra={"metrics": health_props})
|
||||
logger.metrics("run metrics", extra=get_logging_extras([RUN_DURATION_GAUGE, RUN_DURATION_SUMMARY]))
|
||||
|
||||
# preserve last run metrics
|
||||
global LAST_RUN_METRICS
|
||||
LAST_RUN_METRICS = run_metrics
|
||||
|
||||
# exit due to signal
|
||||
signals.raise_if_signalled()
|
||||
|
||||
# exit due to exception and flag
|
||||
if run_metrics.has_failed and C.EXIT_ON_EXCEPTION:
|
||||
logger.warning(f"Exiting runner due to EXIT_ON_EXCEPTION flag set")
|
||||
return -1
|
||||
|
||||
# single run may be forced but at least wait_runs must pass
|
||||
if RUN_ARGS.single_run and (health_props["runs_count"] >= RUN_ARGS.wait_runs and
|
||||
# and was all the time idle or (was not idle but now pending is 0)
|
||||
(health_props["runs_not_idle_count"] == 0 or run_metrics.pending_items == 0)):
|
||||
logger.warning(f"Stopping runner due to single run override")
|
||||
return 0
|
||||
|
||||
if run_metrics.has_failed:
|
||||
sleep(C.RUN_SLEEP_WHEN_FAILED)
|
||||
elif run_metrics.pending_items == 0:
|
||||
# nothing is pending so we can sleep longer
|
||||
sleep(C.RUN_SLEEP_IDLE)
|
||||
else:
|
||||
# more items are pending, sleep (typically) shorter
|
||||
sleep(C.RUN_SLEEP)
|
||||
|
||||
# this allows to recycle long living process that get their memory fragmented
|
||||
# exit after runners sleeps so we keep the running period
|
||||
if health_props["runs_count"] == C.STOP_AFTER_RUNS:
|
||||
logger.warning(f"Stopping runner due to max runs {health_props['runs_count']} exceeded")
|
||||
return -2
|
||||
except SignalReceivedException as sigex:
|
||||
# sleep this may raise SignalReceivedException
|
||||
logger.warning(f"Exiting runner due to signal {sigex.signal_code}")
|
||||
return sigex.signal_code
|
||||
except TimeRangeExhaustedException as tre:
|
||||
logger.info(f"{str(tre)}, not further processing will be done")
|
||||
return 0
|
||||
finally:
|
||||
if pool:
|
||||
logger.info("Closing processing pool")
|
||||
pool.close()
|
||||
pool.join()
|
||||
pool = None
|
||||
575
dlt/common/schema.py
Normal file
575
dlt/common/schema.py
Normal file
@@ -0,0 +1,575 @@
|
||||
import base64
|
||||
import binascii
|
||||
import yaml
|
||||
import re
|
||||
from re import Pattern
|
||||
from copy import deepcopy
|
||||
from dateutil.parser import isoparse
|
||||
from typing import Dict, List, Set, Mapping, Optional, Sequence, Tuple, Type, TypedDict, Literal, Any, cast
|
||||
|
||||
from dlt.common import pendulum, json, Decimal
|
||||
from dlt.common.typing import DictStrAny, StrAny, StrStr
|
||||
from dlt.common.arithmetics import ConversionSyntax
|
||||
from dlt.common.exceptions import DltException
|
||||
|
||||
DataType = Literal["text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei"]
|
||||
HintType = Literal["not_null", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"]
|
||||
ColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"]
|
||||
|
||||
DATA_TYPES: Set[DataType] = set(["text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei"])
|
||||
COLUMN_PROPS: Set[ColumnProp] = set(["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique"])
|
||||
COLUMN_HINTS: Set[HintType] = set(["partition", "cluster", "primary_key", "foreign_key", "sort", "unique"])
|
||||
|
||||
class ColumnBase(TypedDict, total=True):
|
||||
name: str
|
||||
data_type: DataType
|
||||
nullable: bool
|
||||
|
||||
class Column(ColumnBase, total=True):
|
||||
partition: bool
|
||||
cluster: bool
|
||||
unique: bool
|
||||
sort: bool
|
||||
primary_key: bool
|
||||
foreign_key: bool
|
||||
|
||||
Table = Dict[str, Column]
|
||||
SchemaTables = Dict[str, Table]
|
||||
SchemaUpdate = Dict[str, List[Column]]
|
||||
|
||||
|
||||
class StoredSchema(TypedDict, total=True):
|
||||
version: int
|
||||
engine_version: int
|
||||
name: str
|
||||
tables: SchemaTables
|
||||
preferred_types: Mapping[str, DataType]
|
||||
hints: Mapping[HintType, Sequence[str]]
|
||||
excludes: Sequence[str]
|
||||
includes: Sequence[str]
|
||||
|
||||
|
||||
class Schema:
|
||||
|
||||
VERSION_TABLE_NAME = "_version"
|
||||
VERSION_COLUMN_NAME = "version"
|
||||
LOADS_TABLE_NAME = "_loads"
|
||||
ENGINE_VERSION = 2
|
||||
|
||||
def __init__(self, name: str) -> None:
|
||||
self._schema_tables: SchemaTables = {}
|
||||
self._schema_name: str = name
|
||||
self._version = 1
|
||||
# list of preferred types: map regex on columns into types
|
||||
self._preferred_types: Mapping[str, DataType] = {}
|
||||
# compiled regexes
|
||||
self._compiled_preferred_types: List[Tuple[Pattern[str], DataType]] = []
|
||||
# table hints
|
||||
self._hints: Mapping[HintType, Sequence[str]] = {}
|
||||
self._compiled_hints: Dict[HintType, Sequence[Pattern[str]]] = {}
|
||||
# excluded paths
|
||||
self._excludes: Sequence[str] = []
|
||||
self._compiled_excludes: Sequence[Pattern[str]] = []
|
||||
# included paths
|
||||
self._includes: Sequence[str] = []
|
||||
self._compiled_includes: Sequence[Pattern[str]] = []
|
||||
# add version table
|
||||
self._add_standard_tables()
|
||||
# add standard hints
|
||||
self._add_standard_hints()
|
||||
# compile hints
|
||||
self._compile_regexes()
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, stored_schema: StoredSchema) -> "Schema":
|
||||
# upgrade engine if needed
|
||||
cls._upgrade_engine_version(stored_schema, stored_schema["engine_version"], cls.ENGINE_VERSION)
|
||||
# create new instance from dict
|
||||
self: Schema = cls(stored_schema["name"])
|
||||
self._schema_tables = stored_schema["tables"]
|
||||
# TODO: generate difference if STANDARD SCHEMAS are different than those and increase schema version
|
||||
if Schema.VERSION_TABLE_NAME not in self._schema_tables:
|
||||
raise SchemaCorruptedException(f"Schema must contain table {Schema.VERSION_TABLE_NAME}")
|
||||
if Schema.LOADS_TABLE_NAME not in self._schema_tables:
|
||||
raise SchemaCorruptedException(f"Schema must contain table {Schema.LOADS_TABLE_NAME}")
|
||||
# verify table schemas
|
||||
for table_name, table in self._schema_tables.items():
|
||||
for column_name in table:
|
||||
# add default hints to tables
|
||||
column = self._add_missing_hints(table[column_name])
|
||||
# overwrite column name
|
||||
column["name"] = column_name
|
||||
# verify column
|
||||
self._verify_column(table_name, column_name, column)
|
||||
table[column_name] = column
|
||||
self._version = stored_schema["version"]
|
||||
self._preferred_types = stored_schema["preferred_types"]
|
||||
self._hints = stored_schema["hints"]
|
||||
self._excludes = stored_schema["excludes"]
|
||||
self._includes = stored_schema["includes"]
|
||||
# compile regexes
|
||||
self._compile_regexes()
|
||||
|
||||
return self
|
||||
|
||||
def filter_row(self, table_name: str, row: StrAny, path_separator: str) -> StrAny:
|
||||
# include and exclude paths follow the naming convention of the unpacker and correspond to json document nesting
|
||||
# current version of the unpacker separates json elements with __
|
||||
|
||||
def _exclude(path: str) -> bool:
|
||||
is_included = False
|
||||
is_excluded = any(exclude.search(path) for exclude in self._compiled_excludes)
|
||||
if is_excluded:
|
||||
# we may have exception if explicitely included
|
||||
is_included = any(include.search(path) for include in self._compiled_includes)
|
||||
return is_excluded and not is_included
|
||||
|
||||
# check if any of the rows is excluded
|
||||
for field_name in list(row.keys()):
|
||||
path = f"{table_name}{path_separator}{field_name}"
|
||||
# excluded if any rule matches
|
||||
if _exclude(path):
|
||||
# TODO: copy to new instance
|
||||
del row[field_name] # type: ignore
|
||||
return row
|
||||
|
||||
def coerce_row(self, table_name: str, row: StrAny) -> Tuple[StrAny, List[Column]]:
|
||||
table_schema: Table = self._schema_tables.get(table_name, {})
|
||||
new_columns: List[Column] = []
|
||||
new_row: DictStrAny = {}
|
||||
for col_name, v in row.items():
|
||||
# skip None values, we should infer the types later
|
||||
if v is None:
|
||||
# just check if column is nullable if exists
|
||||
self._coerce_null_value(table_schema, table_name, col_name)
|
||||
else:
|
||||
new_col_name, new_col_def, new_v = self._coerce_non_null_value(table_schema, table_name, col_name, v)
|
||||
new_row[new_col_name] = new_v
|
||||
if new_col_def:
|
||||
new_columns.append(new_col_def)
|
||||
|
||||
return new_row, new_columns
|
||||
|
||||
def filter_hints_in_row(self, table_name: str, hint_type: HintType, row: StrAny) -> StrAny:
|
||||
rv_row: DictStrAny = {}
|
||||
column_prop: ColumnProp = self._hint_to_column_prop(hint_type)
|
||||
try:
|
||||
table = self.get_table(table_name)
|
||||
for column_name in table:
|
||||
if column_name in row:
|
||||
hint_value = table[column_name][column_prop]
|
||||
if (hint_value and column_prop != "nullable") or (column_prop == "nullable" and not hint_value):
|
||||
rv_row[column_name] = row[column_name]
|
||||
except KeyError:
|
||||
for k, v in row.items():
|
||||
if self._infer_hint(hint_type, v, k):
|
||||
rv_row[k] = v
|
||||
|
||||
# dicts are ordered and we will return the rows with hints in the same order as they appear in the columns
|
||||
return rv_row
|
||||
|
||||
def update_schema(self, table_name: str, updated_columns: List[Column]) -> None:
|
||||
# all tables in the schema must start with the schema name
|
||||
# if not table_name.startswith(f"{self._schema_name}"):
|
||||
# raise InvalidTableNameException(self._schema_name, table_name)
|
||||
|
||||
if table_name not in self._schema_tables:
|
||||
# add the whole new table to SchemaTables
|
||||
self._schema_tables[table_name] = {c["name"]: c for c in updated_columns}
|
||||
else:
|
||||
# add several columns to existing table
|
||||
table_schema = self._schema_tables[table_name]
|
||||
for column in updated_columns:
|
||||
column_name = column["name"]
|
||||
if column_name in table_schema:
|
||||
# we do not support changing existing columns
|
||||
if not Schema._compare_columns(table_schema[column_name], column):
|
||||
# attempt to update to incompatible columns
|
||||
raise CannotCoerceColumnException(table_name, column_name, table_schema[column_name]["data_type"], column["data_type"], None)
|
||||
else:
|
||||
table_schema[column_name] = column
|
||||
# bump schema version
|
||||
self._version += 1
|
||||
|
||||
def get_schema_update_for(self, table_name: str, t: Table) -> List[Column]:
|
||||
# gets new columns to be added to "t" to bring up to date with stored schema
|
||||
diff_c: List[Column] = []
|
||||
s_t = self.get_table(table_name)
|
||||
for c in s_t.values():
|
||||
if c["name"] not in t:
|
||||
diff_c.append(c)
|
||||
return diff_c
|
||||
|
||||
def get_table(self, table_name: str) -> Table:
|
||||
return self._schema_tables[table_name]
|
||||
|
||||
def to_dict(self) -> StoredSchema:
|
||||
return {
|
||||
"tables": self._schema_tables,
|
||||
"name": self._schema_name,
|
||||
"version": self._version,
|
||||
"preferred_types": self._preferred_types,
|
||||
"hints": self._hints,
|
||||
"excludes": self._excludes,
|
||||
"includes": self._includes,
|
||||
"engine_version": Schema.ENGINE_VERSION
|
||||
}
|
||||
|
||||
@property
|
||||
def schema_version(self) -> int:
|
||||
return self._version
|
||||
|
||||
@property
|
||||
def schema_name(self) -> str:
|
||||
return self._schema_name
|
||||
|
||||
@property
|
||||
def schema_tables(self) -> SchemaTables:
|
||||
return self._schema_tables
|
||||
|
||||
def as_yaml(self, remove_default_hints: bool = False) -> str:
|
||||
d = self.to_dict()
|
||||
clean_tables = deepcopy(d["tables"])
|
||||
|
||||
for t in clean_tables.values():
|
||||
for c in t.values():
|
||||
# do not save names
|
||||
del c["name"] # type: ignore
|
||||
# remove hints with default values
|
||||
if remove_default_hints:
|
||||
for h in list(c.keys()):
|
||||
if type(c[h]) is bool and c[h] is False and h != "nullable": # type: ignore
|
||||
del c[h] # type: ignore
|
||||
|
||||
d["tables"] = clean_tables
|
||||
|
||||
return cast(str, yaml.dump(d, allow_unicode=True, default_flow_style=False, sort_keys=False))
|
||||
|
||||
def _infer_column(self, k: str, v: Any) -> Column:
|
||||
return Column(
|
||||
name=k,
|
||||
data_type=self._map_value_to_column_type(v, k),
|
||||
nullable=not self._infer_hint("not_null", v, k),
|
||||
partition=self._infer_hint("partition", v, k),
|
||||
cluster=self._infer_hint("cluster", v, k),
|
||||
sort=self._infer_hint("sort", v, k),
|
||||
unique=self._infer_hint("unique", v, k),
|
||||
primary_key=self._infer_hint("primary_key", v, k),
|
||||
foreign_key=self._infer_hint("foreign_key", v, k)
|
||||
)
|
||||
|
||||
def _coerce_null_value(self, table_schema: Table, table_name: str, col_name: str) -> None:
|
||||
if col_name in table_schema:
|
||||
existing_column = table_schema[col_name]
|
||||
if not existing_column["nullable"]:
|
||||
raise CannotCoerceNullException(table_name, col_name)
|
||||
|
||||
def _coerce_non_null_value(self, table_schema: Table, table_name: str, col_name: str, v: Any) -> Tuple[str, Column, Any]:
|
||||
new_column: Column = None
|
||||
rv = v
|
||||
variant_col_name = col_name
|
||||
|
||||
if col_name in table_schema:
|
||||
existing_column = table_schema[col_name]
|
||||
# existing columns cannot be changed so we must update row
|
||||
py_data_type = Schema._py_type_to_sc_type(type(v))
|
||||
if existing_column["data_type"] != py_data_type:
|
||||
# first try to coerce existing value into destination type
|
||||
try:
|
||||
rv = Schema._coerce_type(existing_column["data_type"], py_data_type, v)
|
||||
except (ValueError, SyntaxError):
|
||||
# for complex types we must coerce to text
|
||||
if py_data_type == "complex":
|
||||
py_data_type = "text"
|
||||
rv = Schema._coerce_type("text", "complex", v)
|
||||
# if that does not work we must create variant extension to the table
|
||||
variant_col_name = f"{col_name}_v_{py_data_type}"
|
||||
# if variant exists check type, coercions are not required
|
||||
if variant_col_name in table_schema:
|
||||
if table_schema[variant_col_name]["data_type"] != py_data_type:
|
||||
raise CannotCoerceColumnException(table_name, variant_col_name, table_schema[variant_col_name]["data_type"], py_data_type, v)
|
||||
else:
|
||||
# new column
|
||||
# add new column
|
||||
new_column = self._infer_column(variant_col_name, v)
|
||||
# must have variant type, not preferred or coerced type
|
||||
new_column["data_type"] = py_data_type
|
||||
else:
|
||||
# just copy row: types match
|
||||
pass
|
||||
else:
|
||||
# infer new column
|
||||
new_column = self._infer_column(col_name, v)
|
||||
# and coerce type if inference changed the python type
|
||||
py_type = Schema._py_type_to_sc_type(type(v))
|
||||
rv = Schema._coerce_type(new_column["data_type"], py_type, v)
|
||||
|
||||
return variant_col_name, new_column, rv
|
||||
|
||||
def _map_value_to_column_type(self, v: Any, k: str) -> DataType:
|
||||
mapped_type = Schema._py_type_to_sc_type(type(v))
|
||||
# if complex type was detected we must coerce to string
|
||||
if mapped_type == "complex":
|
||||
mapped_type = "text"
|
||||
# get preferred type based on column name
|
||||
preferred_type = self._get_preferred_type(k)
|
||||
# try to match python type to preferred
|
||||
if preferred_type:
|
||||
# try to coerce to destination type
|
||||
try:
|
||||
Schema._coerce_type(preferred_type, mapped_type, v)
|
||||
# coercion possible so preferred type may be used
|
||||
mapped_type = preferred_type
|
||||
except ValueError:
|
||||
# coercion not possible
|
||||
pass
|
||||
return mapped_type
|
||||
|
||||
def _get_preferred_type(self, col_name: str) -> Optional[DataType]:
|
||||
return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None)
|
||||
|
||||
def _infer_hint(self, hint_type: HintType, _: Any, k: str) -> bool:
|
||||
if hint_type in self._compiled_hints:
|
||||
return any(h.search(k) for h in self._compiled_hints[hint_type])
|
||||
else:
|
||||
return False
|
||||
|
||||
def _add_standard_tables(self) -> None:
|
||||
version_table: Table = {
|
||||
"version": self._add_missing_hints({
|
||||
"name": "version",
|
||||
"data_type": "bigint",
|
||||
"nullable": False,
|
||||
}),
|
||||
"engine_version": self._add_missing_hints({
|
||||
"name": "engine_version",
|
||||
"data_type": "bigint",
|
||||
"nullable": False
|
||||
}),
|
||||
"inserted_at": self._add_missing_hints({
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": False
|
||||
})
|
||||
|
||||
}
|
||||
self._schema_tables[Schema.VERSION_TABLE_NAME] = version_table
|
||||
load_table: Table = {
|
||||
"load_id": self._add_missing_hints({
|
||||
"name": "load_id",
|
||||
"data_type": "text",
|
||||
"nullable": False
|
||||
}),
|
||||
"status": self._add_missing_hints({
|
||||
"name": "status",
|
||||
"data_type": "bigint",
|
||||
"nullable": False
|
||||
}),
|
||||
"inserted_at": self._add_missing_hints({
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": False
|
||||
})
|
||||
}
|
||||
self._schema_tables[Schema.LOADS_TABLE_NAME] = load_table
|
||||
|
||||
def _add_standard_hints(self) -> None:
|
||||
self._hints = {
|
||||
"not_null": ["^_record_hash$", "^_root_hash$", "^_parent_hash$", "^_pos$", "_load_id"],
|
||||
"foreign_key": ["^_parent_hash$"],
|
||||
"unique": ["^_record_hash$"]
|
||||
}
|
||||
|
||||
def _compile_regexes(self) -> None:
|
||||
for pattern, dt in self._preferred_types.items():
|
||||
# add tuples to be searched in coercions
|
||||
self._compiled_preferred_types.append((re.compile(pattern), dt))
|
||||
for hint_name, hint_list in self._hints.items():
|
||||
# compile hints which are column matching regexes
|
||||
self._compiled_hints[hint_name] = list(map(lambda hint: re.compile(hint), hint_list))
|
||||
self._compiled_excludes = list(map(lambda exclude: re.compile(exclude), self._excludes))
|
||||
self._compiled_includes = list(map(lambda include: re.compile(include), self._includes))
|
||||
|
||||
@staticmethod
|
||||
def _verify_column(table_name: str, column_name: str, column: Column) -> None:
|
||||
existing_props = set(column.keys())
|
||||
missing_props = COLUMN_PROPS.difference(existing_props)
|
||||
if len(missing_props) > 0:
|
||||
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: Column definition is missing following properties {missing_props}")
|
||||
data_type = column["data_type"]
|
||||
if data_type not in DATA_TYPES:
|
||||
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: {data_type} is not one of available types: {DATA_TYPES}")
|
||||
for p, v in column.items():
|
||||
if p in COLUMN_HINTS and not type(v) is bool:
|
||||
raise SchemaCorruptedException(f"In table {table_name} column {column_name}: hint {p} is not boolean.")
|
||||
|
||||
@staticmethod
|
||||
def _upgrade_engine_version(schema_dict: StoredSchema, from_engine: int, to_engine: int) -> None:
|
||||
if from_engine == 1:
|
||||
schema_dict["engine_version"] = 2
|
||||
schema_dict["includes"] = []
|
||||
schema_dict["excludes"] = []
|
||||
from_engine = 2
|
||||
if from_engine == 2:
|
||||
pass
|
||||
if from_engine != to_engine:
|
||||
raise SchemaEngineNoUpgradePathException(schema_dict["name"], schema_dict["engine_version"], from_engine, to_engine)
|
||||
|
||||
@staticmethod
|
||||
def _add_missing_hints(column: ColumnBase) -> Column:
|
||||
return {
|
||||
**{ # type:ignore
|
||||
"partition": False,
|
||||
"cluster": False,
|
||||
"unique": False,
|
||||
"sort": False,
|
||||
"primary_key": False,
|
||||
"foreign_key": False,
|
||||
},
|
||||
**column
|
||||
}
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _py_type_to_sc_type(t: Type[Any]) -> DataType:
|
||||
if t is float:
|
||||
return "double"
|
||||
elif t is int:
|
||||
return "bigint"
|
||||
elif t is bool:
|
||||
return "bool"
|
||||
elif t is bytes:
|
||||
return "binary"
|
||||
elif t in [dict, list]:
|
||||
return "complex"
|
||||
elif t is Decimal:
|
||||
return "decimal"
|
||||
else:
|
||||
return "text"
|
||||
|
||||
@staticmethod
|
||||
def _coerce_type(to_type: DataType, from_type: DataType, value: Any) -> Any:
|
||||
if to_type == from_type:
|
||||
return value
|
||||
|
||||
if to_type == "text":
|
||||
if from_type == "complex":
|
||||
return json.dumps(value)
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
if to_type == "binary":
|
||||
if from_type == "text":
|
||||
if value.startswith("0x"):
|
||||
return bytes.fromhex(value[2:])
|
||||
try:
|
||||
return base64.b64decode(value, validate=True)
|
||||
except binascii.Error:
|
||||
raise ValueError(value)
|
||||
if from_type == "bigint":
|
||||
return value.to_bytes((value.bit_length() + 7) // 8, 'little')
|
||||
|
||||
if to_type in ["wei", "bigint"]:
|
||||
if from_type == "bigint":
|
||||
return value
|
||||
if from_type in ["decimal", "double"]:
|
||||
if value % 1 != 0:
|
||||
# only integer decimals and floats can be coerced
|
||||
raise ValueError(value)
|
||||
return int(value)
|
||||
if from_type == "text":
|
||||
trim_value = value.strip()
|
||||
if trim_value.startswith("0x"):
|
||||
return int(trim_value[2:], 16)
|
||||
else:
|
||||
return int(trim_value)
|
||||
|
||||
if to_type == "double":
|
||||
if from_type in ["bigint", "wei", "decimal"]:
|
||||
return float(value)
|
||||
if from_type == "text":
|
||||
trim_value = value.strip()
|
||||
if trim_value.startswith("0x"):
|
||||
return float(int(trim_value[2:], 16))
|
||||
else:
|
||||
return float(trim_value)
|
||||
|
||||
if to_type == "decimal":
|
||||
if from_type in ["bigint", "wei"]:
|
||||
return value
|
||||
if from_type == "double":
|
||||
return Decimal(value)
|
||||
if from_type == "text":
|
||||
trim_value = value.strip()
|
||||
if trim_value.startswith("0x"):
|
||||
return int(trim_value[2:], 16)
|
||||
elif "." not in trim_value and "e" not in trim_value:
|
||||
return int(trim_value)
|
||||
else:
|
||||
try:
|
||||
return Decimal(trim_value)
|
||||
except ConversionSyntax:
|
||||
raise ValueError(trim_value)
|
||||
|
||||
if to_type == "timestamp":
|
||||
if from_type in ["bigint", "double"]:
|
||||
# returns ISO datetime with timezone
|
||||
return str(pendulum.from_timestamp(value))
|
||||
|
||||
if from_type == "text":
|
||||
# if parses as ISO date then pass it
|
||||
try:
|
||||
isoparse(value)
|
||||
return value
|
||||
except ValueError:
|
||||
# try to convert string to integer, or float
|
||||
try:
|
||||
value = int(value)
|
||||
except ValueError:
|
||||
# raises ValueError if not parsing correctly
|
||||
value = float(value)
|
||||
return str(pendulum.from_timestamp(value))
|
||||
|
||||
raise ValueError(value)
|
||||
|
||||
@staticmethod
|
||||
def _compare_columns(a: Column, b: Column) -> bool:
|
||||
return a["data_type"] == b["data_type"] and a["nullable"] == b["nullable"]
|
||||
|
||||
@staticmethod
|
||||
def _hint_to_column_prop(h: HintType) -> ColumnProp:
|
||||
if h == "not_null":
|
||||
return "nullable"
|
||||
return h
|
||||
|
||||
class SchemaException(DltException):
|
||||
pass
|
||||
|
||||
|
||||
class CannotCoerceColumnException(SchemaException):
|
||||
def __init__(self, table_name: str, column_name: str, from_type: DataType, to_type: DataType, value: Any) -> None:
|
||||
super().__init__(f"Cannot coerce type in table {table_name} column {column_name} existing type {from_type} coerced type {to_type} value: {value}")
|
||||
|
||||
|
||||
class CannotCoerceNullException(SchemaException):
|
||||
def __init__(self, table_name: str, column_name: str) -> None:
|
||||
super().__init__(f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable")
|
||||
|
||||
|
||||
class InvalidTableNameException(SchemaException):
|
||||
def __init__(self, schema_name: str, table_name: str) -> None:
|
||||
self.schema_name = schema_name
|
||||
self.table_name = table_name
|
||||
super().__init__(f"All table names must start with '{schema_name}' so {table_name} is invalid")
|
||||
|
||||
class SchemaCorruptedException(SchemaException):
|
||||
pass
|
||||
|
||||
|
||||
class SchemaEngineNoUpgradePathException(SchemaException):
|
||||
def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engine: int) -> None:
|
||||
self.schema_name = schema_name
|
||||
self.init_engine = init_engine
|
||||
self.from_engine = from_engine
|
||||
self.to_engine = to_engine
|
||||
super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}")
|
||||
35
dlt/common/signals.py
Normal file
35
dlt/common/signals.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import signal
|
||||
from threading import Event
|
||||
from typing import Any
|
||||
|
||||
from dlt.common import logger
|
||||
from dlt.common.exceptions import SignalReceivedException
|
||||
|
||||
_received_signal: int = 0
|
||||
exit_event = Event()
|
||||
|
||||
|
||||
def signal_receiver(signal: int, frame: Any) -> None:
|
||||
global _received_signal
|
||||
|
||||
logger.info(f"Signal {signal} received")
|
||||
|
||||
if _received_signal > 0:
|
||||
logger.info(f"Another signal received after {_received_signal}")
|
||||
return
|
||||
|
||||
_received_signal = signal
|
||||
# awake all threads sleeping on event
|
||||
exit_event.set()
|
||||
|
||||
logger.info(f"Sleeping threads signalled")
|
||||
|
||||
|
||||
def raise_if_signalled() -> None:
|
||||
if _received_signal:
|
||||
raise SignalReceivedException(_received_signal)
|
||||
|
||||
|
||||
def register_signals() -> None:
|
||||
signal.signal(signal.SIGINT, signal_receiver)
|
||||
signal.signal(signal.SIGTERM, signal_receiver)
|
||||
1
dlt/common/storages/__init__.py
Normal file
1
dlt/common/storages/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .schema_storage import SchemaStorage # noqa: F401
|
||||
23
dlt/common/storages/exceptions.py
Normal file
23
dlt/common/storages/exceptions.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import semver
|
||||
from dlt.common.exceptions import DltException
|
||||
|
||||
class StorageException(DltException):
|
||||
def __init__(self, msg: str) -> None:
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class NoMigrationPathException(StorageException):
|
||||
def __init__(self, storage_path: str, initial_version: semver.VersionInfo, migrated_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None:
|
||||
self.storage_path = storage_path
|
||||
self.initial_version = initial_version
|
||||
self.migrated_version = migrated_version
|
||||
self.target_version = target_version
|
||||
super().__init__(f"Could not find migration path for {storage_path} from v {initial_version} to {target_version}, stopped at {migrated_version}")
|
||||
|
||||
|
||||
class WrongStorageVersionException(StorageException):
|
||||
def __init__(self, storage_path: str, initial_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None:
|
||||
self.storage_path = storage_path
|
||||
self.initial_version = initial_version
|
||||
self.target_version = target_version
|
||||
super().__init__(f"Expected storage {storage_path} with v {target_version} but found {initial_version}")
|
||||
181
dlt/common/storages/loader_storage.py
Normal file
181
dlt/common/storages/loader_storage.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Sequence, Tuple, Type
|
||||
|
||||
from dlt.common import json, pendulum
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.dataset_writers import TWriterType, write_jsonl, write_insert_values
|
||||
from dlt.common.configuration import LoadingVolumeConfiguration
|
||||
from dlt.common.exceptions import TerminalValueError
|
||||
from dlt.common.schema import SchemaUpdate, Table
|
||||
from dlt.common.storages.versioned_storage import VersionedStorage
|
||||
from dlt.common.typing import StrAny
|
||||
|
||||
from dlt.common.storages.exceptions import StorageException
|
||||
|
||||
|
||||
# folders to manage load jobs in a single load package
|
||||
TWorkingFolder = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"]
|
||||
|
||||
class LoaderStorage(VersionedStorage):
|
||||
|
||||
STORAGE_VERSION = "1.0.0"
|
||||
LOADING_FOLDER = "loading" # folder within the volume where load packages are stored
|
||||
LOADED_FOLDER = "loaded" # folder to keep the loads that were completely processed
|
||||
|
||||
NEW_JOBS_FOLDER: TWorkingFolder = "new_jobs"
|
||||
FAILED_JOBS_FOLDER: TWorkingFolder = "failed_jobs"
|
||||
STARTED_JOBS_FOLDER: TWorkingFolder = "started_jobs"
|
||||
COMPLETED_JOBS_FOLDER: TWorkingFolder = "completed_jobs"
|
||||
|
||||
LOAD_SCHEMA_UPDATE_FILE_NAME = "schema_updates.json"
|
||||
|
||||
SUPPORTED_WRITERS: List[TWriterType] = ["jsonl", "insert_values"]
|
||||
|
||||
def __init__(self, is_owner: bool, C: Type[LoadingVolumeConfiguration], writer_type: TWriterType) -> None:
|
||||
if writer_type not in LoaderStorage.SUPPORTED_WRITERS:
|
||||
raise TerminalValueError(writer_type)
|
||||
self.writer_type = writer_type
|
||||
self.delete_completed_jobs = C.DELETE_COMPLETED_JOBS
|
||||
super().__init__(LoaderStorage.STORAGE_VERSION, is_owner, FileStorage(C.LOADING_VOLUME_PATH, "t", makedirs=is_owner))
|
||||
|
||||
def initialize_storage(self) -> None:
|
||||
self.storage.create_folder(LoaderStorage.LOADED_FOLDER, exists_ok=True)
|
||||
self.storage.create_folder(LoaderStorage.LOADING_FOLDER, exists_ok=True)
|
||||
|
||||
def create_temp_load_folder(self, load_id: str) -> None:
|
||||
# delete previous version
|
||||
if self.storage.has_folder(load_id):
|
||||
self.storage.delete_folder(load_id, recursively=True)
|
||||
self.storage.create_folder(load_id)
|
||||
# create processing directories
|
||||
self.storage.create_folder(f"{load_id}/{LoaderStorage.NEW_JOBS_FOLDER}")
|
||||
self.storage.create_folder(f"{load_id}/{LoaderStorage.COMPLETED_JOBS_FOLDER}")
|
||||
self.storage.create_folder(f"{load_id}/{LoaderStorage.FAILED_JOBS_FOLDER}")
|
||||
self.storage.create_folder(f"{load_id}/{LoaderStorage.STARTED_JOBS_FOLDER}")
|
||||
|
||||
def write_temp_loading_file(self, load_id: str, table_name: str, table: Table, file_id: str, rows: Sequence[StrAny]) -> str:
|
||||
file_name = self.build_loading_file_name(load_id, table_name, file_id)
|
||||
with self.storage.open(file_name, mode = "w") as f:
|
||||
if self.writer_type == "jsonl":
|
||||
write_jsonl(f, rows)
|
||||
elif self.writer_type == "insert_values":
|
||||
write_insert_values(f, rows, table.keys())
|
||||
return Path(file_name).name
|
||||
|
||||
def save_schema_updates(self, load_id: str, schema_updates: Sequence[SchemaUpdate]) -> None:
|
||||
with self.storage.open(f"{load_id}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}", mode="w") as f:
|
||||
json.dump(schema_updates, f)
|
||||
|
||||
def commit_temp_load_folder(self, load_id: str) -> None:
|
||||
self.storage.atomic_rename(load_id, self.get_load_path(load_id))
|
||||
|
||||
def list_loads(self) -> Sequence[str]:
|
||||
loads = self.storage.list_folder_dirs(LoaderStorage.LOADING_FOLDER, to_root=False)
|
||||
# start from the oldest packages
|
||||
return sorted(loads)
|
||||
|
||||
def list_completed_loads(self) -> Sequence[str]:
|
||||
loads = self.storage.list_folder_dirs(LoaderStorage.LOADED_FOLDER, to_root=False)
|
||||
# start from the oldest packages
|
||||
return sorted(loads)
|
||||
|
||||
def list_new_jobs(self, load_id: str) -> Sequence[str]:
|
||||
new_jobs = self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.NEW_JOBS_FOLDER}")
|
||||
# make sure all jobs have supported writers
|
||||
wrong_job = next((j for j in new_jobs if LoaderStorage.parse_load_file_name(j)[1] != self.writer_type), None)
|
||||
if wrong_job is not None:
|
||||
raise JobWithUnsupportedWriterException(load_id, self.writer_type, wrong_job)
|
||||
return new_jobs
|
||||
|
||||
def list_started_jobs(self, load_id: str) -> Sequence[str]:
|
||||
return self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.STARTED_JOBS_FOLDER}")
|
||||
|
||||
def list_failed_jobs(self, load_id: str) -> Sequence[str]:
|
||||
return self.storage.list_folder_files(f"{self.get_load_path(load_id)}/{LoaderStorage.FAILED_JOBS_FOLDER}")
|
||||
|
||||
def list_archived_failed_jobs(self, load_id: str) -> Sequence[str]:
|
||||
return self.storage.list_folder_files(f"{self.get_archived_path(load_id)}/{LoaderStorage.FAILED_JOBS_FOLDER}")
|
||||
|
||||
def begin_schema_update(self, load_id: str) -> Optional[SchemaUpdate]:
|
||||
schema_update_file = f"{self.get_load_path(load_id)}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}"
|
||||
if self.storage.has_file(schema_update_file):
|
||||
schema_update: SchemaUpdate = json.loads(self.storage.load(schema_update_file))
|
||||
return schema_update
|
||||
else:
|
||||
return None
|
||||
|
||||
def commit_schema_update(self, load_id: str) -> None:
|
||||
load_path = self.get_load_path(load_id)
|
||||
schema_update_file = f"{load_path}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}"
|
||||
self.storage.atomic_rename(schema_update_file, f"{load_path}/{LoaderStorage.COMPLETED_JOBS_FOLDER}/{LoaderStorage.LOAD_SCHEMA_UPDATE_FILE_NAME}")
|
||||
|
||||
def start_job(self, load_id: str, file_name: str) -> str:
|
||||
return self._move_file(load_id, LoaderStorage.NEW_JOBS_FOLDER, LoaderStorage.STARTED_JOBS_FOLDER, file_name)
|
||||
|
||||
def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) -> str:
|
||||
load_path = self.get_load_path(load_id)
|
||||
if failed_message:
|
||||
self.storage.save(f"{load_path}/{LoaderStorage.FAILED_JOBS_FOLDER}/{file_name}.exception", failed_message)
|
||||
# move to failed jobs
|
||||
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.FAILED_JOBS_FOLDER, file_name)
|
||||
|
||||
def retry_job(self, load_id: str, file_name: str) -> str:
|
||||
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.NEW_JOBS_FOLDER, file_name)
|
||||
|
||||
def complete_job(self, load_id: str, file_name: str) -> str:
|
||||
return self._move_file(load_id, LoaderStorage.STARTED_JOBS_FOLDER, LoaderStorage.COMPLETED_JOBS_FOLDER, file_name)
|
||||
|
||||
def archive_load(self, load_id: str) -> None:
|
||||
load_path = self.get_load_path(load_id)
|
||||
has_failed_jobs = len(self.list_failed_jobs(load_id)) > 0
|
||||
# delete load that does not contain failed jobs
|
||||
if self.delete_completed_jobs and not has_failed_jobs:
|
||||
self.storage.delete_folder(load_path, recursively=True)
|
||||
else:
|
||||
archive_path = self.get_archived_path(load_id)
|
||||
self.storage.atomic_rename(load_path, archive_path)
|
||||
|
||||
def get_load_path(self, load_id: str) -> str:
|
||||
return f"{LoaderStorage.LOADING_FOLDER}/{load_id}"
|
||||
|
||||
def get_archived_path(self, load_id: str) -> str:
|
||||
return f"{LoaderStorage.LOADED_FOLDER}/{load_id}"
|
||||
|
||||
def build_loading_file_name(self, load_id: str, table_name: str, file_id: str) -> str:
|
||||
file_name = f"{table_name}.{file_id}.{self.writer_type}"
|
||||
return f"{load_id}/{LoaderStorage.NEW_JOBS_FOLDER}/{file_name}"
|
||||
|
||||
def _move_file(self, load_id: str, source_folder: TWorkingFolder, dest_folder: TWorkingFolder, file_name: str) -> str:
|
||||
load_path = self.get_load_path(load_id)
|
||||
dest_path = f"{load_path}/{dest_folder}/{file_name}"
|
||||
self.storage.atomic_rename(f"{load_path}/{source_folder}/{file_name}", dest_path)
|
||||
return self.storage._make_path(dest_path)
|
||||
|
||||
def job_elapsed_time_seconds(self, file_path: str) -> float:
|
||||
return pendulum.now().timestamp() - os.path.getmtime(file_path) # type: ignore
|
||||
|
||||
def _get_file_path(self, load_id: str, folder: TWorkingFolder, file_name: str) -> str:
|
||||
load_path = self.get_load_path(load_id)
|
||||
return f"{load_path}/{folder}/{file_name}"
|
||||
|
||||
@staticmethod
|
||||
def parse_load_file_name(file_name: str) -> Tuple[str, TWriterType]:
|
||||
p = Path(file_name)
|
||||
ext: TWriterType = p.suffix[1:] # type: ignore
|
||||
if ext not in LoaderStorage.SUPPORTED_WRITERS:
|
||||
raise TerminalValueError(ext)
|
||||
|
||||
parts = p.stem.split(".")
|
||||
return (parts[0], ext)
|
||||
|
||||
|
||||
class LoaderStorageException(StorageException):
|
||||
pass
|
||||
|
||||
|
||||
class JobWithUnsupportedWriterException(LoaderStorageException):
|
||||
def __init__(self, load_id: str, expected_writer_type: TWriterType, wrong_job: str) -> None:
|
||||
self.load_id = load_id
|
||||
self.expected_writer_type = expected_writer_type
|
||||
self.wrong_job = wrong_job
|
||||
49
dlt/common/storages/schema_storage.py
Normal file
49
dlt/common/storages/schema_storage.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from dlt.common import json
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.schema import Schema, StoredSchema
|
||||
|
||||
|
||||
class SchemaStorage:
|
||||
|
||||
STORE_SCHEMA_FILE_PATTERN = "%s_schema.json"
|
||||
FOLDER_SCHEMA_FILE = "schema.json"
|
||||
|
||||
def __init__(self, schema_storage_root: str, makedirs: bool = False) -> None:
|
||||
self.storage = FileStorage(schema_storage_root, makedirs=makedirs)
|
||||
|
||||
def load_store_schema(self, name: str) -> Schema:
|
||||
# loads a schema from a store holding many schemas
|
||||
schema_file = self._get_file_by_name(name)
|
||||
stored_schema: StoredSchema = json.loads(self.storage.load(schema_file))
|
||||
return Schema.from_dict(stored_schema)
|
||||
|
||||
def load_folder_schema(self, from_folder: str) -> Schema:
|
||||
# loads schema from a folder containing one default schema
|
||||
schema_path = self._get_file_in_folder(from_folder)
|
||||
stored_schema: StoredSchema = json.loads(self.storage.load(schema_path))
|
||||
return Schema.from_dict(stored_schema)
|
||||
|
||||
def save_store_schema(self, schema: Schema) -> str:
|
||||
# save a schema to schema store
|
||||
dump = json.dumps(schema.to_dict(), indent=2)
|
||||
schema_file = self._get_file_by_name(schema.schema_name)
|
||||
return self.storage.save(schema_file, dump)
|
||||
|
||||
def save_folder_schema(self, schema: Schema, in_folder: str) -> str:
|
||||
# save a schema to a folder holding one schema
|
||||
dump = json.dumps(schema.to_dict())
|
||||
schema_file = self._get_file_in_folder(in_folder)
|
||||
return self.storage.save(schema_file, dump)
|
||||
|
||||
def has_store_schema(self, name: str) -> bool:
|
||||
schema_file = self._get_file_by_name(name)
|
||||
return self.storage.has_file(schema_file)
|
||||
|
||||
def _get_file_by_name(self, name: str) -> str:
|
||||
return SchemaStorage.STORE_SCHEMA_FILE_PATTERN % name
|
||||
|
||||
def _get_file_in_folder(self, folder: str) -> str:
|
||||
return os.path.join(folder, SchemaStorage.FOLDER_SCHEMA_FILE) # if folder is None else os.path.join(folder, SchemaStorage.SCHEMA_FILE)
|
||||
73
dlt/common/storages/unpacker_storage.py
Normal file
73
dlt/common/storages/unpacker_storage.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from typing import List, Sequence, Tuple, Type
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
|
||||
from dlt.common.utils import chunks
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.configuration import UnpackingVolumeConfiguration
|
||||
from dlt.common.storages.versioned_storage import VersionedStorage
|
||||
|
||||
|
||||
class UnpackerStorage(VersionedStorage):
|
||||
|
||||
STORAGE_VERSION = "1.0.0"
|
||||
UNPACKING_FOLDER: str = "unpacking" # folder within the volume where files to be unpacked are stored
|
||||
UNPACK_FILE_EXTENSION = ".unpack.json"
|
||||
UNPACK_FILE_EXTENSION_LEN = len(UNPACK_FILE_EXTENSION)
|
||||
|
||||
def __init__(self, is_owner: bool, C: Type[UnpackingVolumeConfiguration]) -> None:
|
||||
super().__init__(UnpackerStorage.STORAGE_VERSION, is_owner, FileStorage(C.UNPACKING_VOLUME_PATH, "t", makedirs=is_owner))
|
||||
|
||||
def initialize_storage(self) -> None:
|
||||
self.storage.create_folder(UnpackerStorage.UNPACKING_FOLDER, exists_ok=True)
|
||||
|
||||
def list_files_to_unpack_sorted(self) -> Sequence[str]:
|
||||
return sorted(self.storage.list_folder_files(UnpackerStorage.UNPACKING_FOLDER))
|
||||
|
||||
def get_grouped_iterator(self, files: Sequence[str]) -> "groupby[str, str]":
|
||||
return groupby(files, lambda f: UnpackerStorage.get_schema_name(f))
|
||||
|
||||
@staticmethod
|
||||
def chunk_by_events(files: Sequence[str], max_events: int, processing_cores: int) -> List[Sequence[str]]:
|
||||
# should distribute ~ N events evenly among m cores with fallback for small amounts of events
|
||||
|
||||
def count_events(file_name : str) -> int:
|
||||
# return event count from file name
|
||||
return UnpackerStorage.get_events_count(file_name)
|
||||
|
||||
counts = list(map(count_events, files))
|
||||
# make a list of files containing ~max_events
|
||||
events_count = 0
|
||||
m = 0
|
||||
while events_count < max_events and m < len(files):
|
||||
events_count += counts[m]
|
||||
m += 1
|
||||
processing_chunks = round(m / processing_cores)
|
||||
if processing_chunks == 0:
|
||||
# return one small chunk
|
||||
return [files]
|
||||
else:
|
||||
# should return ~ amount of chunks to fill all the cores
|
||||
return list(chunks(files[:m], processing_chunks))
|
||||
|
||||
@staticmethod
|
||||
def get_events_count(file_name: str) -> int:
|
||||
return UnpackerStorage._parse_unpack_file_name(file_name)[0]
|
||||
|
||||
@staticmethod
|
||||
def get_schema_name(file_name: str) -> str:
|
||||
return UnpackerStorage._parse_unpack_file_name(file_name)[2]
|
||||
|
||||
@staticmethod
|
||||
def build_unpack_file_name(schema_name: str, stem: str, event_count: int, load_id: str) -> str:
|
||||
# builds file name of the unpack file for the tracker
|
||||
return f"{schema_name}_{stem}_{load_id}_{event_count}{UnpackerStorage.UNPACK_FILE_EXTENSION}"
|
||||
|
||||
@staticmethod
|
||||
def _parse_unpack_file_name(file_name: str) -> Tuple[int, str, str]:
|
||||
# parser unpack tracker file and returns (events found, load id, schema_name)
|
||||
if not file_name.endswith(UnpackerStorage.UNPACK_FILE_EXTENSION):
|
||||
raise ValueError(file_name)
|
||||
|
||||
parts = Path(file_name[:-UnpackerStorage.UNPACK_FILE_EXTENSION_LEN]).stem.split("_")
|
||||
return (int(parts[-1]), parts[-2], parts[0])
|
||||
54
dlt/common/storages/versioned_storage.py
Normal file
54
dlt/common/storages/versioned_storage.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import semver
|
||||
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.storages.exceptions import NoMigrationPathException, WrongStorageVersionException
|
||||
|
||||
|
||||
class VersionedStorage:
|
||||
|
||||
VERSION_FILE = ".version"
|
||||
|
||||
def __init__(self, version: semver.VersionInfo, is_owner: bool, storage: FileStorage) -> None:
|
||||
self.storage = storage
|
||||
# read current version
|
||||
if self.storage.has_file(VersionedStorage.VERSION_FILE):
|
||||
existing_version = self._load_version()
|
||||
if existing_version != version:
|
||||
if existing_version > version:
|
||||
# version cannot be downgraded
|
||||
raise NoMigrationPathException(storage.storage_path, existing_version, existing_version, version)
|
||||
if is_owner:
|
||||
# only owner can migrate storage
|
||||
self.migrate_storage(existing_version, version)
|
||||
# storage should be migrated to desired version
|
||||
migrated_version = self._load_version()
|
||||
if version != migrated_version:
|
||||
raise NoMigrationPathException(storage.storage_path, existing_version, migrated_version, version)
|
||||
else:
|
||||
# we cannot use storage and we must wait for owner to upgrade it
|
||||
raise WrongStorageVersionException(storage.storage_path, existing_version, version)
|
||||
else:
|
||||
if is_owner:
|
||||
self._save_version(version)
|
||||
else:
|
||||
raise WrongStorageVersionException(storage.storage_path, semver.VersionInfo.parse("0.0.0"), version)
|
||||
|
||||
def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None:
|
||||
# migration example:
|
||||
# # semver lib supports comparing both to string and other semvers
|
||||
# if from_version == "1.0.0" and from_version < to_version:
|
||||
# # do migration
|
||||
# # save migrated version
|
||||
# from_version = semver.VersionInfo.parse("1.1.0")
|
||||
# self._save_version(from_version)
|
||||
pass
|
||||
|
||||
@property
|
||||
def version(self) -> semver.VersionInfo:
|
||||
return self._load_version()
|
||||
|
||||
def _load_version(self) -> semver.VersionInfo:
|
||||
return self.storage.load(VersionedStorage.VERSION_FILE)
|
||||
|
||||
def _save_version(self, version: semver.VersionInfo) -> None:
|
||||
self.storage.save(VersionedStorage.VERSION_FILE, str(version))
|
||||
65
dlt/common/telemetry.py
Normal file
65
dlt/common/telemetry.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from typing import Iterable, Sequence, TypedDict, NamedTuple
|
||||
from prometheus_client import Gauge
|
||||
from prometheus_client.metrics import MetricWrapperBase
|
||||
|
||||
from dlt.common.typing import DictStrAny, StrAny
|
||||
|
||||
|
||||
class TRunHealth(TypedDict):
|
||||
# count runs
|
||||
runs_count: int
|
||||
# count not idle runs
|
||||
runs_not_idle_count: int
|
||||
# succesfull runs
|
||||
runs_healthy_count: int
|
||||
# count consecutive successful runs
|
||||
runs_cs_healthy_gauge: int
|
||||
# count failed runs
|
||||
runs_failed_count: int
|
||||
# count consecutive failed runs
|
||||
runs_cs_failed_gauge: int
|
||||
# number of items pending at the end of the run
|
||||
runs_pending_items_gauge: int
|
||||
|
||||
|
||||
class TRunMetrics(NamedTuple):
|
||||
was_idle: bool
|
||||
has_failed: bool
|
||||
pending_items: int
|
||||
|
||||
|
||||
def get_metrics_from_prometheus(gauges: Iterable[MetricWrapperBase]) -> StrAny:
|
||||
metrics: DictStrAny = {}
|
||||
for g in gauges:
|
||||
name = g._name
|
||||
if g._is_parent():
|
||||
# for gauges containing many label values, enumerate all
|
||||
metrics.update(get_metrics_from_prometheus([g.labels(*l) for l in g._metrics.keys()]))
|
||||
continue
|
||||
# for gauges with labels: add the label to the name and enumerate samples
|
||||
if g._labelvalues:
|
||||
name += "_" + "_".join(g._labelvalues)
|
||||
for m in g._child_samples():
|
||||
k = name
|
||||
if m[0] == "_created":
|
||||
continue
|
||||
if m[0] != "_total":
|
||||
k += m[0]
|
||||
if g._type == "info":
|
||||
# actual descriptive value is held in [1], [2] is a placeholder in info
|
||||
metrics[k] = m[1]
|
||||
else:
|
||||
metrics[k] = m[2]
|
||||
return metrics
|
||||
|
||||
|
||||
def set_gauge_all_labels(gauge: Gauge, value: float) -> None:
|
||||
if gauge._is_parent():
|
||||
for l in gauge._metrics.keys():
|
||||
set_gauge_all_labels(gauge.labels(*l), value)
|
||||
else:
|
||||
gauge.set(value)
|
||||
|
||||
|
||||
def get_logging_extras(gauges: Iterable[MetricWrapperBase]) -> StrAny:
|
||||
return {"metrics": get_metrics_from_prometheus(gauges)}
|
||||
30
dlt/common/time.py
Normal file
30
dlt/common/time.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from typing import Optional # noqa
|
||||
|
||||
from dlt.common import signals
|
||||
|
||||
PAST_TIMESTAMP: float = 0.0
|
||||
FUTURE_TIMESTAMP: float = 9999999999.0
|
||||
DAY_DURATION_SEC: float = 24 * 60 * 60.0
|
||||
|
||||
|
||||
def timestamp_within(timestamp: float, min_exclusive: Optional[float], max_inclusive: Optional[float]) -> bool:
|
||||
"""
|
||||
check if timestamp within range uniformly treating none and range inclusiveness
|
||||
"""
|
||||
return timestamp > (min_exclusive or PAST_TIMESTAMP) and timestamp <= (max_inclusive or FUTURE_TIMESTAMP)
|
||||
|
||||
|
||||
def timestamp_before(timestamp: float, max_inclusive: Optional[float]) -> bool:
|
||||
"""
|
||||
check if timestamp is before max timestamp, inclusive
|
||||
"""
|
||||
return timestamp <= (max_inclusive or FUTURE_TIMESTAMP)
|
||||
|
||||
|
||||
def sleep(sleep_seconds: float) -> None:
|
||||
# do not allow sleeping if signal was received
|
||||
signals.raise_if_signalled()
|
||||
# sleep or wait for signal
|
||||
signals.exit_event.wait(sleep_seconds)
|
||||
# if signal then raise
|
||||
signals.raise_if_signalled()
|
||||
32
dlt/common/typing.py
Normal file
32
dlt/common/typing.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from typing import Dict, Any, List, Literal, Mapping, Sequence, TypedDict, Optional, Union
|
||||
|
||||
DictStrAny = Dict[str, Any]
|
||||
DictStrStr = Dict[str, str]
|
||||
StrAny = Mapping[str, Any] # immutable, covariant entity
|
||||
StrStr = Mapping[str, str] # immutable, covariant entity
|
||||
StrStrStr = Mapping[str, Mapping[str, str]] # immutable, covariant entity
|
||||
|
||||
class TEventRow(TypedDict, total=False):
|
||||
_timestamp: float # used for partitioning
|
||||
_dist_key: str # distribution key used for clustering
|
||||
_record_hash: str # unique id of current row
|
||||
_root_hash: str # unique id of top level parent
|
||||
|
||||
class TEventRowRoot(TEventRow, total=False):
|
||||
_load_id: str # load id to identify records loaded together that ie. need to be processed
|
||||
_event_json: str # dump of the original event
|
||||
_event_type: str # sets event type which will be translated to table
|
||||
|
||||
|
||||
class TEventRowChild(TEventRow, total=False):
|
||||
_parent_hash: str # unique id of parent row
|
||||
_pos: int # position in the list of rows
|
||||
value: Any # for lists of simple types
|
||||
|
||||
|
||||
class TEvent(TypedDict, total=False):
|
||||
pass
|
||||
|
||||
|
||||
class TTimestampEvent(TEvent, total=False):
|
||||
timestamp: float # timestamp of event
|
||||
117
dlt/common/utils.py
Normal file
117
dlt/common/utils.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import hashlib
|
||||
from os import environ
|
||||
from uuid import uuid4
|
||||
from typing import Any, Iterator, Sequence, TypeVar, Mapping, List, Union
|
||||
|
||||
from dlt.common.typing import StrAny, DictStrAny, StrStr
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def chunks(list: Sequence[T], n: int) -> Iterator[Sequence[T]]:
|
||||
for i in range(0, len(list), n):
|
||||
yield list[i:i + n]
|
||||
|
||||
|
||||
def uniq_id() -> str:
|
||||
return uuid4().hex
|
||||
|
||||
|
||||
def digest128(v: str) -> str:
|
||||
return hashlib.shake_128(v.encode("utf-8")).hexdigest(16)
|
||||
|
||||
|
||||
def str2bool(v: str) -> bool:
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||
return True
|
||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||
return False
|
||||
else:
|
||||
raise ValueError('Boolean value expected.')
|
||||
|
||||
|
||||
def flatten_list_of_dicts(dicts: Sequence[StrAny]) -> StrAny:
|
||||
"""
|
||||
Transforms a list of objects [{K: {...}}, {L: {....}}, ...] -> {K: {...}, L: {...}...}
|
||||
"""
|
||||
o: DictStrAny = {}
|
||||
for d in dicts:
|
||||
for k,v in d.items():
|
||||
if k in o:
|
||||
raise KeyError(f"Cannot flatten with duplicate key {k}")
|
||||
o[k] = v
|
||||
return o
|
||||
|
||||
|
||||
def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> StrAny:
|
||||
"""
|
||||
Transforms a list of objects or strings [{K: {...}}, L, ...] -> {K: {...}, L: None, ...}
|
||||
"""
|
||||
o: DictStrAny = {}
|
||||
for e in seq:
|
||||
if type(e) is dict:
|
||||
for k,v in e.items():
|
||||
if k in o:
|
||||
raise KeyError(f"Cannot flatten with duplicate key {k}")
|
||||
o[k] = v
|
||||
else:
|
||||
key = str(e)
|
||||
if key in o:
|
||||
raise KeyError(f"Cannot flatten with duplicate key {k}")
|
||||
o[key] = None
|
||||
return o
|
||||
|
||||
|
||||
def flatten_dicts_of_dicts(dicts: Mapping[str, Any]) -> Sequence[Any]:
|
||||
"""
|
||||
Transform and object {K: {...}, L: {...}...} -> [{key:K, ....}, {key: L, ...}, ...]
|
||||
"""
|
||||
o: List[Any] = []
|
||||
for k, v in dicts.items():
|
||||
if type(v) is list:
|
||||
# if v is a list then add "key" to each list element
|
||||
for lv in v:
|
||||
lv["key"] = k
|
||||
else:
|
||||
# add as "key" to dict
|
||||
v["key"] = k
|
||||
|
||||
o.append(v)
|
||||
return o
|
||||
|
||||
|
||||
def tuplify_list_of_dicts(dicts: Sequence[DictStrAny]) -> Sequence[DictStrAny]:
|
||||
"""
|
||||
Transform dicts with single key into {"key": orig_key, "value": orig_value}
|
||||
"""
|
||||
for d in dicts:
|
||||
if len(d) > 1:
|
||||
raise ValueError(f"Tuplify requires one key dicts {d}")
|
||||
if len(d) == 1:
|
||||
key = next(iter(d))
|
||||
# delete key first to avoid name clashes
|
||||
value = d[key]
|
||||
del d[key]
|
||||
d["key"] = key
|
||||
d["value"] = value
|
||||
|
||||
return dicts
|
||||
|
||||
|
||||
def filter_env_vars(vars: List[str]) -> StrStr:
|
||||
return {k.lower(): environ[k] for k in vars if k in environ}
|
||||
|
||||
|
||||
def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None:
|
||||
for k, v in update.items():
|
||||
if v is not None:
|
||||
dest[k] = v
|
||||
elif k in dest:
|
||||
del dest[k]
|
||||
|
||||
|
||||
def is_interactive() -> bool:
|
||||
import __main__ as main
|
||||
return not hasattr(main, '__file__')
|
||||
17
dlt/dbt_runner/README.md
Normal file
17
dlt/dbt_runner/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
https://github.com/davidgasquez/kubedbt
|
||||
https://discourse.getdbt.com/t/running-dbt-in-kubernetes/92
|
||||
https://github.com/godatadriven/pytest-dbt-core
|
||||
https://github.com/great-expectations/great_expectations
|
||||
|
||||
https://github.com/fal-ai/fal (attach python scripts to models)
|
||||
|
||||
https://blog.getdbt.com/how-great-data-teams-test-their-data-models/
|
||||
|
||||
PG_DATABASE_NAME=chat_analytics_rasa PG_PASSWORD=8P5gyDPNo9zo582rQG6a PG_USER=loader PG_HOST=3.66.204.141 PG_PORT=5439 dbt list --profiles-dir . --vars '{source_schema_prefix: "unk"}' --resource-type test -s source:*
|
||||
|
||||
https://docs.getdbt.com/reference/node-selection/test-selection-examples
|
||||
|
||||
|
||||
# list tests with selectors
|
||||
|
||||
PG_DATABASE_NAME=chat_analytics_rasa PG_PASSWORD=8P5gyDPNo9zo582rQG6a PG_USER=loader PG_HOST=3.66.204.141 PG_PORT=5439 dbt list --profiles-dir . --vars '{source_schema_prefix: "unk"}' --resource-type test -s views
|
||||
1
dlt/dbt_runner/__init__.py
Normal file
1
dlt/dbt_runner/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from ._version import __version__
|
||||
1
dlt/dbt_runner/_version.py
Normal file
1
dlt/dbt_runner/_version.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "1.0.0"
|
||||
69
dlt/dbt_runner/configuration.py
Normal file
69
dlt/dbt_runner/configuration.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from typing import List, Optional, Type
|
||||
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.configuration.utils import TConfigSecret, make_configuration, _get_key_value
|
||||
from dlt.common.configuration import PoolRunnerConfiguration, TPoolType, PostgresConfiguration, PostgresProductionConfiguration, GcpClientConfiguration, GcpClientProductionConfiguration
|
||||
|
||||
from . import __version__
|
||||
|
||||
|
||||
class DBTRunnerConfiguration(PoolRunnerConfiguration):
|
||||
POOL_TYPE: TPoolType = "none"
|
||||
STOP_AFTER_RUNS: int = 1
|
||||
PACKAGE_VOLUME_PATH: str = "_storage/dbt_runner"
|
||||
PACKAGE_REPOSITORY_URL: str = "https://github.com/scale-vector/rasa_semantic_schema_customization.git"
|
||||
PACKAGE_REPOSITORY_BRANCH: Optional[str] = None
|
||||
PACKAGE_REPOSITORY_SSH_KEY: TConfigSecret = TConfigSecret("") # the default is empty value which will disable custom SSH KEY
|
||||
PACKAGE_PROFILES_DIR: str = "."
|
||||
PACKAGE_PROFILE_PREFIX: str = "rasa_semantic_schema"
|
||||
PACKAGE_SOURCE_TESTS_SELECTOR: str = "tag:prerequisites"
|
||||
PACKAGE_ADDITIONAL_VARS: Optional[StrAny] = None
|
||||
PACKAGE_RUN_PARAMS: List[str] = ["--fail-fast"]
|
||||
AUTO_FULL_REFRESH_WHEN_OUT_OF_SYNC: bool = True
|
||||
|
||||
SOURCE_SCHEMA_PREFIX: str = None
|
||||
DEST_SCHEMA_PREFIX: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def check_integrity(cls) -> None:
|
||||
if cls.PACKAGE_REPOSITORY_SSH_KEY and cls.PACKAGE_REPOSITORY_SSH_KEY[-1] != "\n":
|
||||
# must end with new line, otherwise won't be parsed by Crypto
|
||||
cls.PACKAGE_REPOSITORY_SSH_KEY = TConfigSecret(cls.PACKAGE_REPOSITORY_SSH_KEY + "\n")
|
||||
if cls.STOP_AFTER_RUNS != 1:
|
||||
# always stop after one run
|
||||
cls.STOP_AFTER_RUNS = 1
|
||||
|
||||
|
||||
class DBTRunnerProductionConfiguration(DBTRunnerConfiguration):
|
||||
PACKAGE_VOLUME_PATH: str = "/var/local/app" # this is actually not exposed as volume
|
||||
PACKAGE_REPOSITORY_URL: str = None
|
||||
|
||||
|
||||
def gen_configuration_variant(initial_values: StrAny = None) -> Type[DBTRunnerConfiguration]:
|
||||
# derive concrete config depending on env vars present
|
||||
DBTRunnerConfigurationImpl: Type[DBTRunnerConfiguration]
|
||||
DBTRunnerProductionConfigurationImpl: Type[DBTRunnerProductionConfiguration]
|
||||
|
||||
if _get_key_value("PG_SCHEMA_PREFIX", type(str)):
|
||||
source_schema_prefix = _get_key_value("PG_SCHEMA_PREFIX", type(str))
|
||||
class DBTRunnerConfigurationPostgress(PostgresConfiguration, DBTRunnerConfiguration):
|
||||
SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
|
||||
DBTRunnerConfigurationImpl = DBTRunnerConfigurationPostgress
|
||||
|
||||
class DBTRunnerProductionConfigurationPostgress(DBTRunnerProductionConfiguration, PostgresProductionConfiguration, DBTRunnerConfigurationPostgress):
|
||||
pass
|
||||
# SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
|
||||
DBTRunnerProductionConfigurationImpl = DBTRunnerProductionConfigurationPostgress
|
||||
|
||||
else:
|
||||
source_schema_prefix = _get_key_value("DATASET", type(str))
|
||||
class DBTRunnerConfigurationGcp(GcpClientConfiguration, DBTRunnerConfiguration):
|
||||
SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
|
||||
DBTRunnerConfigurationImpl = DBTRunnerConfigurationGcp
|
||||
|
||||
class DBTRunnerProductionConfigurationGcp(DBTRunnerProductionConfiguration, GcpClientProductionConfiguration, DBTRunnerConfigurationGcp):
|
||||
pass
|
||||
# SOURCE_SCHEMA_PREFIX: str = source_schema_prefix
|
||||
DBTRunnerProductionConfigurationImpl = DBTRunnerProductionConfigurationGcp
|
||||
|
||||
return make_configuration(DBTRunnerConfigurationImpl, DBTRunnerProductionConfigurationImpl, initial_values=initial_values)
|
||||
9
dlt/dbt_runner/exceptions.py
Normal file
9
dlt/dbt_runner/exceptions.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from dlt.common.exceptions import DltException
|
||||
|
||||
|
||||
class DBTRunnerException(DltException):
|
||||
pass
|
||||
|
||||
|
||||
class PrerequisitesException(DBTRunnerException):
|
||||
pass
|
||||
187
dlt/dbt_runner/runner.py
Normal file
187
dlt/dbt_runner/runner.py
Normal file
@@ -0,0 +1,187 @@
|
||||
from typing import Optional, Sequence, Tuple, Type
|
||||
from git import GitError
|
||||
from prometheus_client import REGISTRY, Gauge, CollectorRegistry, Info
|
||||
from prometheus_client.metrics import MetricWrapperBase
|
||||
from dlt.common.configuration import GcpClientConfiguration
|
||||
|
||||
from dlt.common import logger
|
||||
from dlt.common.typing import DictStrAny, DictStrStr, StrAny
|
||||
from dlt.common.logger import process_internal_exception, is_json_logging
|
||||
from dlt.common.telemetry import get_logging_extras
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.runners import TRunArgs, create_default_args, initialize_runner, pool_runner
|
||||
from dlt.common.telemetry import TRunMetrics
|
||||
|
||||
from dlt.dbt_runner.configuration import DBTRunnerConfiguration, gen_configuration_variant
|
||||
from dlt.dbt_runner.utils import DBTProcessingError, clone_repo, dbt_results, ensure_remote_head, git_custom_key_command, initialize_dbt_logging, is_incremental_schema_out_of_sync_error, run_dbt_command
|
||||
from dlt.dbt_runner.exceptions import PrerequisitesException
|
||||
|
||||
|
||||
CLONED_PACKAGE_NAME = "dbt_package"
|
||||
|
||||
CONFIG: Type[DBTRunnerConfiguration] = None
|
||||
storage: FileStorage = None
|
||||
dbt_package_vars: StrAny = None
|
||||
global_args: Sequence[str] = None
|
||||
repo_path: str = None
|
||||
profile_name: str = None
|
||||
|
||||
model_elapsed_gauge: Gauge = None
|
||||
model_exec_info: Info = None
|
||||
|
||||
|
||||
def create_folders() -> Tuple[FileStorage, StrAny, Sequence[str], str, str]:
|
||||
storage = FileStorage(CONFIG.PACKAGE_VOLUME_PATH, makedirs=True)
|
||||
dbt_package_vars: DictStrAny = {
|
||||
"source_schema_prefix": CONFIG.SOURCE_SCHEMA_PREFIX
|
||||
}
|
||||
if CONFIG.DEST_SCHEMA_PREFIX:
|
||||
dbt_package_vars["dest_schema_prefix"] = CONFIG.DEST_SCHEMA_PREFIX
|
||||
if CONFIG.PACKAGE_ADDITIONAL_VARS:
|
||||
dbt_package_vars.update(CONFIG.PACKAGE_ADDITIONAL_VARS)
|
||||
|
||||
# initialize dbt logging, returns global parameters to dbt command
|
||||
global_args = initialize_dbt_logging(CONFIG.LOG_LEVEL, is_json_logging(CONFIG.LOG_FORMAT))
|
||||
|
||||
# generate path for the dbt package repo
|
||||
repo_path = storage._make_path(CLONED_PACKAGE_NAME)
|
||||
|
||||
# generate profile name
|
||||
profile_name: str = None
|
||||
if CONFIG.PACKAGE_PROFILE_PREFIX:
|
||||
if issubclass(CONFIG, GcpClientConfiguration):
|
||||
profile_name = "%s_bigquery" % (CONFIG.PACKAGE_PROFILE_PREFIX)
|
||||
else:
|
||||
profile_name = "%s_redshift" % (CONFIG.PACKAGE_PROFILE_PREFIX)
|
||||
|
||||
return storage, dbt_package_vars, global_args, repo_path, profile_name
|
||||
|
||||
|
||||
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase]:
|
||||
return (
|
||||
Gauge("dbtrunner_model_elapsed_seconds", "Last model processing time", ["model"], registry=registry),
|
||||
Info("dbtrunner_model_status", "Last execution status of the model", registry=registry)
|
||||
)
|
||||
|
||||
|
||||
def run_dbt(command: str, command_args: Sequence[str] = None) -> Sequence[dbt_results. BaseResult]:
|
||||
logger.info(f"Exec dbt command: {global_args} {command} {command_args} {dbt_package_vars} on profile {profile_name or '<project_default>'}")
|
||||
return run_dbt_command(
|
||||
repo_path, command,
|
||||
CONFIG.PACKAGE_PROFILES_DIR,
|
||||
profile_name=profile_name,
|
||||
command_args=command_args,
|
||||
global_args=global_args,
|
||||
vars=dbt_package_vars
|
||||
)
|
||||
|
||||
|
||||
def log_dbt_run_results(results: dbt_results.RunExecutionResult) -> None:
|
||||
# run may return RunResult of something different depending on error
|
||||
if issubclass(type(results), dbt_results.BaseResult):
|
||||
results = [results] # make it iterable
|
||||
elif issubclass(type(results), dbt_results.ExecutionResult):
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"{type(results)} is unknown and cannot be logged")
|
||||
return
|
||||
|
||||
info: DictStrStr = {}
|
||||
for res in results:
|
||||
name = res.node.name
|
||||
message = res.message
|
||||
time = res.execution_time
|
||||
if res.status == dbt_results.RunStatus.Error:
|
||||
logger.error(f"Model {name} errored! Error: {message}")
|
||||
else:
|
||||
logger.info(f"Model {name} {res.status} in {time} seconds with {message}")
|
||||
model_elapsed_gauge.labels(name).set(time)
|
||||
info[name] = message
|
||||
|
||||
# log execution
|
||||
model_exec_info.info(info)
|
||||
logger.metrics("Executed models", extra=get_logging_extras([model_elapsed_gauge, model_exec_info]))
|
||||
|
||||
|
||||
def initialize_package(with_git_command: Optional[str]) -> None:
|
||||
try:
|
||||
# cleanup package folder
|
||||
if storage.has_folder(CLONED_PACKAGE_NAME):
|
||||
storage.delete_folder(CLONED_PACKAGE_NAME, recursively=True)
|
||||
logger.info(f"Will clone {CONFIG.PACKAGE_REPOSITORY_URL} head {CONFIG.PACKAGE_REPOSITORY_BRANCH} into {repo_path}")
|
||||
clone_repo(CONFIG.PACKAGE_REPOSITORY_URL, repo_path, branch=CONFIG.PACKAGE_REPOSITORY_BRANCH, with_git_command=with_git_command)
|
||||
run_dbt("deps")
|
||||
except Exception as e:
|
||||
# delete folder so we start clean next time
|
||||
if storage.has_folder(CLONED_PACKAGE_NAME):
|
||||
storage.delete_folder(CLONED_PACKAGE_NAME, recursively=True)
|
||||
raise
|
||||
|
||||
|
||||
def ensure_newest_package() -> None:
|
||||
with git_custom_key_command(CONFIG.PACKAGE_REPOSITORY_SSH_KEY) as ssh_command:
|
||||
try:
|
||||
ensure_remote_head(repo_path, with_git_command=ssh_command)
|
||||
except GitError as err:
|
||||
# cleanup package folder
|
||||
logger.info(f"Package will be cloned due to {type(err).__name__}:{str(err)}")
|
||||
initialize_package(with_git_command=ssh_command)
|
||||
|
||||
|
||||
def run_db_steps() -> Sequence[dbt_results.BaseResult]:
|
||||
# make sure we use package from the remote head
|
||||
ensure_newest_package()
|
||||
# check if raw schema exists
|
||||
try:
|
||||
if CONFIG.PACKAGE_SOURCE_TESTS_SELECTOR:
|
||||
run_dbt("test", ["-s", CONFIG.PACKAGE_SOURCE_TESTS_SELECTOR])
|
||||
except DBTProcessingError as err:
|
||||
raise PrerequisitesException() from err
|
||||
|
||||
# always run seeds
|
||||
run_dbt("seed")
|
||||
# throws DBTProcessingError
|
||||
try:
|
||||
return run_dbt("run", CONFIG.PACKAGE_RUN_PARAMS)
|
||||
except DBTProcessingError as e:
|
||||
# detect incremental model out of sync
|
||||
if is_incremental_schema_out_of_sync_error(e.results) and CONFIG.AUTO_FULL_REFRESH_WHEN_OUT_OF_SYNC:
|
||||
logger.warning(f"Attempting full refresh due to incremental model out of sync on {e.results.message}")
|
||||
return run_dbt("run", CONFIG.PACKAGE_RUN_PARAMS + ["--full-refresh"])
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def run(_: None) -> TRunMetrics:
|
||||
try:
|
||||
# there were many issues with running the method below with pool.apply
|
||||
# 1 - some exceptions are not serialized well on process boundary and queue hangs
|
||||
# 2 - random hangs event if there's no exception, probably issues with DBT spawning its own workers
|
||||
# instead the runner host was configured to recycle each run
|
||||
results = run_db_steps()
|
||||
log_dbt_run_results(results)
|
||||
return TRunMetrics(False, False, 0)
|
||||
except PrerequisitesException:
|
||||
logger.warning(f"Raw schema test failed, it may yet not be created")
|
||||
# run failed and loads possibly still pending
|
||||
return TRunMetrics(False, True, 1)
|
||||
except DBTProcessingError as runerr:
|
||||
log_dbt_run_results(runerr.results)
|
||||
# pass exception to the runner
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
CONFIG = gen_configuration_variant()
|
||||
parser = create_default_args(CONFIG)
|
||||
args = parser.parse_args()
|
||||
# we should force single run
|
||||
initialize_runner(CONFIG, TRunArgs(args.single_run, args.wait_runs))
|
||||
try:
|
||||
storage, dbt_package_vars, global_args, repo_path, profile_name = create_folders()
|
||||
model_elapsed_gauge, model_exec_info = create_gauges(REGISTRY)
|
||||
except Exception:
|
||||
process_internal_exception("init module")
|
||||
exit(-1)
|
||||
|
||||
exit(pool_runner(CONFIG, run))
|
||||
130
dlt/dbt_runner/utils.py
Normal file
130
dlt/dbt_runner/utils.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import os
|
||||
import logging
|
||||
import tempfile
|
||||
from typing import Any, Iterator, List, Sequence
|
||||
from git import Repo, Git, RepositoryDirtyError
|
||||
from contextlib import contextmanager
|
||||
|
||||
from dlt.common import json
|
||||
from dlt.common.utils import uniq_id
|
||||
from dlt.common.typing import StrAny, Optional
|
||||
from dlt.dbt_runner.exceptions import DBTRunnerException
|
||||
|
||||
# block disabling root logger
|
||||
import logbook.compat
|
||||
logbook.compat.redirect_logging = lambda : None
|
||||
|
||||
# can only import DBT after redirect is disabled
|
||||
import dbt.main
|
||||
import dbt.logger
|
||||
from dbt.events import functions
|
||||
from dbt.contracts import results as dbt_results
|
||||
from dbt.exceptions import FailFastException
|
||||
|
||||
|
||||
# keep this exception definition here due to mock of logbook
|
||||
class DBTProcessingError(DBTRunnerException):
|
||||
def __init__(self, command: str, results: Any) -> None:
|
||||
self.command = command
|
||||
# the results from DBT may be anything
|
||||
self.results = results
|
||||
super().__init__(f"DBT command {command} could not be executed")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def git_custom_key_command(private_key: Optional[str]) -> Iterator[str]:
|
||||
if private_key:
|
||||
key_file = tempfile.mktemp(prefix=uniq_id())
|
||||
with open(key_file, "w") as f:
|
||||
f.write(private_key)
|
||||
try:
|
||||
# permissions so SSH does not complain
|
||||
os.chmod(key_file, 0o600)
|
||||
yield 'ssh -o "StrictHostKeyChecking accept-new" -i %s' % key_file
|
||||
finally:
|
||||
os.remove(key_file)
|
||||
else:
|
||||
yield 'ssh -o "StrictHostKeyChecking accept-new"'
|
||||
|
||||
|
||||
def ensure_remote_head(repo_path: str, with_git_command: Optional[str] = None) -> None:
|
||||
# update remotes and check if heads are same. ignores locally modified files
|
||||
repo = Repo(repo_path)
|
||||
# use custom environemnt if specified
|
||||
with repo.git.custom_environment(GIT_SSH_COMMAND=with_git_command):
|
||||
# update origin
|
||||
repo.remote().update()
|
||||
# get branch status
|
||||
status: str = repo.git.status("--short", "--branch", "-uno")
|
||||
# we expect first status line ## main...origin/main
|
||||
status_line = status.split("/n")[0]
|
||||
if not (status_line.startswith("##") and not status_line.endswith("]")):
|
||||
raise RepositoryDirtyError(repo, status)
|
||||
|
||||
|
||||
def clone_repo(repository_url: str, clone_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> None:
|
||||
repo = Repo.clone_from(repository_url, clone_path, env=dict(GIT_SSH_COMMAND=with_git_command))
|
||||
if branch:
|
||||
repo.git.checkout(branch)
|
||||
|
||||
|
||||
def initialize_dbt_logging(level: str, is_json_logging: bool) -> Sequence[str]:
|
||||
int_level = logging._nameToLevel[level]
|
||||
|
||||
# wrap log setup to force out log level
|
||||
|
||||
def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None:
|
||||
functions.setup_event_logger(log_path, level)
|
||||
# force log level as file is debug only
|
||||
functions.this.FILE_LOG.setLevel(level)
|
||||
functions.this.FILE_LOG.handlers[0].setLevel(level)
|
||||
|
||||
dbt.main.setup_event_logger = setup_event_logger_wrapper
|
||||
|
||||
globs = []
|
||||
if int_level <= logging.DEBUG:
|
||||
globs = ["--debug"]
|
||||
|
||||
# return global parameters to be passed to setup logging
|
||||
|
||||
if is_json_logging:
|
||||
return ["--log-format", "json"] + globs
|
||||
else:
|
||||
return globs
|
||||
|
||||
|
||||
def is_incremental_schema_out_of_sync_error(error: dbt_results.RunResult) -> bool:
|
||||
return issubclass(type(error), dbt_results.RunResult) and error.status == dbt_results.RunStatus.Error and\
|
||||
"The source and target schemas on this incremental model are out of sync" in error.message
|
||||
|
||||
|
||||
def run_dbt_command(package_path: str, command: str, profiles_dir: str, profile_name: Optional[str] = None,
|
||||
global_args: Sequence[str] = None, command_args: Sequence[str] = None, vars: StrAny = None) -> Sequence[dbt_results.BaseResult]:
|
||||
args = ["--profiles-dir", profiles_dir]
|
||||
# add profile name if provided
|
||||
if profile_name:
|
||||
args += ["--profile", profile_name]
|
||||
# serialize dbt variables to pass to package
|
||||
if vars:
|
||||
args += ["--vars", json.dumps(vars)]
|
||||
if command_args:
|
||||
args += command_args
|
||||
|
||||
# cwd to package dir
|
||||
working_dir = os.getcwd()
|
||||
os.chdir(package_path)
|
||||
try:
|
||||
results: List[dbt_results.BaseResult] = None
|
||||
success: bool = None
|
||||
results, success = dbt.main.handle_and_check((global_args or []) + [command] + args) # type: ignore
|
||||
assert type(success) is bool
|
||||
if not success:
|
||||
raise DBTProcessingError(command ,results)
|
||||
return results
|
||||
except FailFastException as ff:
|
||||
raise DBTProcessingError(command, ff.result) from ff
|
||||
finally:
|
||||
# unblock logger manager to run next command
|
||||
dbt.logger.log_manager.reset_handlers()
|
||||
# go back to working dir
|
||||
os.chdir(working_dir)
|
||||
0
dlt/extractors/__init__.py
Normal file
0
dlt/extractors/__init__.py
Normal file
5
dlt/extractors/exceptions.py
Normal file
5
dlt/extractors/exceptions.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from dlt.common.exceptions import DltException
|
||||
|
||||
|
||||
class ExtractorException(DltException):
|
||||
pass
|
||||
34
dlt/extractors/extractor_storage.py
Normal file
34
dlt/extractors/extractor_storage.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import semver
|
||||
|
||||
from dlt.common.utils import uniq_id
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.storages.versioned_storage import VersionedStorage
|
||||
from dlt.common.storages.unpacker_storage import UnpackerStorage
|
||||
|
||||
|
||||
class ExtractorStorageBase(VersionedStorage):
|
||||
def __init__(self, version: semver.VersionInfo, is_owner: bool, storage: FileStorage, unpacker_storage: UnpackerStorage) -> None:
|
||||
self.unpacker_storage = unpacker_storage
|
||||
super().__init__(version, is_owner, storage)
|
||||
|
||||
def create_temp_folder(self) -> str:
|
||||
tf_name = uniq_id()
|
||||
self.storage.create_folder(tf_name)
|
||||
return tf_name
|
||||
|
||||
def commit_events(self, schema_name: str, processed_file_path: str, dest_file_stem: str, no_processed_events: int, load_id: str, with_delete: bool = True) -> str:
|
||||
# schema name cannot contain underscores
|
||||
if "_" in schema_name:
|
||||
raise ValueError(schema_name)
|
||||
|
||||
dest_name = UnpackerStorage.build_unpack_file_name(schema_name, dest_file_stem, no_processed_events, load_id)
|
||||
# if no events extracted from tracker, file is not saved
|
||||
if no_processed_events > 0:
|
||||
# moves file to possibly external storage and place in the dest folder atomically
|
||||
self.storage.copy_cross_storage_atomically(
|
||||
self.unpacker_storage.storage.storage_path, UnpackerStorage.UNPACKING_FOLDER, processed_file_path, dest_name)
|
||||
|
||||
if with_delete:
|
||||
self.storage.delete(processed_file_path)
|
||||
|
||||
return dest_name
|
||||
0
dlt/extractors/generator/__init__.py
Normal file
0
dlt/extractors/generator/__init__.py
Normal file
0
dlt/extractors/generator/extractor.py
Normal file
0
dlt/extractors/generator/extractor.py
Normal file
1
dlt/loaders/__init__.py
Normal file
1
dlt/loaders/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from dlt._version import loader_version as __version__
|
||||
150
dlt/loaders/client_base.py
Normal file
150
dlt/loaders/client_base.py
Normal file
@@ -0,0 +1,150 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from types import TracebackType
|
||||
from typing import Any, Literal, Sequence, Type, TypeVar, AnyStr
|
||||
from pathlib import Path
|
||||
|
||||
from dlt.common import pendulum, logger
|
||||
from dlt.common.schema import Column, Schema, Table
|
||||
# from dlt.common.file_storage import FileStorage
|
||||
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
from dlt.loaders.exceptions import LoadClientSchemaVersionCorrupted, LoadUnknownTableException
|
||||
|
||||
# typing for context manager
|
||||
TClient = TypeVar("TClient", bound="ClientBase")
|
||||
|
||||
|
||||
class LoadJob:
|
||||
def __init__(self, file_name: str) -> None:
|
||||
"""
|
||||
File name is also a job id (or job id is deterministically derived) so it must be globally unique
|
||||
"""
|
||||
self._file_name = file_name
|
||||
|
||||
@abstractmethod
|
||||
def status(self) -> LoadJobStatus:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def file_name(self) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def exception(self) -> str:
|
||||
pass
|
||||
|
||||
|
||||
class LoadEmptyJob(LoadJob):
|
||||
def __init__(self, file_name: str, status: LoadJobStatus, exception: str = None) -> None:
|
||||
self._status = status
|
||||
self._exception = exception
|
||||
super().__init__(file_name)
|
||||
|
||||
def status(self) -> LoadJobStatus:
|
||||
return self._status
|
||||
|
||||
def file_name(self) -> str:
|
||||
return self._file_name
|
||||
|
||||
def exception(self) -> str:
|
||||
return self._exception
|
||||
|
||||
|
||||
class ClientBase(ABC):
|
||||
def __init__(self, schema: Schema) -> None:
|
||||
self.schema = schema
|
||||
|
||||
@abstractmethod
|
||||
def initialize_storage(self) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update_storage_schema(self) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_file_load(self, file_path: str) -> LoadJob:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def complete_load(self, load_id: str) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _open_connection(self) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _close_connection(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
def __enter__(self: TClient) -> TClient:
|
||||
self._open_connection()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None:
|
||||
self._close_connection()
|
||||
|
||||
def _get_table_by_name(self, table_name: str, file_name: str) -> Table:
|
||||
try:
|
||||
return self.schema.get_table(table_name)
|
||||
except KeyError:
|
||||
raise LoadUnknownTableException(table_name, file_name)
|
||||
|
||||
@staticmethod
|
||||
def get_file_name_from_file_path(file_path: str) -> str:
|
||||
return Path(file_path).name
|
||||
|
||||
@staticmethod
|
||||
def make_job_with_status(file_path: str, status: LoadJobStatus, message: str = None) -> LoadJob:
|
||||
return LoadEmptyJob(ClientBase.get_file_name_from_file_path(file_path), status, exception=message)
|
||||
|
||||
@staticmethod
|
||||
def make_absolute_path(file_path: str) -> str:
|
||||
return str(Path(file_path).absolute())
|
||||
|
||||
|
||||
class SqlClientBase(ClientBase):
|
||||
def __init__(self, schema: Schema) -> None:
|
||||
super().__init__(schema)
|
||||
|
||||
def complete_load(self, load_id: str) -> None:
|
||||
name = self._to_canonical_table_name(Schema.LOADS_TABLE_NAME)
|
||||
now_ts = str(pendulum.now())
|
||||
self._execute_sql(f"INSERT INTO {name}(load_id, status, inserted_at) VALUES('{load_id}', 0, '{now_ts}');")
|
||||
|
||||
@abstractmethod
|
||||
def _execute_sql(self, query: AnyStr) -> Any:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _to_canonical_schema_name(self) -> str:
|
||||
pass
|
||||
|
||||
def _create_table_update(self, table_name: str, storage_table: Table) -> Sequence[Column]:
|
||||
# compare table with stored schema and produce delta
|
||||
l = self.schema.get_schema_update_for(table_name, storage_table)
|
||||
logger.info(f"Found {len(l)} updates for {table_name} in {self.schema.schema_name}")
|
||||
return l
|
||||
|
||||
def _to_canonical_table_name(self, table_name: str) -> str:
|
||||
return f"{self._to_canonical_schema_name()}.{table_name}"
|
||||
|
||||
def _get_schema_version_from_storage(self) -> int:
|
||||
name = self._to_canonical_table_name(Schema.VERSION_TABLE_NAME)
|
||||
rows = list(self._execute_sql(f"SELECT {Schema.VERSION_COLUMN_NAME} FROM {name} ORDER BY inserted_at DESC LIMIT 1;"))
|
||||
if len(rows) > 1:
|
||||
raise LoadClientSchemaVersionCorrupted(self._to_canonical_schema_name())
|
||||
if len(rows) == 0:
|
||||
return 0
|
||||
return int(rows[0][0])
|
||||
|
||||
def _update_schema_version(self, new_version: int) -> None:
|
||||
now_ts = str(pendulum.now())
|
||||
name = self._to_canonical_table_name(Schema.VERSION_TABLE_NAME)
|
||||
self._execute_sql(f"INSERT INTO {name}({Schema.VERSION_COLUMN_NAME}, engine_version, inserted_at) VALUES ({new_version}, {Schema.ENGINE_VERSION}, '{now_ts}');")
|
||||
51
dlt/loaders/configuration.py
Normal file
51
dlt/loaders/configuration.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from typing import Any, Type
|
||||
|
||||
from dlt.common.utils import uniq_id
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.configuration import (PoolRunnerConfiguration,
|
||||
LoadingVolumeConfiguration,
|
||||
ProductionLoadingVolumeConfiguration,
|
||||
PostgresConfiguration, PostgresProductionConfiguration,
|
||||
GcpClientConfiguration, GcpClientProductionConfiguration,
|
||||
TPoolType, make_configuration)
|
||||
|
||||
from dlt.loaders.dummy.configuration import DummyClientConfiguration
|
||||
|
||||
from . import __version__
|
||||
|
||||
class LoaderConfiguration(PoolRunnerConfiguration, LoadingVolumeConfiguration):
|
||||
CLIENT_TYPE: str = "dummy" # which analytical storage to use
|
||||
MAX_PARALLEL_LOADS: int = 20 # how many parallel loads can be executed
|
||||
MAX_PARALLELISM: int = 20 # in 20 separate threads
|
||||
POOL_TYPE: TPoolType = "thread" # mostly i/o (upload) so may be thread pool
|
||||
|
||||
|
||||
class ProductionLoaderConfiguration(ProductionLoadingVolumeConfiguration, LoaderConfiguration):
|
||||
pass
|
||||
|
||||
|
||||
def configuration(initial_values: StrAny = None) -> Type[LoaderConfiguration]:
|
||||
# synthesize right configuration
|
||||
C = make_configuration(LoaderConfiguration, ProductionLoaderConfiguration, initial_values=initial_values)
|
||||
T: Type[Any] = None
|
||||
T_P: Type[Any] = None
|
||||
if C.CLIENT_TYPE == "dummy":
|
||||
T = DummyClientConfiguration
|
||||
T_P = DummyClientConfiguration
|
||||
elif C.CLIENT_TYPE == "gcp":
|
||||
T = GcpClientConfiguration
|
||||
T_P = GcpClientProductionConfiguration
|
||||
elif C.CLIENT_TYPE == "redshift":
|
||||
T = PostgresConfiguration
|
||||
T_P = PostgresProductionConfiguration
|
||||
else:
|
||||
raise ValueError(C.CLIENT_TYPE)
|
||||
|
||||
ST = type(LoaderConfiguration.__name__ + "_" + T.__name__ + "_" + uniq_id(), (T, LoaderConfiguration), {})
|
||||
ST_P = type(ProductionLoaderConfiguration.__name__ + "_" + T_P.__name__ + "_" + uniq_id(), (T_P, ProductionLoaderConfiguration), {})
|
||||
return make_configuration(
|
||||
ST,
|
||||
ST_P,
|
||||
initial_values=initial_values,
|
||||
skip_subclass_check=True
|
||||
)
|
||||
0
dlt/loaders/dummy/__init__.py
Normal file
0
dlt/loaders/dummy/__init__.py
Normal file
134
dlt/loaders/dummy/client.py
Normal file
134
dlt/loaders/dummy/client.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from datetime import time
|
||||
import random
|
||||
from typing import Dict, Literal, Type
|
||||
from dlt.common.dataset_writers import TWriterType
|
||||
|
||||
from dlt.common import pendulum
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.common.typing import StrAny
|
||||
|
||||
from dlt.loaders.client_base import ClientBase, LoadJob
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
from dlt.loaders.exceptions import (LoadJobNotExistsException, LoadJobInvalidStateTransitionException,
|
||||
LoadClientTerminalException, LoadClientTransientException)
|
||||
|
||||
from dlt.loaders.dummy.configuration import DummyClientConfiguration
|
||||
|
||||
|
||||
class LoadDummyJob(LoadJob):
|
||||
def __init__(self, file_name: str, fail_prob: float = 0.0, retry_prob: float = 0.0, completed_prob: float = 1.0, timeout: float = 10.0) -> None:
|
||||
self.fail_prob = fail_prob
|
||||
self.retry_prob = retry_prob
|
||||
self.completed_prob = completed_prob
|
||||
self.timeout = timeout
|
||||
self._status: LoadJobStatus = "running"
|
||||
self._exception: str = None
|
||||
self.start_time: float = pendulum.now().timestamp()
|
||||
super().__init__(file_name)
|
||||
s = self.status()
|
||||
if s == "failed":
|
||||
raise LoadClientTerminalException(self._exception)
|
||||
if s == "retry":
|
||||
raise LoadClientTransientException(self._exception)
|
||||
|
||||
|
||||
def status(self) -> LoadJobStatus:
|
||||
# this should poll the server for a job status, here we simulate various outcomes
|
||||
if self._status == "running":
|
||||
n = pendulum.now().timestamp()
|
||||
if n - self.start_time > self.timeout:
|
||||
self._status = "failed"
|
||||
self._exception = "failed due to timeout"
|
||||
else:
|
||||
c_r = random.random()
|
||||
if self.completed_prob >= c_r:
|
||||
self._status = "completed"
|
||||
else:
|
||||
c_r = random.random()
|
||||
if self.retry_prob >= c_r:
|
||||
self._status = "retry"
|
||||
self._exception = "a random retry occured"
|
||||
else:
|
||||
c_r = random.random()
|
||||
if self.fail_prob >= c_r:
|
||||
self._status = "failed"
|
||||
self._exception = "a random fail occured"
|
||||
|
||||
return self._status
|
||||
|
||||
def file_name(self) -> str:
|
||||
return self._file_name
|
||||
|
||||
def exception(self) -> str:
|
||||
# this will typically call server for error messages
|
||||
return self._exception
|
||||
|
||||
def retry(self) -> None:
|
||||
if self._status != "retry":
|
||||
raise LoadJobInvalidStateTransitionException(self._status, "retry")
|
||||
self._status = "retry"
|
||||
|
||||
|
||||
JOBS: Dict[str, LoadDummyJob] = {}
|
||||
|
||||
|
||||
class DummyClient(ClientBase):
|
||||
"""
|
||||
dummy client storing jobs in memory
|
||||
"""
|
||||
def __init__(self, schema: Schema, CONFIG: Type[DummyClientConfiguration]) -> None:
|
||||
self.C = CONFIG
|
||||
super().__init__(schema)
|
||||
|
||||
def initialize_storage(self) -> None:
|
||||
pass
|
||||
|
||||
def update_storage_schema(self) -> None:
|
||||
pass
|
||||
|
||||
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
|
||||
self._get_table_by_name(table_name, file_path)
|
||||
job_id = ClientBase.get_file_name_from_file_path(file_path)
|
||||
file_name = ClientBase.get_file_name_from_file_path(file_path)
|
||||
# return existing job if already there
|
||||
if job_id not in JOBS:
|
||||
JOBS[job_id] = self._create_job(file_name)
|
||||
else:
|
||||
job = JOBS[job_id]
|
||||
if job.status == "retry":
|
||||
job.retry()
|
||||
|
||||
return JOBS[job_id]
|
||||
|
||||
def get_file_load(self, file_path: str) -> LoadJob:
|
||||
job_id = ClientBase.get_file_name_from_file_path(file_path)
|
||||
if job_id not in JOBS:
|
||||
raise LoadJobNotExistsException(job_id)
|
||||
return JOBS[job_id]
|
||||
|
||||
def complete_load(self, load_id: str) -> None:
|
||||
pass
|
||||
|
||||
def _open_connection(self) -> None:
|
||||
pass
|
||||
|
||||
def _close_connection(self) -> None:
|
||||
pass
|
||||
|
||||
def _create_job(self, job_id: str) -> LoadDummyJob:
|
||||
return LoadDummyJob(
|
||||
job_id,
|
||||
fail_prob=self.C.FAIL_PROB,
|
||||
retry_prob=self.C.RETRY_PROB,
|
||||
completed_prob=self.C.COMPLETED_PROB,
|
||||
timeout=self.C.TIMEOUT
|
||||
)
|
||||
|
||||
|
||||
|
||||
def make_client(schema: Schema, C: Type[DummyClientConfiguration]) -> ClientBase:
|
||||
return DummyClient(schema, C)
|
||||
|
||||
|
||||
def supported_writer(C: Type[DummyClientConfiguration]) -> TWriterType:
|
||||
return C.WRITER_TYPE
|
||||
8
dlt/loaders/dummy/configuration.py
Normal file
8
dlt/loaders/dummy/configuration.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from dlt.common.dataset_writers import TWriterType
|
||||
|
||||
class DummyClientConfiguration:
|
||||
WRITER_TYPE: TWriterType = "jsonl"
|
||||
FAIL_PROB: float = 0.0
|
||||
RETRY_PROB: float = 0.0
|
||||
COMPLETED_PROB: float = 0.0
|
||||
TIMEOUT: float = 10.0
|
||||
72
dlt/loaders/exceptions.py
Normal file
72
dlt/loaders/exceptions.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from typing import Sequence
|
||||
from dlt.common.exceptions import DltException, TerminalException, TransientException
|
||||
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
|
||||
|
||||
class LoadException(DltException):
|
||||
def __init__(self, msg: str) -> None:
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class LoadClientTerminalException(LoadException, TerminalException):
|
||||
def __init__(self, msg: str) -> None:
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class LoadClientTransientException(LoadException, TransientException):
|
||||
def __init__(self, msg: str) -> None:
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class LoadClientTerminalInnerException(LoadClientTerminalException):
|
||||
def __init__(self, msg: str, inner_exc: Exception) -> None:
|
||||
self.inner_exc = inner_exc
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
class LoadClientTransientInnerException(LoadClientTransientException):
|
||||
def __init__(self, msg: str, inner_exc: Exception) -> None:
|
||||
self.inner_exc = inner_exc
|
||||
super().__init__(msg)
|
||||
|
||||
|
||||
|
||||
class LoadJobNotExistsException(LoadClientTerminalException):
|
||||
def __init__(self, job_id: str) -> None:
|
||||
super().__init__(f"Job with id/file name {job_id} not found")
|
||||
|
||||
|
||||
class LoadUnknownTableException(LoadClientTerminalException):
|
||||
def __init__(self, table_name: str, file_name: str) -> None:
|
||||
self.table_name = table_name
|
||||
super().__init__(f"Client does not know table {table_name} for load file {file_name}")
|
||||
|
||||
|
||||
class LoadJobInvalidStateTransitionException(LoadClientTerminalException):
|
||||
def __init__(self, from_state: LoadJobStatus, to_state: LoadJobStatus) -> None:
|
||||
self.from_state = from_state
|
||||
self.to_state = to_state
|
||||
super().__init__(f"Load job cannot transition form {from_state} to {to_state}")
|
||||
|
||||
class LoadJobServerTerminalException(LoadClientTerminalException):
|
||||
def __init__(self, file_path: str) -> None:
|
||||
super().__init__(f"Job with id/file name {file_path} encountered unrecoverable problem")
|
||||
|
||||
|
||||
class LoadClientSchemaVersionCorrupted(LoadClientTerminalException):
|
||||
def __init__(self, dataset_name: str) -> None:
|
||||
self.dataset_name = dataset_name
|
||||
super().__init__(f"Schema _version table contains too many rows in {dataset_name}")
|
||||
|
||||
|
||||
class LoadClientSchemaWillNotUpdate(LoadClientTerminalException):
|
||||
def __init__(self, table_name: str, columns: Sequence[str], msg: str) -> None:
|
||||
self.table_name = table_name
|
||||
self.columns = columns
|
||||
super().__init__(f"Schema for table {table_name} column(s) {columns} will not update: {msg}")
|
||||
|
||||
|
||||
class LoadFileTooBig(LoadClientTerminalException):
|
||||
def __init__(self, file_name: str, max_size: int) -> None:
|
||||
super().__init__(f"File {file_name} exceedes {max_size} and cannot be loaded. Split the file and try again.")
|
||||
6
dlt/loaders/gcp/README.md
Normal file
6
dlt/loaders/gcp/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Loader account setup
|
||||
|
||||
1. Create new services account, add private key to it and download the `services.json` file
|
||||
2. Make sure that this newly created account has access to BigQuery API
|
||||
3. You must add followig roles to the account above: `BigQuery Data Editor` and `BigQuey Job User`
|
||||
4. IAM to add roles is here https://console.cloud.google.com/iam-admin/iam?project=chat-analytics-rasa-ci
|
||||
0
dlt/loaders/gcp/__init__.py
Normal file
0
dlt/loaders/gcp/__init__.py
Normal file
324
dlt/loaders/gcp/client.py
Normal file
324
dlt/loaders/gcp/client.py
Normal file
@@ -0,0 +1,324 @@
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, AnyStr, Dict, List, Literal, Optional, Tuple, Type
|
||||
import google.cloud.bigquery as bigquery
|
||||
from google.cloud import exceptions as gcp_exceptions
|
||||
from google.oauth2 import service_account
|
||||
from google.api_core import exceptions as api_core_exceptions
|
||||
|
||||
|
||||
from dlt.common import json, logger
|
||||
from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
|
||||
from dlt.common.configuration import GcpClientConfiguration
|
||||
from dlt.common.dataset_writers import TWriterType, escape_bigquery_identifier
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
from dlt.common.schema import Column, DataType, Schema, Table
|
||||
|
||||
from dlt.loaders.client_base import SqlClientBase, LoadJob
|
||||
from dlt.loaders.exceptions import LoadClientSchemaWillNotUpdate, LoadJobNotExistsException, LoadJobServerTerminalException, LoadUnknownTableException
|
||||
|
||||
SCT_TO_BQT: Dict[DataType, str] = {
|
||||
"text": "STRING",
|
||||
"double": "FLOAT64",
|
||||
"bool": "BOOLEAN",
|
||||
"timestamp": "TIMESTAMP",
|
||||
"bigint": "INTEGER",
|
||||
"binary": "BYTES",
|
||||
"decimal": f"NUMERIC({DEFAULT_NUMERIC_PRECISION},{DEFAULT_NUMERIC_SCALE})",
|
||||
"wei": "BIGNUMERIC" # non parametrized should hold wei values
|
||||
}
|
||||
|
||||
BQT_TO_SCT: Dict[str, DataType] = {
|
||||
"STRING": "text",
|
||||
"FLOAT": "double",
|
||||
"BOOLEAN": "bool",
|
||||
"TIMESTAMP": "timestamp",
|
||||
"INTEGER": "bigint",
|
||||
"BYTES": "binary",
|
||||
"NUMERIC": "decimal",
|
||||
"BIGNUMERIC": "decimal"
|
||||
}
|
||||
|
||||
class BigQueryLoadJob(LoadJob):
|
||||
def __init__(self, file_name: str, bq_load_job: bigquery.LoadJob, CONFIG: Type[GcpClientConfiguration]) -> None:
|
||||
self.bq_load_job = bq_load_job
|
||||
self.C = CONFIG
|
||||
self.default_retry = bigquery.DEFAULT_RETRY.with_deadline(CONFIG.TIMEOUT)
|
||||
super().__init__(file_name)
|
||||
|
||||
def status(self) -> LoadJobStatus:
|
||||
# check server if done
|
||||
done = self.bq_load_job.done(retry=self.default_retry, timeout=self.C.TIMEOUT)
|
||||
if done:
|
||||
# rows processed
|
||||
if self.bq_load_job.output_rows is not None and self.bq_load_job.error_result is None:
|
||||
return "completed"
|
||||
else:
|
||||
return "failed"
|
||||
else:
|
||||
return "running"
|
||||
|
||||
def file_name(self) -> str:
|
||||
return self._file_name
|
||||
|
||||
def exception(self) -> str:
|
||||
exception: str = json.dumps({
|
||||
"error_result": self.bq_load_job.error_result,
|
||||
"errors": self.bq_load_job.errors,
|
||||
"job_start": self.bq_load_job.started,
|
||||
"job_end": self.bq_load_job.ended,
|
||||
"job_id": self.bq_load_job.job_id
|
||||
})
|
||||
return exception
|
||||
|
||||
|
||||
class BigQueryClient(SqlClientBase):
|
||||
def __init__(self, schema: Schema, CONFIG: Type[GcpClientConfiguration]) -> None:
|
||||
self._client: bigquery.Client = None
|
||||
self.C = CONFIG
|
||||
self.default_retry = bigquery.DEFAULT_RETRY.with_deadline(CONFIG.TIMEOUT)
|
||||
super().__init__(schema)
|
||||
|
||||
|
||||
def initialize_storage(self) -> None:
|
||||
dataset_name = self._to_canonical_schema_name()
|
||||
try:
|
||||
self._client.get_dataset(dataset_name, retry=self.default_retry, timeout=self.C.TIMEOUT)
|
||||
except gcp_exceptions.NotFound:
|
||||
self._client.create_dataset(dataset_name, exists_ok=False, retry=self.default_retry, timeout=self.C.TIMEOUT)
|
||||
|
||||
def get_file_load(self, file_path: str) -> LoadJob:
|
||||
try:
|
||||
return BigQueryLoadJob(
|
||||
SqlClientBase.get_file_name_from_file_path(file_path),
|
||||
self._retrieve_load_job(file_path),
|
||||
self.C
|
||||
)
|
||||
except api_core_exceptions.NotFound:
|
||||
raise LoadJobNotExistsException(file_path)
|
||||
except (api_core_exceptions.BadRequest, api_core_exceptions.NotFound):
|
||||
raise LoadJobServerTerminalException(file_path)
|
||||
|
||||
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
|
||||
# verify that table exists in the schema
|
||||
self._get_table_by_name(table_name, file_path)
|
||||
try:
|
||||
return BigQueryLoadJob(
|
||||
SqlClientBase.get_file_name_from_file_path(file_path),
|
||||
self._create_load_job(table_name, file_path),
|
||||
self.C
|
||||
)
|
||||
except api_core_exceptions.NotFound:
|
||||
# google.api_core.exceptions.BadRequest - will not be processed ie bad job name
|
||||
raise LoadUnknownTableException(table_name, file_path)
|
||||
except (api_core_exceptions.BadRequest, api_core_exceptions.NotFound):
|
||||
# google.api_core.exceptions.NotFound: 404 - table not found
|
||||
raise LoadJobServerTerminalException(file_path)
|
||||
except api_core_exceptions.Conflict:
|
||||
# google.api_core.exceptions.Conflict: 409 PUT - already exists
|
||||
return self.get_file_load(file_path)
|
||||
|
||||
def update_storage_schema(self) -> None:
|
||||
storage_version = self._get_schema_version_from_storage()
|
||||
if storage_version < self.schema.schema_version:
|
||||
for sql in self._build_schema_update_sql():
|
||||
self._execute_sql(sql)
|
||||
self._update_schema_version(self.schema.schema_version)
|
||||
|
||||
def _open_connection(self) -> None:
|
||||
credentials = service_account.Credentials.from_service_account_info(self.C.to_service_credentials())
|
||||
self._client = bigquery.Client(self.C.PROJECT_ID, credentials=credentials)
|
||||
|
||||
def _close_connection(self) -> None:
|
||||
if self._client:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
|
||||
def _get_schema_version_from_storage(self) -> int:
|
||||
try:
|
||||
return super()._get_schema_version_from_storage()
|
||||
except api_core_exceptions.NotFound:
|
||||
# there's no table so there's no schema
|
||||
return 0
|
||||
|
||||
def _build_schema_update_sql(self) -> List[str]:
|
||||
sql_updates = []
|
||||
for table_name in self.schema.schema_tables:
|
||||
exists, storage_table = self._get_storage_table(table_name)
|
||||
sql = self._get_table_update_sql(table_name, storage_table, exists)
|
||||
if sql:
|
||||
sql_updates.append(sql)
|
||||
return sql_updates
|
||||
|
||||
def _get_table_update_sql(self, table_name: str, storage_table: Table, exists: bool) -> str:
|
||||
new_columns = self._create_table_update(table_name, storage_table)
|
||||
if len(new_columns) == 0:
|
||||
# no changes
|
||||
return None
|
||||
# build sql
|
||||
canonical_name = self._to_canonical_table_name(table_name)
|
||||
if not exists:
|
||||
# build CREATE
|
||||
sql = f"CREATE TABLE {canonical_name} (\n"
|
||||
sql += ",\n".join([self._get_column_def_sql(c) for c in new_columns])
|
||||
sql += ")"
|
||||
else:
|
||||
# build ALTER
|
||||
sql = f"ALTER TABLE {canonical_name}\n"
|
||||
sql += ",\n".join(["ADD COLUMN " + self._get_column_def_sql(c) for c in new_columns])
|
||||
# scan columns to get hints
|
||||
cluster_list = [escape_bigquery_identifier(c["name"]) for c in new_columns if c.get("cluster", False)]
|
||||
partition_list = [escape_bigquery_identifier(c["name"]) for c in new_columns if c.get("partition", False)]
|
||||
# partition by must be added first
|
||||
if len(partition_list) > 0:
|
||||
if exists:
|
||||
raise LoadClientSchemaWillNotUpdate(canonical_name, partition_list, "Partition requested after table was created")
|
||||
elif len(partition_list) > 1:
|
||||
raise LoadClientSchemaWillNotUpdate(canonical_name, partition_list, "Partition requested for more than one column")
|
||||
else:
|
||||
sql += f"\nPARTITION BY DATE({partition_list[0]})"
|
||||
if len(cluster_list) > 0:
|
||||
if exists:
|
||||
raise LoadClientSchemaWillNotUpdate(canonical_name, cluster_list, "Clustering requested after table was created")
|
||||
else:
|
||||
sql += "\nCLUSTER BY " + ",".join(cluster_list)
|
||||
|
||||
return sql
|
||||
|
||||
def _get_column_def_sql(self, c: Column) -> str:
|
||||
name = escape_bigquery_identifier(c["name"])
|
||||
return f"{name} {self._sc_t_to_bq_t(c['data_type'])} {self._gen_not_null(c['nullable'])}"
|
||||
|
||||
def _get_storage_table(self, table_name: str) -> Tuple[bool, Table]:
|
||||
schema_table: Table = {}
|
||||
try:
|
||||
table = self._client.get_table(self._to_canonical_table_name(table_name), retry=self.default_retry, timeout=self.C.TIMEOUT)
|
||||
partition_field = table.time_partitioning.field if table.time_partitioning else None
|
||||
for c in table.schema:
|
||||
schema_c: Column = {
|
||||
"name": c.name,
|
||||
"nullable": c.is_nullable,
|
||||
"data_type": self._bq_t_to_sc_t(c.field_type, c.precision, c.scale),
|
||||
"unique": False,
|
||||
"sort": False,
|
||||
"primary_key": False,
|
||||
"foreign_key": False,
|
||||
"cluster": c.name in (table.clustering_fields or []),
|
||||
"partition": c.name == partition_field
|
||||
}
|
||||
schema_table[c.name] = schema_c
|
||||
return True, schema_table
|
||||
except gcp_exceptions.NotFound:
|
||||
return False, schema_table
|
||||
|
||||
def _execute_sql(self, query: AnyStr) -> Any:
|
||||
logger.debug(f"Will execute query {query}") # type: ignore
|
||||
return self._client.query(query, job_retry=self.default_retry, timeout=self.C.TIMEOUT).result()
|
||||
|
||||
def _to_canonical_schema_name(self) -> str:
|
||||
return f"{self.C.PROJECT_ID}.{self.C.DATASET}_{self.schema.schema_name}"
|
||||
|
||||
def _create_load_job(self, table_name: str, file_path: str) -> bigquery.LoadJob:
|
||||
job_id = BigQueryClient._get_job_id_from_file_path(file_path)
|
||||
job_config = bigquery.LoadJobConfig(
|
||||
autodetect=False,
|
||||
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
||||
create_disposition=bigquery.CreateDisposition.CREATE_NEVER,
|
||||
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
||||
ignore_unknown_values=False,
|
||||
max_bad_records=0,
|
||||
|
||||
)
|
||||
with open(file_path, "rb") as f:
|
||||
return self._client.load_table_from_file(f,
|
||||
self._to_canonical_table_name(table_name),
|
||||
job_id=job_id,
|
||||
job_config=job_config,
|
||||
timeout=self.C.TIMEOUT
|
||||
)
|
||||
|
||||
def _retrieve_load_job(self, file_path: str) -> bigquery.LoadJob:
|
||||
job_id = BigQueryClient._get_job_id_from_file_path(file_path)
|
||||
return self._client.get_job(job_id)
|
||||
|
||||
@staticmethod
|
||||
def _get_job_id_from_file_path(file_path: str) -> str:
|
||||
return Path(file_path).name.replace(".", "_")
|
||||
|
||||
@staticmethod
|
||||
def _gen_not_null(v: bool) -> str:
|
||||
return "NOT NULL" if not v else ""
|
||||
|
||||
@staticmethod
|
||||
def _sc_t_to_bq_t(sc_t: DataType) -> str:
|
||||
return SCT_TO_BQT[sc_t]
|
||||
|
||||
@staticmethod
|
||||
def _bq_t_to_sc_t(bq_t: str, precision: Optional[int], scale: Optional[int]) -> DataType:
|
||||
if bq_t == "BIGNUMERIC":
|
||||
if precision is None: # biggest numeric possible
|
||||
return "wei"
|
||||
return BQT_TO_SCT.get(bq_t, "text")
|
||||
|
||||
|
||||
def make_client(schema: Schema, C: Type[GcpClientConfiguration]) -> BigQueryClient:
|
||||
return BigQueryClient(schema, C)
|
||||
|
||||
|
||||
def supported_writer(C: Type[GcpClientConfiguration]) -> TWriterType:
|
||||
return "jsonl"
|
||||
|
||||
# cred = service_account.Credentials.from_service_account_info(_credentials)
|
||||
# project_id = cred.get('project_id')
|
||||
# client = bigquery.Client(project_id, credentials=cred)
|
||||
# print(client.get_dataset("carbon_bot_extract_7"))
|
||||
# exit(0)
|
||||
# from dlt.common.configuration import SchemaStoreConfiguration
|
||||
# from dlt.common.logger import init_logging_from_config
|
||||
|
||||
# init_logging_from_config(CLIENT_CONFIG)
|
||||
|
||||
# schema = Schema(SchemaStoreConfiguration.TRACKER_SCHEMA_FILE_PATH)
|
||||
# schema.load_schema()
|
||||
# import pprint
|
||||
# # pprint.pprint(schema.as_yaml())
|
||||
# with make_client(schema) as client:
|
||||
# client.initialize_storage()
|
||||
# # job = client._create_load_job("tracker", "_storage/loaded/1630949263.574516/completed_jobs/tracker.1c31ff1b-c250-4690-8973-14f0ee9ae355.jsonl")
|
||||
# # unk table
|
||||
# # job = client._create_load_job("trackerZ", "_storage/loaded/1630949263.574516/completed_jobs/tracker.4876f905-aefe-4262-a440-d29ed2643c3a.jsonl")
|
||||
# # job = client._create_load_job("tracker", "_storage/loaded/1630949263.574516/completed_jobs/event_bot.c9105079-2d1d-4ad3-8613-a5dff790889d.jsonl")
|
||||
# # failed
|
||||
# # job = client._retrieve_load_job("_storage/loaded/1630949263.574516/completed_jobs/event_bot.c9105079-2d1d-4ad3-8613-a5dff790889d.jsonl")
|
||||
# # OK
|
||||
# job = client._retrieve_load_job("_storage/loaded/1630949263.574516/completed_jobs/tracker.1c31ff1b-c250-4690-8973-14f0ee9ae355.jsonl")
|
||||
# while True:
|
||||
# try:
|
||||
# # this does not throw
|
||||
# done = job.done()
|
||||
# print(f"DONE: {job.done(reload=False)}")
|
||||
# except Exception as e:
|
||||
# logger.exception("DONE")
|
||||
# done = True
|
||||
# if done:
|
||||
# break;
|
||||
# # done is not self running
|
||||
|
||||
# # print(job.running())
|
||||
# sleep(1)
|
||||
# try:
|
||||
# print(f"status: {job.state}")
|
||||
# print(f"error: {job.error_result}")
|
||||
# print(f"errors: {job.errors}")
|
||||
# print(f"line count: {job.output_rows}")
|
||||
# print(job.exception())
|
||||
# except:
|
||||
# logger.exception("EXCEPTION")
|
||||
# try:
|
||||
# print(job.result())
|
||||
# except:
|
||||
# logger.exception("RESULT")
|
||||
|
||||
# non existing table
|
||||
# wrong data - unknown column
|
||||
|
||||
238
dlt/loaders/loader.py
Normal file
238
dlt/loaders/loader.py
Normal file
@@ -0,0 +1,238 @@
|
||||
from types import ModuleType
|
||||
from typing import Any, Iterator, List, Dict, Literal, Optional, Tuple, Type
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from importlib import import_module
|
||||
from prometheus_client import REGISTRY, Counter, Gauge, CollectorRegistry, Summary
|
||||
from prometheus_client.metrics import MetricWrapperBase
|
||||
|
||||
from dlt.common import sleep, logger
|
||||
from dlt.common.runners import TRunArgs, TRunMetrics, create_default_args, initialize_runner, pool_runner
|
||||
from dlt.common.logger import process_internal_exception, pretty_format_exception
|
||||
from dlt.common.exceptions import TerminalValueError
|
||||
from dlt.common.dataset_writers import TWriterType
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.common.storages import SchemaStorage
|
||||
from dlt.common.storages.loader_storage import LoaderStorage
|
||||
from dlt.common.telemetry import get_logging_extras, set_gauge_all_labels
|
||||
|
||||
from dlt.loaders.exceptions import LoadClientTerminalException, LoadClientTransientException, LoadJobNotExistsException
|
||||
from dlt.loaders.client_base import ClientBase, LoadJob
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
from dlt.loaders.configuration import configuration, LoaderConfiguration
|
||||
|
||||
|
||||
CONFIG: Type[LoaderConfiguration] = None
|
||||
load_storage: LoaderStorage = None
|
||||
client_module: ModuleType = None
|
||||
load_counter: Counter = None
|
||||
job_gauge: Gauge = None
|
||||
job_counter: Counter = None
|
||||
job_wait_summary: Summary = None
|
||||
|
||||
|
||||
def client_impl(client_type: str) -> ModuleType:
|
||||
return import_module(f".{client_type}.client", "dlt.loaders")
|
||||
|
||||
|
||||
def create_client(schema: Schema) -> ClientBase:
|
||||
return client_module.make_client(schema, CONFIG) # type: ignore
|
||||
|
||||
|
||||
def supported_writer() -> TWriterType:
|
||||
return client_module.supported_writer(CONFIG) # type: ignore
|
||||
|
||||
|
||||
def create_folders() -> LoaderStorage:
|
||||
load_storage = LoaderStorage(False, CONFIG, supported_writer())
|
||||
load_storage.initialize_storage()
|
||||
return load_storage
|
||||
|
||||
|
||||
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase, MetricWrapperBase, MetricWrapperBase]:
|
||||
return (
|
||||
Counter("loader_load_package_counter", "Counts load package processed", registry=registry),
|
||||
Gauge("loader_last_package_jobs_counter", "Counts jobs in last package per status", ["status"], registry=registry),
|
||||
Counter("loader_jobs_counter", "Counts jobs per job status", ["status"], registry=registry),
|
||||
Summary("loader_jobs_wait_seconds", "Counts jobs total wait until completion", registry=registry)
|
||||
)
|
||||
|
||||
|
||||
def spool_job(file_path: str, load_id: str, schema: Schema) -> Optional[LoadJob]:
|
||||
# open new connection for each upload
|
||||
job: LoadJob = None
|
||||
try:
|
||||
with create_client(schema) as client:
|
||||
table_name, _ = load_storage.parse_load_file_name(file_path)
|
||||
logger.info(f"Will load file {file_path} with table name {table_name}")
|
||||
job = client.start_file_load(table_name, load_storage.storage._make_path(file_path))
|
||||
except (LoadClientTerminalException, TerminalValueError):
|
||||
# if job irreversible cannot be started, mark it as failed
|
||||
process_internal_exception(f"Terminal problem with spooling job {file_path}")
|
||||
job = ClientBase.make_job_with_status(file_path, "failed", pretty_format_exception())
|
||||
except (LoadClientTransientException, Exception):
|
||||
# return no job so file stays in new jobs (root) folder
|
||||
process_internal_exception(f"Temporary problem with spooling job {file_path}")
|
||||
return None
|
||||
load_storage.start_job(load_id, job.file_name())
|
||||
return job
|
||||
|
||||
|
||||
def spool_new_jobs(pool: ThreadPool, load_id: str, schema: Schema) -> Tuple[int, List[LoadJob]]:
|
||||
# TODO: validate file type, combine files, finalize etc., this is client specific, jsonl for single table
|
||||
# can just be combined, insert_values must be finalized and then combined
|
||||
# use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs
|
||||
# TODO: combine files by providing a list of files pertaining to same table into job, so job must be
|
||||
# extended to accept a list
|
||||
load_files = load_storage.list_new_jobs(load_id)[:CONFIG.MAX_PARALLEL_LOADS]
|
||||
file_count = len(load_files)
|
||||
if file_count == 0:
|
||||
logger.info(f"No new jobs found in {load_id}")
|
||||
return 0, []
|
||||
logger.info(f"Will load {file_count}, creating jobs")
|
||||
param_chunk = [(file, load_id, schema) for file in load_files]
|
||||
# exceptions should not be raised, None as job is a temporary failure
|
||||
# other jobs should not be affected
|
||||
jobs: List[LoadJob] = pool.starmap(spool_job, param_chunk)
|
||||
# remove None jobs and check the rest
|
||||
return file_count, [job for job in jobs if job is not None]
|
||||
|
||||
|
||||
def retrieve_jobs(client: ClientBase, load_id: str) -> Tuple[int, List[LoadJob]]:
|
||||
jobs: List[LoadJob] = []
|
||||
|
||||
# list all files that were started but not yet completed
|
||||
started_jobs = load_storage.list_started_jobs(load_id)
|
||||
logger.info(f"Found {len(started_jobs)} that are already started and should be continued")
|
||||
if len(started_jobs) == 0:
|
||||
return 0, jobs
|
||||
|
||||
for file_path in started_jobs:
|
||||
try:
|
||||
logger.info(f"Will retrieve {file_path}")
|
||||
job = client.get_file_load(file_path)
|
||||
except LoadClientTerminalException:
|
||||
process_internal_exception(f"Job retrieval for {file_path} failed, job will be terminated")
|
||||
job = ClientBase.make_job_with_status(file_path, "failed", pretty_format_exception())
|
||||
# proceed to appending job, do not reraise
|
||||
except (LoadClientTransientException, Exception) as e:
|
||||
# raise on all temporary exceptions, typically network / server problems
|
||||
raise
|
||||
jobs.append(job)
|
||||
|
||||
job_gauge.labels("retrieved").inc()
|
||||
job_counter.labels("retrieved").inc()
|
||||
logger.metrics("Retrieve jobs metrics",
|
||||
extra=get_logging_extras([job_gauge.labels("retrieved"), job_counter.labels("retrieved")])
|
||||
)
|
||||
return len(jobs), jobs
|
||||
|
||||
|
||||
def complete_jobs(load_id: str, jobs: List[LoadJob]) -> List[LoadJob]:
|
||||
remaining_jobs: List[LoadJob] = []
|
||||
logger.info(f"Will complete {len(jobs)} for {load_id}")
|
||||
for ii in range(len(jobs)):
|
||||
job = jobs[ii]
|
||||
logger.debug(f"Checking status for job {job.file_name()}")
|
||||
status: LoadJobStatus = job.status()
|
||||
final_location: str = None
|
||||
if status == "running":
|
||||
# ask again
|
||||
logger.debug(f"job {job.file_name()} still running")
|
||||
remaining_jobs.append(job)
|
||||
elif status == "failed":
|
||||
# try to get exception message from job
|
||||
failed_message = job.exception()
|
||||
final_location = load_storage.fail_job(load_id, job.file_name(), failed_message)
|
||||
logger.error(f"Job for {job.file_name()} failed terminally in load {load_id} with message {failed_message}")
|
||||
elif status == "retry":
|
||||
# try to get exception message from job
|
||||
retry_message = job.exception()
|
||||
# move back to new folder to try again
|
||||
final_location = load_storage.retry_job(load_id, job.file_name())
|
||||
logger.error(f"Job for {job.file_name()} retried in load {load_id} with message {retry_message}")
|
||||
elif status == "completed":
|
||||
# move to completed folder
|
||||
final_location = load_storage.complete_job(load_id, job.file_name())
|
||||
logger.info(f"Job for {job.file_name()} completed in load {load_id}")
|
||||
|
||||
if status != "running":
|
||||
job_gauge.labels(status).inc()
|
||||
job_counter.labels(status).inc()
|
||||
job_wait_summary.observe(load_storage.job_elapsed_time_seconds(final_location))
|
||||
|
||||
logger.metrics("Completing jobs metrics", extra=get_logging_extras([job_counter, job_gauge, job_wait_summary]))
|
||||
return remaining_jobs
|
||||
|
||||
|
||||
|
||||
def run(pool: ThreadPool) -> TRunMetrics:
|
||||
logger.info(f"Running file loading")
|
||||
# get list of loads and order by name ASC to execute schema updates
|
||||
loads = load_storage.list_loads()
|
||||
logger.info(f"Found {len(loads)} load packages")
|
||||
if len(loads) == 0:
|
||||
return TRunMetrics(True, False, 0)
|
||||
|
||||
load_id = loads[0]
|
||||
logger.info(f"Loading schema from load package in {load_id}")
|
||||
# one load package contains table from one schema
|
||||
schema_storage = SchemaStorage(load_storage.storage.storage_path)
|
||||
# get relative path to load schema from load package
|
||||
schema = schema_storage.load_folder_schema(load_storage.get_load_path(load_id))
|
||||
logger.info(f"Loaded schema name {schema.schema_name} and version {schema.schema_version}")
|
||||
# initialize analytical storage ie. create dataset required by passed schema
|
||||
with create_client(schema) as client:
|
||||
logger.info(f"Client {CONFIG.CLIENT_TYPE} will start load")
|
||||
client.initialize_storage()
|
||||
schema_update = load_storage.begin_schema_update(load_id)
|
||||
if schema_update:
|
||||
logger.info(f"Client {CONFIG.CLIENT_TYPE} will update schema to package schema")
|
||||
client.update_storage_schema()
|
||||
load_storage.commit_schema_update(load_id)
|
||||
# spool or retrieve unfinished jobs
|
||||
jobs_count, jobs = retrieve_jobs(client, load_id)
|
||||
if not jobs:
|
||||
# jobs count is a total number of jobs including those that could not be initialized
|
||||
jobs_count, jobs = spool_new_jobs(pool, load_id, schema)
|
||||
if jobs_count > 0:
|
||||
# this is a new load package
|
||||
set_gauge_all_labels(job_gauge, 0)
|
||||
job_gauge.labels("running").inc(len(jobs))
|
||||
job_counter.labels("running").inc(len(jobs))
|
||||
logger.metrics("New jobs metrics",
|
||||
extra=get_logging_extras([job_counter.labels("running"), job_gauge.labels("running")])
|
||||
)
|
||||
# if there are no existing or new jobs we archive the package
|
||||
if jobs_count == 0:
|
||||
with create_client(schema) as client:
|
||||
remaining_jobs = client.complete_load(load_id)
|
||||
load_storage.archive_load(load_id)
|
||||
logger.info(f"All jobs completed, archiving package {load_id}")
|
||||
load_counter.inc()
|
||||
logger.metrics("Load package metrics", extra=get_logging_extras([load_counter]))
|
||||
else:
|
||||
while True:
|
||||
remaining_jobs = complete_jobs(load_id, jobs)
|
||||
if len(remaining_jobs) == 0:
|
||||
break
|
||||
# process remaining jobs again
|
||||
jobs = remaining_jobs
|
||||
# this will raise on signal
|
||||
sleep(1)
|
||||
|
||||
return TRunMetrics(False, False, len(load_storage.list_loads()))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
CONFIG = configuration()
|
||||
parser = create_default_args(CONFIG)
|
||||
args = parser.parse_args()
|
||||
initialize_runner(CONFIG, TRunArgs(args.single_run, args.wait_runs))
|
||||
try:
|
||||
client_module = client_impl(CONFIG.CLIENT_TYPE)
|
||||
load_counter, job_gauge, job_counter, job_wait_summary = create_gauges(REGISTRY)
|
||||
load_storage = create_folders()
|
||||
except Exception:
|
||||
process_internal_exception("run")
|
||||
exit(-1)
|
||||
exit(pool_runner(CONFIG, run))
|
||||
4
dlt/loaders/local_types.py
Normal file
4
dlt/loaders/local_types.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from typing import Literal
|
||||
|
||||
|
||||
LoadJobStatus = Literal["running", "failed", "retry", "completed"]
|
||||
23
dlt/loaders/redshift/README.md
Normal file
23
dlt/loaders/redshift/README.md
Normal file
@@ -0,0 +1,23 @@
|
||||
# Public Access setup
|
||||
There's *Modify publicly accessible settings* in Actions of each Redshift cluster. Assign your IP there.
|
||||
|
||||
# Runtime optimization
|
||||
|
||||
https://www.intermix.io/blog/top-14-performance-tuning-techniques-for-amazon-redshift/
|
||||
|
||||
1. we should use separate work queue for loader user
|
||||
2. they suggest to not use dist keys
|
||||
3. data must be inserted in order of sortkey
|
||||
|
||||
# loader account setup
|
||||
|
||||
1. Create new database `CREATE DATABASE chat_analytics_rasa_ci`
|
||||
2. Create new user, set password
|
||||
3. Set as database owner (we could set lower permission) `ALTER DATABASE chat_analytics_rasa_ci OWNER TO loader`
|
||||
|
||||
# Public access setup for Serverless
|
||||
Follow https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-connecting.html `Connecting from the public subnet to the Amazon Redshift Serverless endpoint using Network Load Balancer`
|
||||
|
||||
that will use terraform template to create load balancer endpoint and assign public IP. The cost of the load balancer is ~16$/month + cost of IP
|
||||
|
||||
It seems that port 5439 is closed to the VPC on which serverless redshift created itself. In the cluster panel: Data Access : VPC security group add Inbound Rule to allow 5439 port from any subnet 0.0.0.0/0
|
||||
0
dlt/loaders/redshift/__init__.py
Normal file
0
dlt/loaders/redshift/__init__.py
Normal file
282
dlt/loaders/redshift/client.py
Normal file
282
dlt/loaders/redshift/client.py
Normal file
@@ -0,0 +1,282 @@
|
||||
import os
|
||||
import psycopg2
|
||||
from psycopg2.sql import SQL, Identifier, Composed, Literal as SQLLiteral
|
||||
from typing import Any, AnyStr, Dict, List, Literal, Optional, Tuple, Type
|
||||
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE
|
||||
from dlt.common.configuration import PostgresConfiguration
|
||||
from dlt.common.dataset_writers import TWriterType, escape_redshift_identifier
|
||||
from dlt.common.schema import COLUMN_HINTS, Column, ColumnBase, DataType, HintType, Schema, SchemaUpdate, Table
|
||||
|
||||
from dlt.loaders.exceptions import (LoadClientSchemaWillNotUpdate, LoadClientTerminalInnerException,
|
||||
LoadClientTransientInnerException, LoadFileTooBig)
|
||||
from dlt.loaders.local_types import LoadJobStatus
|
||||
from dlt.loaders.client_base import ClientBase, SqlClientBase, LoadJob
|
||||
|
||||
SCT_TO_PGT: Dict[DataType, str] = {
|
||||
"text": "varchar(max)",
|
||||
"double": "double precision",
|
||||
"bool": "boolean",
|
||||
"timestamp": "timestamp with time zone",
|
||||
"bigint": "bigint",
|
||||
"binary": "varbinary",
|
||||
"decimal": f"numeric({DEFAULT_NUMERIC_PRECISION},{DEFAULT_NUMERIC_SCALE})"
|
||||
}
|
||||
|
||||
PGT_TO_SCT: Dict[str, DataType] = {
|
||||
"varchar(max)": "text",
|
||||
"double precision": "double",
|
||||
"boolean": "bool",
|
||||
"timestamp with time zone": "timestamp",
|
||||
"bigint": "bigint",
|
||||
"binary varying": "binary",
|
||||
"numeric": "decimal"
|
||||
}
|
||||
|
||||
HINT_TO_REDSHIFT_ATTR: Dict[HintType, str] = {
|
||||
"cluster": "DISTKEY",
|
||||
# it is better to not enforce constraints in redshift
|
||||
# "primary_key": "PRIMARY KEY",
|
||||
"sort": "SORTKEY"
|
||||
}
|
||||
|
||||
|
||||
class SqlClientMixin:
|
||||
|
||||
MAX_STATEMENT_SIZE = 16 * 1024 * 1204
|
||||
|
||||
def __init__(self, CONFIG: Type[PostgresConfiguration], *args: Any, **kwargs: Any) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self._conn: psycopg2.connection = None
|
||||
self.C = CONFIG
|
||||
|
||||
def _open_connection(self) -> None:
|
||||
self._conn = psycopg2.connect(dbname=self.C.PG_DATABASE_NAME,
|
||||
user=self.C.PG_USER,
|
||||
host=self.C.PG_HOST,
|
||||
port=self.C.PG_PORT,
|
||||
password=self.C.PG_PASSWORD,
|
||||
connect_timeout=self.C.PG_CONNECTION_TIMEOUT
|
||||
)
|
||||
# we'll provide explicit transactions
|
||||
self._conn.set_session(autocommit=True)
|
||||
|
||||
def _close_connection(self) -> None:
|
||||
if self._conn:
|
||||
self._conn.close()
|
||||
self._conn = None
|
||||
|
||||
def _execute_sql(self, query: AnyStr) -> Any:
|
||||
curr: psycopg2.cursor
|
||||
with self._conn.cursor() as curr:
|
||||
try:
|
||||
curr.execute(query)
|
||||
except psycopg2.Error as outer:
|
||||
try:
|
||||
self._conn.rollback()
|
||||
self._conn.reset()
|
||||
except psycopg2.Error:
|
||||
self._close_connection()
|
||||
self._open_connection()
|
||||
raise outer
|
||||
if curr.description is None:
|
||||
return None
|
||||
else:
|
||||
f = curr.fetchall()
|
||||
return f
|
||||
|
||||
|
||||
class RedshiftInsertLoadJob(SqlClientMixin, LoadJob):
|
||||
def __init__(self, canonical_table_name: str, file_path: str, conn: Any, CONFIG: Type[PostgresConfiguration]) -> None:
|
||||
super().__init__(CONFIG, ClientBase.get_file_name_from_file_path(file_path))
|
||||
self._conn = conn
|
||||
# insert file content immediately
|
||||
self._insert(canonical_table_name, file_path)
|
||||
|
||||
def status(self) -> LoadJobStatus:
|
||||
# this job is always done
|
||||
return "completed"
|
||||
|
||||
def file_name(self) -> str:
|
||||
return self._file_name
|
||||
|
||||
def exception(self) -> str:
|
||||
# this part of code should be never reached
|
||||
raise NotImplementedError()
|
||||
|
||||
def _insert(self, canonical_table_name: str, file_path: str) -> None:
|
||||
# TODO: implement tracking of jobs in storage, both completed and failed
|
||||
# WARNING: maximum redshift statement is 16MB https://docs.aws.amazon.com/redshift/latest/dg/c_redshift-sql.html
|
||||
# in case of postgres: 2GiB
|
||||
if os.stat(file_path).st_size >= SqlClientMixin.MAX_STATEMENT_SIZE:
|
||||
# terminal exception
|
||||
raise LoadFileTooBig(file_path, SqlClientMixin.MAX_STATEMENT_SIZE)
|
||||
with open(file_path, "r") as f:
|
||||
header = f.readline()
|
||||
content = f.read()
|
||||
sql = Composed(
|
||||
[SQL("BEGIN TRANSACTION;"),
|
||||
SQL(header).format(SQL(canonical_table_name)),
|
||||
SQL(content),
|
||||
SQL("COMMIT TRANSACTION;")]
|
||||
)
|
||||
self._execute_sql(sql)
|
||||
|
||||
|
||||
class RedshiftClient(SqlClientMixin, SqlClientBase):
|
||||
def __init__(self, schema: Schema, CONFIG: Type[PostgresConfiguration]) -> None:
|
||||
super().__init__(CONFIG, schema)
|
||||
|
||||
def initialize_storage(self) -> None:
|
||||
schema_name = self._to_canonical_schema_name()
|
||||
query = """
|
||||
SELECT 1
|
||||
FROM INFORMATION_SCHEMA.SCHEMATA
|
||||
WHERE schema_name = {};
|
||||
"""
|
||||
rows = self._execute_sql(SQL(query).format(SQLLiteral(schema_name)))
|
||||
if len(rows) == 0:
|
||||
self._execute_sql(SQL("CREATE SCHEMA {};").format(Identifier(schema_name)))
|
||||
|
||||
def get_file_load(self, file_path: str) -> LoadJob:
|
||||
# always returns completed jobs as RedshiftInsertLoadJob is executed
|
||||
# atomically in start_file_load so any jobs that should be recreated are already completed
|
||||
# in case of bugs in loader (asking for jobs that were never created) we are not able to detect that
|
||||
return ClientBase.make_job_with_status(file_path, "completed")
|
||||
|
||||
def start_file_load(self, table_name: str, file_path: str) -> LoadJob:
|
||||
# verify that table exists in the schema
|
||||
self._get_table_by_name(table_name, file_path)
|
||||
try:
|
||||
return RedshiftInsertLoadJob(self._to_canonical_table_name(table_name), file_path, self._conn, self.C)
|
||||
except (psycopg2.OperationalError, psycopg2.InternalError) as tr_ex:
|
||||
if tr_ex.pgerror is not None:
|
||||
if "Cannot insert a NULL value into column" in tr_ex.pgerror:
|
||||
# NULL violations is internal error, probably a redshift thing
|
||||
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
|
||||
if "Numeric data overflow" in tr_ex.pgerror:
|
||||
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
|
||||
if "Precision exceeds maximum":
|
||||
raise LoadClientTerminalInnerException("Terminal error, file will not load", tr_ex)
|
||||
raise LoadClientTransientInnerException("Error may go away, will retry", tr_ex)
|
||||
except (psycopg2.DataError, psycopg2.ProgrammingError, psycopg2.IntegrityError) as ter_ex:
|
||||
raise LoadClientTerminalInnerException("Terminal error, file will not load", ter_ex)
|
||||
|
||||
def update_storage_schema(self) -> None:
|
||||
storage_version = self._get_schema_version_from_storage()
|
||||
if storage_version < self.schema.schema_version:
|
||||
for sql in self._build_schema_update_sql():
|
||||
self._execute_sql(sql)
|
||||
self._update_schema_version(self.schema.schema_version)
|
||||
|
||||
def _get_schema_version_from_storage(self) -> int:
|
||||
try:
|
||||
return super()._get_schema_version_from_storage()
|
||||
except psycopg2.ProgrammingError:
|
||||
# there's no table so there's no schema
|
||||
return 0
|
||||
|
||||
def _build_schema_update_sql(self) -> List[str]:
|
||||
sql_updates = []
|
||||
for table_name in self.schema.schema_tables:
|
||||
exists, storage_table = self._get_storage_table(table_name)
|
||||
sql = self._get_table_update_sql(table_name, storage_table, exists)
|
||||
if sql:
|
||||
sql_updates.append(sql)
|
||||
return sql_updates
|
||||
|
||||
def _get_table_update_sql(self, table_name: str, storage_table: Table, exists: bool) -> str:
|
||||
new_columns = self._create_table_update(table_name, storage_table)
|
||||
if len(new_columns) == 0:
|
||||
# no changes
|
||||
return None
|
||||
# build sql
|
||||
canonical_name = self._to_canonical_table_name(table_name)
|
||||
sql = "BEGIN TRANSACTION;\n"
|
||||
if not exists:
|
||||
# build CREATE
|
||||
sql += f"CREATE TABLE {canonical_name} (\n"
|
||||
sql += ",\n".join([self._get_column_def_sql(c) for c in new_columns])
|
||||
sql += ");"
|
||||
else:
|
||||
# build ALTER as separate statement for each column (redshift limitation)
|
||||
sql += "\n".join([f"ALTER TABLE {canonical_name}\nADD COLUMN {self._get_column_def_sql(c)};" for c in new_columns])
|
||||
# scan columns to get hints
|
||||
if exists:
|
||||
# no hints may be specified on added columns
|
||||
for hint in COLUMN_HINTS:
|
||||
if any(c.get(hint, False) is True for c in new_columns):
|
||||
hint_columns = [c["name"] for c in new_columns if c.get(hint, False)]
|
||||
raise LoadClientSchemaWillNotUpdate(canonical_name, hint_columns, f"{hint} requested after table was created")
|
||||
# TODO: add FK relations
|
||||
sql += "\nCOMMIT TRANSACTION;"
|
||||
return sql
|
||||
|
||||
def _get_column_def_sql(self, c: Column) -> str:
|
||||
hints_str = " ".join(HINT_TO_REDSHIFT_ATTR.get(h, "") for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True)
|
||||
column_name = escape_redshift_identifier(c["name"])
|
||||
return f"{column_name} {self._sc_t_to_pq_t(c['data_type'])} {hints_str} {self._gen_not_null(c['nullable'])}"
|
||||
|
||||
def _get_storage_table(self, table_name: str) -> Tuple[bool, Table]:
|
||||
schema_table: Table = {}
|
||||
query = f"""
|
||||
SELECT column_name, data_type, is_nullable, numeric_precision, numeric_scale
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE table_schema = '{self._to_canonical_schema_name()}' AND table_name = '{table_name}'
|
||||
ORDER BY ordinal_position;
|
||||
"""
|
||||
rows = self._execute_sql(query)
|
||||
# if no rows we assume that table does not exist
|
||||
if len(rows) == 0:
|
||||
# TODO: additionally check if table exists
|
||||
return False, schema_table
|
||||
# TODO: pull more data to infer DISTKEY, PK and SORTKEY attributes/constraints
|
||||
for c in rows:
|
||||
schema_c: ColumnBase = {
|
||||
"name": c[0],
|
||||
"nullable": self._null_to_bool(c[2]),
|
||||
"data_type": self._pq_t_to_sc_t(c[1], c[3], c[4]),
|
||||
}
|
||||
schema_table[c[0]] = Schema._add_missing_hints(schema_c)
|
||||
return True, schema_table
|
||||
|
||||
|
||||
def _to_canonical_schema_name(self) -> str:
|
||||
return f"{self.C.PG_SCHEMA_PREFIX}_{self.schema.schema_name}"
|
||||
|
||||
def _to_canonical_table_name(self, table_name: str) -> str:
|
||||
return f"{self._to_canonical_schema_name()}.{table_name}"
|
||||
|
||||
@staticmethod
|
||||
def _null_to_bool(v: str) -> bool:
|
||||
if v == "NO":
|
||||
return False
|
||||
elif v == "YES":
|
||||
return True;
|
||||
raise ValueError(v)
|
||||
|
||||
@staticmethod
|
||||
def _gen_not_null(v: bool) -> str:
|
||||
return "NOT NULL" if not v else ""
|
||||
|
||||
@staticmethod
|
||||
def _sc_t_to_pq_t(sc_t: DataType) -> str:
|
||||
if sc_t == "wei":
|
||||
return f"numeric({DEFAULT_NUMERIC_PRECISION},0)"
|
||||
return SCT_TO_PGT[sc_t]
|
||||
|
||||
@staticmethod
|
||||
def _pq_t_to_sc_t(pq_t: str, precision: Optional[int], scale: Optional[int]) -> DataType:
|
||||
if pq_t == "numeric":
|
||||
if precision == DEFAULT_NUMERIC_PRECISION and scale == 0:
|
||||
return "wei"
|
||||
return PGT_TO_SCT.get(pq_t, "text")
|
||||
|
||||
|
||||
def make_client(schema: Schema, C: Type[PostgresConfiguration]) -> RedshiftClient:
|
||||
return RedshiftClient(schema, C)
|
||||
|
||||
|
||||
def supported_writer(C: Type[PostgresConfiguration]) -> TWriterType:
|
||||
return "insert_values"
|
||||
@@ -5,21 +5,21 @@ import os.path
|
||||
from typing import Callable, Dict, Iterator, List, Literal, Sequence, Tuple
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
from autopoiesis.common import json, runners
|
||||
from autopoiesis.common.configuration import BasicConfiguration, make_configuration
|
||||
from autopoiesis.common.configuration.utils import TConfigSecret
|
||||
from autopoiesis.common.file_storage import FileStorage
|
||||
from autopoiesis.common.logger import process_internal_exception
|
||||
from autopoiesis.common.runners import TRunArgs, TRunMetrics
|
||||
from autopoiesis.common.schema import Schema, StoredSchema
|
||||
from autopoiesis.common.typing import DictStrAny, StrAny
|
||||
from autopoiesis.common.utils import uniq_id, is_interactive
|
||||
from dlt.common import json, runners
|
||||
from dlt.common.configuration import BasicConfiguration, make_configuration
|
||||
from dlt.common.configuration.utils import TConfigSecret
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.logger import process_internal_exception
|
||||
from dlt.common.runners import TRunArgs, TRunMetrics
|
||||
from dlt.common.schema import Schema, StoredSchema
|
||||
from dlt.common.typing import DictStrAny, StrAny
|
||||
from dlt.common.utils import uniq_id, is_interactive
|
||||
|
||||
from autopoiesis.extractors.extractor_storage import ExtractorStorageBase
|
||||
from autopoiesis.unpacker.configuration import configuration as unpacker_configuration
|
||||
from autopoiesis.loaders.configuration import configuration as loader_configuration
|
||||
from autopoiesis.unpacker import unpacker
|
||||
from autopoiesis.loaders import loader
|
||||
from dlt.extractors.extractor_storage import ExtractorStorageBase
|
||||
from dlt.unpacker.configuration import configuration as unpacker_configuration
|
||||
from dlt.loaders.configuration import configuration as loader_configuration
|
||||
from dlt.unpacker import unpacker
|
||||
from dlt.loaders import loader
|
||||
|
||||
TClientType = Literal["gcp", "redshift"]
|
||||
|
||||
|
||||
0
dlt/py.typed
Normal file
0
dlt/py.typed
Normal file
1
dlt/unpacker/__init__.py
Normal file
1
dlt/unpacker/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from dlt._version import unpacker_version as __version__
|
||||
29
dlt/unpacker/configuration.py
Normal file
29
dlt/unpacker/configuration.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from typing import Type
|
||||
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.configuration.pool_runner_configuration import TPoolType
|
||||
from dlt.common.dataset_writers import TWriterType
|
||||
|
||||
from dlt.common.configuration import (PoolRunnerConfiguration, UnpackingVolumeConfiguration,
|
||||
LoadingVolumeConfiguration, SchemaVolumeConfiguration,
|
||||
ProductionLoadingVolumeConfiguration, ProductionUnpackingVolumeConfiguration,
|
||||
ProductionSchemaVolumeConfiguration,
|
||||
TPoolType, make_configuration)
|
||||
|
||||
from . import __version__
|
||||
|
||||
|
||||
class UnpackerConfiguration(PoolRunnerConfiguration, UnpackingVolumeConfiguration, LoadingVolumeConfiguration, SchemaVolumeConfiguration):
|
||||
MAX_EVENTS_IN_CHUNK: int = 40000 # maximum events to be processed in single chunk
|
||||
WRITER_TYPE: TWriterType = "jsonl" # jsonp or insert commands will be generated
|
||||
ADD_EVENT_JSON: bool = True # add event json to "event" table, useful for debugging or recreating tracker
|
||||
POOL_TYPE: TPoolType = "process"
|
||||
|
||||
|
||||
class ProductionUnpackerConfiguration(ProductionUnpackingVolumeConfiguration, ProductionLoadingVolumeConfiguration,
|
||||
ProductionSchemaVolumeConfiguration, UnpackerConfiguration):
|
||||
pass
|
||||
|
||||
|
||||
def configuration(initial_values: StrAny = None) -> Type[UnpackerConfiguration]:
|
||||
return make_configuration(UnpackerConfiguration, ProductionUnpackerConfiguration, initial_values=initial_values)
|
||||
0
dlt/unpacker/exceptions.py
Normal file
0
dlt/unpacker/exceptions.py
Normal file
249
dlt/unpacker/unpacker.py
Normal file
249
dlt/unpacker/unpacker.py
Normal file
@@ -0,0 +1,249 @@
|
||||
from typing import Any, Callable, Type, List, Dict, Optional, Sequence, Tuple
|
||||
from multiprocessing.pool import Pool as ProcessPool
|
||||
from itertools import chain
|
||||
from prometheus_client import Counter, CollectorRegistry, REGISTRY, Gauge
|
||||
from prometheus_client.metrics import MetricWrapperBase
|
||||
|
||||
from dlt.common import pendulum, signals, json, logger
|
||||
from dlt.common.runners import TRunArgs, TRunMetrics, create_default_args, pool_runner, initialize_runner
|
||||
from dlt.common.storages.unpacker_storage import UnpackerStorage
|
||||
from dlt.common.telemetry import get_logging_extras
|
||||
from dlt.common.utils import uniq_id
|
||||
from dlt.common.typing import TEvent
|
||||
from dlt.common.logger import process_internal_exception
|
||||
from dlt.common.exceptions import PoolException
|
||||
from dlt.common.storages import SchemaStorage
|
||||
from dlt.common.schema import CannotCoerceColumnException, SchemaUpdate, Schema
|
||||
from dlt.common.parser import PATH_SEPARATOR
|
||||
from dlt.common.storages.loader_storage import LoaderStorage
|
||||
|
||||
from dlt.common.parser import extract, TExtractFunc
|
||||
from dlt.unpacker.configuration import configuration, UnpackerConfiguration
|
||||
|
||||
extract_func: TExtractFunc = extract
|
||||
CONFIG: Type[UnpackerConfiguration] = None
|
||||
unpack_storage: UnpackerStorage = None
|
||||
load_storage: LoaderStorage = None
|
||||
schema_storage: SchemaStorage = None
|
||||
load_schema_storage: SchemaStorage = None
|
||||
event_counter: Counter = None
|
||||
event_gauge: Gauge = None
|
||||
schema_version_gauge: Gauge = None
|
||||
load_package_counter: Counter = None
|
||||
|
||||
|
||||
def create_gauges(registry: CollectorRegistry) -> Tuple[MetricWrapperBase, MetricWrapperBase, MetricWrapperBase, MetricWrapperBase]:
|
||||
return (
|
||||
Counter("unpacker_event_count", "Events processed in unpacker", ["schema"], registry=registry),
|
||||
Gauge("unpacker_last_events", "Number of events processed in last run", ["schema"], registry=registry),
|
||||
Gauge("unpacker_schema_version", "Current schema version", ["schema"], registry=registry),
|
||||
Gauge("unpacker_load_packages_created_count", "Count of load package created", ["schema"], registry=registry)
|
||||
)
|
||||
|
||||
|
||||
def create_folders() -> Tuple[UnpackerStorage, LoaderStorage, SchemaStorage, SchemaStorage]:
|
||||
unpack_storage = UnpackerStorage(True, CONFIG)
|
||||
schema_storage = SchemaStorage(CONFIG.SCHEMA_VOLUME_PATH, makedirs=True)
|
||||
load_schema_storage = SchemaStorage(CONFIG.LOADING_VOLUME_PATH, makedirs=False)
|
||||
load_storage = LoaderStorage(True, CONFIG, CONFIG.WRITER_TYPE)
|
||||
|
||||
unpack_storage.initialize_storage()
|
||||
load_storage.initialize_storage()
|
||||
|
||||
return unpack_storage, load_storage, schema_storage, load_schema_storage
|
||||
|
||||
|
||||
def install_schemas(default_schemas_path: str, schema_names: List[str]) -> None:
|
||||
# copy default schemas if not present
|
||||
default_schemas = SchemaStorage(default_schemas_path)
|
||||
logger.info(f"Checking default schemas in {schema_storage.storage.storage_path}")
|
||||
for name in schema_names:
|
||||
if not schema_storage.has_store_schema(name):
|
||||
logger.info(f"Schema, {name} not present in {schema_storage.storage.storage_path}, installing...")
|
||||
schema = default_schemas.load_store_schema(name)
|
||||
schema_storage.save_store_schema(schema)
|
||||
|
||||
|
||||
def load_or_create_schema(schema_name: str) -> Schema:
|
||||
try:
|
||||
schema = schema_storage.load_store_schema(schema_name)
|
||||
logger.info(f"Loaded schema with name {schema_name} with version {schema.schema_version}")
|
||||
except FileNotFoundError:
|
||||
schema = Schema(schema_name)
|
||||
logger.info(f"Created new schema with name {schema_name}")
|
||||
return schema
|
||||
|
||||
|
||||
# this is a worker process
|
||||
def w_unpack_files(schema_name: str, load_id: str, events_files: Sequence[str]) -> SchemaUpdate:
|
||||
unpacked_data: Dict[str, List[Any]] = {}
|
||||
|
||||
schema_update: SchemaUpdate = {}
|
||||
schema = load_or_create_schema(schema_name)
|
||||
file_id = uniq_id()
|
||||
|
||||
# process all event files and store rows in memory
|
||||
for events_file in events_files:
|
||||
try:
|
||||
logger.debug(f"Processing events file {events_file}")
|
||||
with unpack_storage.storage.open(events_file) as f:
|
||||
events: Sequence[TEvent] = json.load(f)
|
||||
for event in events:
|
||||
for table_name, row in extract_func(schema, event, load_id, CONFIG.ADD_EVENT_JSON):
|
||||
# filter row, may eliminate some or all fields
|
||||
row = schema.filter_row(table_name, row, PATH_SEPARATOR)
|
||||
# do not process empty rows
|
||||
if row:
|
||||
# check if schema can be updated
|
||||
row, table_update = schema.coerce_row(table_name, row)
|
||||
if len(table_update) > 0:
|
||||
# update schema and save the change
|
||||
schema.update_schema(table_name, table_update)
|
||||
table_updates = schema_update.setdefault(table_name, [])
|
||||
table_updates.extend(table_update)
|
||||
# store row
|
||||
rows = unpacked_data.setdefault(table_name, [])
|
||||
rows.append(row)
|
||||
except Exception:
|
||||
process_internal_exception(f"Exception when processing file {events_file}")
|
||||
raise PoolException("unpack_files", events_file)
|
||||
|
||||
# save rows and return schema changes to be gathered in parent process
|
||||
for table_name, rows in unpacked_data.items():
|
||||
# save into new jobs to processed as load
|
||||
table = schema.get_table(table_name)
|
||||
load_storage.write_temp_loading_file(load_id, table_name, table, file_id, rows)
|
||||
|
||||
return schema_update
|
||||
|
||||
|
||||
TMapFuncRV = Tuple[List[SchemaUpdate], List[Sequence[str]]]
|
||||
TMapFuncType = Callable[[ProcessPool, str, str, Sequence[str]], TMapFuncRV]
|
||||
|
||||
def map_parallel(pool: ProcessPool, schema_name: str, load_id: str, files: Sequence[str]) -> TMapFuncRV:
|
||||
# we chunk files in a way to not exceed MAX_EVENTS_IN_CHUNK and split them equally
|
||||
# between processors
|
||||
configured_processes = pool._processes # type: ignore
|
||||
chunk_files = UnpackerStorage.chunk_by_events(files, CONFIG.MAX_EVENTS_IN_CHUNK, configured_processes)
|
||||
logger.info(f"Obtained {len(chunk_files)} processing chunks")
|
||||
param_chunk = [(schema_name, load_id, files) for files in chunk_files]
|
||||
return pool.starmap(w_unpack_files, param_chunk), chunk_files
|
||||
|
||||
|
||||
def map_single(_: ProcessPool, schema_name: str, load_id: str, files: Sequence[str]) -> TMapFuncRV:
|
||||
chunk_files = UnpackerStorage.chunk_by_events(files, CONFIG.MAX_EVENTS_IN_CHUNK, 1)
|
||||
# get in one chunk
|
||||
assert len(chunk_files) == 1
|
||||
logger.info(f"Obtained {len(chunk_files)} processing chunks")
|
||||
return [w_unpack_files(schema_name, load_id, chunk_files[0])], chunk_files
|
||||
|
||||
|
||||
def update_schema(schema_name: str, schema_updates: List[SchemaUpdate]) -> Schema:
|
||||
schema = load_or_create_schema(schema_name)
|
||||
# gather schema from all manifests, validate consistency and combine
|
||||
for schema_update in schema_updates:
|
||||
for table_name, table_updates in schema_update.items():
|
||||
logger.debug(f"Updating schema for table {table_name} with {len(table_updates)} deltas")
|
||||
schema.update_schema(table_name, table_updates)
|
||||
return schema
|
||||
|
||||
|
||||
def spool_files(pool: ProcessPool, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None:
|
||||
# process files in parallel or in single thread, depending on map_f
|
||||
schema_updates, chunk_files = map_f(pool, schema_name, load_id, files)
|
||||
|
||||
schema = update_schema(schema_name, schema_updates)
|
||||
schema_version_gauge.labels(schema_name).set(schema._version)
|
||||
logger.metrics("Unpacker metrics", extra=get_logging_extras([schema_version_gauge.labels(schema_name)]))
|
||||
logger.info(f"Saving schema {schema_name} with version {schema._version}, writing manifest files")
|
||||
# schema is updated, save it to schema volume
|
||||
schema_storage.save_store_schema(schema)
|
||||
# save schema and schema updates to temp load folder
|
||||
load_schema_storage.save_folder_schema(schema, load_id)
|
||||
load_storage.save_schema_updates(load_id, schema_updates)
|
||||
# files must be renamed and deleted together so do not attempt that when process is about to be terminated
|
||||
signals.raise_if_signalled()
|
||||
logger.info(f"Committing storage, do not kill this process")
|
||||
# rename temp folder to processing
|
||||
load_storage.commit_temp_load_folder(load_id)
|
||||
# delete event files and count events to provide metrics
|
||||
total_events = 0
|
||||
for event_file in chain.from_iterable(chunk_files): # flatten chunks
|
||||
unpack_storage.storage.delete(event_file)
|
||||
total_events += UnpackerStorage.get_events_count(event_file)
|
||||
# log and update metrics
|
||||
logger.info(f"Chunk {load_id} processed")
|
||||
load_package_counter.labels(schema_name).inc()
|
||||
event_counter.labels(schema_name).inc(total_events)
|
||||
event_gauge.labels(schema_name).set(total_events)
|
||||
logger.metrics("Unpacker metrics", extra=get_logging_extras(
|
||||
[load_package_counter.labels(schema_name), event_counter.labels(schema_name), event_gauge.labels(schema_name)]))
|
||||
|
||||
|
||||
def spool_schema_files(pool: ProcessPool, schema_name: str, files: Sequence[str]) -> str:
|
||||
# unpacked files will go here before being atomically renamed
|
||||
load_id = str(pendulum.now().timestamp())
|
||||
load_storage.create_temp_load_folder(load_id)
|
||||
logger.info(f"Created temp load folder {load_id} on loading volume")
|
||||
|
||||
try:
|
||||
# process parallel
|
||||
spool_files(pool, schema_name, load_id, map_parallel, files)
|
||||
except CannotCoerceColumnException as exc:
|
||||
# schema conflicts resulting from parallel executing
|
||||
logger.warning(f"Parallel schema update conflict, switching to single thread ({str(exc)}")
|
||||
# start from scratch
|
||||
load_storage.create_temp_load_folder(load_id)
|
||||
spool_files(pool, schema_name, load_id, map_single, files)
|
||||
|
||||
return load_id
|
||||
|
||||
|
||||
def run(pool: ProcessPool) -> TRunMetrics:
|
||||
logger.info(f"Running file unpacking")
|
||||
# list files and group by schema name, list must be sorted for group by to actually work
|
||||
files = unpack_storage.list_files_to_unpack_sorted()
|
||||
logger.info(f"Found {len(files)} files, will process in chunks of {CONFIG.MAX_EVENTS_IN_CHUNK} of events")
|
||||
if len(files) == 0:
|
||||
return TRunMetrics(True, False, 0)
|
||||
# group files by schema
|
||||
for schema_name, files_in_schema in unpack_storage.get_grouped_iterator(files):
|
||||
logger.info(f"Found files in schema {schema_name}")
|
||||
spool_schema_files(pool, schema_name, list(files_in_schema))
|
||||
# return info on still pending files (if extractor saved something in the meantime)
|
||||
return TRunMetrics(False, False, len(unpack_storage.list_files_to_unpack_sorted()))
|
||||
|
||||
|
||||
def configure(C: Type[UnpackerConfiguration], collector: CollectorRegistry, extract_f: TExtractFunc, default_schemas_path: str = None, schema_names: List[str] = None) -> bool:
|
||||
global CONFIG
|
||||
global unpack_storage, load_storage, schema_storage, load_schema_storage
|
||||
global event_counter, event_gauge, schema_version_gauge, load_package_counter
|
||||
global extract_func
|
||||
|
||||
CONFIG = C
|
||||
# set extracting parser function
|
||||
extract_func = extract_f
|
||||
try:
|
||||
unpack_storage, load_storage, schema_storage, load_schema_storage = create_folders()
|
||||
event_counter, event_gauge, schema_version_gauge, load_package_counter = create_gauges(collector)
|
||||
if default_schemas_path and schema_names:
|
||||
install_schemas(default_schemas_path, schema_names)
|
||||
return True
|
||||
except Exception:
|
||||
process_internal_exception("init module")
|
||||
return False
|
||||
|
||||
|
||||
def main(extract_f: TExtractFunc, default_schemas_path: str = None, schema_names: List[str] = None) -> None:
|
||||
# initialize runner
|
||||
C = configuration()
|
||||
parser = create_default_args(C)
|
||||
args = parser.parse_args()
|
||||
initialize_runner(C, TRunArgs(args.single_run, args.wait_runs))
|
||||
if not configure(C, REGISTRY, extract_f, default_schemas_path, schema_names):
|
||||
exit(-1)
|
||||
# run
|
||||
exit(pool_runner(C, run))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(extract)
|
||||
@@ -7,9 +7,9 @@
|
||||
|
||||
from typing import Sequence
|
||||
|
||||
from autopoiesis.common.typing import StrAny
|
||||
from autopoiesis.common import json
|
||||
from autopoiesis.common.schema import Schema
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common import json
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.pipeline import Pipeline
|
||||
|
||||
# the load schema will be named {pipeline_mame}_{source_name}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from autopoiesis.common import json
|
||||
from autopoiesis.common.schema import Schema
|
||||
from autopoiesis.common.typing import DictStrAny, StrAny
|
||||
from dlt.common import json
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.common.typing import DictStrAny, StrAny
|
||||
|
||||
from dlt.pipeline import Pipeline, PostgresPipelineCredentials
|
||||
|
||||
@@ -17,7 +17,6 @@ from dlt.pipeline import Pipeline, PostgresPipelineCredentials
|
||||
# credentials = Pipeline.load_gcp_credentials("_secrets/project1234_service.json", "gamma_guild")
|
||||
|
||||
import multiprocessing
|
||||
multiprocessing.set_start_method("spawn", force=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# working redshift creds, you can pass password as last parameter or via PG_PASSWORD env variable ie.
|
||||
@@ -70,7 +69,7 @@ if __name__ == '__main__':
|
||||
# from now on each pipeline does more or less the same thing: unpack and load data
|
||||
|
||||
# now create loading packages and infer the schema
|
||||
m = pipeline.unpack(workers=2)
|
||||
m = pipeline.unpack()
|
||||
if m.has_failed:
|
||||
print("Unpacking failed")
|
||||
print(pipeline.last_run_exception)
|
||||
|
||||
@@ -2,10 +2,11 @@ import requests
|
||||
from typing import Iterator, Sequence, cast
|
||||
from web3 import Web3, HTTPProvider
|
||||
|
||||
from autopoiesis.common import Decimal
|
||||
from autopoiesis.common.arithmetics import numeric_default_context, numeric_default_quantize
|
||||
from autopoiesis.common.schema import Schema
|
||||
from autopoiesis.common.typing import DictStrAny, StrAny
|
||||
from dlt.common import json
|
||||
from dlt.common import Decimal
|
||||
from dlt.common.arithmetics import numeric_default_context, numeric_default_quantize
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.common.typing import DictStrAny, StrAny
|
||||
|
||||
from dlt.pipeline import Pipeline, TExtractorItemWithTable, TExtractorItem
|
||||
|
||||
@@ -117,6 +118,10 @@ schema: Schema = None
|
||||
# in case of ethereum data the fundamental problem is 2^256 integer size which does not fit in any BIGINT
|
||||
# type. that is fixed in schema loaded below
|
||||
schema = Pipeline.load_schema_from_file("examples/schemas/ethereum_schema.yml")
|
||||
# jschema = schema.to_dict()
|
||||
# with open("examples/schemas/ethereum_schema.json", "w") as f:
|
||||
# json.dump(jschema, f)
|
||||
# exit(-1)
|
||||
pipeline.create_pipeline(credentials, schema=schema)
|
||||
print(pipeline.root_path)
|
||||
|
||||
@@ -124,13 +129,12 @@ m = pipeline.extract_generator(block_generator)
|
||||
if m.has_failed:
|
||||
print("Extracting failed")
|
||||
print(pipeline.last_run_exception)
|
||||
exit(0)
|
||||
exit(0)
|
||||
|
||||
m = pipeline.unpack()
|
||||
if m.has_failed:
|
||||
print("Unpacking failed")
|
||||
print(pipeline.last_run_exception)
|
||||
exit(0)
|
||||
|
||||
# get inferred schema
|
||||
schema = pipeline.get_current_schema()
|
||||
|
||||
@@ -5,8 +5,8 @@ import io
|
||||
from typing import Any, Iterator
|
||||
import csv
|
||||
|
||||
from autopoiesis.common.typing import StrAny
|
||||
from autopoiesis.common.schema import Schema
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.schema import Schema
|
||||
from dlt.pipeline import Pipeline
|
||||
|
||||
SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
@@ -15,19 +15,19 @@ SCOPES = ['https://www.googleapis.com/auth/drive']
|
||||
KEY_FILE_LOCATION = '_secrets/project1234_service.json'
|
||||
|
||||
|
||||
def _initialize_drive() -> Any:
|
||||
"""Initializes an drive service object.
|
||||
# def _initialize_drive() -> Any:
|
||||
# """Initializes an drive service object.
|
||||
|
||||
Returns:
|
||||
An authorized drive service object.
|
||||
"""
|
||||
credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
||||
KEY_FILE_LOCATION, SCOPES)
|
||||
# Returns:
|
||||
# An authorized drive service object.
|
||||
# """
|
||||
# credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
||||
# KEY_FILE_LOCATION, SCOPES)
|
||||
|
||||
# Build the service object.
|
||||
service = build('drive', 'v3', credentials=credentials)
|
||||
# # Build the service object.
|
||||
# service = build('drive', 'v3', credentials=credentials)
|
||||
|
||||
return service
|
||||
# return service
|
||||
|
||||
|
||||
def _initialize_sheets() -> Any:
|
||||
@@ -41,20 +41,20 @@ def _initialize_sheets() -> Any:
|
||||
return service
|
||||
|
||||
|
||||
def download_csv_as_json(file_id: str, csv_options: StrAny = None) -> Iterator[StrAny]:
|
||||
if csv_options is None:
|
||||
csv_options = {}
|
||||
# def download_csv_as_json(file_id: str, csv_options: StrAny = None) -> Iterator[StrAny]:
|
||||
# if csv_options is None:
|
||||
# csv_options = {}
|
||||
|
||||
drive_service = _initialize_drive()
|
||||
request = drive_service.files().get_media(fileId=file_id)
|
||||
fh = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(fh, request)
|
||||
done = False
|
||||
while done is False:
|
||||
status, done = downloader.next_chunk()
|
||||
print("Download %d%%." % int(status.progress() * 100))
|
||||
rows = fh.getvalue().decode("utf-8")
|
||||
return csv.DictReader(io.StringIO(rows), **csv_options)
|
||||
# drive_service = _initialize_drive()
|
||||
# request = drive_service.files().get_media(fileId=file_id)
|
||||
# fh = io.BytesIO()
|
||||
# downloader = MediaIoBaseDownload(fh, request)
|
||||
# done = False
|
||||
# while done is False:
|
||||
# status, done = downloader.next_chunk()
|
||||
# print("Download %d%%." % int(status.progress() * 100))
|
||||
# rows = fh.getvalue().decode("utf-8")
|
||||
# return csv.DictReader(io.StringIO(rows), **csv_options)
|
||||
|
||||
|
||||
def download_sheet_to_csv(spreadsheet_id: str, sheet_name: str) -> Iterator[StrAny]:
|
||||
|
||||
911
examples/schemas/ethereum_schema.json
Normal file
911
examples/schemas/ethereum_schema.json
Normal file
@@ -0,0 +1,911 @@
|
||||
{
|
||||
"tables": {
|
||||
"_loads": {
|
||||
"inserted_at": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "timestamp",
|
||||
"name": "inserted_at",
|
||||
"nullable": false
|
||||
},
|
||||
"load_id": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "load_id",
|
||||
"nullable": false
|
||||
},
|
||||
"status": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "status",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"_version": {
|
||||
"engine_version": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "engine_version",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "timestamp",
|
||||
"name": "inserted_at",
|
||||
"nullable": false
|
||||
},
|
||||
"version": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "version",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"blocks": {
|
||||
"_load_id": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_load_id",
|
||||
"nullable": false
|
||||
},
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"number": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "number",
|
||||
"nullable": false
|
||||
},
|
||||
"parent_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "parent_hash",
|
||||
"nullable": true
|
||||
},
|
||||
"hash": {
|
||||
"partition": false,
|
||||
"cluster": true,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "hash",
|
||||
"nullable": false
|
||||
},
|
||||
"base_fee_per_gas": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "base_fee_per_gas",
|
||||
"nullable": false
|
||||
},
|
||||
"difficulty": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "difficulty",
|
||||
"nullable": false
|
||||
},
|
||||
"extra_data": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "extra_data",
|
||||
"nullable": true
|
||||
},
|
||||
"gas_limit": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "gas_limit",
|
||||
"nullable": false
|
||||
},
|
||||
"gas_used": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "gas_used",
|
||||
"nullable": false
|
||||
},
|
||||
"logs_bloom": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "logs_bloom",
|
||||
"nullable": true
|
||||
},
|
||||
"miner": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "miner",
|
||||
"nullable": true
|
||||
},
|
||||
"mix_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "mix_hash",
|
||||
"nullable": true
|
||||
},
|
||||
"nonce": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "nonce",
|
||||
"nullable": true
|
||||
},
|
||||
"receipts_root": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "receipts_root",
|
||||
"nullable": true
|
||||
},
|
||||
"sha3_uncles": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "sha3_uncles",
|
||||
"nullable": true
|
||||
},
|
||||
"size": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "size",
|
||||
"nullable": true
|
||||
},
|
||||
"state_root": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "state_root",
|
||||
"nullable": false
|
||||
},
|
||||
"timestamp": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": true,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "timestamp",
|
||||
"name": "timestamp",
|
||||
"nullable": false
|
||||
},
|
||||
"total_difficulty": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "total_difficulty",
|
||||
"nullable": true
|
||||
},
|
||||
"transactions_root": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "transactions_root",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"blocks__transactions": {
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"block_number": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "block_number",
|
||||
"nullable": false
|
||||
},
|
||||
"transaction_index": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "transaction_index",
|
||||
"nullable": false
|
||||
},
|
||||
"hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "hash",
|
||||
"nullable": false
|
||||
},
|
||||
"block_hash": {
|
||||
"partition": false,
|
||||
"cluster": true,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "block_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"block_timestamp": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": true,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "timestamp",
|
||||
"name": "block_timestamp",
|
||||
"nullable": false
|
||||
},
|
||||
"chain_id": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "chain_id",
|
||||
"nullable": true
|
||||
},
|
||||
"from": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "from",
|
||||
"nullable": true
|
||||
},
|
||||
"gas": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "gas",
|
||||
"nullable": true
|
||||
},
|
||||
"gas_price": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "gas_price",
|
||||
"nullable": true
|
||||
},
|
||||
"input": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "input",
|
||||
"nullable": true
|
||||
},
|
||||
"max_fee_per_gas": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "max_fee_per_gas",
|
||||
"nullable": true
|
||||
},
|
||||
"max_priority_fee_per_gas": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "max_priority_fee_per_gas",
|
||||
"nullable": true
|
||||
},
|
||||
"nonce": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "nonce",
|
||||
"nullable": true
|
||||
},
|
||||
"r": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "r",
|
||||
"nullable": true
|
||||
},
|
||||
"s": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "s",
|
||||
"nullable": true
|
||||
},
|
||||
"status": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "status",
|
||||
"nullable": true
|
||||
},
|
||||
"to": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "to",
|
||||
"nullable": true
|
||||
},
|
||||
"type": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "type",
|
||||
"nullable": true
|
||||
},
|
||||
"v": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "v",
|
||||
"nullable": true
|
||||
},
|
||||
"value": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "wei",
|
||||
"name": "value",
|
||||
"nullable": false
|
||||
},
|
||||
"eth_value": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "decimal",
|
||||
"name": "eth_value",
|
||||
"nullable": true
|
||||
}
|
||||
},
|
||||
"blocks__transactions__logs": {
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"address": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "address",
|
||||
"nullable": false
|
||||
},
|
||||
"block_timestamp": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": true,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "timestamp",
|
||||
"name": "block_timestamp",
|
||||
"nullable": false
|
||||
},
|
||||
"block_hash": {
|
||||
"partition": false,
|
||||
"cluster": true,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "block_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"block_number": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "block_number",
|
||||
"nullable": false
|
||||
},
|
||||
"transaction_index": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "transaction_index",
|
||||
"nullable": false
|
||||
},
|
||||
"log_index": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": true,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "log_index",
|
||||
"nullable": false
|
||||
},
|
||||
"data": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "data",
|
||||
"nullable": true
|
||||
},
|
||||
"removed": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bool",
|
||||
"name": "removed",
|
||||
"nullable": true
|
||||
},
|
||||
"transaction_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "transaction_hash",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"blocks__transactions__logs__topics": {
|
||||
"_parent_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": true,
|
||||
"data_type": "text",
|
||||
"name": "_parent_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_pos": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "_pos",
|
||||
"nullable": false
|
||||
},
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_root_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_root_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"value": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "value",
|
||||
"nullable": true
|
||||
}
|
||||
},
|
||||
"blocks__transactions__access_list": {
|
||||
"_parent_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": true,
|
||||
"data_type": "text",
|
||||
"name": "_parent_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_pos": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "_pos",
|
||||
"nullable": false
|
||||
},
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_root_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_root_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"address": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "address",
|
||||
"nullable": true
|
||||
}
|
||||
},
|
||||
"blocks__transactions__access_list__storage_keys": {
|
||||
"_parent_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": true,
|
||||
"data_type": "text",
|
||||
"name": "_parent_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_pos": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "_pos",
|
||||
"nullable": false
|
||||
},
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_root_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_root_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"value": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "value",
|
||||
"nullable": true
|
||||
}
|
||||
},
|
||||
"blocks__uncles": {
|
||||
"_parent_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": true,
|
||||
"data_type": "text",
|
||||
"name": "_parent_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_pos": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "bigint",
|
||||
"name": "_pos",
|
||||
"nullable": false
|
||||
},
|
||||
"_record_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": true,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_record_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"_root_hash": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "_root_hash",
|
||||
"nullable": false
|
||||
},
|
||||
"value": {
|
||||
"partition": false,
|
||||
"cluster": false,
|
||||
"unique": false,
|
||||
"sort": false,
|
||||
"primary_key": false,
|
||||
"foreign_key": false,
|
||||
"data_type": "text",
|
||||
"name": "value",
|
||||
"nullable": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"name": "ethereum",
|
||||
"version": 8,
|
||||
"preferred_types": {},
|
||||
"hints": {
|
||||
"foreign_key": [
|
||||
"^_parent_hash$"
|
||||
],
|
||||
"not_null": [
|
||||
"^_record_hash$",
|
||||
"^_root_hash$",
|
||||
"^_parent_hash$",
|
||||
"^_pos$"
|
||||
],
|
||||
"unique": [
|
||||
"^_record_hash$"
|
||||
]
|
||||
},
|
||||
"excludes": [],
|
||||
"includes": [],
|
||||
"engine_version": 2
|
||||
}
|
||||
@@ -0,0 +1,936 @@
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
tables:
|
||||
_version:
|
||||
version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
engine_version:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
_loads:
|
||||
load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
status:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: false
|
||||
inserted_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: timestamp
|
||||
nullable: false
|
||||
model_annotations:
|
||||
sender_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
message_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
annotation:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
confidence:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: double
|
||||
nullable: true
|
||||
count:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bigint
|
||||
nullable: true
|
||||
added_at:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: true
|
||||
reviewed:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: bool
|
||||
nullable: true
|
||||
_load_id:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: false
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
_record_hash:
|
||||
partition: false
|
||||
cluster: false
|
||||
unique: true
|
||||
sort: false
|
||||
primary_key: false
|
||||
foreign_key: false
|
||||
data_type: text
|
||||
nullable: false
|
||||
name: csv
|
||||
version: 2
|
||||
preferred_types: {}
|
||||
hints:
|
||||
not_null:
|
||||
- ^_record_hash$
|
||||
- ^_root_hash$
|
||||
- ^_parent_hash$
|
||||
- ^_pos$
|
||||
- _load_id
|
||||
foreign_key:
|
||||
- ^_parent_hash$
|
||||
unique:
|
||||
- ^_record_hash$
|
||||
excludes: []
|
||||
includes: []
|
||||
engine_version: 2
|
||||
1110
poetry.lock
generated
1110
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,27 +1,61 @@
|
||||
[tool.poetry]
|
||||
name = "python-dlt"
|
||||
version = "0.0.1"
|
||||
version = "0.1.0.dev0"
|
||||
description = "DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run."
|
||||
authors = ["Marcin Rudolf <rudolfix@rudolfix.org>"]
|
||||
license = "MIT"
|
||||
authors = ["ScaleVector <services@scalevector.ai>"]
|
||||
maintainers = [ "Marcin Rudolf <marcin@scalevector.ai>", "Adrian Brudaru <adrian@scalevector.ai>",]
|
||||
readme = "README.md"
|
||||
license = "Apache-2.0"
|
||||
homepage = "https://github.com/scale-vector"
|
||||
repository = "https://github.com/scale-vector/dlt"
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: POSIX :: Linux",]
|
||||
keywords = [ "etl" ]
|
||||
include = [ "LICENSE.txt", "README.md"]
|
||||
packages = [
|
||||
{ include = "dlt" },
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8,<3.11"
|
||||
# autopoiesis = {path = "../rasa_data_ingestion"}
|
||||
requests = "^2.26.0"
|
||||
pendulum = "^2.1.2"
|
||||
simplejson = "^3.17.5"
|
||||
jsonlines = "^2.0.0"
|
||||
PyYAML = "^5.4.1"
|
||||
json-logging = "1.4.1rc0"
|
||||
prometheus-client = "^0.11.0"
|
||||
semver = "^2.13.0"
|
||||
sentry-sdk = "^1.4.3"
|
||||
hexbytes = "^0.2.2"
|
||||
cachetools = "^5.2.0"
|
||||
|
||||
psycopg2-binary = {version = "^2.9.1", optional = true, extras = ["redshift", "postgres"]}
|
||||
|
||||
grpcio = {version = "1.43.0", optional = true, extras = ["gcp"]}
|
||||
google-cloud-bigquery = {version = "^2.26.0", optional = true, extras = ["gcp"]}
|
||||
|
||||
GitPython = {version = "^3.1.26", optional = true, extras = ["dbt"]}
|
||||
dbt-core = {version = "1.0.6", optional = true, extras = ["dbt"]}
|
||||
dbt-redshift = {version = "1.0.1", optional = true, extras = ["dbt"]}
|
||||
dbt-bigquery = {version = "1.0.0", optional = true, extras = ["dbt"]}
|
||||
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "6.2.4"
|
||||
pytest = "^6.2.4"
|
||||
mypy = "0.931"
|
||||
flake8 = "3.9.2"
|
||||
bandit = "1.7.0"
|
||||
flake8-bugbear = "21.4.3"
|
||||
pytest-pythonpath = "0.7.3"
|
||||
bandit = "^1.7.0"
|
||||
flake8-bugbear = "^21.4.3"
|
||||
pytest-pythonpath = "^0.7.3"
|
||||
pytest-order = "^1.0.0"
|
||||
pytest-cases = "^3.6.9"
|
||||
pytest-forked = "^1.3.0"
|
||||
types-PyYAML = "^6.0.7"
|
||||
types-cachetools = "^4.2.9"
|
||||
types-protobuf = "^3.19.8"
|
||||
@@ -29,6 +63,12 @@ types-simplejson = "^3.17.0"
|
||||
types-requests = "^2.25.6"
|
||||
types-python-dateutil = "^2.8.15"
|
||||
|
||||
[tool.poetry.extras]
|
||||
dbt = ["dbt-core", "GitPython", "dbt-redshift", "dbt-bigquery"]
|
||||
gcp = ["grpcio", "google-cloud-bigquery"]
|
||||
postgres = ["psycopg2-binary"]
|
||||
redshift = ["psycopg2-binary"]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.8"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
7
pytest.ini
Normal file
7
pytest.ini
Normal file
@@ -0,0 +1,7 @@
|
||||
[pytest]
|
||||
python_paths= autopoiesis
|
||||
norecursedirs= .direnv .eggs build dist
|
||||
addopts= -v --showlocals --durations 10
|
||||
xfail_strict= true
|
||||
log_cli= 1
|
||||
log_cli_level= INFO
|
||||
18
tests/.example.env
Normal file
18
tests/.example.env
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
# copy to .env and run with (set -a && . tests/.env && pytest tests)
|
||||
# for tests that do not involve any secrets you may run (set -a && . tests/.example.env && pytest tests)
|
||||
|
||||
|
||||
PROJECT_ID=chat-analytics-317513
|
||||
DATASET=carbon_bot_3
|
||||
BQ_CRED_PRIVATE_KEY="-----BEGIN PRIVATE KEY-----
|
||||
paste key here
|
||||
-----END PRIVATE KEY-----
|
||||
"
|
||||
BQ_CRED_CLIENT_EMAIL=loader@chat-analytics-317513.iam.gserviceaccount.com
|
||||
|
||||
PG_DATABASE_NAME=chat_analytics_rasa
|
||||
PG_SCHEMA_PREFIX=carbon_bot_3
|
||||
PG_USER=loader
|
||||
PG_HOST=3.73.90.3
|
||||
PG_PASSWORD=set-me-up
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/common/__init__.py
Normal file
0
tests/common/__init__.py
Normal file
32
tests/common/cases/mod_bot_case.json
Normal file
32
tests/common/cases/mod_bot_case.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"event": "bot",
|
||||
"timestamp": 1624001210.7276764,
|
||||
"metadata": {
|
||||
"rasa_x_flagged": false,
|
||||
"rasa_x_id": 60304
|
||||
},
|
||||
"text": "Hello! Just a heads up - this bot is part of a research project and we intend to make the conversations publicly available to researchers. So please don't share any personal information! [Privacy Policy](https://rasa.com/carbon-bot-privacy-policy/)",
|
||||
"data": {
|
||||
"elements": null,
|
||||
"quick_replies": null,
|
||||
"buttons": null,
|
||||
"attachment": null,
|
||||
"image": null,
|
||||
"custom": null
|
||||
},
|
||||
"data__custom": "remains",
|
||||
"data__custom__goes": "goes",
|
||||
"custom_data": {
|
||||
"excluded_path": {
|
||||
"prop1": "str1"
|
||||
},
|
||||
"included_object": {
|
||||
"included_exception": "exception",
|
||||
"eliminated": true
|
||||
}
|
||||
},
|
||||
"is_flagged": false,
|
||||
"sender_id": "411b44bdfcc545f282fb4aa15282b73f",
|
||||
"model_id": "__unknown",
|
||||
"environment": "__unknown"
|
||||
}
|
||||
BIN
tests/common/cases/schemas/ev1/event_schema.7z
Normal file
BIN
tests/common/cases/schemas/ev1/event_schema.7z
Normal file
Binary file not shown.
3265
tests/common/cases/schemas/ev1/event_schema.json
Normal file
3265
tests/common/cases/schemas/ev1/event_schema.json
Normal file
File diff suppressed because it is too large
Load Diff
63
tests/common/cases/schemas/ev1/model_schema.json
Normal file
63
tests/common/cases/schemas/ev1/model_schema.json
Normal file
@@ -0,0 +1,63 @@
|
||||
{
|
||||
"tables": {
|
||||
"_version": {
|
||||
"version": {
|
||||
"name": "version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"engine_version": {
|
||||
"name": "engine_version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"_loads": {
|
||||
"load_id": {
|
||||
"name": "load_id",
|
||||
"data_type": "text",
|
||||
"nullable": false
|
||||
},
|
||||
"status": {
|
||||
"name": "status",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"name": "model",
|
||||
"version": 1,
|
||||
"preferred_types": {
|
||||
"^timestamp$": "timestamp",
|
||||
"trained_at$": "timestamp",
|
||||
"^inserted_at$": "timestamp",
|
||||
"^_pos$": "bigint"
|
||||
},
|
||||
"hints": {
|
||||
"not_null": [
|
||||
"^timestamp$",
|
||||
"^_record_hash$",
|
||||
"^_root_hash$",
|
||||
"^_load_id$",
|
||||
"^_parent_hash$",
|
||||
"^_pos$"
|
||||
],
|
||||
"primary_key": [
|
||||
"^_record_hash$"
|
||||
],
|
||||
"foreign_key": [
|
||||
"^_parent_hash$"
|
||||
]
|
||||
},
|
||||
"engine_version": 1
|
||||
}
|
||||
59
tests/common/cases/schemas/rasa/event_schema.json
Normal file
59
tests/common/cases/schemas/rasa/event_schema.json
Normal file
@@ -0,0 +1,59 @@
|
||||
{
|
||||
"tables": {
|
||||
"_version": {
|
||||
"version": {
|
||||
"name": "version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"engine_version": {
|
||||
"name": "engine_version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"_loads": {
|
||||
"load_id": {
|
||||
"name": "load_id",
|
||||
"data_type": "text",
|
||||
"nullable": false
|
||||
},
|
||||
"status": {
|
||||
"name": "status",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"version": 1,
|
||||
"engine_version": 2,
|
||||
"name": "event",
|
||||
"preferred_types": {
|
||||
"^timestamp$": "timestamp",
|
||||
"^_timestamp$": "timestamp",
|
||||
"^inserted_at$": "timestamp",
|
||||
"confidence": "double",
|
||||
"^_pos$": "bigint"
|
||||
},
|
||||
"hints": {
|
||||
"not_null": ["^timestamp$", "^_timestamp$", "^_dist_key$", "^_record_hash$", "^_root_hash$", "^_load_id$", "^_parent_hash$", "^_pos$", "^sender_id$"],
|
||||
"partition": ["^_timestamp$", "^timestamp$"],
|
||||
"cluster": ["^_dist_key$", "^sender_id$"],
|
||||
"primary_key": [],
|
||||
"foreign_key": ["^_parent_hash$"],
|
||||
"sort": ["^timestamp$", "^_timestamp$"],
|
||||
"unique": ["^_record_hash$"]
|
||||
},
|
||||
"excludes": ["^event_user__parse_data", "^event_bot__data", "^event_bot__metadata"],
|
||||
"includes": ["^event_user__parse_data__(intent|entities|message_id$|text$)", "^event_bot__metadata__(utter_action|template_name|rasa_x_[a-z]+)$"]
|
||||
}
|
||||
54
tests/common/cases/schemas/rasa/model_schema.json
Normal file
54
tests/common/cases/schemas/rasa/model_schema.json
Normal file
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"tables": {
|
||||
"_version": {
|
||||
"version": {
|
||||
"name": "version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"engine_version": {
|
||||
"name": "engine_version",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
},
|
||||
"_loads": {
|
||||
"load_id": {
|
||||
"name": "load_id",
|
||||
"data_type": "text",
|
||||
"nullable": false
|
||||
},
|
||||
"status": {
|
||||
"name": "status",
|
||||
"data_type": "bigint",
|
||||
"nullable": false
|
||||
},
|
||||
"inserted_at": {
|
||||
"name": "inserted_at",
|
||||
"data_type": "timestamp",
|
||||
"nullable": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"version": 1,
|
||||
"engine_version": 2,
|
||||
"name": "model",
|
||||
"preferred_types": {
|
||||
"^timestamp$": "timestamp",
|
||||
"trained_at$": "timestamp",
|
||||
"^inserted_at$": "timestamp",
|
||||
"^_pos$": "bigint"
|
||||
},
|
||||
"hints": {
|
||||
"not_null": ["^timestamp$", "^_record_hash$", "^_root_hash$", "^_load_id$", "^_parent_hash$", "^_pos$"],
|
||||
"unique": ["^_record_hash$"],
|
||||
"foreign_key": ["^_parent_hash$"]
|
||||
},
|
||||
"excludes": [],
|
||||
"includes": []
|
||||
}
|
||||
1
tests/common/cases/secret-kube/secret-kube
Normal file
1
tests/common/cases/secret-kube/secret-kube
Normal file
@@ -0,0 +1 @@
|
||||
kube
|
||||
1
tests/common/cases/secret-value
Normal file
1
tests/common/cases/secret-value
Normal file
@@ -0,0 +1 @@
|
||||
BANANA
|
||||
16
tests/common/cases/simple_row.json
Normal file
16
tests/common/cases/simple_row.json
Normal file
@@ -0,0 +1,16 @@
|
||||
[
|
||||
{
|
||||
"f_int": 7817289712,
|
||||
"f_float": 92898e37,
|
||||
"f_timestamp": "2021-10-13T13:49:32.901899+00:00",
|
||||
"f_bool": true,
|
||||
"f_bool_2": false,
|
||||
"f_str": "some string"
|
||||
},
|
||||
{
|
||||
"f_int": 7817289713,
|
||||
"f_float": 878172.8292,
|
||||
"f_timestamp": "2021-10-13T13:49:32.901899+00:00",
|
||||
"f_bool_2": false
|
||||
}
|
||||
]
|
||||
14
tests/common/cases/weird_rows.json
Normal file
14
tests/common/cases/weird_rows.json
Normal file
@@ -0,0 +1,14 @@
|
||||
[
|
||||
{
|
||||
"idx": 1,
|
||||
"str": ", NULL'); DROP SCHEMA Public --"
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"str": "イロハニホヘト チリヌルヲ 'ワカヨタレソ ツネナラム"
|
||||
},
|
||||
{
|
||||
"idx": 3,
|
||||
"str": "ऄअआइ'ईउऊऋऌऍऎए"
|
||||
}
|
||||
]
|
||||
0
tests/common/storages/__init__.py
Normal file
0
tests/common/storages/__init__.py
Normal file
84
tests/common/storages/test_loader_storage.py
Normal file
84
tests/common/storages/test_loader_storage.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import pytest
|
||||
from typing import Sequence, Tuple
|
||||
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.storages.loader_storage import LoaderStorage
|
||||
from dlt.common.configuration import LoadingVolumeConfiguration, make_configuration
|
||||
from dlt.common.storages.exceptions import NoMigrationPathException
|
||||
from dlt.common.typing import StrAny
|
||||
from dlt.common.utils import uniq_id
|
||||
|
||||
from tests.utils import write_version, autouse_root_storage
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def storage() -> LoaderStorage:
|
||||
C = make_configuration(LoadingVolumeConfiguration, LoadingVolumeConfiguration)
|
||||
s = LoaderStorage(True, C, "jsonl")
|
||||
s.initialize_storage()
|
||||
return s
|
||||
|
||||
|
||||
def test_archive_completed(storage: LoaderStorage) -> None:
|
||||
# should delete archive in full
|
||||
storage.delete_completed_jobs = True
|
||||
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
|
||||
assert storage.storage.has_folder(storage.get_load_path(load_id))
|
||||
storage.complete_job(load_id, file_name)
|
||||
storage.archive_load(load_id)
|
||||
# deleted from loading
|
||||
assert not storage.storage.has_folder(storage.get_load_path(load_id))
|
||||
# deleted from archive
|
||||
assert not storage.storage.has_folder(storage.get_archived_path(load_id))
|
||||
|
||||
# do not delete completed jobs
|
||||
storage.delete_completed_jobs = False
|
||||
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
|
||||
storage.complete_job(load_id, file_name)
|
||||
storage.archive_load(load_id)
|
||||
# deleted from loading
|
||||
assert not storage.storage.has_folder(storage.get_load_path(load_id))
|
||||
# has load archived
|
||||
assert storage.storage.has_folder(storage.get_archived_path(load_id))
|
||||
|
||||
|
||||
def test_archive_failed(storage: LoaderStorage) -> None:
|
||||
# loads with failed jobs are always archived
|
||||
storage.delete_completed_jobs = True
|
||||
load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}])
|
||||
assert storage.storage.has_folder(storage.get_load_path(load_id))
|
||||
storage.fail_job(load_id, file_name, "EXCEPTION")
|
||||
storage.archive_load(load_id)
|
||||
# deleted from loading
|
||||
assert not storage.storage.has_folder(storage.get_load_path(load_id))
|
||||
# present in archive
|
||||
assert storage.storage.has_folder(storage.get_archived_path(load_id))
|
||||
|
||||
|
||||
def test_full_migration_path() -> None:
|
||||
# create directory structure
|
||||
s = LoaderStorage(True, LoadingVolumeConfiguration, "jsonl")
|
||||
# overwrite known initial version
|
||||
write_version(s.storage, "1.0.0")
|
||||
# must be able to migrate to current version
|
||||
s = LoaderStorage(False, LoadingVolumeConfiguration, "jsonl")
|
||||
assert s.version == LoaderStorage.STORAGE_VERSION
|
||||
|
||||
|
||||
def test_unknown_migration_path() -> None:
|
||||
# create directory structure
|
||||
s = LoaderStorage(True, LoadingVolumeConfiguration, "jsonl")
|
||||
# overwrite known initial version
|
||||
write_version(s.storage, "10.0.0")
|
||||
# must be able to migrate to current version
|
||||
with pytest.raises(NoMigrationPathException):
|
||||
LoaderStorage(False, LoadingVolumeConfiguration, "jsonl")
|
||||
|
||||
|
||||
def start_loading_file(s: LoaderStorage, content: Sequence[StrAny]) -> Tuple[str, str]:
|
||||
load_id = uniq_id()
|
||||
s.create_temp_load_folder(load_id)
|
||||
file_name = s.write_temp_loading_file(load_id, "mock_table", None, uniq_id(), content)
|
||||
s.commit_temp_load_folder(load_id)
|
||||
s.start_job(load_id, file_name)
|
||||
return load_id, file_name
|
||||
40
tests/common/storages/test_unpacker_storage.py
Normal file
40
tests/common/storages/test_unpacker_storage.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import pytest
|
||||
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.storages.exceptions import NoMigrationPathException
|
||||
from dlt.common.storages.unpacker_storage import UnpackerStorage
|
||||
from dlt.common.configuration import UnpackingVolumeConfiguration
|
||||
|
||||
from tests.utils import TEST_STORAGE, write_version, autouse_root_storage
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_load_events_and_group_by_sender() -> None:
|
||||
# TODO: create fixture with two sender ids and 3 files and check the result
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_chunk_by_events() -> None:
|
||||
# TODO: should distribute ~ N events evenly among m cores with fallback for small amounts of events
|
||||
pass
|
||||
|
||||
|
||||
|
||||
def test_full_migration_path() -> None:
|
||||
# create directory structure
|
||||
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
|
||||
# overwrite known initial version
|
||||
write_version(s.storage, "1.0.0")
|
||||
# must be able to migrate to current version
|
||||
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
|
||||
assert s.version == UnpackerStorage.STORAGE_VERSION
|
||||
|
||||
|
||||
def test_unknown_migration_path() -> None:
|
||||
# create directory structure
|
||||
s = UnpackerStorage(True, UnpackingVolumeConfiguration)
|
||||
# overwrite known initial version
|
||||
write_version(s.storage, "10.0.0")
|
||||
# must be able to migrate to current version
|
||||
with pytest.raises(NoMigrationPathException):
|
||||
UnpackerStorage(False, UnpackingVolumeConfiguration)
|
||||
59
tests/common/storages/test_versioned_storage.py
Normal file
59
tests/common/storages/test_versioned_storage.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import pytest
|
||||
import semver
|
||||
|
||||
from dlt.common.file_storage import FileStorage
|
||||
from dlt.common.storages.exceptions import NoMigrationPathException, WrongStorageVersionException
|
||||
from dlt.common.storages.versioned_storage import VersionedStorage
|
||||
|
||||
from tests.utils import write_version, root_storage
|
||||
|
||||
|
||||
class MigratedStorage(VersionedStorage):
|
||||
def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None:
|
||||
# migration example:
|
||||
if from_version == "1.0.0" and from_version < to_version:
|
||||
from_version = semver.VersionInfo.parse("1.1.0")
|
||||
self._save_version(from_version)
|
||||
if from_version == "1.1.0" and from_version < to_version:
|
||||
from_version = semver.VersionInfo.parse("1.2.0")
|
||||
self._save_version(from_version)
|
||||
|
||||
|
||||
def test_new_versioned_storage(root_storage: FileStorage) -> None:
|
||||
v = VersionedStorage("1.0.1", True, root_storage)
|
||||
assert v.version == "1.0.1"
|
||||
|
||||
|
||||
def test_new_versioned_storage_non_owner(root_storage: FileStorage) -> None:
|
||||
with pytest.raises(WrongStorageVersionException) as wsve:
|
||||
VersionedStorage("1.0.1", False, root_storage)
|
||||
assert wsve.value.storage_path == root_storage.storage_path
|
||||
assert wsve.value.target_version == "1.0.1"
|
||||
assert wsve.value.initial_version == "0.0.0"
|
||||
|
||||
|
||||
def test_migration(root_storage: FileStorage) -> None:
|
||||
write_version(root_storage, "1.0.0")
|
||||
v = MigratedStorage("1.2.0", True, root_storage)
|
||||
assert v.version == "1.2.0"
|
||||
|
||||
|
||||
def test_unknown_migration_path(root_storage: FileStorage) -> None:
|
||||
write_version(root_storage, "1.0.0")
|
||||
with pytest.raises(NoMigrationPathException) as wmpe:
|
||||
MigratedStorage("1.3.0", True, root_storage)
|
||||
assert wmpe.value.migrated_version == "1.2.0"
|
||||
|
||||
|
||||
def test_only_owner_migrates(root_storage: FileStorage) -> None:
|
||||
write_version(root_storage, "1.0.0")
|
||||
with pytest.raises(WrongStorageVersionException) as wmpe:
|
||||
MigratedStorage("1.2.0", False, root_storage)
|
||||
assert wmpe.value.initial_version == "1.0.0"
|
||||
|
||||
|
||||
def test_downgrade_not_possible(root_storage: FileStorage) -> None:
|
||||
write_version(root_storage, "1.2.0")
|
||||
with pytest.raises(NoMigrationPathException) as wmpe:
|
||||
MigratedStorage("1.1.0", True, root_storage)
|
||||
assert wmpe.value.migrated_version == "1.2.0"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user