Merge branch 'devel' into chore/checks-hub-extra

This commit is contained in:
Marcin Rudolf
2025-12-16 21:58:49 +01:00
58 changed files with 15545 additions and 1028 deletions

View File

@@ -106,3 +106,9 @@ jobs:
- name: run docs preprocessor
run: cd docs && make preprocess-docs
- name: test preprocess_to_molab
run: cd docs && make test-preprocess-molabs
- name: Ensure marimo notebooks are up-to-date
run: cd docs && make validate-molabs

View File

@@ -29,7 +29,7 @@ class Container:
"""
_INSTANCE: ClassVar["Container"] = None
_INSTANCE: ClassVar[Optional["Container"]] = None
_LOCK: ClassVar[threading.Lock] = threading.Lock()
_MAIN_THREAD_ID: ClassVar[int] = threading.get_ident()
"""A main thread id to which get item will fallback for contexts without default"""

View File

@@ -152,6 +152,23 @@ def escape_snowflake_identifier(v: str) -> str:
return escape_postgres_identifier(v)
def escape_snowflake_literal(v: Any) -> Any:
"""Escape string literals for Snowflake using standard SQL escaping.
Snowflake uses '' to escape single quotes (not backslash escaping).
"""
if isinstance(v, str):
# Snowflake uses standard SQL escaping: ' -> ''
return "'" + v.replace("'", "''") + "'"
if isinstance(v, (datetime, date, time)):
return f"'{v.isoformat()}'"
if isinstance(v, (list, dict)):
return "'" + json.dumps(v).replace("'", "''") + "'"
if isinstance(v, bytes):
return f"X'{v.hex()}'"
return "NULL" if v is None else str(v)
escape_databricks_identifier = escape_hive_identifier

View File

@@ -40,7 +40,8 @@ except ModuleNotFoundError:
raise MissingDependencyException(
"dlt pyarrow helpers",
[f"{version.DLT_PKG_NAME}[parquet]"],
"Install pyarrow to be allow to load arrow tables, panda frames and to use parquet files.",
"Install pyarrow to be allowed to load arrow tables, panda frames and to use parquet"
" files.",
)
import ctypes

View File

@@ -1,8 +1,8 @@
import dataclasses
from typing import ClassVar, List, Final, Optional
from typing import ClassVar, List, Final, Optional, Union
from dlt.common.configuration import configspec
from dlt.common.configuration.specs import GcpServiceAccountCredentials
from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials
from dlt.common.utils import digest128
from dlt.common.destination.client import DestinationClientDwhWithStagingConfiguration
@@ -11,7 +11,7 @@ from dlt.common.destination.client import DestinationClientDwhWithStagingConfigu
@configspec
class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration):
destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False) # type: ignore
credentials: GcpServiceAccountCredentials = None
credentials: Union[GcpServiceAccountCredentials, GcpOAuthCredentials] = None
location: str = "US"
project_id: Optional[str] = None
"""Note, that this is BigQuery project_id which could be different from credentials.project_id"""

View File

@@ -1,5 +1,5 @@
from contextlib import contextmanager
from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence, Generator
from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence, Generator, Union
import google.cloud.bigquery as bigquery # noqa: I250
from google.api_core import exceptions as api_core_exceptions
@@ -9,7 +9,10 @@ from google.cloud.bigquery.dbapi import Connection as DbApiConnection, Cursor as
from google.cloud.bigquery.dbapi import exceptions as dbapi_exceptions
from dlt.common import logger
from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults
from dlt.common.configuration.specs import (
GcpServiceAccountCredentialsWithoutDefaults,
GcpOAuthCredentials,
)
from dlt.common.destination import DestinationCapabilitiesContext
from dlt.common.typing import StrAny
from dlt.destinations.exceptions import (
@@ -63,7 +66,7 @@ class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction):
self,
dataset_name: str,
staging_dataset_name: str,
credentials: GcpServiceAccountCredentialsWithoutDefaults,
credentials: Union[GcpServiceAccountCredentialsWithoutDefaults, GcpOAuthCredentials],
capabilities: DestinationCapabilitiesContext,
location: str = "US",
project_id: Optional[str] = None,
@@ -71,7 +74,9 @@ class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction):
retry_deadline: float = 60.0,
) -> None:
self._client: bigquery.Client = None
self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials
self.credentials: Union[
GcpServiceAccountCredentialsWithoutDefaults, GcpOAuthCredentials
] = credentials
self.location = location
self.project_id = project_id or self.credentials.project_id
self.http_timeout = http_timeout

View File

@@ -1,6 +1,7 @@
from typing import Optional, Sequence, List, Dict
from typing import Optional, Sequence, List, Dict, Literal
from dlt.common import logger
from dlt.common.data_writers.escape import escape_snowflake_literal
from dlt.common.destination import DestinationCapabilitiesContext
from dlt.common.destination.client import (
FollowupJobRequest,
@@ -27,6 +28,7 @@ from dlt.destinations.sql_jobs import SqlMergeFollowupJob
from dlt.destinations.path_utils import get_file_format_and_compression
SUPPORTED_HINTS: Dict[TColumnHint, str] = {"unique": "UNIQUE"}
COLUMN_COMMENT_HINT: Literal["x-snowflake-column-comment"] = "x-snowflake-column-comment"
class SnowflakeMergeJob(SqlMergeFollowupJob):
@@ -169,6 +171,15 @@ class SnowflakeClient(SqlJobClientWithStagingDataset, SupportsStagingDestination
"ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table) for c in new_columns)
]
def _get_column_def_sql(self, column: TColumnSchema, table: PreparedTableSchema = None) -> str:
column_def_sql = super()._get_column_def_sql(column, table)
if column.get(COLUMN_COMMENT_HINT) or column.get("description"):
comment = column.get(COLUMN_COMMENT_HINT) or column.get("description")
escaped_comment = escape_snowflake_literal(comment)
column_def_sql = f"{column_def_sql} COMMENT {escaped_comment}"
return column_def_sql
def _get_constraints_sql(
self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
) -> str:

View File

@@ -94,7 +94,7 @@ class PipelineTasksGroup(TaskGroup):
buffer_max_items (int, optional): Maximum number of buffered items. Use 0 to keep dlt built-in limit. Defaults to 1000.
retry_policy (_type_, optional): Tenacity retry policy. Defaults to no retry.
retry_pipeline_steps (Sequence[TPipelineStep], optional): Which pipeline steps are eligible for retry. Defaults to ("load", ).
wipe_local_data (bool, optional): Will wipe all the data created by pipeline, also in case of exception. Defaults to False.
wipe_local_data (bool, optional): Will wipe all the data created by pipeline, also in case of exception. Defaults to True.
save_load_info (bool, optional): Will save extensive load info to the destination. Defaults to False.
save_trace_info (bool, optional): Will save trace info to the destination. Defaults to False.
"""

View File

@@ -27,10 +27,9 @@ test-examples: ## Tests the examples in the examples folder
test-snippets: ## Tests the snippets in the snippets folder
cd website/docs && uv run pytest --ignore=node_modules
format: ## Formats the docs tooling, notebooks, and examples
format: ## Formats the docs tooling, website, examples, and notebooks
uv run black docs_tools website examples
uv run black education --ipynb
uv run black education/*/*.ipynb --ipynb
generate-api-ref: ## Generates the API reference documentation from dlt codebase for website
cd docs_tools/api_docs && uv run pydoc-markdown
@@ -43,3 +42,14 @@ preprocess-docs: ## Preprocesses the docs pages, copies docs to docs_processed
preprocess-docs-watch: ## Preprocesses the docs pages, copies docs to docs_processed folder and inserts snippets and tuba links and watches for changes
uv run preprocess-docs --watch
test-preprocess-molabs: ## Tests functions used to build Molabs
uv run pytest docs_tools/education/tests
build-molabs: ## Format the notebooks files first and build Molabs
uv run black education/*/*.ipynb --ipynb
uv run python docs_tools/education/preprocess_to_molab.py
uv run black education/*/*.py
uv run marimo check education/*/*.py --fix --quiet
validate-molabs: build-molabs ## Validate marimo notebooks are up-to-date
git diff --quiet --exit-code -- education/

View File

View File

@@ -0,0 +1,290 @@
import json
import re
import shlex
import subprocess
from pathlib import Path
from typing import Dict, Any
EDUCATION_NOTEBOOKS_DIR = Path(__file__).parent.parent.parent / "education"
TEMP_IPYNB_FILE_PREIFX = "tmp"
MUST_INSTALL_PACKAGES = {"numpy", "pandas", "sqlalchemy"}
def replace_colab_imports_in_notebook(notebook_dict: Dict[str, Any]) -> Dict[str, Any]:
"""
Remove Google Colab-specific imports and replace Colab API calls with standard Python.
Google Colab provides special APIs like `google.colab.userdata` for accessing secrets
that don't exist outside the Colab environment. This function:
- Removes: `from google.colab import userdata` (and similar imports)
- Replaces: `userdata.get(...)` → `os.getenv(...)`
Args:
notebook_dict: Notebook as a Python dictionary
Returns:
Modified notebook dictionary
"""
for cell in notebook_dict.get("cells", []):
if cell.get("cell_type") == "code":
source = cell.get("source", [])
if isinstance(source, list):
# Remove lines with Google Colab imports
source = [
line
for line in source
if not re.match(r"^\s*from google\.colab import", line)
]
# Replace userdata.get with os.getenv
source = [
line.replace("userdata.get(", "os.getenv(") for line in source
]
cell["source"] = source
return notebook_dict
def process_shell_commands_in_notebook(
notebook_dict: Dict[str, Any]
) -> tuple[Dict[str, Any], set[str]]:
"""
Convert Jupyter shell commands to Python subprocess calls and extract dependencies.
Jupyter/Colab notebooks support shell commands with `!` syntax (e.g., `!pip install dlt`),
but this is IPython-specific magic syntax that doesn't work in standard Python or Marimo.
This function:
- Extracts package names from `!pip install` commands for dependency tracking
- Converts other `!command` shell commands to `subprocess.run()` calls
- Removes notebook-specific magic commands (e.g., `%%capture`)
Args:
notebook_dict: Notebook as a Python dictionary
Returns:
Tuple of (modified notebook dict, set of package names extracted from pip install commands)
"""
packages: set[str] = set()
subprocess_imported: bool = False
for cell in notebook_dict.get("cells", []):
if cell.get("cell_type") == "code":
cell_code = cell.get("source", [])
new_cell_code = []
for line in cell_code:
stripped = line.strip()
# skip magic commands
if stripped.startswith("%%capture"):
continue
# extract packages from pip install
if stripped.startswith("!pip install"):
match = re.search(r"!pip install\s+(.+?)(?:\n|$)", stripped)
if match:
cleaned = (
match.group(1).strip().replace('"', "").replace("'", "")
)
# Remove spaces around commas in brackets
cleaned = re.sub(r"\[\s*", "[", cleaned) # Remove space after [
cleaned = re.sub(
r"\s*\]", "]", cleaned
) # Remove space before ]
cleaned = re.sub(
r",\s+", ",", cleaned
) # Remove space after commas
pkgs = [
p.strip()
for p in cleaned.split()
if p.strip() and not p.startswith("-")
] # Filter flags
packages.update(pkgs)
continue
# convert other shell commands
elif stripped.startswith("!"):
if not subprocess_imported:
new_cell_code.append("import subprocess\n")
subprocess_imported = True
cmd = stripped[1:]
new_line = _build_subprocess_line(cmd) + "\n"
new_cell_code.append(new_line)
else:
new_cell_code.append(line)
cell["source"] = new_cell_code
return notebook_dict, packages
def add_inline_dependencies_to_content(packages: set[str], py_content: str) -> str:
"""
Add PEP 723 inline script metadata block with dependencies.
Marimo/Molab can automatically install packages when they're declared using PEP 723
inline script metadata. The dependency list includes:
- Packages extracted from !pip install commands in the original notebook
- MUST_INSTALL_PACKAGES (core dependencies required for all notebooks)
Args:
packages: Set of package names to include (will be merged with MUST_INSTALL_PACKAGES)
py_content: The Python file content as a string
Returns:
Python content with PEP 723 metadata block prepended
NOTE: Without this, users would need to go through a step of manually installing packages before running
the notebook (Marimo will try to install missing imports, which is not exactly nice for a smooth experience.
Also, some libraries used under the hood are not directly imported and are not caught by Marimo).
Format:
# /// script
# dependencies = [
# "package1",
# "package2",
# ]
# ///
"""
packages = packages.copy() # Don't mutate the input set
packages.update(MUST_INSTALL_PACKAGES)
if not packages:
return py_content
pkg_lines = "\n".join(f'# "{pkg}",' for pkg in sorted(packages))
deps_block = f"""# /// script
# dependencies = [
{pkg_lines}
# ]
# ///
"""
return deps_block + py_content
def read_notebook(ipynb_path: Path) -> Dict[str, Any]:
"""
Read a Jupyter notebook file and return as a dictionary.
Args:
ipynb_path: Path to the .ipynb file
Returns:
Notebook data as a Python dictionary
"""
data: Dict[str, Any] = json.loads(ipynb_path.read_text(encoding="utf-8"))
return data
def write_notebook(notebook_dict: Dict[str, Any], output_path: Path) -> None:
"""
Write a notebook dictionary to a file.
Args:
notebook_dict: Notebook data as a Python dictionary
output_path: Path where the notebook should be written
"""
output_path.write_text(
json.dumps(notebook_dict, indent=1, ensure_ascii=False), encoding="utf-8"
)
def convert_notebook_to_marimo(temp_ipynb_path: Path) -> str:
"""
Convert a Jupyter notebook to Marimo Python format using marimo CLI.
Args:
temp_ipynb_path: Path to the temporary preprocessed notebook
Returns:
Marimo Python file content as a string
"""
result = subprocess.run(
["marimo", "convert", str(temp_ipynb_path)],
capture_output=True,
text=True,
check=True,
)
return result.stdout
def write_python_file(content: str, output_path: Path) -> None:
"""
Write Python content to a file.
Args:
content: Python file content as a string
output_path: Path where the file should be written
"""
output_path.write_text(content, encoding="utf-8")
def _build_subprocess_line(cmd: str) -> str:
"""
Generate a subprocess.run() call string from a shell command.
This helper converts various shell command patterns to their Python subprocess
equivalents, handling special cases like piped input.
Conversion rules:
- Simple commands: `command arg` → `subprocess.run(['command', 'arg'], check=True)`
- Yes piping: `yes | command` → `subprocess.run(['command'], input='y\\n', ...)`
- No piping: `no | command` → `subprocess.run(['command'], input='n\\n', ...)`
- Complex pipes: `cmd1 | cmd2` → `subprocess.run('cmd1 | cmd2', shell=True, ...)`
Args:
cmd: The shell command string (without the leading `!`)
Returns:
A string containing Python code for subprocess.run()
"""
cmd = cmd.strip()
# No pipe → simple list argv
if "|" not in cmd:
argv = shlex.split(cmd)
return f"subprocess.run({argv!r}, check=True)"
# Split pipe
left, right = map(str.strip, cmd.split("|", 1))
left_lower = left.lower()
# yes | command → feed "y\n"
if left_lower == "yes":
argv = shlex.split(right)
return f"subprocess.run({argv!r}, input='y\\n', text=True, check=True)"
# no | command → feed "n\n"
if left_lower == "no":
argv = shlex.split(right)
return f"subprocess.run({argv!r}, input='n\\n', text=True, check=True)"
# generic pipe: shell=True fallback
return f"subprocess.run({cmd!r}, shell=True, check=True)"
if __name__ == "__main__":
for ipynb_file in EDUCATION_NOTEBOOKS_DIR.glob("*/*.ipynb"):
# 1. Read notebook file
notebook_dict = read_notebook(ipynb_file)
# 2. Replace Colab imports
notebook_dict = replace_colab_imports_in_notebook(notebook_dict)
# 3. Process shell commands
notebook_dict, packages = process_shell_commands_in_notebook(notebook_dict)
# 4. Write temporary notebook
temp_ipynb_file = ipynb_file.with_name(
f"{TEMP_IPYNB_FILE_PREIFX}_{ipynb_file.name}"
)
write_notebook(notebook_dict, temp_ipynb_file)
# 5. Convert to Marimo format
py_content = convert_notebook_to_marimo(temp_ipynb_file)
# 6. Add inline dependencies
py_content_with_deps = add_inline_dependencies_to_content(packages, py_content)
# 7. Write final Python file
output_path = ipynb_file.with_suffix(".py")
write_python_file(py_content_with_deps, output_path)
# 8. Clean up temporary files
temp_ipynb_file.unlink()

View File

@@ -0,0 +1,109 @@
import pytest
from docs_tools.education.preprocess_to_molab import (
replace_colab_imports_in_notebook,
process_shell_commands_in_notebook,
add_inline_dependencies_to_content,
)
def test_replace_colab_imports() -> None:
"""Ensure that collab specific imports are removed and converted where necessary."""
notebook = {
"cells": [
{
"cell_type": "code",
"source": [
"from google.colab import userdata\n",
"api_key = userdata.get('API_KEY')\n",
"print(api_key)\n",
],
},
]
}
result = replace_colab_imports_in_notebook(notebook)
assert result == {
"cells": [
{
"cell_type": "code",
"source": [
"api_key = os.getenv('API_KEY')\n",
"print(api_key)\n",
],
},
]
}
def test_process_shell_commands_in_notebook() -> None:
"""Ensure that pip install commands are removed, shell commands converted."""
notebook = {
"cells": [
{
"cell_type": "code",
"source": [
"!pip install dlt\n",
"!pip install dlt[bigquery,postgres]\n",
"!pip install requests==2.28.0\n",
"!pip install -q scikit-learn\n",
],
},
{
"cell_type": "code",
"source": [
"!ls -la\n",
"!pwd\n",
"!yes | dlt init source destination\n",
"!no | some_command --flag\n",
"!cat file.txt | grep pattern\n",
"%%capture\n",
"print('hello')\n",
],
},
]
}
result, packages = process_shell_commands_in_notebook(notebook)
assert packages == {
"dlt",
"dlt[bigquery,postgres]",
"requests==2.28.0",
"scikit-learn",
}
assert result == {
"cells": [
{"cell_type": "code", "source": []},
{
"cell_type": "code",
"source": [
"import subprocess\n",
"subprocess.run(['ls', '-la'], check=True)\n",
"subprocess.run(['pwd'], check=True)\n",
"subprocess.run(['dlt', 'init', 'source', 'destination'], input='y\\n', text=True, check=True)\n",
"subprocess.run(['some_command', '--flag'], input='n\\n', text=True, check=True)\n",
"subprocess.run('cat file.txt | grep pattern', shell=True, check=True)\n",
"print('hello')\n",
],
},
]
}
def test_add_inline_dependencies_to_content() -> None:
"""Ensure that PEP 723 metadata block is correctly added and includes MUST_INSTALL_PACKAGES."""
packages = {"requests", "dlt[bigquery,postgres]"}
py_content = "import marimo\n"
result = add_inline_dependencies_to_content(packages, py_content)
expected = """# /// script
# dependencies = [
# "dlt[bigquery,postgres]",
# "numpy",
# "pandas",
# "requests",
# "sqlalchemy",
# ]
# ///
import marimo
"""
print(result)
assert result == expected

31
docs/education/README.md Normal file
View File

@@ -0,0 +1,31 @@
# Adding New Notebooks
## Overview
The `.py` files in this directory are **auto-generated** from `.ipynb` files. Only edit the `.ipynb` files.
To regenerate `.py` files:
```bash
make build-molabs
```
Preprocessing logic: [`docs/docs_tools/education/`](../docs_tools/education/)
## Things to consider
To ensure compatibility with both **Google Colab** and **Marimo/Molab**:
### 1. **No inline comments**
Bad: `x = 5 # comment`
Good: Separate line comments
**Why:** `marimo convert` scatters inline comments
## Workflow
1. Create/edit `.ipynb` in the course folder
2. Follow guidelines above
3. Run `make build-molabs` to generate `.py` files
4. Test both versions (Colab and Molab)
5. Commit both `.ipynb` and `.py` files
6. Make changes to the processing logic in `docs/docs_tools/education/` if necessary.

View File

@@ -6,7 +6,7 @@
"id": "TKD-8-XUjqU4"
},
"source": [
"# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)"
"# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)"
]
},
{
@@ -46,7 +46,9 @@
"We constructed a custom source for the **GitHub API** using the `RESTClient` class, decorators like `@dlt.resource` and `@dlt.source`, and manual pagination handling.\n",
"\n",
"\n",
"#### **Example**"
"#### **Example**\n",
"\n",
"> Don't forget to use your [GitHub API token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) below! "
]
},
{
@@ -81,7 +83,7 @@
"from google.colab import userdata\n",
"\n",
"\n",
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -148,7 +150,7 @@
" \"client\": {\n",
" \"base_url\": \"https://api.github.com\",\n",
" \"auth\": {\n",
" \"token\": dlt.secrets[\"access_token\"], # Access token configured above\n",
" \"token\": dlt.secrets[\"access_token\"],\n",
" },\n",
" \"paginator\": \"header_link\",\n",
" },\n",
@@ -182,14 +184,14 @@
"\n",
"git_source = rest_api_source(config)\n",
"\n",
"pipeline = dlt.pipeline(\n",
"rest_api_pipeline = dlt.pipeline(\n",
" pipeline_name=\"rest_api_github\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"rest_api_data\",\n",
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.run(git_source)\n",
"load_info = rest_api_pipeline.run(git_source)\n",
"print(load_info)"
]
},
@@ -212,7 +214,7 @@
"source": [
"If you don't like black boxes and prefer lower-level building blocks, then our `RESTClient` is perfect for you!\n",
"\n",
"The `RESTClient` class offers an Pythonic interface for interacting with RESTful APIs, including features like:\n",
"The `RESTClient` class offers a Pythonic interface for interacting with RESTful APIs, including features like:\n",
"\n",
"- automatic pagination,\n",
"- various authentication mechanisms,\n",
@@ -225,7 +227,7 @@
"- How to build a custom `@dlt.source`\n",
"- How to run the pipeline and inspect the data\n",
"\n",
"For more information, read `dlt` [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client) official documentation."
"For more information, read `dlt`'s official documentation for the [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client)."
]
},
{
@@ -248,11 +250,10 @@
"source": [
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator\n",
"from google.colab import userdata\n",
"\n",
"\n",
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"\n",
"\n",
"client = RESTClient(\n",
@@ -335,7 +336,7 @@
"\n",
"#### **Authentication Details:**\n",
"\n",
"To use NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n",
"To use the NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n",
"\n",
"```http\n",
"GET /v2/everything?q=python&page=1&apiKey=YOUR_API_KEY\n",
@@ -357,7 +358,7 @@
"\n",
"1. **Sign up** at [https://newsapi.org/register](https://newsapi.org/register)\n",
"2. Copy your **API key** from your dashboard\n",
"3. Save your **API key** in Colab Secrets (side-bar on the right) as NEWS_API_KEY\n",
"3. Save your **API key** in Colab (or Molab) Secrets (side-bar on the right) as NEWS_API_KEY\n",
"\n",
"\n",
"### **How we chose the right authenticator for NewsAPI**\n",
@@ -423,12 +424,12 @@
"\n",
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
"\n",
"client = RESTClient(\n",
"news_api_client = RESTClient(\n",
" base_url=\"https://newsapi.org/v2/\",\n",
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
")\n",
"\n",
"response = client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"response = news_api_client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"print(response.json())"
]
},
@@ -503,16 +504,24 @@
},
"outputs": [],
"source": [
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"page_iterator = news_api_client.paginate(\n",
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
")\n",
"# prints the original request object\n",
"print(next(page_iterator).request)\n",
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"page_iterator = news_api_client.paginate(\n",
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
")\n",
"# prints the raw HTTP response\n",
"print(next(page_iterator).response)\n",
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"page_iterator = news_api_client.paginate(\n",
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
")\n",
"# prints the paginator that was used\n",
"print(next(page_iterator).paginator)\n",
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
"page_iterator = news_api_client.paginate(\n",
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
")\n",
"# prints the authentication class used\n",
"print(next(page_iterator).auth)"
]
@@ -545,7 +554,7 @@
"### **Question 1:**\n",
"\n",
"\n",
"Which paginator is used by `client.paginate()` by default in the example above?\n",
"Which paginator is used by `news_api_client.paginate()` by default in the example above?\n",
"\n",
"\n",
">Answer this question and select the correct option in the homework Google Form.\n"
@@ -627,19 +636,19 @@
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
"\n",
"\n",
"client = RESTClient(\n",
"another_client = RESTClient(\n",
" base_url=\"https://newsapi.org/v2/\",\n",
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
" paginator=PageNumberPaginator(\n",
" base_page=1, # NewsAPI starts paging from 1\n",
" page_param=\"page\", # Matches the API spec\n",
" total_path=None, # Set it to None explicitly\n",
" stop_after_empty_page=True, # Stop if no articles returned\n",
" maximum_page=4, # Optional limit for dev/testing\n",
" base_page=1,\n",
" page_param=\"page\",\n",
" total_path=None,\n",
" stop_after_empty_page=True,\n",
" maximum_page=4,\n",
" ),\n",
")\n",
"\n",
"for page in client.paginate(\n",
"for page in another_client.paginate(\n",
" \"everything\", params={\"q\": \"python\", \"pageSize\": 5, \"language\": \"en\"}\n",
"):\n",
" for article in page:\n",
@@ -670,14 +679,14 @@
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n",
"\n",
"os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
"dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
"\n",
"\n",
"@dlt.resource(write_disposition=\"replace\", name=\"python_articles\")\n",
"def get_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
"def get_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
" client = RESTClient(\n",
" base_url=\"https://newsapi.org/v2/\",\n",
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
" auth=APIKeyAuth(name=\"apiKey\", api_key=news_api_key, location=\"query\"),\n",
" paginator=PageNumberPaginator(\n",
" base_page=1,\n",
" page_param=\"page\",\n",
@@ -715,11 +724,11 @@
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n",
"\n",
"os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
"dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
"\n",
"\n",
"@dlt.resource(write_disposition=\"replace\", name=\"top_articles\")\n",
"def get_top_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
"def get_top_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
" client = RESTClient(\n",
" base_url=\"https://newsapi.org/v2/\",\n",
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
@@ -759,8 +768,8 @@
"outputs": [],
"source": [
"@dlt.source\n",
"def newsapi_source(api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n",
" return [get_articles(api_key=api_key), get_top_articles(api_key=api_key)]"
"def newsapi_source(news_api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n",
" return [get_articles(news_api_key), get_top_articles(news_api_key)]"
]
},
{
@@ -843,7 +852,7 @@
"\n",
"dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n",
"\n",
"In previous section you've already met Rest API Client. `dlt`s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n",
"In the previous section, you've already learned about the Rest API Client. `dlt`s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n",
"\n",
"\n"
]
@@ -909,7 +918,7 @@
"source": [
"### **RESTAPIConfig**\n",
"\n",
"The central object when working with `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n",
"The central object when working with the `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n",
"\n",
"It defines:\n",
"- how to connect to the API (base URL, authentication)\n",
@@ -1045,7 +1054,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
]
},
@@ -1081,7 +1090,7 @@
"}\n",
"```\n",
"\n",
"This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab's secret manager.\n",
"This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab or Molab's secret manager.\n",
"\n",
"\n",
"The available authentication methods you can find in [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#authentication)."
@@ -1122,12 +1131,12 @@
"\n",
"news_source = rest_api_source(news_config)\n",
"\n",
"pipeline = dlt.pipeline(\n",
"another_pipeline = dlt.pipeline(\n",
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
"another_pipeline.run(news_source)\n",
"print(another_pipeline.last_trace)"
]
},
{
@@ -1202,7 +1211,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
]
},
@@ -1292,7 +1301,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
]
},
@@ -1318,7 +1327,7 @@
"- dlt will remember the last `publishedAt` seen\n",
"- On the next run, it will only request articles newer than that\n",
"\n",
"This is optional and depends on your usage pattern.\n"
"This is optional and depends on your usage pattern."
]
},
{
@@ -1331,8 +1340,14 @@
"source": [
"import dlt\n",
"from dlt.sources.rest_api import rest_api_source\n",
"from datetime import datetime, timedelta, timezone\n",
"from google.colab import userdata\n",
"\n",
"# the free plan of newsapi.org only allows you to fetch news from a maximum of 1 month ago\n",
"one_month_ago = datetime.now(timezone.utc) - timedelta(days=30)\n",
"initial_from = one_month_ago.replace(microsecond=0).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
"\n",
"\n",
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
"\n",
"\n",
@@ -1365,7 +1380,7 @@
" \"from\": {\n",
" \"type\": \"incremental\",\n",
" \"cursor_path\": \"publishedAt\",\n",
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
" \"initial_value\": initial_from,\n",
" },\n",
" },\n",
" },\n",
@@ -1379,11 +1394,11 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)\n",
"\n",
"# Run the pipeline one more time\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
]
},
@@ -1471,7 +1486,7 @@
" \"from\": {\n",
" \"type\": \"incremental\",\n",
" \"cursor_path\": \"publishedAt\",\n",
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
" \"initial_value\": initial_from,\n",
" },\n",
" },\n",
" },\n",
@@ -1485,11 +1500,11 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)\n",
"\n",
"# Run the pipeline one more time\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)"
]
},
@@ -1580,7 +1595,7 @@
" \"from\": {\n",
" \"type\": \"incremental\",\n",
" \"cursor_path\": \"publishedAt\",\n",
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
" \"initial_value\": initial_from,\n",
" },\n",
" },\n",
" },\n",
@@ -1601,7 +1616,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)\n",
"\n",
"pipeline.dataset().top_headlines.df().head()"
@@ -1672,9 +1687,10 @@
},
"outputs": [],
"source": [
"def debug_response(\n",
" response: requests.Response, *args: Any, **kwargs: Any\n",
") -> requests.Response:\n",
"from dlt.sources.helpers.requests import Response\n",
"\n",
"\n",
"def debug_response(response: Response, *args: Any, **kwargs: Any) -> Response:\n",
" print(\"Intercepted:\", response.status_code)\n",
" return response"
]
@@ -1728,7 +1744,7 @@
" \"response_actions\": [\n",
" {\n",
" \"status_code\": 200,\n",
" \"action\": debug_response, # <--- add some action\n",
" \"action\": debug_response,\n",
" },\n",
" ],\n",
" \"params\": {\n",
@@ -1736,7 +1752,7 @@
" \"from\": {\n",
" \"type\": \"incremental\",\n",
" \"cursor_path\": \"publishedAt\",\n",
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
" \"initial_value\": initial_from,\n",
" },\n",
" },\n",
" },\n",
@@ -1757,7 +1773,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)\n",
"\n",
"pipeline.dataset().news_articles.df().head()"
@@ -1807,8 +1823,8 @@
},
"outputs": [],
"source": [
"def lower_title(record: TDataItem) -> TDataItem:\n",
" record[\"title\"] = record[\"title\"].lower()\n",
"def lower_title(record: dict[str, Any]) -> dict[str, Any]:\n",
" record[\"title\"] = str(record[\"title\"]).lower()\n",
" return record"
]
},
@@ -1857,8 +1873,8 @@
" {\n",
" \"name\": \"news_articles\",\n",
" \"processing_steps\": [\n",
" {\"filter\": lambda x: len(x[\"author\"]) > 0}, # <--- add filter\n",
" {\"map\": lower_title}, # <--- add some transformation\n",
" {\"filter\": lambda x: len(x[\"author\"]) > 0},\n",
" {\"map\": lower_title},\n",
" ],\n",
" \"endpoint\": {\n",
" \"path\": \"everything\",\n",
@@ -1873,7 +1889,7 @@
" \"from\": {\n",
" \"type\": \"incremental\",\n",
" \"cursor_path\": \"publishedAt\",\n",
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
" \"initial_value\": initial_from,\n",
" },\n",
" },\n",
" },\n",
@@ -1894,7 +1910,7 @@
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
")\n",
"\n",
"load_info = pipeline.run(news_source)\n",
"pipeline.run(news_source)\n",
"print(pipeline.last_trace)\n",
"\n",
"pipeline.dataset().news_articles.df().head()"
@@ -1944,15 +1960,15 @@
"\n",
"### Requirements:\n",
"1. Use `rest_api_source` to define your source config.\n",
"2. This API uses **pagination**. Figure out what type is it.\n",
"2. This API uses **pagination**. Figure out what type it is.\n",
"3. Add incremental loading to `orders`, starting from `2017-08-01` and using `ordered_at` as the cursor.\n",
"4. Add `processing_steps` to `orders`:\n",
" - Remove records from orders which `order_total` > 500.\n",
" - Remove records from orders for which it is true that `order_total` > 500.\n",
"\n",
"\n",
"\n",
"### Question:\n",
"How many rows does resulted table `orders` contain?\n"
"How many rows does the resulting table `orders` contain?\n"
]
},
{
@@ -1972,7 +1988,7 @@
"id": "70D6czgeId7F"
},
"source": [
"✅ ▶ Well done! Go to [the next lesson.](https://colab.research.google.com/drive/1lQ8VkrGJwZMsVtbkuYympcvbv0_CCgYo#forceEdit=true&sandboxMode=true)"
"✅ ▶ Well done! Go to [the next lesson.](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)"
]
},
{

View File

@@ -6,7 +6,7 @@
"id": "NvaKFdYx-kbG"
},
"source": [
"# Building custom sources using SQL Databases [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n",
"# Building custom sources using SQL Databases [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n",
"\n",
"This lesson covers building flexible and powerful custom sources using the `sql_database` verified source.\n"
]
@@ -32,15 +32,6 @@
"- How to load only new data with incremental loading\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4PRqLBIQA7rj"
},
"source": [
"Setup & install dlt:"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -198,7 +189,7 @@
"id": "YjPZMS6DWVNN"
},
"source": [
"Let's save this filtered data:"
"Let's load this filtered data:"
]
},
{
@@ -209,7 +200,7 @@
},
"outputs": [],
"source": [
"info = pipeline.run(filtered_resource, table_name=\"bacterias\")\n",
"info = pipeline.run(filtered_resource, table_name=\"bacteria\")\n",
"print(info)"
]
},
@@ -230,7 +221,7 @@
},
"outputs": [],
"source": [
"pipeline.dataset().bacterias.df().head()"
"pipeline.dataset().bacteria.df().head()"
]
},
{
@@ -241,7 +232,7 @@
"source": [
"### **Question 1**:\n",
"\n",
"How many rows are present in the `bacterias` table?\n",
"How many rows are present in the `bacteria` table?\n",
"\n",
">Answer this question and select the correct option in the homework Quiz.\n"
]
@@ -278,8 +269,10 @@
"\n",
"\n",
"def add_max_timestamp(table: Table) -> Any:\n",
" max_ts = sa.func.greatest(table.c.created, table.c.updated).label(\"max_timestamp\")\n",
" subq = sa.select(*table.c, max_ts).subquery()\n",
" max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label(\n",
" \"max_timestamp\"\n",
" )\n",
" subq = sa.select(*table.columns, max_ts).subquery()\n",
" return subq"
]
},
@@ -476,7 +469,7 @@
"\n",
"We'll also be looking at where these incremental values are stored.\n",
"\n",
"Hint: they are stored in [dlt state](https://dlthub.com/docs/general-usage/state)."
"Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state)."
]
},
{
@@ -583,17 +576,8 @@
"id": "IkvUgaRhI6iY"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1P8pOw9C6J9555o2jhZydESVuVb-3z__y#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Iz0lz3QhJEvv"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,435 @@
# /// script
# dependencies = [
# "dlt",
# "duckdb",
# "numpy",
# "pandas",
# "pymysql",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Building custom sources using SQL Databases [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
This lesson covers building flexible and powerful custom sources using the `sql_database` verified source.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_2_Custom_sources_SQL_Databases_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_2_Custom_sources_SQL_Databases_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## What you will learn
- How to build a custom pipeline using SQL sources
- How to use `query_adapter_callback`, `table_adapter_callback`, and `type_adapter_callback`
- How to load only new data with incremental loading
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Step 1: Load data from SQL Databases""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Well use the [Rfam MySQL public DB](https://docs.rfam.org/en/latest/database.html) and load it into DuckDB:"""
)
return
@app.cell
def _():
from typing import Any
from dlt.sources.sql_database import sql_database
import dlt
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["family"],
)
pipeline = dlt.pipeline(
pipeline_name="sql_database_example",
destination="duckdb",
dataset_name="sql_data",
dev_mode=True,
)
load_info = pipeline.run(_source)
print(load_info)
return Any, dlt, pipeline, sql_database
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Explore the `family` table:""")
return
@app.cell
def _(pipeline):
pipeline.dataset().family.df().head()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 2: Customize SQL queries with `query_adapter_callback`
You can fully rewrite or modify the SQL SELECT statement per table.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Filter rows using a WHERE clause""")
return
@app.cell
def _():
from sqlalchemy import text
from dlt.sources.sql_database.helpers import SelectClause, Table
def query_adapter_callback(query: SelectClause, table: Table) -> SelectClause:
return text(f"SELECT * FROM {table.fullname} WHERE rfam_id like '%bacteria%'")
return Table, query_adapter_callback
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""To be able to use `sql_database` and not have to declare the connection string each time, we save it as an environment variable. This can also (should preferably) be done in `secrets.toml`"""
)
return
@app.cell
def _():
import os
os.environ[
"SOURCES__SQL_DATABASE__CREDENTIALS"
] = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
return
@app.cell
def _(query_adapter_callback, sql_database):
filtered_resource = sql_database(
query_adapter_callback=query_adapter_callback, table_names=["family"]
)
return (filtered_resource,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's load this filtered data:""")
return
@app.cell
def _(filtered_resource, pipeline):
_info = pipeline.run(filtered_resource, table_name="bacteria")
print(_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Explore the data:""")
return
@app.cell
def _(pipeline):
pipeline.dataset().bacteria.df().head()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Question 1**:
How many rows are present in the `bacteria` table?
>Answer this question and select the correct option in the homework Quiz.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 3: Modify table schema with `table_adapter_callback`
Add columns, change types, or transform schema using this hook.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Example: Add computed column `max_timestamp`""")
return
@app.cell
def _(Any, Table):
import sqlalchemy as sa
def add_max_timestamp(table: Table) -> Any:
max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label(
"max_timestamp"
)
subq = sa.select(*table.columns, max_ts).subquery()
return subq
return add_max_timestamp, sa
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Use it with `sql_table`:""")
return
@app.cell
def _(add_max_timestamp, dlt, pipeline):
from dlt.sources.sql_database import sql_table
table = sql_table(
table="family",
table_adapter_callback=add_max_timestamp,
incremental=dlt.sources.incremental("max_timestamp"),
)
_info = pipeline.run(table, table_name="family_with_max_timestamp")
print(_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's check out if this column exists!""")
return
@app.cell
def _(pipeline):
pipeline.dataset().family_with_max_timestamp.df().head()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 4: Adapt column data types with `type_adapter_callback`
When the default types dont match what you want in the destination, you can remap them.
Let's look at the schema that has already been loaded:
""")
return
@app.cell
def _(pipeline):
schema = pipeline.default_schema.to_dict()["tables"]["family"]["columns"]
for _column in schema:
print(schema[_column]["name"], ":", schema[_column]["data_type"])
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Lets change `hmm_lambda` from decimal to float.
💡 Quick fyi: The `float` data type is:
- Fast and uses less space
- But it's approximate — you may get 0.30000000000000004 instead of 0.3
- Bad for money, great for probabilities, large numeric ranges, scientific values
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Example: Change data types""")
return
@app.cell
def _(Any, sa):
from sqlalchemy.types import Float
def type_adapter_callback(sql_type: Any) -> Any:
if isinstance(sql_type, sa.Numeric):
return Float
return sql_type
return (type_adapter_callback,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Use it with `sql_database`:""")
return
@app.cell
def _(pipeline, sql_database, type_adapter_callback):
new_source = sql_database(
type_adapter_callback=type_adapter_callback, table_names=["family"]
)
_info = pipeline.run(new_source, table_name="type_changed_family")
print(_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""👀 Can you see how the column data types have changed?""")
return
@app.cell
def _(pipeline):
schema1 = pipeline.default_schema.to_dict()["tables"]["family"]["columns"]
schema2 = pipeline.default_schema.to_dict()["tables"]["type_changed_family"][
"columns"
]
_column = "trusted_cutoff"
print(
"For table 'family':",
schema1[_column]["name"],
":",
schema1[_column]["data_type"],
)
print(
"For table 'type_changed_family':",
schema2[_column]["name"],
":",
schema2[_column]["data_type"],
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Question 2**:
How many columns had their type changed in the `type_changed_family` table?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 5: Incremental loads with `sql_database`
Track only new rows using a timestamp or ID column.
We'll also be looking at where these incremental values are stored.
Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state).
""")
return
@app.cell
def _():
import json
with open(
"/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8"
) as _f:
_data = json.load(_f)
_data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys()
return (json,)
@app.cell
def _(dlt, pipeline, sql_database):
import pendulum
_source = sql_database(table_names=["family"])
_source.family.apply_hints(
incremental=dlt.sources.incremental(
"updated", initial_value=pendulum.datetime(2024, 1, 1)
)
)
_info = pipeline.run(_source)
print(_info)
return
@app.cell
def _(json):
with open(
"/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8"
) as _f:
_data = json.load(_f)
_data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## **Rename tables for `sql_database` source**""")
return
@app.cell
def _(dlt, sql_database):
_source = sql_database(table_names=["family"])
for _resource_name, resource in _source.resources.items():
resource.apply_hints(table_name=f"xxxx__{resource.name}")
pipeline_1 = dlt.pipeline(
pipeline_name="sql_db_prefixed_tables",
destination="duckdb",
dataset_name="renamed_tables",
)
print(pipeline_1.run(_source))
pipeline_1.dataset().row_counts().df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "8ucJBHffzqYB"
},
"source": [
"# Building Custom Sources with the Filesystem in `dlt` [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"
"# Building Custom Sources with the Filesystem in `dlt` [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"
]
},
{
@@ -24,8 +24,6 @@
"id": "F5ayDx9Nz1ts"
},
"source": [
"You will learn how to:\n",
"\n",
"- Use the `filesystem` resource to build real custom sources\n",
"- Apply filters to file metadata (name, size, date)\n",
"- Implement and register custom transformers\n",
@@ -42,15 +40,6 @@
"## Setup: Download real data"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "siTnHHjg1fSK"
},
"source": [
"Install dlt"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -80,7 +69,14 @@
},
"outputs": [],
"source": [
"!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet"
"import urllib.request\n",
"import os\n",
"\n",
"os.makedirs(\"local_data\", exist_ok=True)\n",
"\n",
"url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n",
"dest = \"local_data/userdata.parquet\"\n",
"urllib.request.urlretrieve(url, dest)"
]
},
{
@@ -277,7 +273,9 @@
"\n",
"\n",
"# Download a JSON file\n",
"!wget -O local_data/sample.json https://jsonplaceholder.typicode.com/users\n",
"url = \"https://jsonplaceholder.typicode.com/users\"\n",
"dest = \"local_data/sample.json\"\n",
"urllib.request.urlretrieve(url, dest)\n",
"\n",
"fs = filesystem(bucket_url=\"./local_data\", file_glob=\"sample.json\")\n",
"pipeline = dlt.pipeline(\"json_pipeline\", destination=\"duckdb\")\n",
@@ -366,7 +364,7 @@
"id": "XoWLhw7DLg7i"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/14br3TZTRFwTSwpDyom7fxlZCeRF4efMk#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!"
]
},
{
@@ -375,15 +373,6 @@
"source": [
"![Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1.webp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rBJ9K3XwMhZW"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,301 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""# Building Custom Sources with the Filesystem in `dlt` [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## What you will learn""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
- Use the `filesystem` resource to build real custom sources
- Apply filters to file metadata (name, size, date)
- Implement and register custom transformers
- Enrich records with file metadata
- Use incremental loading both for files and content
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Setup: Download real data""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Well use a real `.parquet` file from [TimeStored.com](https://www.timestored.com/data/sample/userdata.parquet)"""
)
return
@app.cell
def _():
import urllib.request
import os
os.makedirs("local_data", exist_ok=True)
_url = "https://www.timestored.com/data/sample/userdata.parquet"
_dest = "local_data/userdata.parquet"
urllib.request.urlretrieve(_url, _dest)
return os, urllib
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1: Load Parquet file from Local Filesystem
**What the script below does**: Lists and reads all `.parquet` files in `./local_data` and loads them into a table named `userdata`.
""")
return
@app.cell
def _():
import dlt
from dlt.sources.filesystem import filesystem, read_parquet
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
# Point to the local file directory
parquet_data = _fs | read_parquet()
pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb")
# Add a transformer
_load_info = pipeline.run(parquet_data.with_name("userdata"))
print(_load_info)
# Create and run pipeline
# Inspect data
pipeline.dataset().userdata.df().head()
return dlt, filesystem, pipeline, read_parquet
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Question 1**:
In the `my_pipeline` pipeline, and the `userdata` dataset, what is the ratio of men:women in decimal?
""")
return
@app.cell
def _(pipeline):
# check out the numbers below and answer 👀
df = pipeline.dataset().userdata.df()
df.groupby("gender").describe()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 2: Enrich records with file metadata
Lets add the file name to every record to track the data origin.
""")
return
@app.cell
def _(dlt, filesystem):
from dlt.common.typing import TDataItems
@dlt.transformer()
def read_parquet_with_filename(files: TDataItems) -> TDataItems:
import pyarrow.parquet as pq
for file_item in files:
with file_item.open() as f:
table = pq.read_table(f).to_pandas()
table["source_file"] = file_item["file_name"]
yield table.to_dict(orient="records")
_fs = filesystem(bucket_url="./local_data", file_glob="*.parquet")
pipeline_1 = dlt.pipeline("meta_pipeline", destination="duckdb")
_load_info = pipeline_1.run(
(_fs | read_parquet_with_filename()).with_name("userdata")
)
print(_load_info)
return (TDataItems,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Step 3: Filter files by metadata""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Only load files matching custom logic:""")
return
@app.cell
def _(dlt, filesystem, read_parquet):
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
_fs.add_filter(lambda f: "user" in f["file_name"] and f["size_in_bytes"] < 1000000)
pipeline_2 = dlt.pipeline("filtered_pipeline", destination="duckdb")
_load_info = pipeline_2.run((_fs | read_parquet()).with_name("userdata_filtered"))
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 4: Load files incrementally
Avoid reprocessing the same file twice.
""")
return
@app.cell
def _(dlt, filesystem, read_parquet):
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
_fs.apply_hints(incremental=dlt.sources.incremental("modification_date"))
data = (_fs | read_parquet()).with_name("userdata")
pipeline_3 = dlt.pipeline("incremental_pipeline", destination="duckdb")
_load_info = pipeline_3.run(data)
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 5: Create a custom transformer
Lets read structured data from `.json` files.
""")
return
@app.cell
def _(TDataItems, dlt, filesystem, urllib):
@dlt.transformer(standalone=True)
def read_json(items: TDataItems) -> TDataItems:
from dlt.common import json
for file_obj in items:
with file_obj.open() as f:
yield json.load(f)
_url = "https://jsonplaceholder.typicode.com/users"
_dest = "local_data/sample.json"
urllib.request.urlretrieve(_url, _dest)
_fs = filesystem(bucket_url="./local_data", file_glob="sample.json")
pipeline_4 = dlt.pipeline("json_pipeline", destination="duckdb")
_load_info = pipeline_4.run((_fs | read_json()).with_name("users"))
print(_load_info)
return (pipeline_4,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
📁 You will see that this file also exists in your local_data directory.
> A **standalone** resource is defined on a function that is top-level in a module (not an inner function) that accepts config and secrets values. Additionally, if the standalone flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function, and the user must call the wrapper to get the actual resource.
Let's inspect the `users` table in your DuckDB dataset:
""")
return
@app.cell
def _(pipeline_4):
pipeline_4.dataset().users.df().head()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 6: Copy files before loading
Copy files locally as part of the pipeline. This is useful for backups or post-processing.
""")
return
@app.cell
def _(dlt, filesystem, os):
from dlt.common.storages.fsspec_filesystem import FileItemDict
def copy_local(item: FileItemDict) -> FileItemDict:
local_path = os.path.join("copied", item["file_name"])
os.makedirs(os.path.dirname(local_path), exist_ok=True)
item.fsspec.download(item["file_url"], local_path)
return item
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet").add_map(
copy_local
)
pipeline_5 = dlt.pipeline("copy_pipeline", destination="duckdb")
_load_info = pipeline_5.run(_fs.with_name("copied_files"))
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Next steps
- Try building a transformer for `.xml` using `xmltodict`
- Combine multiple directories or buckets in a single pipeline
- Explore [more examples](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/advanced)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1.webp)"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "eZpIGo3Fg8hR"
},
"source": [
"# Custom destinations & Reverse ETL [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n",
"# Custom destinations & Reverse ETL [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n",
"\n",
"---\n",
"\n",
@@ -15,7 +15,7 @@
"- What reverse ETL means in practice \n",
"- How to build custom destinations with `@dlt.destination` \n",
"- How batching works \n",
"- How to push real data from Rfam database to Notion \n",
"- How to push real data from the Rfam database to Notion \n",
"\n",
"---\n"
]
@@ -237,8 +237,8 @@
"\n",
"### 4.1. Step 1: Create a database in Notion\n",
"\n",
"1. Create empty database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n",
"2. [Create integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n",
"1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n",
"2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n",
"3. Connect your database to the integration.\n",
"4. Create 3 columns: Accession (title), ID (text), Description (text)"
]
@@ -263,7 +263,7 @@
"id": "0AdDovQklsE9"
},
"source": [
"### 4.2. Step 2: Install and configure"
"### 4.2. Step 2: Configure"
]
},
{
@@ -289,7 +289,7 @@
"2. Set your credentials either in:\n",
" - `~/.dlt/secrets.toml` \n",
" - or environment variables\n",
" - or (**in our case**) in Colab Secrets\n",
" - or (**in our case**) in Colab or Molab Secrets\n",
"\n",
" ```toml\n",
" [destination.notion]\n",
@@ -344,7 +344,7 @@
"id": "C0r_R3M_6ePP"
},
"source": [
"You can also check if your integration works via `curl`:\n",
"You can also check if your integration works via the requests library:\n",
"1. Modify Bearer token\n",
"2. Modify \"query\" if you database have another name"
]
@@ -357,7 +357,24 @@
},
"outputs": [],
"source": [
"! curl -X POST 'https://api.notion.com/v1/search' -H 'Authorization: Bearer '\"ntn_q5_your_token_o5xQLn1sewnep6\"'' -H 'Content-Type: application/json' -H 'Notion-Version: 2022-06-28' --data '{\"query\": \"Advanced\", \"filter\": {\"value\": \"database\", \"property\": \"object\"}, \"sort\": {\"direction\":\"ascending\", \"timestamp\":\"last_edited_time\"}}'"
"import requests\n",
"\n",
"url = \"https://api.notion.com/v1/search\"\n",
"\n",
"headers = {\n",
" \"Authorization\": \"Bearer ntn_q5_your_token_o5xQLn1sewnep6\",\n",
" \"Content-Type\": \"application/json\",\n",
" \"Notion-Version\": \"2022-06-28\",\n",
"}\n",
"\n",
"data = {\n",
" \"query\": \"Advanced\",\n",
" \"filter\": {\"value\": \"database\", \"property\": \"object\"},\n",
" \"sort\": {\"direction\": \"ascending\", \"timestamp\": \"last_edited_time\"},\n",
"}\n",
"\n",
"response = requests.post(url, headers=headers, json=data)\n",
"print(response.json())"
]
},
{
@@ -424,8 +441,8 @@
"from notion_client import Client\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n",
"os.environ[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n",
"dlt.secrets[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n",
"dlt.secrets[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n",
"\n",
"\n",
"@dlt.destination(name=\"notion\")\n",
@@ -522,17 +539,8 @@
"id": "nJach4xBFfva"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1--wNVd26TqNolnnECnUYZqeE2CXOeVZE#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "vmz0tMhcmwPh"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,464 @@
# /// script
# dependencies = [
# "dlt",
# "dlt[duckdb]",
# "notion-client",
# "numpy",
# "pandas",
# "pymysql",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Custom destinations & Reverse ETL [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
---
## What youll learn
- What reverse ETL means in practice
- How to build custom destinations with `@dlt.destination`
- How batching works
- How to push real data from the Rfam database to Notion
---
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **1. Concept: What is a custom destination?**
Normally, dlt sends your data to databases like BigQuery or Postgres.
But with `@dlt.destination`, you can **intercept the normalized data** and send it wherever you want:
- APIs (Notion, Slack, Airtable)
- Message queues (Kafka, SQS)
- Logging systems
- Custom data sinks
All you have to do is define a function like:
```python
@dlt.destination
def my_destination(items, table):
...
```
And dlt will call this for every batch of data extracted and normalized.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **2. Simple example: print data rows**
### Code example:
""")
return
@app.cell
def _():
import dlt
from dlt.common.typing import TDataItems
from dlt.common.schema import TTableSchema
@dlt.destination(batch_size=5)
def print_sink(items: TDataItems, table: TTableSchema) -> None:
print(f"\nTable: {table['name']}")
for item in items:
print(item)
@dlt.resource
def simple_data() -> TDataItems:
yield [{"id": i, "value": f"row-{i}"} for i in range(12)]
_pipeline = dlt.pipeline("print_example", destination=print_sink)
_pipeline.run(simple_data())
print(_pipeline.last_trace)
return TDataItems, TTableSchema, dlt, simple_data
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Whats happening?**
- `simple_data()` yields 12 small records.
- The data goes through **normalization** (converted to rows + types).
- `@dlt.destination(batch_size=5)` groups these rows into batches of 5.
- For each batch, `print_sink()` is called.
- The `table` parameter tells you which table the batch belongs to.
**Why this is important?**
- This is the **simplest possible custom destination.**
- Youre in control: log, debug, or route data per table.
- It introduces how dlt structures the data and calls your function.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Question 1:
In the following example, how many times will the function be called?
""")
return
@app.cell
def _(TDataItems, TTableSchema, dlt):
@dlt.destination(batch_size=2)
def new_print_sink(items: TDataItems, table: TTableSchema) -> None:
print(items)
@dlt.resource
def new_simple_data() -> TDataItems:
yield [{"id": i} for i in range(6)]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **3. How batching works**
By default `batch_size` is 10.
Lets tweak just one thing:
""")
return
@app.cell
def _(TDataItems, TTableSchema, dlt, simple_data):
@dlt.destination(batch_size=1)
def print_each_row(items: TDataItems, table: TTableSchema) -> None:
print(f"Got one row from table {table['name']}:")
print(items)
_pipeline = dlt.pipeline("print_example", destination=print_each_row)
_pipeline.run(simple_data())
print(_pipeline.last_trace)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Now, dlt calls your function **once per row** instead of per 5 rows.
Useful if:
- Your API doesnt support bulk inserts.
- You want fine-grained control or retries.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **4. Real-world project: Rfam database → Notion**
Lets build a real pipeline that fetches data from database and **sends it to Notion**.
### Why Notion?
- Notion is a great tool for product/dev teams.
- But dlt doesnt support Notion as a *destination*.
- So, well build that ourselves.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### 4.1. Step 1: Create a database in Notion
1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)
2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace.
3. Connect your database to the integration.
4. Create 3 columns: Accession (title), ID (text), Description (text)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img3](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img3.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### 4.2. Step 2: Configure""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
2. Set your credentials either in:
- `~/.dlt/secrets.toml`
- or environment variables
- or (**in our case**) in Colab or Molab Secrets
```toml
[destination.notion]
notion_auth = "<your_integration_token>"
notion_page_id = "<your_database_id>"
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""- Save your [Notion authentication token](https://developers.notion.com/docs/authorization#internal-integration-auth-flow-set-up) and the [ID of the page](https://developers.notion.com/docs/working-with-page-content#creating-a-page-with-content) where you want to create a database in your Colab secrets:"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img4](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img4.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img5](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img5.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img6](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img6.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> Make sure to [connect the page](https://www.notion.so/help/add-and-manage-connections-with-the-api#add-connections-to-pages) to the integration associated with the token!"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can also check if your integration works via the requests library:
1. Modify Bearer token
2. Modify "query" if you database have another name
""")
return
@app.cell
def _():
import requests
url = "https://api.notion.com/v1/search"
headers = {
"Authorization": "Bearer ntn_q5_your_token_o5xQLn1sewnep6",
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
}
data = {
"query": "Advanced",
"filter": {"value": "database", "property": "object"},
"sort": {"direction": "ascending", "timestamp": "last_edited_time"},
}
response = requests.post(url, headers=headers, json=data)
print(response.json())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### 4.3. Step 3: Get data from Rfam database
Let's use `query_callback` and limit the number of data rows:
""")
return
@app.cell
def _():
import os
import sqlalchemy as sa
from sqlalchemy import text
from dlt.sources.sql_database import sql_database
from dlt.sources.sql_database.helpers import SelectClause, Table
def limit_rows(query: SelectClause, table: Table) -> SelectClause:
return text(f"SELECT * FROM {table.fullname} LIMIT 20")
source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["family"],
query_adapter_callback=limit_rows,
)
return os, source
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### 4.4. Step 4: Define Notion destination""")
return
@app.cell
def _(TDataItems, TTableSchema, dlt, os):
from notion_client import Client
dlt.secrets["DESTINATION__NOTION__NOTION_AUTH"] = os.getenv("NOTION_AUTHENTICATION")
dlt.secrets["DESTINATION__NOTION__NOTION_PAGE_ID"] = os.getenv("NOTION_PAGE_ID")
@dlt.destination(name="notion")
def push_to_notion(
items: TDataItems,
table: TTableSchema,
notion_auth: str = dlt.secrets.value,
notion_page_id: str = dlt.secrets.value,
) -> None:
client = Client(auth=notion_auth)
print(len(items))
for item in items:
client.pages.create(
parent={"database_id": notion_page_id},
properties={
"Accession": {"title": [{"text": {"content": item["rfam_acc"]}}]},
"ID": {"rich_text": [{"text": {"content": item["rfam_id"]}}]},
"Description": {
"rich_text": [{"text": {"content": item["description"]}}]
},
},
)
return (push_to_notion,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Whats happening?**
- dlt will call `push_to_notion()` with one batch of records at a time.
- For each record, we create a page in Notion.
- Credentials and database ID come from `secrets.toml` or env vars.
**Why this is useful?**
- You just turned your pipeline into a full **reverse ETL** job.
- No need for Airbyte or writing custom orchestration scripts.
- Its reusable and works with dlts retry logic, state management, and transformations.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### 4.5. Step 5: Run the pipeline""")
return
@app.cell
def _(dlt, push_to_notion, source):
_pipeline = dlt.pipeline(
"notion_pipeline", destination=push_to_notion, progress="log"
)
_pipeline.run(source, table_name="rfam_family")
print(_pipeline.last_trace)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Destinations_Reverse_ETL_img7](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img7.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## 5. Reliability and state
### What if Notion fails mid-run?
- dlt **retries batches** up to 5 times.
- You can restart the pipeline and it will continue from the failed batch.
- But you must make your destination **idempotent** (i.e., safe to re-run the same input).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,17 +6,17 @@
"id": "CbFVutT06Cqq"
},
"source": [
"# Transforming and filtering the data [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n",
"# Transforming and filtering the data [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n",
"\n",
"In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion.\n",
"\n",
"dlt provides several ways of doing it during the ingestion:\n",
"1. With custom query (applicable for `sql_database` source).\n",
"2. With dlt special functions (`add_map` and `add_filter`).\n",
"1. With a custom query (applicable for `sql_database` source).\n",
"2. With special dlt functions (`add_map` and `add_filter`).\n",
"3. Via `@dlt.transformers`.\n",
"4. With `pipeline.dataset()`.\n",
"\n",
"Let's review and compare those methods."
"Let's review and compare these methods."
]
},
{
@@ -116,8 +116,8 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT * FROM genome\") as table:\n",
" genome = table.df()\n",
" with client.execute_query(\"SELECT * FROM genome\") as my_table:\n",
" genome = my_table.df()\n",
"genome"
]
},
@@ -139,8 +139,8 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as table:\n",
" print(table.df())"
" with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as my_table:\n",
" print(my_table.df())"
]
},
{
@@ -158,7 +158,7 @@
"id": "edAUbOHXuwlL"
},
"source": [
"Imagine a use-case where we're only interested in getting the genome data for bacterias. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."
"Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."
]
},
{
@@ -172,8 +172,8 @@
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\n",
" \"SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'\"\n",
" ) as table:\n",
" print(table.df())"
" ) as my_table:\n",
" print(my_table.df())"
]
},
{
@@ -190,20 +190,14 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "F8A675ZXTCn9"
},
"metadata": {},
"outputs": [],
"source": [
"from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause\n",
"\n",
"\n",
"def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny:\n",
" if table.name == \"genome\":\n",
" # Only select rows where the column kingdom has value \"bacteria\"\n",
" return query.where(table.c.kingdom == \"bacteria\")\n",
" # Use the original query for other tables\n",
" return query"
" return query.where(table.c.kingdom == \"bacteria\") if table.name else query"
]
},
{
@@ -240,8 +234,7 @@
" dataset_name=\"sql_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
"\n",
"pipeline.run(source, write_disposition=\"replace\")\n",
"print(pipeline.last_trace)"
]
},
@@ -305,16 +298,16 @@
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\n",
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan\"\n",
" ) as table:\n",
" ) as my_table:\n",
" print(\"Table clan:\")\n",
" print(table.df())\n",
" print(my_table.df())\n",
" print(\"\\n\")\n",
"\n",
" with client.execute_query(\n",
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n",
" ) as table:\n",
" ) as my_table:\n",
" print(\"Table genome:\")\n",
" print(table.df())"
" print(my_table.df())"
]
},
{
@@ -373,9 +366,9 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
" print(\"Table clan:\")\n",
" print(table.df())"
" print(my_table.df())"
]
},
{
@@ -465,9 +458,9 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
" print(\"Table clan:\")\n",
" clan = table.df()\n",
" clan = my_table.df()\n",
"\n",
"clan"
]
@@ -546,9 +539,9 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
" print(\"Table clan:\")\n",
" print(table.df())"
" print(my_table.df())"
]
},
{
@@ -596,8 +589,8 @@
"\n",
"resource.add_map(add_greeting)\n",
"\n",
"for row in resource():\n",
" print(row)"
"for _row in resource():\n",
" print(_row)"
]
},
{
@@ -680,7 +673,7 @@
")\n",
"source.genome.add_filter(lambda item: item[\"kingdom\"] == \"bacteria\")\n",
"\n",
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
"pipeline.run(source, write_disposition=\"replace\")\n",
"\n",
"print(pipeline.last_trace)"
]
@@ -696,9 +689,9 @@
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\n",
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n",
" ) as table:\n",
" ) as my_table:\n",
" print(\"Table genome:\")\n",
" genome_count = table.df()\n",
" genome_count = my_table.df()\n",
"genome_count"
]
},
@@ -753,8 +746,7 @@
")\n",
"source.genome.add_limit(1)\n",
"\n",
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
"\n",
"pipeline.run(source, write_disposition=\"replace\")\n",
"print(pipeline.last_trace)"
]
},
@@ -767,8 +759,8 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT * FROM genome\") as table:\n",
" genome_limited = table.df()\n",
" with client.execute_query(\"SELECT * FROM genome\") as my_table:\n",
" genome_limited = my_table.df()\n",
"genome_limited"
]
},
@@ -824,7 +816,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"info = pipeline.run([genome_resource, genome_resource | batch_stats])\n",
"pipeline.run([genome_resource, genome_resource | batch_stats])\n",
"print(pipeline.last_trace)"
]
},
@@ -837,8 +829,8 @@
"outputs": [],
"source": [
"with pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT * FROM batch_stats\") as table:\n",
" res = table.df()\n",
" with client.execute_query(\"SELECT * FROM batch_stats\") as my_table:\n",
" res = my_table.df()\n",
"res"
]
},
@@ -879,16 +871,16 @@
"# NOTE: this is the duckdb sql dialect, other destinations may use different expressions\n",
"with pipeline.sql_client() as client:\n",
" client.execute_sql(\n",
" \"\"\" CREATE OR REPLACE TABLE genome_length AS\n",
" SELECT\n",
" SUM(total_length) AS total_total_length,\n",
" AVG(total_length) AS average_total_length\n",
" FROM\n",
" genome\n",
" \"\"\"\n",
" (\n",
" \"CREATE OR REPLACE TABLE genome_length AS \"\n",
" \"SELECT \"\n",
" \" SUM(total_length) AS total_total_length, \"\n",
" \" AVG(total_length) AS average_total_length \"\n",
" \"FROM genome\"\n",
" )\n",
" )\n",
" with client.execute_query(\"SELECT * FROM genome_length\") as table:\n",
" genome_length = table.df()\n",
" with client.execute_query(\"SELECT * FROM genome_length\") as my_table:\n",
" genome_length = my_table.df()\n",
"\n",
"genome_length"
]
@@ -1068,7 +1060,7 @@
"id": "AH3F46PaJZe4"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1XT1xUIQIWj0nPWOmTixThgdXzi4vudce#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!"
]
}
],

View File

@@ -0,0 +1,765 @@
# /// script
# dependencies = [
# "dlt[sql_database,duckdb]",
# "ibis-framework[duckdb]",
# "numpy",
# "pandas",
# "pymysql",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Transforming and filtering the data [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion.
dlt provides several ways of doing it during the ingestion:
1. With a custom query (applicable for `sql_database` source).
2. With special dlt functions (`add_map` and `add_filter`).
3. Via `@dlt.transformers`.
4. With `pipeline.dataset()`.
Let's review and compare these methods.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## What youll learn:
- How to limit rows at the source with SQL queries.
- How to apply custom Python logic per record.
- How to write transformations using functional and declarative APIs.
- How to access and query your loaded data using `.dataset()`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Setup and initial Load""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
We will be using the `sql_database` source as an example and will connect to the public [MySQL RFam](https://www.google.com/url?q=https%3A%2F%2Fwww.google.com%2Furl%3Fq%3Dhttps%253A%252F%252Fdocs.rfam.org%252Fen%252Flatest%252Fdatabase.html) database. The RFam database contains publicly accessible scientific data on RNA structures.
Let's perform an initial load:
""")
return
@app.cell
def _():
import dlt
from dlt.sources.sql_database import sql_database
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["family", "genome"],
)
pipeline = dlt.pipeline(
pipeline_name="sql_database_pipeline",
destination="duckdb",
dataset_name="sql_data",
)
_load_info = pipeline.run(_source)
print(_load_info)
return dlt, pipeline, sql_database
@app.cell
def _(pipeline):
with pipeline.sql_client() as _client:
with _client.execute_query("SELECT * FROM genome") as _my_table:
genome = _my_table.df()
genome
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""You can check your data count using `sql_client`:""")
return
@app.cell
def _(pipeline):
with pipeline.sql_client() as _client:
with _client.execute_query(
"SELECT COUNT(*) AS total_rows FROM genome"
) as _my_table:
print(_my_table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""## **1. Filtering the data during the ingestion with `query_adapter_callback`**"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."""
)
return
@app.cell
def _(pipeline):
with pipeline.sql_client() as _client:
with _client.execute_query(
"SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'"
) as _my_table:
print(_my_table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
When ingesting data using the `sql_database` source, dlt runs a `SELECT` statement in the back, and using the `query_adapter_callback` parameter makes it possible to pass a `WHERE` clause inside the underlying `SELECT` statement.
In this example, only the table `genome` is filtered on the column `kingdom`
""")
return
@app.cell
def _():
from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause
def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny:
return query.where(table.c.kingdom == "bacteria") if table.name else query
return SelectAny, SelectClause, Table, query_adapter_callback
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Attach it:""")
return
@app.cell
def _(dlt, query_adapter_callback, sql_database):
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["genome"],
query_adapter_callback=query_adapter_callback,
)
pipeline_1 = dlt.pipeline(
pipeline_name="sql_database_pipeline_filtered",
destination="duckdb",
dataset_name="sql_data",
)
pipeline_1.run(_source, write_disposition="replace")
print(pipeline_1.last_trace)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
In the snippet above we created an SQL VIEW in your source database and extracted data from it. In that case, dlt will infer all column types and read data in shape you define in a view without any further customization.
If creating a view is not feasible, you can fully rewrite the automatically generated query with an extended version of `query_adapter_callback`:
""")
return
@app.cell
def _(SelectAny, SelectClause, Table, dlt, sql_database):
import sqlalchemy as sa
def query_adapter_callback_1(query: SelectAny, table: Table) -> SelectClause:
if table.name == "genome":
return sa.text(f"SELECT * FROM {table.fullname} WHERE kingdom='bacteria'")
return query
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["genome", "clan"],
query_adapter_callback=query_adapter_callback_1,
)
pipeline_2 = dlt.pipeline(
pipeline_name="sql_database_pipeline_filtered",
destination="duckdb",
dataset_name="sql_data",
)
_load_info = pipeline_2.run(_source, write_disposition="replace")
print(_load_info)
return (pipeline_2,)
@app.cell
def _(pipeline_2):
with pipeline_2.sql_client() as _client:
with _client.execute_query(
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan"
) as _my_table:
print("Table clan:")
print(_my_table.df())
print("\n")
with _client.execute_query(
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome"
) as _my_table:
print("Table genome:")
print(_my_table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## **2. Transforming the data after extract and before load**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Since dlt is a Python library, it gives you a lot of control over the extracted data.
You can attach any number of transformations that are evaluated on an item-per-item basis to your resource. The available transformation types:
* `map` - transform the data item (resource.add_map).
* `filter` - filter the data item (resource.add_filter).
* `yield map` - a map that returns an iterator (so a single row may generate many rows - resource.add_yield_map).
* `limit` - limits the number of records processed by a resource. Useful for testing or reducing data volume during development.
For example, if we wanted to anonymize sensitive data before it is loaded into the destination, then we can write a python function for it and apply it to source or resource using the `.add_map()` method.
[dlt documentation.](https://dlthub.com/docs/general-usage/resource#filter-transform-and-pivot-data)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Using `add_map`""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""In the table `clan`, we notice that there is a column `author` that we would like to anonymize."""
)
return
@app.cell
def _(pipeline_2):
with pipeline_2.sql_client() as _client:
with _client.execute_query(
"SELECT DISTINCT author FROM clan LIMIT 5"
) as _my_table:
print("Table clan:")
print(_my_table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We write a function in python that anonymizes a string""")
return
@app.cell
def _():
import hashlib
from dlt.common.typing import TDataItem
def pseudonymize_name(row: TDataItem) -> TDataItem:
"""
Pseudonymization is a deterministic type of PII-obscuring.
Its role is to allow identifying users by their hash,
without revealing the underlying info.
"""
# add a constant salt to generate
salt = "WI@N57%zZrmk#88c"
salted_string = row["author"] + salt
sh = hashlib.sha256()
sh.update(salted_string.encode())
hashed_string = sh.digest().hex()
row["author"] = hashed_string
return row
return TDataItem, hashlib, pseudonymize_name
@app.cell
def _(dlt, pseudonymize_name, sql_database):
pipeline_3 = dlt.pipeline(
pipeline_name="sql_database_pipeline_anonymized",
destination="duckdb",
dataset_name="sql_data",
)
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["clan"],
)
_source.clan.add_map(pseudonymize_name)
_info = pipeline_3.run(_source)
print(_info)
return (pipeline_3,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""After the pipeline has run, we can observe that the author column has been anonymized."""
)
return
@app.cell
def _(pipeline_3):
with pipeline_3.sql_client() as _client:
with _client.execute_query(
"SELECT DISTINCT author FROM clan LIMIT 5"
) as _my_table:
print("Table clan:")
clan = _my_table.df()
clan
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""**Note:** If you're using the `pyarrow` or `connectorx` backend, the data is not processed item-by-item. Instead they're processed in batches, therefore your function should be adjusted. For example, for PyArrow chunks the function could be changed as follows:"""
)
return
@app.cell
def _(dlt, hashlib, sql_database):
import pyarrow as pa
import pyarrow.compute as pc
def pseudonymize_name_pyarrow(table: pa.Table) -> pa.Table:
"""
Pseudonymizes the 'author' column in a PyArrow Table.
"""
salt = "WI@N57%zZrmk#88c"
_df = table.to_pandas()
_df["author"] = (
_df["author"]
.astype(str)
.apply(lambda x: hashlib.sha256((x + salt).encode()).hexdigest())
)
new_table = pa.Table.from_pandas(_df)
return new_table
pipeline_4 = dlt.pipeline(
pipeline_name="sql_database_pipeline_anonymized1",
destination="duckdb",
dataset_name="sql_data",
)
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["clan"],
backend="pyarrow",
)
_source.clan.add_map(pseudonymize_name_pyarrow)
_info = pipeline_4.run(_source)
print(_info)
return (pipeline_4,)
@app.cell
def _(pipeline_4):
with pipeline_4.sql_client() as _client:
with _client.execute_query(
"SELECT DISTINCT author FROM clan LIMIT 5"
) as _my_table:
print("Table clan:")
print(_my_table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### `add_map` vs `add_yield_map`
The difference between `add_map` and `add_yield_map` matters when a transformation returns multiple records from a single input.
#### **`add_map`**
- Use `add_map` when you want to transform each item into exactly one item.
- Think of it like modifying or enriching a row.
- You use a regular function that returns one modified item.
- Great for adding fields or changing structure.
#### Example
""")
return
@app.cell
def _(TDataItem, dlt):
from dlt.common.typing import TDataItems
@dlt.resource
def _resource() -> TDataItems:
yield [{"name": "Alice"}, {"name": "Bob"}]
def add_greeting(item: TDataItem) -> TDataItem:
item["greeting"] = f"Hello, {item['name']}!"
return item
_resource.add_map(add_greeting)
for _row in _resource():
print(_row)
return (TDataItems,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
#### **`add_yield_map`**
- Use `add_yield_map` when you want to turn one item into multiple items, or possibly no items.
- Your function is a generator that uses yield.
- Great for pivoting nested data, flattening lists, or filtering rows.
#### Example
""")
return
@app.cell
def _(TDataItem, TDataItems, dlt):
@dlt.resource
def _resource() -> TDataItems:
yield [
{"name": "Alice", "hobbies": ["reading", "chess"]},
{"name": "Bob", "hobbies": ["cycling"]},
]
def expand_hobbies(item: TDataItem) -> TDataItem:
for hobby in item["hobbies"]:
yield {"name": item["name"], "hobby": hobby}
_resource.add_yield_map(expand_hobbies)
for row in _resource():
print(row)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Using `add_filter`
`add_filter` function can be used similarly. The difference is that `add_filter` expects a function that returns a boolean value for each item. For example, to implement the same filtering we did with a query callback, we can use:
""")
return
@app.cell
def _(dlt, sql_database):
import time
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["genome"],
)
pipeline_5 = dlt.pipeline(
pipeline_name="sql_database_pipeline_filtered",
destination="duckdb",
dataset_name="sql_data",
)
_source.genome.add_filter(lambda item: item["kingdom"] == "bacteria")
pipeline_5.run(_source, write_disposition="replace")
print(pipeline_5.last_trace)
return (pipeline_5,)
@app.cell
def _(pipeline_5):
with pipeline_5.sql_client() as _client:
with _client.execute_query(
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome"
) as _my_table:
print("Table genome:")
genome_count = _my_table.df()
genome_count
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Question 1:
What is a `total_rows` in the example above?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Using `add_limit`
If your resource loads thousands of pages of data from a REST API or millions of rows from a database table, you may want to sample just a fragment of it in order to quickly see the dataset with example data and test your transformations, etc.
To do this, you limit how many items will be yielded by a resource (or source) by calling the `add_limit` method. This method will close the generator that produces the data after the limit is reached.
""")
return
@app.cell
def _(dlt, sql_database):
_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["genome"],
chunk_size=10,
)
pipeline_6 = dlt.pipeline(
pipeline_name="sql_database_pipeline_filtered",
destination="duckdb",
dataset_name="sql_data",
)
_source.genome.add_limit(1)
pipeline_6.run(_source, write_disposition="replace")
print(pipeline_6.last_trace)
return (pipeline_6,)
@app.cell
def _(pipeline_6):
with pipeline_6.sql_client() as _client:
with _client.execute_query("SELECT * FROM genome") as _my_table:
genome_limited = _my_table.df()
genome_limited
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **3. Transforming data with `@dlt.transformer`**
The main purpose of transformers is to create children tables with additional data requests, but they can also be used for data transformations especially if you want to keep the original data as well.
""")
return
@app.cell
def _(TDataItem, TDataItems, dlt, sql_database):
@dlt.transformer()
def batch_stats(items: TDataItems) -> TDataItem:
"""
Pseudonymization is a deterministic type of PII-obscuring.
Its role is to allow identifying users by their hash,
without revealing the underlying info.
"""
yield {
"batch_length": len(items),
"max_length": max([item["total_length"] for item in items]),
}
genome_resource = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", chunk_size=10000
).genome
pipeline_7 = dlt.pipeline(
pipeline_name="sql_database_pipeline_with_transformers1",
destination="duckdb",
dataset_name="sql_data",
dev_mode=True,
)
pipeline_7.run([genome_resource, genome_resource | batch_stats])
print(pipeline_7.last_trace) # add a constant salt to generate
return (pipeline_7,)
@app.cell
def _(pipeline_7):
with pipeline_7.sql_client() as _client:
with _client.execute_query("SELECT * FROM batch_stats") as _my_table:
res = _my_table.df()
res
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **4. Transforming data after the load**
Another possibility for data transformation is transforming data after the load. dlt provides several way of doing it:
* using `sql_client`,
* via `.dataset()` and ibis integration,
* via [dbt integration](https://dlthub.com/docs/dlt-ecosystem/transformations/dbt/).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### SQL client
You already saw examples of using dlt's SQL client. dlt gives you an opportunity to connect to your destination and execute any SQL query.
""")
return
@app.cell
def _(pipeline_7):
with pipeline_7.sql_client() as _client:
_client.execute_sql(
"CREATE OR REPLACE TABLE genome_length AS SELECT SUM(total_length) AS total_total_length, AVG(total_length) AS average_total_length FROM genome"
)
with _client.execute_query("SELECT * FROM genome_length") as _my_table:
genome_length = _my_table.df()
genome_length
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Accessing loaded data with `pipeline.dataset()`
Use `pipeline.dataset()` to inspect and work with your data in Python after loading.
""")
return
@app.cell
def _(pipeline_7):
dataset = pipeline_7.dataset()
# List tables
dataset.row_counts().df()
return (dataset,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Note that `row_counts` didn't return the new table `genome_length`,"""
)
return
@app.cell
def _(dataset):
# Access as pandas
_df = dataset["genome"].df()
_df
return
@app.cell
def _(dataset):
# Access as Arrow
arrow_table = dataset["genome_length"].arrow()
arrow_table
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""You can also filter, limit, and select columns:""")
return
@app.cell
def _(dataset):
_df = dataset["genome"].select("kingdom", "ncbi_id").limit(10).df()
_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""To iterate over large data:""")
return
@app.cell
def _(dataset):
for chunk in dataset["genome"].iter_df(chunk_size=500):
print(chunk.head())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""For more advanced users, this interface supports **Ibis expressions**, joins, and subqueries."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Ibis integration
Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/).
[dlt provides a way to use Ibis expressions natively](https://dlthub.com/docs/general-usage/dataset-access/ibis-backend) with a lot of destinations. Supported ones are:
* Snowflake
* DuckDB
* MotherDuck
* Postgres
* Redshift
* Clickhouse
* MSSQL (including Synapse)
* BigQuery
""")
return
@app.cell
def _(pipeline_7):
# get the dataset from the pipeline
dataset_1 = pipeline_7.dataset()
dataset_name = pipeline_7.dataset_name
ibis_connection = dataset_1.ibis()
# get the native ibis connection from the dataset
print(ibis_connection.list_tables(database=dataset_name))
table = ibis_connection.table("batch_stats", database=dataset_name)
# list all tables in the dataset
# NOTE: You need to provide the dataset name to ibis, in ibis datasets are named databases
# get the items table
# print the first 2 rows
print(table.limit(2).execute()) # # type: ignore[attr-defined]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "_dbt9Ilnmktb"
},
"source": [
"# Merge and replace strategies & Advanced tricks [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n"
"# Merge and replace strategies & Advanced tricks [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n"
]
},
{
@@ -46,7 +46,7 @@
"\n",
"\n",
"\n",
"A `write_disposition` in `dlt` can specified in the resource decorator:\n",
"A `write_disposition` in `dlt` can be specified in the resource decorator:\n",
"\n",
"```python\n",
"@dlt.resource(write_disposition=\"append\")\n",
@@ -153,17 +153,17 @@
" - Append\n",
" - Replace\n",
" - Merge\n",
"- What incremental loading is.\n",
"- What incremental loading is\n",
"\n",
"**Now, we will cover** the different strategies for `merge` write disposition:\n",
"- `delete-insert` strategy.\n",
"- `upsert` strategy.\n",
"- `SCD2` strategy.\n",
"- `delete-insert` strategy\n",
"- `upsert` strategy\n",
"- `SCD2` strategy\n",
"\n",
"We also will take a look at\n",
"* Hard deletes.\n",
"* Falling back for incremental cursors.\n",
"* Backfills."
"We will also take a look at:\n",
"* Hard deletes\n",
"* Falling back for incremental cursors\n",
"* Backfills"
]
},
{
@@ -258,9 +258,7 @@
"]\n",
"\n",
"\n",
"dlt.secrets[\n",
" \"destination.replace_strategy\"\n",
"] = \"truncate-and-insert\" # <--- set the replace strategy using TOML, ENVs or Python\n",
"dlt.secrets[\"destination.replace_strategy\"] = \"truncate-and-insert\"\n",
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"pokemon_load_1\",\n",
@@ -268,7 +266,7 @@
" dataset_name=\"pokemon_data_1\",\n",
")\n",
"\n",
"load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
"pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
"print(pipeline.last_trace)"
]
},
@@ -350,9 +348,7 @@
"]\n",
"\n",
"\n",
"dlt.secrets[\n",
" \"destination.replace_strategy\"\n",
"] = \"insert-from-staging\" # <--- set the replace strategy using TOML, ENVs or Python\n",
"dlt.secrets[\"destination.replace_strategy\"] = \"insert-from-staging\"\n",
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"pokemon_load_2\",\n",
@@ -360,8 +356,7 @@
" dataset_name=\"pokemon_data_2\",\n",
")\n",
"\n",
"load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
"\n",
"pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
"print(pipeline.last_trace)"
]
},
@@ -391,7 +386,7 @@
"\n",
"In this example, the `insert-from-staging` strategy will load the pokemon data **into a staging table** in the `pokemon_data_2_staging` schema in DuckDB (or any other destination you choose). \n",
"\n",
"Let's check the content of this table:"
"Let's check the contents of this table:"
]
},
{
@@ -558,7 +553,7 @@
" write_disposition=\"merge\",\n",
" primary_key=\"id\",\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def pokemon(data: TDataItems) -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
@@ -568,7 +563,7 @@
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
@@ -645,7 +640,7 @@
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 7.5, \"height\": 0.4},\n",
" }, # <--- Pikachu's weight has increased\n",
" },\n",
"]"
]
},
@@ -666,7 +661,7 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
@@ -729,7 +724,7 @@
"id": "S06hBVpXgmqF"
},
"source": [
"We see that only new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one."
"We see that only the new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one."
]
},
{
@@ -892,12 +887,12 @@
"@dlt.resource(\n",
" name=\"pokemon\",\n",
" write_disposition={\n",
" \"disposition\": \"merge\", # <--- specifies that existing data should be merged\n",
" \"strategy\": \"scd2\", # <--- enables SCD2 tracking, which keeps historical records of changes\n",
" \"disposition\": \"merge\",\n",
" \"strategy\": \"scd2\",\n",
" },\n",
" primary_key=\"id\",\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def pokemon(data: TDataItems) -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
@@ -908,7 +903,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)"
]
},
@@ -972,7 +967,7 @@
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" }, # <--- weight has changed back\n",
" },\n",
"]"
]
},
@@ -993,7 +988,7 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)"
]
},
@@ -1075,19 +1070,19 @@
" \"name\": \"bulbasaur\",\n",
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
" \"deleted_flag\": True,\n",
" }, # <--- should be deleted\n",
" },\n",
" {\n",
" \"id\": \"4\",\n",
" \"name\": \"charmander\",\n",
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
" \"deleted_flag\": None,\n",
" }, # <--- should be kept\n",
" },\n",
" {\n",
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"deleted_flag\": False,\n",
" }, # <--- should be kept\n",
" },\n",
"]"
]
},
@@ -1106,9 +1101,9 @@
" name=\"pokemon\",\n",
" write_disposition=\"merge\",\n",
" primary_key=\"id\",\n",
" columns={\"deleted_flag\": {\"hard_delete\": True}}, # <--- set columns argument\n",
" columns={\"deleted_flag\": {\"hard_delete\": True}},\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def pokemon(data: TDataItems) -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
@@ -1119,7 +1114,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)"
]
},
@@ -1160,7 +1155,7 @@
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"deleted_flag\": True,\n",
" }, # <--- set to True\n",
" },\n",
"]"
]
},
@@ -1172,7 +1167,7 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)"
]
},
@@ -1236,19 +1231,19 @@
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"deleted_flag\": None,\n",
" }, # <--- will be filtered out\n",
" },\n",
" {\n",
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 7, \"height\": 0.4},\n",
" \"deleted_flag\": True,\n",
" }, # <--- will be removed\n",
" },\n",
" {\n",
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 8, \"height\": 0.4},\n",
" \"deleted_flag\": None,\n",
" }, # <--- will be loaded\n",
" },\n",
"]"
]
},
@@ -1279,9 +1274,9 @@
" columns={\n",
" \"deleted_flag\": {\"hard_delete\": True},\n",
" \"size__weight\": {\"dedup_sort\": \"desc\"},\n",
" }, # <-- desc means that the record with the highest value remains.\n",
" },\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def pokemon(data: TDataItems) -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
@@ -1292,7 +1287,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)\n",
"\n",
"pipeline.dataset().pokemon.df()"
@@ -1381,7 +1376,7 @@
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"created_at\": 3,\n",
" \"updated_at\": None,\n",
" }, # <--- Incremental cursor is None\n",
" },\n",
"]"
]
},
@@ -1396,12 +1391,13 @@
"import dlt\n",
"\n",
"\n",
"@dlt.resource\n",
"@dlt.resource(name=\"pokemon\")\n",
"def pokemon(\n",
" data: TDataItems,\n",
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n",
" \"updated_at\", on_cursor_value_missing=\"include\"\n",
" )\n",
") -> TDataItems: # <--- we want to include all data rows even if cursor is missing\n",
" ),\n",
") -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
@@ -1412,7 +1408,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = pipeline.run(pokemon(data))\n",
"print(load_info)\n",
"\n",
"pipeline.dataset().pokemon.df()"
@@ -1474,7 +1470,7 @@
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"created_at\": 3,\n",
" \"updated_at\": None,\n",
" }, # <--- Incremental cursor is None\n",
" },\n",
"]"
]
},
@@ -1488,6 +1484,7 @@
"source": [
"@dlt.resource\n",
"def some_data(\n",
" data: TDataItems,\n",
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\"updated_at\"),\n",
") -> TDataItems:\n",
" yield data\n",
@@ -1495,9 +1492,7 @@
"\n",
"def set_default_updated_at(record: TDataItem) -> TDataItems:\n",
" if record.get(\"updated_at\") is None:\n",
" record[\"updated_at\"] = record.get(\n",
" \"created_at\"\n",
" ) # <--- use 'created_at' instead of missing 'updated_at'\n",
" record[\"updated_at\"] = record.get(\"created_at\")\n",
" return record"
]
},
@@ -1510,7 +1505,7 @@
"outputs": [],
"source": [
"# Modifies records before the incremental processing\n",
"with_default_values = some_data().add_map(set_default_updated_at, insert_at=1)"
"with_default_values = some_data(data).add_map(set_default_updated_at, insert_at=1)"
]
},
{
@@ -1542,7 +1537,7 @@
"outputs": [],
"source": [
"# Removes records before the incremental processing\n",
"without_none = some_data().add_filter(\n",
"without_none = some_data(data).add_filter(\n",
" lambda r: r.get(\"updated_at\") is not None, insert_at=1\n",
")"
]
@@ -1641,9 +1636,10 @@
"\n",
"@dlt.resource\n",
"def some_data(\n",
" data: TDataItems,\n",
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n",
" \"created_at\", initial_value=0, end_value=2\n",
" )\n",
" ),\n",
") -> TDataItems:\n",
" yield data"
]
@@ -1662,7 +1658,7 @@
" dataset_name=\"pokemon_inc_wd\",\n",
")\n",
"\n",
"load_info = pipeline.run(some_data, table_name=\"pokemon\")\n",
"load_info = pipeline.run(some_data(data), table_name=\"pokemon\")\n",
"print(load_info)\n",
"\n",
"pipeline.dataset().pokemon.df()"
@@ -1752,7 +1748,7 @@
"continue_load_flag = True\n",
"\n",
"while continue_load_flag:\n",
" load_info = pipeline.run(source.genome.add_limit(10))\n",
" pipeline.run(source.genome.add_limit(10))\n",
" continue_load_flag = (\n",
" my_table_name in pipeline.last_trace.last_normalize_info.row_counts.keys()\n",
" )\n",
@@ -1772,17 +1768,8 @@
"id": "AH3F46PaJZe4"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mC09rjkheo92-ycjjq0AlIzgwJC8-ZMX#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "K4smMmlfMysW"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -6,7 +6,7 @@
"id": "Wat0fkM3BHwn"
},
"source": [
"# **Introduction** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n",
"# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n",
"\n",
"`dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀\n"
]
@@ -35,7 +35,7 @@
"source": [
"When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path=\"schemas/export\"` in your pipeline.\n",
"\n",
"See [dlt Fundamentals: Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)\n"
"See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n"
]
},
{
@@ -167,18 +167,9 @@
"\n",
"Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves.\n",
"\n",
"`dlt` allows you to implement these data contracts at various levels, including the [table level](#scrollTo=zzVNMHgqNEYr), [column level](#scrollTo=Bq_9SNOMQGk_), and [data type level](#scrollTo=H9eMPvlOQHrJ). This provides granular control over how different parts of your schema evolve.\n",
"`dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve.\n",
"\n",
"> **Note**: This Colab is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "g2XDHclpusOU"
},
"source": [
"To get started with data contracts, first install `dlt`:"
"> **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details."
]
},
{
@@ -190,8 +181,6 @@
"outputs": [],
"source": [
"%%capture\n",
"\n",
"# Install dlt\n",
"!pip install dlt[duckdb]"
]
},
@@ -468,13 +457,13 @@
"load_info = column_pipeline.run(\n",
" discard_row(\n",
" [\n",
" {\"id\": 3, \"name\": \"Sam\", \"age\": 30}, # This row will be loaded\n",
" {\"id\": 3, \"name\": \"Sam\", \"age\": 30},\n",
" {\n",
" \"id\": 4,\n",
" \"name\": \"Kate\",\n",
" \"age\": 79,\n",
" \"phone\": \"123-456-7890\",\n",
" }, # This row will not be loaded\n",
" },\n",
" ]\n",
" ),\n",
" table_name=\"users\",\n",
@@ -711,8 +700,8 @@
"load_info = data_type_pipeline.run(\n",
" discard_row(\n",
" [\n",
" {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"}, # This row will be loaded\n",
" {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"}, # This row will not be loaded\n",
" {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"},\n",
" {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"},\n",
" ]\n",
" ),\n",
" table_name=\"users\",\n",
@@ -940,17 +929,8 @@
"id": "AH3F46PaJZe4"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1YCjHWMyOO9QGC66t1a5bIxL-ZUeVKViR#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6_6WprxWXhXi"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,780 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
`dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_7_Data_Contracts_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_7_Data_Contracts_img1.webp)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# [Refresher] **Understanding schema**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path="schemas/export"` in your pipeline.
See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This YAML file will look something like:
```yaml
version: 2 # version of the schema
version_hash: xmTG0tOmE40LvzY2DbPBOnRaNNK8YlLpVP1PMO0YgyE= # hash of the actual schema content
engine_version: 9. # shema engine version of dlt
name: quick_start
tables:
_dlt_version:
...
_dlt_loads:
...
_dlt_pipeline_state:
...
issues:
columns:
url:
data_type: text
nullable: true
repository_url:
data_type: text
nullable: true
labels_url:
data_type: text
nullable: true
...
write_disposition: append
resource: get_issues
x-normalizer:
seen-data: true
issues__assignees:
columns:
...
parent: issues
settings:
detections:
- iso_timestamp
default_hints:
not_null:
- _dlt_id
- _dlt_root_id
- _dlt_parent_id
- _dlt_list_idx
- _dlt_load_id
foreign_key:
- _dlt_parent_id
root_key:
- _dlt_root_id
unique:
- _dlt_id
normalizers:
names: snake_case # naming convention
json:
module: dlt.common.normalizers.json.relational
previous_hashes:
- O4M6U4KA32Xz4Vrdcqo4XPBPFVcK1FZbgRu5qcMfjn4=
- 0DQRnVWANYV21yD0T5nsoUtdTeq0/jIOYMUxpPE6Fcc=
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## **Tables and columns**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
A `table schema` may have the following properties:
- `name`
- `description`
- `parent`: The name of the parent table if this is a child table.
- `columns`: A list of column schemas defining the table's structure.
- `write_disposition`: A hint telling `dlt` how new data coming into the table should be loaded.
A `column schema` may have the following properties:
- `name`
- `description`
- `data_type`
- `precision`: Defines the precision for text, timestamp, time, bigint, binary, and decimal types.
- `scale`: Defines the scale for the decimal type.
- `is_variant`: Indicates that the column was generated as a variant of another column.
A `column schema` may have the following basic hints:
- `nullable`
- `primary_key`
- `merge_key`: Marks the column as part of the merge key used for incremental loads.
- `foreign_key`
- `root_key`: Marks the column as part of a root key, a type of foreign key that always refers to the root table.
- `unique`
A `column schema` may have the following performance hints:
- `partition`: Marks the column to be used for partitioning data.
- `cluster`: Marks the column to be used for clustering data.
- `sort`: : Marks the column as sortable or ordered; on some destinations, this may generate an index, even if the column is not unique.
> Each destination can interpret these performance hints in its own way. For example, the `cluster` hint is used by Redshift to define table distribution, by BigQuery to specify a cluster column, and is ignored by DuckDB and Postgres when creating tables.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Data contracts**
Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves.
`dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve.
> **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
###**Table level**
On the table level, you can specify `evolve` or `freeze` as part of the schema contract.
- `evolve`: Allows the creation of new tables within the schema.
- `freeze`: Prevents any changes to the schema, ensuring no new tables can be added.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Before diving into the modes above, let's load some sample data into a DuckDB database.
> You'll find the database stored in the `Files` section on the left sidebar.
""")
return
@app.cell
def _():
import dlt
data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
# Sample data to be loaded
table_pipeline = dlt.pipeline(
pipeline_name="data_contracts_table_level",
destination="duckdb",
dataset_name="mydata",
)
_load_info = table_pipeline.run(data, table_name="users")
# Create a dlt pipeline
print(_load_info)
# Load the data to the "users" table
# Print the row counts for each table that was loaded in the last run of the pipeline
print(
"\nNumber of new rows loaded into each table: ",
table_pipeline.last_trace.last_normalize_info.row_counts,
)
return data, dlt, table_pipeline
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Now, try out the `evolve` mode at the table level by loading the same sample data into the same database, but this time into a new table called `new_users`."""
)
return
@app.cell
def _(data, dlt, table_pipeline):
from dlt.common.typing import TDataItems
@dlt.resource(schema_contract={"tables": "evolve"})
# Define a dlt resource that allows the creation of new tables
def allow_new_tables(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = table_pipeline.run(allow_new_tables(data), table_name="new_users")
print(_load_info)
# Run the pipeline again with the above dtl resource to load the same data into a new table "new_users"
# Print the row counts for each table that was loaded in the last run of the pipeline
print(
"\nNumber of new rows loaded into each table: ",
table_pipeline.last_trace.last_normalize_info.row_counts,
)
return (TDataItems,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The `freeze` mode at the table level, as mentioned earlier, won't allow any changes to the schema, so the pipeline run below that tries to create another table with the name `newest_users` will fail 👇"""
)
return
@app.cell
def _(TDataItems, data, dlt, table_pipeline):
# Define a dlt resource that prevents any changes to the schema at the table level (no new tables can be added)
@dlt.resource(schema_contract={"tables": "freeze"})
def no_new_tables(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = table_pipeline.run(no_new_tables(data), table_name="newest_users")
# Now, run the pipeline with the resource above, attempting to load the same data into "newest_users".
# This will fail, as new tables can't be added.
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
###**Column level**
At the column level, you can specify:
- `evolve`: Allows for the addition of new columns or changes in the existing ones.
- `freeze`: Prevents any changes to the existing columns.
- `discard_row`: Skips rows that have new columns but loads those that follow the existing schema.
- `discard_value`: Doesn't skip entire rows. Instead, it only skips the values of new columns, loading the rest of the row data.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Just like we did in the previous section, let's first load some sample data into a new database using a new pipeline.
> After you run the following code snippet, a new `data_contracts_column_level.duckdb` file should appear in `Files`.
""")
return
@app.cell
def _(dlt):
column_pipeline = dlt.pipeline(
pipeline_name="data_contracts_column_level",
destination="duckdb",
dataset_name="mydata",
)
_load_info = column_pipeline.run([{"id": 1, "name": "Alice"}], table_name="users")
print(_load_info)
return (column_pipeline,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""View the loaded data using `dlt`'s `sql_client()`.""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Alternatively, you can simply use the DuckDB client.""")
return
@app.cell
def _(column_pipeline):
import duckdb
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
_conn.sql("SELECT * FROM mydata.users").df()
return (duckdb,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Assume that Alice ☝️ is the first user at your imaginary company, and you have now decided to collect users' ages as well.
When you load the information for your second user, Bob, who also provided his age 👇, the schema contract at the column level set to `evolve` will allow `dlt` to automatically adjust the schema in the destination database by adding a new column for "age".
""")
return
@app.cell
def _(TDataItems, column_pipeline, dlt, duckdb):
# Define dlt resource that allows new columns in the data
@dlt.resource(schema_contract={"columns": "evolve"})
def allow_new_columns(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = column_pipeline.run(
allow_new_columns([{"id": 2, "name": "Bob", "age": 35}]), table_name="users"
)
print(_load_info)
# Now, load a new row into the same table, "users", which includes an additional column "age"
print("\n")
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Now, imagine your business partner, with whom you started the company, began requiring phone numbers from users. However, you weren't informed of this requriement and want to first load the data of users who provided their info before this change, i.e., users who did NOT provide their phone numbers.
In this case, you would use the `discard_row` mode - which will only load Sam's data 👇 because he didn't provide a phone number, and therefore his data complies with the schema.
""")
return
@app.cell
def _(TDataItems, column_pipeline, dlt, duckdb):
# Define a dlt resource that skips rows that have new columns but loads those that follow the existing schema
@dlt.resource(schema_contract={"columns": "discard_row"})
def _discard_row(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = column_pipeline.run(
_discard_row(
[
{"id": 3, "name": "Sam", "age": 30},
{"id": 4, "name": "Kate", "age": 79, "phone": "123-456-7890"},
]
),
table_name="users",
)
print(_load_info)
# Attempt to load two additional rows. Only the row that follows the existing schema will be loaded
print("\n")
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Due to some unknown reasons, you've suddenly decided that phone numbers are irrelevant altogether. From now on, you want to load all new data but without the "phone" column.
To achieve this, you can use the `discard_value` mode - which will load both Sarah's and Violetta's data 👇, regardless of whether either of them provided a phone number. However, the phone number column itself will be discarded.
""")
return
@app.cell
def _(TDataItems, column_pipeline, dlt, duckdb):
# Define a dlt resource that only skips the values of new columns, loading the rest of the row data
@dlt.resource(schema_contract={"columns": "discard_value"})
def _discard_value(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = column_pipeline.run(
_discard_value(
[
{"id": 5, "name": "Sarah", "age": "23"},
{"id": 6, "name": "Violetta", "age": "22", "phone": "666-513-4510"},
]
),
table_name="users",
)
print(_load_info)
# Load two additional rows. Since we're using the "discard_value" resource, both rows will be added
# However, the "phone" column in the second row will be ignored and not loaded
print("\n")
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Eventually you decide that users' id, name and age are the only things you need for your obscure business...
So, you set the mode to `freeze`, forbidding any changes to the table schema. The attempt to violate the schema contract, as shown below 👇, will fail.
""")
return
@app.cell
def _(TDataItems, column_pipeline, dlt):
# Define a dlt resource that does not allow new columns in the data
@dlt.resource(schema_contract={"columns": "freeze"})
def no_new_columns(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = column_pipeline.run(
no_new_columns([{"id": 7, "name": "Lisa", "age": 40, "phone": "098-765-4321"}]),
table_name="users",
)
# Attempt to load a row with additional columns when the column contract is set to freeze
# This will fail as no new columns are allowed.
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Data type level**
At this level, you can choose:
- `evolve`: Allows any data type. This may result with variant columns upstream.
- `freeze`: Prevents any changes to the existing data types.
- `discard_row`: Omits rows with unverifiable data types.
- `discard_value`: Replaces unverifiable values with None, but retains the rest of the row data.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
(*No imaginary situations in this section for the sake of variety and ease* ... 👀)
Load a sample row entry into a new database using a new pipeline.
""")
return
@app.cell
def _(dlt, duckdb):
data_type_pipeline = dlt.pipeline(
pipeline_name="data_contracts_data_type",
destination="duckdb",
dataset_name="mydata",
)
_load_info = data_type_pipeline.run(
[{"id": 1, "name": "Alice", "age": 24}], table_name="users"
)
print(_load_info)
print("\n")
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
_conn.sql("SELECT * FROM mydata.users").df()
return (data_type_pipeline,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Before trying out the `evolve` mode at the data type level 👇, take a moment to understand how variant columns mentioned earlier are created:
- **TLDR:** `dlt` creates a new column when the data type of a field in the incoming data can't be validated against the existing data type in the destination table.
- These variant columns will be named following the pattern `<original name>__v_<type>`, where `original_name` is the existing column name (with the data type clash) and `type` is the name of the new data type stored in the variant column.
In the example below, even though Bob's age is passed as a string, it can be validated as an integer, so it won't cause any problems.
""")
return
@app.cell
def _(TDataItems, data_type_pipeline, dlt, duckdb):
# Define dlt resource that accepts all data types
@dlt.resource(schema_contract={"data_type": "evolve"})
def allow_any_data_type(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = data_type_pipeline.run(
allow_any_data_type([{"id": 2, "name": "Bob", "age": "35"}]), table_name="users"
)
print(_load_info)
# Now, load a new row where the "age" column is passed as a string but will be validated and stored as an integer
print("\n")
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
# If you pass the age as "thirty-five", a new variant column will be added
# Note: Running the uncommented code below may affect subsequent steps, so proceed with caution
# load_info = data_type_pipeline.run(allow_any_data_type([{"id": 2, "name": "Bob", "age": "thirty-five"}]), table_name="users")
# print(load_info)
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""But if we ran the commented-out pipeline, this would be the outcome with an additional variant column:"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_7_Data_Contracts_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_7_Data_Contracts_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The `discard_row` mode at the data type level functions similarly to how it does at the column level. The only difference is that it discards rows with diverging data types instead of columns. As a result, you will see that Kate's data will not be loaded 👇."""
)
return
@app.cell
def _(TDataItems, data_type_pipeline, dlt, duckdb):
# Define dlt resource that omits rows with unverifiable data types
@dlt.resource(schema_contract={"data_type": "discard_row"})
def _discard_row(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = data_type_pipeline.run(
_discard_row(
[
{"id": 3, "name": "Sam", "age": "35"},
{"id": 4, "name": "Kate", "age": "seventy"},
]
),
table_name="users",
)
print(_load_info)
# Attempt to load two additional rows. Only the row where all column types can be validated will be loaded
print("\n")
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The same goes for the `discard_value` mode. However, note that when applied at the data type level, it will replace non-validating row items with `None`. So, in this example, Violetta's age will be set to `None` 👇."""
)
return
@app.cell
def _(TDataItems, data_type_pipeline, dlt, duckdb):
# Define a dlt resource that replaces unverifiable values with None, but retains the rest of the row data
@dlt.resource(schema_contract={"data_type": "discard_value"})
def _discard_value(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = data_type_pipeline.run(
_discard_value(
[
{"id": 5, "name": "Sarah", "age": 23},
{"id": 6, "name": "Violetta", "age": "twenty-eight"},
]
),
table_name="users",
)
print(_load_info)
# Load two additional rows. Since we're using the "discard_value" resource, both rows will be added
# However, the "age" value "twenty-eight" in the second row will be ignored and not loaded
print("\n")
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
# View the data that has been loaded
_conn.sql("SELECT * FROM mydata.users").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The `freeze` mode prohibits any changes to the data types of existing columns and will result in an error if there is a "breach in contract". The example below will fail."""
)
return
@app.cell
def _(TDataItems, data_type_pipeline, dlt):
# Define dlt resource that prevents any changes to the existing data types
@dlt.resource(schema_contract={"data_type": "freeze"})
def no_data_type_changes(input_data: TDataItems) -> TDataItems:
yield input_data
_load_info = data_type_pipeline.run(
no_data_type_changes([{"id": 7, "name": "Lisa", "age": "forty"}]),
table_name="users",
)
# Attempt to load a row with a column value that can't be validated, in this case "forty"
# This will fail as no data type changes are allowed with the "no_data_type_changes" resource
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# **Pydantic Models**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Pydantic models can also be used to [define table schemas and validate incoming data](https://dlthub.com/docs/general-usage/resource#define-a-schema-with-pydantic).
They can be passed directly to the "columns" argument of a `dlt` resource:
```python
class User(BaseModel):
id: int
name: str
tags: List[str]
email: Optional[str]
address: Address
status: Union[int, str]
@dlt.resource(name="user", columns=User)
def get_users():
...
```
This will set the schema contract to align with the default Pydantic behavior:
```python
{
"tables": "evolve",
"columns": "discard_value",
"data_type": "freeze"
}
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
If you happen to pass a `schema_contract` explicitly along with the `columns` argument to a `dlt` resource, the following happens:
- `tables`: The contract will not impact the Pydantic model and will be applied when a new table is created.
- `columns`: The modes for columns are mapped into the `extra` modes of Pydantic. If your models contain other models, `dlt` will apply this setting recursively. The contract for columns is applied when a new column is created on an existing table.
<center>
| Column Mode | Pydantic Extra |
|-----------------|----------------|
| evolve | allow |
| freeze | forbid |
| discard_value | ignore |
| discard_row | forbid |
</center>
- `data_type`: This supports the following modes for Pydantic:
1. `evolve` will synthesize a lenient model that allows for any data type. It may result in variant columns upstream.
2. `freeze` will re-raise a ValidationException.
3. `discard_row` will remove the non-validating data items.
4. `discard_value` is not currently supported.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# **Good to Know**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
- Unless you specify a schema contract, settings will default to `evolve` on all levels.
- The `schema_contract` argument accepts two forms:
1. Full form: A detailed mapping of schema entities to their respective contract modes.
```python
schema_contract={"tables": "freeze", "columns": "freeze", "data_type": "freeze"}
```
2. Shorthand form: A single contract mode that will be uniformly applied to all schema entities.
```python
schema_contract="freeze"
```
- Schema contracts can be defined for:
1. `dlt` resources: The contract applies to the corresponding table and any child tables.
```python
@dlt.resource(schema_contract={"columns": "evolve"})
def items():
...
```
2. `dlt` sources: The contract serves as a default for all resources within that source.
```python
@dlt.source(schema_contract="freeze")
def source():
...
```
3. The `pipeline.run()`: This contract overrides any existing schema contracts.
```python
pipeline.run(source(), schema_contract="freeze")
```
- You can change the contract on a `dlt` source via its `schema_contract` property.
```python
source = dlt.source(...)
source.schema_contract = {"tables": "evolve", "columns": "freeze", "data_type": "discard_row"}
```
- To update the contract for `dlt` resources, use `apply_hints`.
```python
resource.apply_hints(schema_contract={"tables": "evolve", "columns": "freeze"})
```
- For the `discard_row` method at the table level, if there are two tables in a parent-child relationship, such as `users` and `users__addresses`, and the contract is violated in the child table, the row in the child table (`users__addresses`) will be discarded, while the corresponding parent row in the `users` table will still be loaded.
- If a table is a `new table` that hasn't been created on the destination yet, `dlt` will allow the creation of new columns. During the first pipeline run, the column mode is temporarily changed to `evolve` and then reverted back to the original mode. Following tables are considered new:
1. Child tables inferred the nested data.
2. Dynamic tables created from the data during extraction.
3. Tables containing incomplete columns - columns without a data type bound to them.
> Note that tables with columns defined with Pydantic models are not considered new.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "y0sqFhxJnH5r"
},
"source": [
"# **Introduction** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"
"# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"
]
},
{
@@ -49,6 +49,7 @@
},
"outputs": [],
"source": [
"import os\n",
"from typing import Iterable, Union\n",
"import dlt\n",
"from dlt.sources.helpers import requests\n",
@@ -58,10 +59,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -162,10 +162,7 @@
"\n",
"## What is `Sentry` 🤔\n",
"\n",
"`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications.\n",
"\n",
"\n",
"Remember, `dlt` does not have the `Sentry` client as a dependency. You need to install it."
"`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications."
]
},
{
@@ -297,10 +294,9 @@
},
"outputs": [],
"source": [
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")"
"dlt.config[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")"
]
},
{
@@ -416,9 +412,9 @@
},
"outputs": [],
"source": [
"import os\n",
"import dlt\n",
"\n",
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
]
},
{
@@ -470,7 +466,7 @@
" dataset_name=\"github_data_merge\",\n",
")\n",
"load_info = pipeline.run(github_source())\n",
"\n",
"print(load_info)\n",
"# result gets showed despite no print statement ? check dlt.log"
]
},
@@ -512,9 +508,9 @@
},
"outputs": [],
"source": [
"import os\n",
"import dlt\n",
"\n",
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
]
},
{
@@ -576,7 +572,8 @@
" destination=\"duckdb\",\n",
" dataset_name=\"github_data_merge\",\n",
")\n",
"load_info = pipeline.run(github_source())"
"load_info = pipeline.run(github_source())\n",
"print(load_info)"
]
},
{
@@ -596,9 +593,9 @@
},
"outputs": [],
"source": [
"import os\n",
"import dlt\n",
"\n",
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n",
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
@@ -607,7 +604,8 @@
" dataset_name=\"github_data_merge\",\n",
" progress=\"log\",\n",
")\n",
"load_info = pipeline.run(github_source())"
"load_info = pipeline.run(github_source())\n",
"print(load_info)"
]
},
{
@@ -616,17 +614,8 @@
"id": "AH3F46PaJZe4"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/11P5O2R40ExtFtPfX4o1O5mF7nFbibtuZ#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "maZdAnM0bjiv"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,470 @@
# /// script
# dependencies = [
# "dlt",
# "loguru",
# "numpy",
# "pandas",
# "sentry-sdk",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
In this notebook, we focus more on pipeline metadata, and how to use that to be able to trace and debug our pipelines.
First, we create the pipeline we'll inspect throughout this notebook.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Create the pipeline we will inspect""")
return
@app.cell
def _():
import os
from typing import Iterable, Union
import dlt
from dlt.sources.helpers import requests
from dlt.extract import DltResource
from dlt.common.typing import TDataItems
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("ACCESS_TOKEN")
@dlt.source
def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
return github_pulls
pipeline = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
_load_info = pipeline.run(github_source())
# define new dlt pipeline
# run the pipeline with the new resource
print(_load_info)
return Union, dlt, github_source, os, pipeline
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## Look at the data""")
return
@app.cell
def _(pipeline):
import duckdb
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")
conn.sql("SHOW ALL TABLES").df()
return (conn,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""More importantly, let's look at the saved load info""")
return
@app.cell
def _(conn):
conn.sql("select * from github_data._dlt_loads").df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# **Tracing with Sentry**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can enable tracing through Sentry.
## What is `Sentry` 🤔
`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications.
""")
return
@app.cell
def _():
import sentry_sdk
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Sentry needs to be initialized in normal scripts
```
import sentry_sdk
import os
sentry_sdk.init(
dsn=os.getenv("RUNTIME__SENTRY_DSN"),
traces_sample_rate=1.0 # Adjust this for performance monitoring if needed
)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Say, you make an error and it is caught with Sentry:
```
try:
1 / 0
except ZeroDivisionError as e:
sentry_sdk.capture_exception(e)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""It will then show up on your Sentry dashboard:""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_8_Logging_%26_Tracing_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_8_Logging_%26_Tracing_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Even when a normal error arises after Sentry has been initiated, your program executes normally, but sends that error to your dashboard, so it can be tracked!"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### In dlt, you can enable Sentry quite easily
You can configure the `DSN` in the `config.toml`:
```
[runtime]
sentry_dsn="https:///<...>"
```
Alternatively, you can use environment variables. **This is what we'll be doing**:
```
RUNTIME__SENTRY_DSN="https:///<...>"
```
The entry client is configured after the first pipeline is created with `dlt.pipeline()`. Feel free to use `sentry_sdk` init again to cover your specific needs.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's try introducing the same error again""")
return
@app.cell
def _(dlt, os):
dlt.config["RUNTIME__SENTRY_DSN"] = os.getenv("SENTRY_TOKEN")
return
@app.cell
def _(pipeline):
data = {12: 34}
info = pipeline.run([data], table_name="issues")
info
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""And that comes up in Sentry as well""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_8_Logging_%26_Tracing_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_8_Logging_%26_Tracing_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The message sent to Sentry is:
```
Job for issues.a3f927c556.insert_values failed terminally in load 1723645286.6510239 with message Constraint Error: NOT NULL constraint failed: issues.id
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# **Logging**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
There are various environments where we would be completely lost without logs.
Debugging any system would be incredibly hard if we didn't know what was going on, or at what point the program ran into an error.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Setting log levels in `dlt`""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can set log levels in your `config.toml` file:
```
[runtime]
log_level="INFO"
```
`log_level` accepts the Python standard logging level names.
The default log level is `WARNING`.
**`INFO` log level is useful when diagnosing problems in production.**
**`CRITICAL` will disable logging.**
**`DEBUG` should not be used in production.**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We'll be setting the log level in our environment variables:""")
return
@app.cell
def _(dlt):
dlt.config["RUNTIME__LOG_LEVEL"] = "INFO"
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
dlt logs to a logger named `dlt`.
dlt logger uses a regular python logger so you can configure the handlers as per your requirement.
""")
return
@app.cell
def _():
import logging
# Create a logger
logger = logging.getLogger("dlt")
# Set the log level
logger.setLevel(logging.INFO)
# Create a file handler
handler = logging.FileHandler("dlt.log")
# Add the handler to the logger
logger.addHandler(handler)
return (logging,)
@app.cell
def _(dlt, github_source):
pipeline_1 = dlt.pipeline(
pipeline_name="github_issues_merge_logger",
destination="duckdb",
dataset_name="github_data_merge",
)
_load_info = pipeline_1.run(github_source())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Logging via `Loguru` in our GitHub example""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""let's change the logging level""")
return
@app.cell
def _(dlt):
dlt.config["RUNTIME__LOG_LEVEL"] = "INFO"
return
@app.cell
def _(Union, logging):
import sys
from loguru import logger as loguru_logger
class InterceptHandler(logging.Handler):
@loguru_logger.catch(default=True, onerror=lambda _: sys.exit(1))
def emit(self, record: logging.LogRecord) -> None:
# parent class logging.Handler processes log messages
try:
level: Union[str, int] = loguru_logger.level(
record.levelname
).name # decorator provided by loguru that catches any exceptions in the decorated function and logs them
except ValueError:
level = record.levelno
(frame, depth) = (
sys._getframe(6),
6,
) # Get corresponding Loguru level if it exists.
while frame and frame.f_code.co_filename == logging.__file__:
frame = frame.f_back
depth = depth + 1
loguru_logger.opt(depth=depth, exception=record.exc_info).log(
level, record.getMessage()
)
logger_dlt = logging.getLogger("dlt")
logger_dlt.addHandler(
InterceptHandler()
) # Find caller (call frame) from where originated the logged message.
# all logs will be written to dlt_loguru.log
loguru_logger.add(
"dlt_loguru.log"
) # logs the message using loguru, with the level, exception information, and depth
return
@app.cell
def _(dlt, github_source):
pipeline_2 = dlt.pipeline(
pipeline_name="github_issues_merge_loguru",
destination="duckdb",
dataset_name="github_data_merge",
)
_load_info = pipeline_2.run(github_source())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## **Logs for monitoring the progress**""")
return
@app.cell
def _(dlt, github_source):
dlt.config["RUNTIME__LOG_LEVEL"] = "WARNING"
pipeline_3 = dlt.pipeline(
pipeline_name="github_issues_progress",
destination="duckdb",
dataset_name="github_data_merge",
progress="log",
)
_load_info = pipeline_3.run(github_source())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "GNU4s2jjWTOV"
},
"source": [
"# **Performance Optimization in dlt pipelines** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)"
"# **Performance Optimization in dlt pipelines** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)"
]
},
{
@@ -94,7 +94,7 @@
"\n",
"We'll now look at how to optimize each of these stages individually.\n",
"\n",
"> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) first."
"> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) first."
]
},
{
@@ -232,23 +232,27 @@
},
"outputs": [],
"source": [
"import multiprocessing\n",
"import time\n",
"import multiprocessing\n",
"from concurrent.futures import ProcessPoolExecutor\n",
"\n",
"\n",
"def compute_heavy_task() -> None:\n",
"def compute_heavy_task() -> str:\n",
" lines = []\n",
" for number in range(3):\n",
" print(\n",
" f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\\n\"\n",
" lines.append(\n",
" f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\"\n",
" )\n",
" time.sleep(0.1)\n",
" return \"\\n\".join(lines)\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" with ProcessPoolExecutor(max_workers=4) as process_executor:\n",
" for _ in range(4):\n",
" process_executor.submit(compute_heavy_task)"
" futures = [process_executor.submit(compute_heavy_task) for _ in range(4)]\n",
" for fut in futures:\n",
" print(fut.result())\n",
" print()"
]
},
{
@@ -450,12 +454,12 @@
"id": "rvId84tCaH7u"
},
"source": [
"- Control the [in-memory buffer size](#scrollTo=ffVpDFHfnqO-) for the extract stage\n",
"- Control the `in-memory buffer size` for the extract stage\n",
"- Group `dlt` resources into `dlt` sources\n",
"- Specify the number of thread workers or..\n",
"- When using async generators, control the number of async functions/awaitables being evaluated in parallel\n",
"- Yield pages instead of rows\n",
"- Customize the [size of intermediary files](#scrollTo=g9AGWfLkoAMb) created in the extract stage to control file rotation"
"- Customize the `size of intermediary files` created in the extract stage to control file rotation"
]
},
{
@@ -559,7 +563,7 @@
" dataset_name=\"mydata\",\n",
" dev_mode=True,\n",
")\n",
"load_info = pipeline.extract(buffered_resource)\n",
"pipeline.extract(buffered_resource)\n",
"print(pipeline.last_trace)"
]
},
@@ -604,7 +608,8 @@
" dataset_name=\"mydata\",\n",
" dev_mode=True,\n",
")\n",
"load_info = pipeline.extract(buffered_resource)\n",
"\n",
"pipeline.extract(buffered_resource)\n",
"print(pipeline.last_trace)"
]
},
@@ -779,9 +784,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.extract(\n",
" [buffered_resource1, buffered_resource2, buffered_resource3]\n",
")\n",
"pipeline.extract([buffered_resource1, buffered_resource2, buffered_resource3])\n",
"print(pipeline.last_trace)"
]
},
@@ -825,7 +828,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.extract(source())\n",
"pipeline.extract(source())\n",
"print(pipeline.last_trace)"
]
},
@@ -954,7 +957,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.extract(source())\n",
"pipeline.extract(source())\n",
"print(pipeline.last_trace)"
]
},
@@ -1038,7 +1041,7 @@
")\n",
"\n",
"\n",
"load_info = pipeline.extract(source())\n",
"pipeline.extract(source())\n",
"print(pipeline.last_trace)"
]
},
@@ -1089,13 +1092,13 @@
"@dlt.resource\n",
"def sync_items() -> TDataItems:\n",
" for i in range(10):\n",
" time.sleep(0.5) # Blocking call\n",
" time.sleep(0.5)\n",
" yield i\n",
"\n",
"\n",
"@dlt.transformer\n",
"def sync_transform(item: TDataItem) -> TDataItems:\n",
" time.sleep(0.5) # Also blocking\n",
" time.sleep(0.5)\n",
" return {\"row\": item}\n",
"\n",
"\n",
@@ -1130,13 +1133,13 @@
"@dlt.resource\n",
"async def async_items() -> TDataItems:\n",
" for i in range(10):\n",
" await asyncio.sleep(0.5) # Blocking\n",
" await asyncio.sleep(0.5)\n",
" yield i\n",
"\n",
"\n",
"@dlt.transformer\n",
"async def async_transform(item) -> TDataItems:\n",
" await asyncio.sleep(0.5) # Non-blocking\n",
" await asyncio.sleep(0.5)\n",
" # just return the results, if you yield, generator will be evaluated in main thread\n",
" return {\"row\": item}\n",
"\n",
@@ -1276,7 +1279,7 @@
"@dlt.resource\n",
"def get_users() -> TDataItems:\n",
" for user in fetch_users():\n",
" yield user # yields one row at a time"
" yield user"
]
},
{
@@ -1354,8 +1357,8 @@
"\n",
"def yield_chunks(iterator: Iterator[Dict[str, int]], chunk_size=10):\n",
" iterator = iter(iterator)\n",
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
" time.sleep(0.01) # Simulate slow API call\n",
" while chunk := list(islice(iterator, chunk_size)):\n",
" time.sleep(0.01)\n",
" yield chunk\n",
"\n",
"\n",
@@ -1387,7 +1390,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.extract(source())\n",
"pipeline.extract(source())\n",
"print(pipeline.last_trace)"
]
},
@@ -1463,7 +1466,7 @@
"4. These files are then used in the **load** stage.\n",
"\n",
"\n",
">If youre not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true&scrollTo=bCeUqaW_cRSh) before diving into performance tuning. "
">If youre not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) before diving into performance tuning. "
]
},
{
@@ -1583,8 +1586,8 @@
"\n",
"def yield_chunks(iterable, chunk_size=10):\n",
" iterator = iter(iterable)\n",
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
" time.sleep(0.01) # Simulate slow API call\n",
" while chunk := list(islice(iterator, chunk_size)):\n",
" time.sleep(0.01)\n",
" yield chunk\n",
"\n",
"\n",
@@ -1611,7 +1614,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.extract(source())\n",
"pipeline.extract(source())\n",
"print(pipeline.last_trace)"
]
},
@@ -1639,7 +1642,7 @@
"\n",
"os.environ[\"NORMALIZE__WORKERS\"] = \"1\"\n",
"\n",
"load_info = pipeline.normalize()\n",
"pipeline.normalize()\n",
"print(pipeline.last_trace)"
]
},
@@ -1710,8 +1713,8 @@
")\n",
"\n",
"\n",
"load_info = pipeline.extract(source())\n",
"load_info = pipeline.normalize()\n",
"pipeline.extract(source())\n",
"pipeline.normalize()\n",
"\n",
"print(pipeline.last_trace)"
]
@@ -1881,8 +1884,8 @@
"\n",
"def yield_chunks(iterable, chunk_size=10):\n",
" iterator = iter(iterable)\n",
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
" time.sleep(0.01) # Simulate slow API call\n",
" while chunk := list(islice(iterator, chunk_size)):\n",
" time.sleep(0.01)\n",
" yield chunk\n",
"\n",
"\n",
@@ -2060,7 +2063,6 @@
},
"outputs": [],
"source": [
"# Install dlt if not already installed\n",
"%%capture\n",
"!pip install \"dlt[duckdb]\""
]
@@ -2082,7 +2084,9 @@
},
"outputs": [],
"source": [
"exit()"
"import os\n",
"\n",
"os.environ.clear()"
]
},
{
@@ -2117,9 +2121,9 @@
"def pagination(url):\n",
" while True:\n",
" response = requests.get(url, headers=headers)\n",
" time.sleep(0.1) # Simulate delay\n",
" time.sleep(0.1)\n",
" response.raise_for_status()\n",
" yield response.json() # Here we're yielding pages\n",
" yield response.json()\n",
"\n",
" # Get next page\n",
" if \"next\" not in response.links:\n",
@@ -2201,7 +2205,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.run(\n",
"pipeline.run(\n",
" [\n",
" get_issues,\n",
" get_stargazers,\n",
@@ -2355,9 +2359,6 @@
" )\n",
"\n",
"\n",
"improved_p = dlt.pipeline(\"test_pipeline_2\", destination=\"duckdb\")\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"extract_pipeline_example2\",\n",
" destination=\"duckdb\",\n",
@@ -2365,7 +2366,7 @@
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.run(github_data())\n",
"pipeline.run(github_data())\n",
"print(pipeline.last_trace)"
]
},

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
"id": "pTAeTdoKJHZV"
},
"source": [
"# **Quick Start** 👩‍💻🚀 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n",
"# **Quick Start** 👩‍💻🚀 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n",
"\n",
"**Here, you will learn:**\n",
"- What is dlt?\n",
@@ -55,15 +55,6 @@
"> **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Su4oUJelKaZY"
},
"source": [
"[Install](https://dlthub.com/docs/reference/installation) `dlt` with DuckDB as destination:"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -180,7 +171,7 @@
"> **What just happened?** \n",
"> The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables.\n",
">\n",
"> For this example `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n",
"> For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n",
">\n",
">For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline)."
]
@@ -191,7 +182,7 @@
"id": "Z9ll-Ax1BxGu"
},
"source": [
"Quick start was really quick, hah? It seems like some kind of magic happened.\n",
"Quick start was really quick, huh? It seems like some kind of magic happened.\n",
"\n",
"We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline?"
]
@@ -217,7 +208,7 @@
},
"outputs": [],
"source": [
"pipeline = dlt.pipeline(\n",
"another_pipeline = dlt.pipeline(\n",
" pipeline_name=\"resource_source\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"mydata\",\n",
@@ -237,7 +228,7 @@
"* **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline.\n",
"* **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline.\n",
"\n",
"There are more arguments, but they are for advanced use, we skip it for now."
"There are additional arguments for advanced use, but well skip them for now."
]
},
{
@@ -262,7 +253,7 @@
"outputs": [],
"source": [
"# Run the pipeline and print load info\n",
"load_info = pipeline.run(data, table_name=\"pokemon\")\n",
"load_info = another_pipeline.run(data, table_name=\"pokemon\")\n",
"print(load_info)"
]
},
@@ -309,7 +300,7 @@
"id": "xQcYIbDbQevC"
},
"source": [
"Start a connection to your database using native `duckdb` connection and look what tables were generated:"
"Start a connection to your database using a native `duckdb` connection and see which tables were generated:"
]
},
{
@@ -321,17 +312,14 @@
"outputs": [],
"source": [
"import duckdb\n",
"from google.colab import data_table\n",
"\n",
"data_table.enable_dataframe_formatter()\n",
"\n",
"# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it\n",
"\n",
"# Connect to the DuckDB database\n",
"conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n",
"conn = duckdb.connect(f\"{another_pipeline.pipeline_name}.duckdb\")\n",
"\n",
"# Set search path to the dataset\n",
"conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n",
"conn.sql(f\"SET search_path = '{another_pipeline.dataset_name}'\")\n",
"\n",
"# Describe the dataset\n",
"conn.sql(\"DESCRIBE\").df()"
@@ -399,7 +387,7 @@
"outputs": [],
"source": [
"# Query data from 'pokemon' using the SQL client\n",
"with pipeline.sql_client() as client:\n",
"with another_pipeline.sql_client() as client:\n",
" with client.execute_query(\"SELECT * FROM pokemon\") as cursor:\n",
" data = cursor.df()\n",
"\n",
@@ -427,7 +415,7 @@
},
"outputs": [],
"source": [
"dataset = pipeline.dataset()\n",
"dataset = another_pipeline.dataset()\n",
"dataset.pokemon.df()"
]
},
@@ -467,17 +455,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lN6cXVfhVPmq"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,394 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Quick Start** 👩‍💻🚀 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
**Here, you will learn:**
- What is dlt?
- How to run a simple pipeline with toy data.
- How to explore the loaded data using:
- DuckDB connection
- dlt's sql_client
- dlt datasets
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **What is dlt?**
In today's data-driven world, organizations often grapple with the challenge of efficiently **extracting, transforming,** and **loading** (ETL) data from various, often messy, data sources into well-structured, live datasets. This process can be complex, time-consuming, and prone to errors, especially when dealing with large volumes of data or nested data structures.
Enter **dlt**, an **open-source Python library** designed to simplify and streamline this process. **dlt can load data from** a wide range of **sources** including REST APIs, SQL databases, cloud storage, and Python data structures, among others. It offers a lightweight interface that **infers schemas** and **data types**, **normalizes** the data, and handles **nested data** structures, making it easy to use, flexible, and scalable.
Moreover, dlt supports a variety of **popular destinations** and allows for the addition of custom destinations to create **reverse ETL** pipelines. It can be deployed **anywhere Python runs**, be it on Airflow, serverless functions, or any other cloud deployment of your choice. With features like **schema evolution**, **data contracts** and **incremental loading**, dlt also automates pipeline maintenance, saving valuable time and resources.
In essence, dlt is a powerful tool that simplifies the ETL process, making it more efficient and less error-prone. It allows data teams to **focus** on leveraging the data and driving value, while ensuring effective **governance** through timely notifications of any changes.
[Learn more about dlt here](https://dlthub.com/docs/intro) and in this course!
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_1_Quick_start_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_1_Quick_start_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Installation**
> **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Read more about DuckDB as a destination [here](https://dlthub.com/docs/dlt-ecosystem/destinations/duckdb)."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Run a simple pipeline with toy data**
For educational purposes, lets start with a simple pipeline using a small dataset — Pokémon data represented as a list of Python dictionaries.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""1. Define a list of Python dictionaries, which will be your toy data:"""
)
return
@app.cell
def _():
# Sample data containing pokemon details
data = [
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
]
return (data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""2. Import `dlt` and create a simple pipeline:""")
return
@app.cell
def _():
import dlt
# Set pipeline name, destination, and dataset name
pipeline = dlt.pipeline(
pipeline_name="quick_start",
destination="duckdb",
dataset_name="mydata",
)
return dlt, pipeline
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""3. Run your pipeline and print the load info:""")
return
@app.cell
def _(data, pipeline):
# Run the pipeline with data and table name
_load_info = pipeline.run(data, table_name="pokemon")
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
> **What just happened?**
> The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables.
>
> For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.
>
>For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Quick start was really quick, huh? It seems like some kind of magic happened.
We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **What is a `dlt` Pipeline?**
A [pipeline](https://dlthub.com/docs/general-usage/pipeline) is a connection that moves data from your Python code to a destination. The pipeline accepts dlt sources or resources, as well as generators, async generators, lists, and any iterables. Once the pipeline runs, all resources are evaluated and the data is loaded at the destination.
""")
return
@app.cell
def _(dlt):
another_pipeline = dlt.pipeline(
pipeline_name="resource_source",
destination="duckdb",
dataset_name="mydata",
dev_mode=True,
)
return (another_pipeline,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You instantiate a pipeline by calling the `dlt.pipeline` function with the following arguments:
* **`pipeline_name`**: This is the name you give to your pipeline. It helps you track and monitor your pipeline, and also helps to bring back its state and data structures for future runs. If you don't provide a name, dlt will use the name of the Python file you're running as the pipeline name.
* **`destination`**: a name of the destination to which dlt will load the data. It may also be provided to the run method of the pipeline.
* **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline.
* **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline.
There are additional arguments for advanced use, but well skip them for now.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Run method**
To load the data, you call the `run()` method and pass your data in the data argument.
""")
return
@app.cell
def _(another_pipeline, data):
# Run the pipeline and print load info
_load_info = another_pipeline.run(data, table_name="pokemon")
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Commonly used arguments:
* **`data`** (the first argument) may be a dlt source, resource, generator function, or any Iterator or Iterable (i.e., a list or the result of the map function).
* **`write_disposition`** controls how to write data to a table. Defaults to the value "append".
* `append` will always add new data at the end of the table.
* `replace` will replace existing data with new data.
* `skip` will prevent data from loading.
* `merge` will deduplicate and merge data based on `primary_key` and `merge_key` hints.
* **`table_name`**: specified in cases when the table name cannot be inferred, i.e., from the resources or name of the generator function.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Explore the loaded data**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(1) DuckDB Connection**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Start a connection to your database using a native `duckdb` connection and see which tables were generated:"""
)
return
@app.cell
def _(another_pipeline):
import duckdb
# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it
# Connect to the DuckDB database
conn = duckdb.connect(f"{another_pipeline.pipeline_name}.duckdb")
# Set search path to the dataset
conn.sql(f"SET search_path = '{another_pipeline.dataset_name}'")
# Describe the dataset
conn.sql("DESCRIBE").df()
return (conn,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can see:
- `pokemon` table,
and 3 special `dlt` tables (we will discuss them later):
- `_dlt_loads`,
- `_dlt_pipeline_state`,
- `_dlt_version`.
Let's execute a query to get all data from the `pokemon` table:
""")
return
@app.cell
def _(conn):
# Fetch all data from 'pokemon' as a DataFrame
table = conn.sql("SELECT * FROM pokemon").df()
# Display the DataFrame
table
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(2) `dlt`'s [sql_client](https://dlthub.com/docs/general-usage/dataset-access/sql-client)**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Most dlt destinations (even filesystem) use an implementation of the `SqlClientBase` class to connect to the physical destination to which your data is loaded. You can access the SQL client of your destination via the `sql_client` method on your pipeline.
Start a connection to your database with `pipeline.sql_client()` and execute a query to get all data from the `pokemon` table:
""")
return
@app.cell
def _(another_pipeline):
# Query data from 'pokemon' using the SQL client
with another_pipeline.sql_client() as client:
with client.execute_query("SELECT * FROM pokemon") as cursor:
data_1 = cursor.df()
# Display the data
data_1
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(3) dlt [datasets](https://dlthub.com/docs/general-usage/dataset-access/dataset)**
Here's an example of how to retrieve data from a pipeline and load it into a Pandas DataFrame or a PyArrow Table.
""")
return
@app.cell
def _(another_pipeline):
dataset = another_pipeline.dataset()
dataset.pokemon.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **Exercise 1**
Using the code from the previous cell, fetch the data from the `pokemon` table into a dataframe and count the number of columns in the table `pokemon`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""**Use this number to answer the question in the Quiz LearnWorlds Form.**"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_1_Quick_start_img2.jpeg](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_1_Quick_start_img2.jpeg)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,7 +6,7 @@
"id": "qvMyiV0uMY-7"
},
"source": [
"# **dlt sources and resources**: Create first dlt pipeline. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n"
"# **dlt sources and resources**: Create your first dlt pipeline [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n"
]
},
{
@@ -24,12 +24,12 @@
"id": "pZCRBANQftVQ"
},
"source": [
"## Recap of [Lesson 1](https://colab.research.google.com/drive/1QwlDWxX5hvwbHMkCgiF0UCzGFRMRoSPY#forceEdit=true&sandboxMode=true) 👩‍💻🚀\n",
"1. Created a pipeline, loaded toy data into DuckDB, and viewed load info.\n",
"2. Used `dlt.pipeline` and `pipeline.run` methods.\n",
"3. Used DuckDB, `sql_client` and dlt `dataset` to view tables and query data.\n",
"## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩‍💻🚀\n",
"1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info.\n",
"2. Used the `dlt.pipeline` and `pipeline.run` methods.\n",
"3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`.\n",
"\n",
"Now we move to the next lesson to learn more details about dlt! 🚀"
"Now, let's move on to the next lesson to learn more! 🚀"
]
},
{
@@ -39,18 +39,9 @@
},
"source": [
"**Here, you will learn how to:**\n",
"- Run a simple pipeline with different types of data, such as dataframes, databases and RestAPI.\n",
"- Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs.\n",
"- Use `dlt.resource`, `dlt.source` and `dlt.transformer`.\n",
"- Build your first dlt pipeline for RestAPI."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "oaLSnDr9hSxE"
},
"source": [
"## **Install dlt**"
"- Build your first dlt pipeline for a REST API."
]
},
{
@@ -142,7 +133,7 @@
"\n",
"\n",
"# Create a dlt resource from the data\n",
"@dlt.resource(table_name=\"pokemon_new\") # <--- we set new table name\n",
"@dlt.resource(table_name=\"pokemon_new\")\n",
"def my_dict_list() -> TDataItems:\n",
" yield data"
]
@@ -156,8 +147,8 @@
"Commonly used arguments:\n",
"\n",
"* **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name.\n",
"* **`table_name`**: the name of the table, if different from the resource name.\n",
"* **`write_disposition`**: controls how to write data to a table. Defaults to the value \"append\"."
"* **`table_name`**: The name of the table, if different from the resource name.\n",
"* **`write_disposition`**: Controls how to write data to a table. Defaults to the value \"append\"."
]
},
{
@@ -232,7 +223,7 @@
"source": [
"---\n",
"### Dataframes\n",
"For creating a pipeline using dataframes, you would do:"
"To create a pipeline using dataframes, you would do:"
]
},
{
@@ -268,11 +259,9 @@
},
"source": [
"---\n",
"### Database\n",
"### Databases\n",
"\n",
"For creating a pipeline from an SQL database query you would:\n",
"\n",
"1. Install the PyMySQL package:"
"To create a pipeline from an SQL database query you would:"
]
},
{
@@ -293,7 +282,7 @@
"id": "ktAAuuJqW792"
},
"source": [
"2. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"
"1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"
]
},
{
@@ -458,7 +447,7 @@
"* The source Python module typically contains optional customizations and data transformations.\n",
"* The source Python module typically contains the authentication and pagination code for a particular API.\n",
"\n",
"Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource) here."
"Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource)."
]
},
{
@@ -508,12 +497,12 @@
"outputs": [],
"source": [
"# Create a pipeline\n",
"pipeline = dlt.pipeline(\n",
"new_pipeline = dlt.pipeline(\n",
" pipeline_name=\"resource_source_new\", destination=\"duckdb\", dataset_name=\"all_data\"\n",
")\n",
"\n",
"# Run the pipeline\n",
"load_info = pipeline.run(all_data())\n",
"load_info = new_pipeline.run(all_data())\n",
"\n",
"# Print load info\n",
"print(load_info)"
@@ -602,8 +591,13 @@
"outputs": [],
"source": [
"@dlt.resource(table_name=\"pokemon\")\n",
"def my_dict_list() -> TDataItems:\n",
" yield data"
"def my_pokemons() -> TDataItems:\n",
" pokemons = [\n",
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
" ]\n",
" yield pokemons"
]
},
{
@@ -623,45 +617,27 @@
},
"outputs": [],
"source": [
"import requests\n",
"\n",
"data = [\n",
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
"]\n",
"\n",
"\n",
"# Define a resource to read and write data to pokemon table\n",
"@dlt.resource(table_name=\"pokemon\")\n",
"def my_dict_list() -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
"# Define a transformer to enrich pokemon data with additional details\n",
"@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n",
"# NOTE: the `items` argument contains data from the `my_dict_list` resource\n",
"@dlt.transformer(data_from=my_pokemons, table_name=\"detailed_info\")\n",
"def poke_details(\n",
" items: TDataItems,\n",
") -> (\n",
" TDataItems\n",
"): # <--- `items` is a variable and contains data from `my_dict_list` resource\n",
") -> TDataItems:\n",
" for item in items:\n",
" print(\n",
" f\"Item: {item}\\n\"\n",
" ) # <-- print what data we get from `my_dict_list` source\n",
" print(f\"Item: {item}\\n\")\n",
"\n",
" item_id = item[\"id\"]\n",
" url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n",
" response = requests.get(url)\n",
" details = response.json()\n",
"\n",
" print(f\"Details: {details}\\n\") # <--- print what data we get from API\n",
" print(f\"Details: {details}\\n\")\n",
"\n",
" yield details\n",
"\n",
"\n",
"# Set pipeline name, destination, and dataset name\n",
"pipeline = dlt.pipeline(\n",
"another_pipeline = dlt.pipeline(\n",
" pipeline_name=\"quick_start\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokedata\",\n",
@@ -687,7 +663,7 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(poke_details())\n",
"load_info = another_pipeline.run(poke_details())\n",
"print(load_info)"
]
},
@@ -709,14 +685,20 @@
"outputs": [],
"source": [
"@dlt.resource(table_name=\"pokemon\")\n",
"def my_dict_list() -> TDataItems:\n",
" yield from data # <--- This would yield one item at a time\n",
"def my_other_pokemons() -> TDataItems:\n",
" pokemons = [\n",
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
" ]\n",
" yield from pokemons\n",
"\n",
"\n",
"@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n",
"def details(\n",
"# NOTE: Transformer receives one item at a time\n",
"@dlt.transformer(data_from=my_other_pokemons, table_name=\"detailed_info\")\n",
"def other_poke_details(\n",
" data_item: TDataItem,\n",
") -> TDataItems: # <--- Transformer receives one item at a time\n",
") -> TDataItems:\n",
" item_id = data_item[\"id\"]\n",
" url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n",
" response = requests.get(url)\n",
@@ -725,7 +707,7 @@
" yield details\n",
"\n",
"\n",
"load_info = pipeline.run(details())\n",
"load_info = another_pipeline.run(other_poke_details())\n",
"print(load_info)"
]
},
@@ -746,7 +728,8 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(my_dict_list | details)"
"load_info = another_pipeline.run(my_pokemons | poke_details)\n",
"print(load_info)"
]
},
{
@@ -767,7 +750,7 @@
"outputs": [],
"source": [
"# Query the 'detailed_info' table and convert the result to a DataFrame\n",
"pipeline.dataset().detailed_info.df()"
"another_pipeline.dataset().detailed_info.df()"
]
},
{
@@ -809,7 +792,7 @@
},
"source": [
"---\n",
"## **Exercise 1: Create a pipeline for GitHub API - repos endpoint**\n",
"## **Exercise 1: Create a pipeline for GitHub API repos endpoint**\n",
"\n",
"In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination.\n",
"\n",
@@ -817,24 +800,24 @@
"\n",
"1. **Explore the GitHub API**\n",
"\n",
" Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:\n",
" Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:\n",
"\n",
" GET https://api.github.com/orgs/{org}/repos\n",
" `GET https://api.github.com/orgs/{org}/repos`\n",
"\n",
"2. **Build the Pipeline**\n",
"2. **Build the pipeline**\n",
"\n",
" Write a script to:\n",
" Write a script to:\n",
"\n",
" * Fetch repositories for a **dlt-hub** organization.\n",
" * Use `dlt.resource` to define the data extraction logic.\n",
" * Combine all resources to a single `@dlt.source`.\n",
" * Load the data into a DuckDB database.\n",
" - Fetch repositories for the **dlt-hub** organization.\n",
" - Use `dlt.resource` to define the data extraction logic.\n",
" - Combine all resources into a single `@dlt.source`.\n",
" - Load the data into a DuckDB database.\n",
"\n",
"3. **Look at the data**\n",
"3. **Inspect the data**\n",
"\n",
" Use `duckdb` connection, `sql_client` or `pipeline.dataset()`.\n",
" Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`.\n",
"\n",
"> **Note**: For this exercise you don't need to use Auth and Pagination."
"> **Note**: For this exercise you don't need to use authentication or pagination.\n"
]
},
{
@@ -843,7 +826,7 @@
"id": "lcBEFsCUuylN"
},
"source": [
"Play with API using requests library:\n"
"Play with the API using the requests library:\n"
]
},
{
@@ -853,9 +836,20 @@
"collapsed": true,
"id": "Ws7JhfPJvRTa"
},
"outputs": [],
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'dlt (Python 3.10.0)' requires the ipykernel package.\n",
"\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
"\u001b[1;31mCommand: '/Users/anuunchinbat/Documents/GitHub/dlt/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
]
}
],
"source": [
"import requests\n",
"from dlt.sources.helpers import requests\n",
"\n",
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/repos\")\n",
"response.json()[0]"
@@ -867,7 +861,7 @@
"id": "7PUyt5LAXEMY"
},
"source": [
"In the code snippet below you will find an **example** for the **`events`** endpoint:"
"In the code snippet below, you will find an **example** for the **`events`** endpoint:"
]
},
{
@@ -889,20 +883,23 @@
" yield response.json()\n",
"\n",
"\n",
"# here is your code\n",
"print(\"build the `github_repos` resource here\")\n",
"\n",
"\n",
"@dlt.source\n",
"def github_data() -> Iterable[DltResource]:\n",
" return (github_events,) # github_repos\n",
" return (github_events,)\n",
"\n",
"\n",
"print(\"return your new resource as part of the source above\")\n",
"\n",
"\n",
"# Set pipeline name, destination, and dataset name\n",
"pipeline = dlt.pipeline(\n",
"github_pipeline = dlt.pipeline(\n",
" pipeline_name=\"github_pipeline\", destination=\"duckdb\", dataset_name=\"github_data\"\n",
")\n",
"\n",
"load_info = pipeline.run(github_data())\n",
"load_info = github_pipeline.run(github_data())\n",
"print(load_info)"
]
},
@@ -913,7 +910,7 @@
},
"source": [
"### Question\n",
"How many columns has the `github_repos` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`."
"How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`."
]
},
{
@@ -922,14 +919,15 @@
"id": "mYfeMBI82Tg0"
},
"source": [
"## **Exercise 2: Create a pipeline for GitHub API - stargazers endpoint**\n",
"## **Exercise 2: Create a pipeline for the GitHub API stargazers endpoint**\n",
"\n",
"Create a `dlt.transformer` for the \"stargazers\" endpoint\n",
"https://api.github.com/repos/OWNER/REPO/stargazers for `dlt-hub` organization.\n",
"Create a `dlt.transformer` for the **\"stargazers\"** endpoint \n",
"`https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization.\n",
"\n",
"Use `github_repos` resource as a main resource for the transformer:\n",
"1. Get all `dlt-hub` repositories.\n",
"2. Feed these repository names to dlt transformer and get all stargazers for all `dlt-hub` repositories."
"Use the `github_repos` resource as the main resource for the transformer:\n",
"\n",
"1. Get all repositories in the `dlt-hub` organization. \n",
"2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories.\n"
]
},
{
@@ -940,7 +938,7 @@
},
"outputs": [],
"source": [
"# here is your code"
"print(\"YOUR CODE GOES HERE\")"
]
},
{
@@ -950,7 +948,7 @@
},
"source": [
"### Question\n",
"How many columns has the `github_stargazer` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`."
"How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`."
]
},
{
@@ -959,7 +957,7 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!"
]
}
],
@@ -970,11 +968,13 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "dlt",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"name": "python",
"version": "3.10.0"
}
},
"nbformat": 4,

View File

@@ -0,0 +1,745 @@
# /// script
# dependencies = [
# "dlt",
# "numpy",
# "pandas",
# "pymysql",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""# **dlt sources and resources**: Create your first dlt pipeline [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img1.png](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩‍💻🚀
1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info.
2. Used the `dlt.pipeline` and `pipeline.run` methods.
3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`.
Now, let's move on to the next lesson to learn more! 🚀
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Here, you will learn how to:**
- Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs.
- Use `dlt.resource`, `dlt.source` and `dlt.transformer`.
- Build your first dlt pipeline for a REST API.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **`dlt` resources**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### List of dicts
In the previous lesson, we simply used a list of dictionaries that essentially represents the `pokemon` table.
""")
return
@app.cell
def _():
import dlt
data = [
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
]
# Sample data containing pokemon details
pipeline = dlt.pipeline(
pipeline_name="quick_start", destination="duckdb", dataset_name="mydata"
)
_load_info = pipeline.run(data, table_name="pokemon")
# Set pipeline name, destination, and dataset name
# Run the pipeline with data and table name
print(_load_info)
return data, dlt, pipeline
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""A better way is to wrap it in the `@dlt.resource` decorator which denotes a logical grouping of data within a data source, typically holding data of similar structure and origin:"""
)
return
@app.cell
def _(data, dlt):
from dlt.common.typing import TDataItems, TDataItem
@dlt.resource(table_name="pokemon_new")
def my_dict_list() -> TDataItems:
# Create a dlt resource from the data
yield data
return TDataItem, TDataItems, my_dict_list
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Commonly used arguments:
* **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name.
* **`table_name`**: The name of the table, if different from the resource name.
* **`write_disposition`**: Controls how to write data to a table. Defaults to the value "append".
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> **Why is it a better way?** This allows you to use `dlt` functionalities to the fullest that follow Data Engineering best practices, including incremental loading and data contracts."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Try running the pipeline with the `my_dict_list` resource:""")
return
@app.cell
def _(my_dict_list, pipeline):
# Run the pipeline and print load info
_load_info = pipeline.run(my_dict_list)
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Check what was loaded to the `pokemon_new` table:""")
return
@app.cell
def _(pipeline):
pipeline.dataset().pokemon_new.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Instead of a dict list, the data could also be a/an:
- dataframe
- database query response
- API request response
- Anything you can transform into JSON/dict format
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### Dataframes
To create a pipeline using dataframes, you would do:
""")
return
@app.cell
def _(TDataItems, dlt, pipeline):
import pandas as pd
@dlt.resource(table_name="df_data")
# Define a resource to load data from a CSV
def my_df() -> TDataItems:
sample_df = pd.read_csv(
"https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv"
)
yield sample_df
_load_info = pipeline.run(my_df)
print(_load_info)
# Run the pipeline with the defined resource
# Query the loaded data from 'df_data'
pipeline.dataset().df_data.df()
return (my_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### Databases
To create a pipeline from an SQL database query you would:
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"""
)
return
@app.cell
def _(TDataItems, dlt, pipeline):
from sqlalchemy import create_engine
@dlt.resource(table_name="genome_data")
def get_genome_data() -> TDataItems:
engine = create_engine(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
)
with engine.connect() as conn:
query = "SELECT * FROM genome LIMIT 1000"
rows = conn.execution_options(yield_per=100).exec_driver_sql(query)
yield from map(lambda row: dict(row._mapping), rows)
_load_info = pipeline.run(get_genome_data)
print(_load_info)
pipeline.dataset().genome_data.df()
return (get_genome_data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### REST API
For REST API endpoints, create a pipeline as follows:
""")
return
@app.cell
def _(TDataItems, dlt, pipeline):
from dlt.sources.helpers import requests
@dlt.resource(table_name="pokemon_api")
# Define a resource to fetch pokemons from PokeAPI
def get_pokemon() -> TDataItems:
url = "https://pokeapi.co/api/v2/pokemon"
response = requests.get(url)
yield response.json()["results"]
_load_info = pipeline.run(get_pokemon)
print(_load_info)
# Run the pipeline using the defined resource
# Query the loaded data from 'pokemon_api' table
pipeline.dataset().pokemon_api.df()
return get_pokemon, requests
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Try loading everything above, in a single pipeline:""")
return
@app.cell
def _(get_genome_data, get_pokemon, my_df, pipeline):
_load_info = pipeline.run([my_df, get_genome_data, get_pokemon])
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Check which new tables were created:""")
return
@app.cell
def _(pipeline):
# List all table names from the database
with pipeline.sql_client() as client:
with client.execute_query(
"SELECT table_name FROM information_schema.tables"
) as table:
print(table.df())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **`dlt` sources**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Now that there are multiple `dlt` resources, each corresponding to a separate table, we can group them into a `dlt` source."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
A source is a logical grouping of resources, e.g., endpoints of a single API. The most common approach is to define it in a separate Python module.
* A source is a function decorated with `@dlt.source` that returns one or more resources.
* A source can optionally define a schema with tables, columns, performance hints, and more.
* The source Python module typically contains optional customizations and data transformations.
* The source Python module typically contains the authentication and pagination code for a particular API.
Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You declare a source by decorating a function that returns or yields one or more resources with `@dlt.source`.
Here's how it's done:
""")
return
@app.cell
def _(dlt, get_genome_data, get_pokemon, my_df):
from typing import Iterable
from dlt.extract import DltResource
@dlt.source
def all_data() -> Iterable[DltResource]:
return my_df, get_genome_data, get_pokemon
return DltResource, Iterable, all_data
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Only using the source above, load everything into a separate database using a new pipeline:"""
)
return
@app.cell
def _(all_data, dlt):
# Create a pipeline
new_pipeline = dlt.pipeline(
pipeline_name="resource_source_new",
destination="duckdb",
dataset_name="all_data",
)
_load_info = new_pipeline.run(all_data())
# Run the pipeline
# Print load info
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
> **Why does this matter?**:
- It is more efficient than running your resources separately.
- It organizes both your schema and your code. 🙂
- It enables the option for parallelization.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **`dlt` transformers**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
We now know that `dlt` resources can be grouped into a `dlt` source, represented as:
```
Source
/ \
Resource 1 ... Resource N
```
However, imagine a scenario where you need an additional step in between:
```
Source
/ \
step \
/ \
Resource 1 ... Resource N
```
This step could arise, for example, in a situation where:
- Resource 1 returns a list of pokemons IDs, and you need to use each of those IDs to retrieve detailed information about the pokemons from a separate API endpoint.
In such cases, you would use `dlt` transformers — special `dlt` resources that can be fed data from another resource:
```
Source
/ \
Transformer \
/ \
Resource 1 ... Resource N
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Lets assume Resource 1 is:""")
return
@app.cell
def _(TDataItems, dlt):
@dlt.resource(table_name="pokemon")
def my_pokemons() -> TDataItems:
pokemons = [
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
]
yield pokemons
return (my_pokemons,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""We need to get detailed information about pokemons from [PokeAPI](https://pokeapi.co/) `"https://pokeapi.co/api/v2/pokemon/{id}"` based on their IDs. We would do:"""
)
return
@app.cell
def _(TDataItems, dlt, my_pokemons, requests):
# Define a transformer to enrich pokemon data with additional details
# NOTE: the `items` argument contains data from the `my_dict_list` resource
@dlt.transformer(data_from=my_pokemons, table_name="detailed_info")
def poke_details(
items: TDataItems,
) -> TDataItems:
for item in items:
print(f"Item: {item}\n")
item_id = item["id"]
url = f"https://pokeapi.co/api/v2/pokemon/{item_id}"
response = requests.get(url)
details = response.json()
print(f"Details: {details}\n")
yield details
# Set pipeline name, destination, and dataset name
another_pipeline = dlt.pipeline(
pipeline_name="quick_start",
destination="duckdb",
dataset_name="pokedata",
dev_mode=True,
)
return another_pipeline, poke_details
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Run the pipeline:""")
return
@app.cell
def _(another_pipeline, poke_details):
_load_info = another_pipeline.run(poke_details())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Alternatively, we could do:""")
return
@app.cell
def _(TDataItem, TDataItems, another_pipeline, dlt, requests):
@dlt.resource(table_name="pokemon")
def my_other_pokemons() -> TDataItems:
pokemons = [
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
]
yield from pokemons
@dlt.transformer(data_from=my_other_pokemons, table_name="detailed_info")
def other_poke_details(data_item: TDataItem) -> TDataItems:
item_id = data_item["id"]
url = f"https://pokeapi.co/api/v2/pokemon/{item_id}"
response = requests.get(url)
# NOTE: Transformer receives one item at a time
details = response.json()
yield details
_load_info = another_pipeline.run(other_poke_details())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""You can also use pipe instead of `data_from`, this is useful when you want to apply `dlt.transformer` to multiple `dlt.resources`:"""
)
return
@app.cell
def _(another_pipeline, my_pokemons, poke_details):
_load_info = another_pipeline.run(my_pokemons | poke_details)
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Check the loaded data:""")
return
@app.cell
def _(another_pipeline):
# Query the 'detailed_info' table and convert the result to a DataFrame
another_pipeline.dataset().detailed_info.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Reduce the nesting level of generated tables**
You can limit how deep dlt goes when generating nested tables and flattening dicts into columns. By default, the library will descend and generate nested tables for all nested lists, without limit.
You can set nesting level for all resources on the source level:
```python
@dlt.source(max_table_nesting=1)
def all_data():
return my_df, get_genome_data, get_pokemon
```
or for each resource separately:
```python
@dlt.resource(table_name='pokemon_new', max_table_nesting=1)
def my_dict_list():
yield data
```
In the example above, we want only 1 level of nested tables to be generated (so there are no nested tables of a nested table). Typical settings:
* `max_table_nesting=0` will not generate nested tables and will not flatten dicts into columns at all. All nested data will be represented as JSON.
* `max_table_nesting=1` will generate nested tables of root tables and nothing more. All nested data in nested tables will be represented as JSON.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Exercise 1: Create a pipeline for GitHub API repos endpoint**
In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination.
## Instructions
1. **Explore the GitHub API**
Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:
`GET https://api.github.com/orgs/{org}/repos`
2. **Build the pipeline**
Write a script to:
- Fetch repositories for the **dlt-hub** organization.
- Use `dlt.resource` to define the data extraction logic.
- Combine all resources into a single `@dlt.source`.
- Load the data into a DuckDB database.
3. **Inspect the data**
Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`.
> **Note**: For this exercise you don't need to use authentication or pagination.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Play with the API using the requests library:""")
return
@app.cell
def _(requests):
response = requests.get("https://api.github.com/orgs/dlt-hub/repos")
response.json()[0]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""In the code snippet below, you will find an **example** for the **`events`** endpoint:"""
)
return
@app.cell
def _(DltResource, Iterable, TDataItems, dlt, requests):
@dlt.resource
def github_events() -> TDataItems:
url = "https://api.github.com/orgs/dlt-hub/events"
response = requests.get(url)
yield response.json()
print("build the `github_repos` resource here")
@dlt.source
def github_data() -> Iterable[DltResource]:
return (github_events,)
print("return your new resource as part of the source above")
github_pipeline = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
_load_info = github_pipeline.run(github_data())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Question
How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Exercise 2: Create a pipeline for the GitHub API stargazers endpoint**
Create a `dlt.transformer` for the **"stargazers"** endpoint
`https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization.
Use the `github_repos` resource as the main resource for the transformer:
1. Get all repositories in the `dlt-hub` organization.
2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories.
""")
return
@app.cell
def _():
print("YOUR CODE GOES HERE")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Question
How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,13 +6,13 @@
"id": "MfQUdpVg2Trs"
},
"source": [
"# **Recap of [Lesson 2](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Used `@dlt.resource` to load and query data like lists, dataframes, and REST API responses into DuckDB. \n",
"2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. \n",
"3. Used `@dlt.transformer` to process and enrich data between resources. \n",
"1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB. \n",
"2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. \n",
"3. Used `@dlt.transformer` to process and enrich data between resources. \n",
"\n",
"Next: Dive deeper into building dlt pipelines using pagination, authentication and dlt configuration! 🚀"
"Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀"
]
},
{
@@ -23,16 +23,16 @@
"source": [
"---\n",
"\n",
"# **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n",
"# **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n",
"\n",
"\n",
"\n",
"**Here, you will learn how to:**\n",
"- Use pagination for RestAPIs.\n",
"- Use environment variables to handle both secrets & configs.\n",
"**In this lesson, you will learn how to:**\n",
"- Use pagination for REST APIs.\n",
"- Use environment variables to manage both secrets & configs.\n",
"- Add values to `secrets.toml` or `config.toml`.\n",
"\n",
"To read more about credentails refer to [dlt documentation](https://dlthub.com/docs/general-usage/credentials/) here."
"To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/)."
]
},
{
@@ -41,7 +41,7 @@
"id": "aAN9q0Kz0tt_"
},
"source": [
"In previous lesson we loaded data from GitHub API to DuckDB,"
"In the previous lesson, we loaded data from the GitHub API to DuckDB,"
]
},
{
@@ -53,7 +53,7 @@
"outputs": [],
"source": [
"%%capture\n",
"!pip install dlt"
"!pip install \"dlt[duckdb]\""
]
},
{
@@ -78,14 +78,14 @@
"\n",
"\n",
"# define dlt pipeline\n",
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"# run dlt pipeline\n",
"load_info = pipeline.run(github_events)\n",
"load_info = _pipeline.run(github_events)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().github_events.df()"
"_pipeline.dataset().github_events.df()"
]
},
{
@@ -94,9 +94,9 @@
"id": "GtyMwBig37uK"
},
"source": [
"You could notice that we received only 1 page, only 30 records. But this endpoint has muuuch more records in total. To get all the pages you should use a pagination.\n",
"You may notice we received only one page — just 30 records — even though this endpoint has many more.\n",
"\n",
"When working with APIs like GitHub, data is often returned in pages. Pagination allows you to retrieve all the data when an endpoint limits how much can be fetched at once."
"To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset."
]
},
{
@@ -114,14 +114,16 @@
"id": "BolhMQE10Zgk"
},
"source": [
"---\n",
"## **Pagination**\n",
"\n",
"GitHub has very good documentation, so it is not difficult to go through the documentation and find the relevant page: [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n",
"GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n",
"\n",
"It says:\n",
">You can use the `link` header from the response to request additional pages of data.\n",
"It explains that:\n",
"\n",
">The link header contains URLs that you can use to fetch additional pages of results. For example, the previous, next, first, and last page of results."
">You can use the `Link` header from the response to request additional pages of data.\n",
"\n",
">The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages."
]
},
{
@@ -130,7 +132,7 @@
"id": "iU-xQriAHJI2"
},
"source": [
"**GitHub API Pagination example**\n",
"**GitHub API Pagination Example**\n",
"\n",
"The GitHub API provides the `per_page` and `page` query parameters:\n",
"\n",
@@ -146,8 +148,6 @@
},
"outputs": [],
"source": [
"import requests\n",
"\n",
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n",
"response.headers"
]
@@ -158,7 +158,7 @@
"id": "ZdDGuAVJ4Qqo"
},
"source": [
"Gotcha! We can see 'Link' in the headers. To get this link we can alternatively use `response.links`:"
"Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:"
]
},
{
@@ -169,8 +169,6 @@
},
"outputs": [],
"source": [
"import requests\n",
"\n",
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n",
"response.links"
]
@@ -183,18 +181,17 @@
"source": [
"### **dlt RESTClient**\n",
"\n",
"The response includes a 'Link' header for navigating to the next page.\n",
"So now we can implement a pagination!\n",
"Now that we know how pagination works conceptually, lets see how to implement it efficiently!\n",
"\n",
"When working with APIs, you could implement pagination using only Python and the requests library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n",
"When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n",
"\n",
"More about how to build pagination with Python and `requests`:\n",
"Learn more about building pagination with Python and `requests`:\n",
"\n",
"* [Link 1](https://farnamdata.com/api-pagination)\n",
"\n",
"* [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval)\n",
"\n",
"**But!** In this lesson, were gonna use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n",
"**But!** In this lesson, were going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n",
"\n",
"\n",
"**Why use RESTClient?**\n",
@@ -208,9 +205,9 @@
"This reduces boilerplate code and lets you focus on your data pipeline logic.\n",
"\n",
"**Heres how to fetch paginated data:**\n",
"1. Import RESTClient\n",
"2. Create the RESTClient instance\n",
"3. Use the `paginate` method to iterate through all pages of data."
"1. Import `RESTClient`\n",
"2. Create a `RESTClient` instance\n",
"3. Use the `paginate` method to iterate through all pages of data"
]
},
{
@@ -238,7 +235,7 @@
"id": "yNB8jyz5Kmo1"
},
"source": [
"Pagination type was detected automatically, but you can explicitly provide it:"
"☝️ The pagination type was detected automatically, but you can also specify it explicitly:"
]
},
{
@@ -249,7 +246,6 @@
},
"outputs": [],
"source": [
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"client = RESTClient(\n",
@@ -264,7 +260,7 @@
"id": "_jNBmv1qkUhk"
},
"source": [
"The full list of available paginators you can see in offcial [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n"
"The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n"
]
},
{
@@ -278,13 +274,11 @@
},
{
"cell_type": "markdown",
"metadata": {
"id": "Dqi7NQtqhfeb"
},
"metadata": {},
"source": [
"The events endpoint does not have as much data, specially if you compare it with the stargazers endpoint for the dlt repo.\n",
"The events endpoint doesnt contain as much data, especially compared to the stargazers endpoint of the dlt repository.\n",
"\n",
"If you run the pipeline for stargazers endpoint, there is a high chance that you face the **rate limit error**."
"If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**."
]
},
{
@@ -295,13 +289,6 @@
},
"outputs": [],
"source": [
"from dlt.sources.helpers.rest_client import RESTClient\n",
"\n",
"\n",
"client = RESTClient(\n",
" base_url=\"https://api.github.com\",\n",
")\n",
"\n",
"for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n",
" print(page)"
]
@@ -324,24 +311,22 @@
"id": "iKUgNTKuiP6w"
},
"source": [
"---\n",
"## **Authentication**\n",
"\n",
"To avoid this error you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n",
"To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n",
"\n",
"1. Login to your GitHub account.\n",
"2. Generate [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic one!).\n",
"2. Use it as an access token for GitHub API."
"2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic).\n",
"2. Use it as an access token for the GitHub API."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-7ZHBjYspQxt"
},
"metadata": {},
"source": [
"**! ATTENTION !**\n",
"\n",
"Never share your credentials in public and never hard-code them in your code. Use **environment variables** or **dlt secrets.toml**."
"> **! ATTENTION !**\n",
"> Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**."
]
},
{
@@ -350,11 +335,18 @@
"id": "UB02kiI8ncYm"
},
"source": [
"Create an environment variable for your access token.\n",
"Create an environment variable for your access token in Colab.\n",
"\n",
"![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3.webp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In Molab, simply click on the `Secrets` section in the left-side menu and add your access token."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -375,7 +367,7 @@
"id": "6bdNZJ0HqY4O"
},
"source": [
"So now you can use `access_token` variable in the code below:"
"Use the `access_token` variable in the code below:"
]
},
{
@@ -386,13 +378,12 @@
},
"outputs": [],
"source": [
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"\n",
"\n",
"client = RESTClient(\n",
" base_url=\"https://api.github.com\",\n",
" auth=BearerTokenAuth(token=access_token), # <--- put your token here\n",
" auth=BearerTokenAuth(token=access_token),\n",
")\n",
"\n",
"for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n",
@@ -406,7 +397,7 @@
"id": "D7-rTvYvr05t"
},
"source": [
"So now we can rewrite our GitHub dlt pipeline using the RestAPI Client and `access_token`."
"Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`."
]
},
{
@@ -418,7 +409,6 @@
"outputs": [],
"source": [
"import dlt\n",
"from dlt.sources.helpers import requests\n",
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"\n",
@@ -435,16 +425,16 @@
"\n",
"\n",
"# define new dlt pipeline\n",
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"\n",
"# run the pipeline with the new resource\n",
"load_info = pipeline.run(github_stargazers)\n",
"load_info = _pipeline.run(github_stargazers)\n",
"print(load_info)\n",
"\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().github_stargazers.df()"
"_pipeline.dataset().github_stargazers.df()"
]
},
{
@@ -462,6 +452,7 @@
"id": "SxpBIZZ_yE8R"
},
"source": [
"---\n",
"## **dlt configuration and secrets**\n",
"\n",
"In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines.\n",
@@ -470,15 +461,13 @@
"\n",
"On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks.\n",
"\n",
"These can be set up in various ways:\n",
"Both can be set up in various ways:\n",
"\n",
"* Environment variables\n",
"* As environment variables\n",
"* Within code using `dlt.secrets` and `dlt.config`\n",
"* Configuration files (`secrets.toml` and `config.toml`)\n",
"* Via configuration files (`secrets.toml` and `config.toml`)\n",
"\n",
"We're gonna use `dlt.secrets.value` to define credentials in resources and sources. dlt automatically **extracts** configuration settings and secrets based on flexible naming conventions. It then **injects** these values where needed in code.\n",
"\n",
"**Note**: It's important to note that while you can put all configurations and credentials in the `dlt.secrets` (or `secrets.toml`) if it's more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt doesn't look for them there.\n"
"> **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if thats more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there."
]
},
{
@@ -487,9 +476,9 @@
"id": "64JM2Lnlxyoa"
},
"source": [
"Let's create dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n",
"Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n",
"\n",
"We'll use `@dlt.source` to combine all resources in one place."
"We'll use `@dlt.source` to group both resources."
]
},
{
@@ -533,7 +522,7 @@
"id": "0h3ugsRiLhfv"
},
"source": [
"Now we'll use `dlt.secrets.value` in our source to enable dlt secrets configuration. Rename `access_token` variable to `secret_key` because it's already defined.\n",
"Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:\n",
"\n"
]
},
@@ -545,7 +534,7 @@
},
"outputs": [],
"source": [
"exit() # we use exit() to reset all ENVs we set"
"os.environ.clear()"
]
},
{
@@ -559,7 +548,6 @@
"from typing import Iterable\n",
"import dlt\n",
"from dlt.extract import DltResource\n",
"from dlt.sources.helpers import requests\n",
"from dlt.sources.helpers.rest_client import RESTClient\n",
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.common.typing import TDataItems\n",
@@ -568,7 +556,7 @@
"@dlt.source\n",
"def github_source(\n",
" access_token=dlt.secrets.value,\n",
") -> Iterable[DltResource]: # <--- set the secret variable \"access_token\" here\n",
") -> Iterable[DltResource]:\n",
" client = RESTClient(\n",
" base_url=\"https://api.github.com\", auth=BearerTokenAuth(token=access_token)\n",
" )\n",
@@ -592,7 +580,7 @@
"id": "H-wNVUqfuD37"
},
"source": [
"Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."
"> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."
]
},
{
@@ -601,7 +589,7 @@
"id": "shfeHo-vOcD1"
},
"source": [
"If you run the pipeline with `secret_key` as `dlt.secrets.value`, you will see the following error:"
"If you now run the pipeline, you will see the following error:"
]
},
{
@@ -613,11 +601,11 @@
"outputs": [],
"source": [
"# define new dlt pipeline\n",
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"\n",
"# run the pipeline with the new resource\n",
"load_info = pipeline.run(github_source())\n",
"load_info = _pipeline.run(github_source())\n",
"print(load_info)"
]
},
@@ -627,9 +615,9 @@
"id": "GCmqzzo7OpgE"
},
"source": [
"^ That is what happens if you set `dlt.secrets.value` for any variable in your dlt pipeline, but don't set the secret value up.\n",
"Thats what happens when you use `dlt.secrets.value` for a variable in your pipeline but havent actually set the secret value.\n",
"\n",
"dlt is looking for secrets in following formats:\n",
"When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below:\n",
"\n",
"```python\n",
"ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration\n",
@@ -654,10 +642,10 @@
"id": "Ox08B2V5NCaH"
},
"source": [
"To define `access_token` secret value we can use:\n",
"To define the `access_token` secret value, we can use (as mentioned earlier):\n",
"\n",
"1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds)\n",
"2. Environment variables (recomnended for prod)\n",
"2. Environment variables (recommended for prod)\n",
"3. `secrets.toml` file (recommended for local dev)"
]
},
@@ -669,7 +657,7 @@
"source": [
"### **Use `dlt.secrets` in code**\n",
"\n",
"You can easily rewrite your secret right in the Python code. It's especially convenient if you take credentials from third-party secret providers, or if you want to update credentials and configs dinamically."
"You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically."
]
},
{
@@ -680,17 +668,15 @@
},
"outputs": [],
"source": [
"import os\n",
"from google.colab import userdata\n",
"\n",
"dlt.secrets[\"access_token\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"# define new dlt pipeline\n",
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"github_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"# run the pipeline with the new resource\n",
"load_info = pipeline.run(github_source())\n",
"load_info = github_pipeline.run(github_source())\n",
"print(load_info)"
]
},
@@ -700,7 +686,7 @@
"id": "GNghaiYwSBGm"
},
"source": [
"Alternatively you can set:\n",
"Alternatively, you can set:\n",
"\n",
"```python\n",
"dlt.secrets[\"sources.access_token\"] = userdata.get('SECRET_KEY')\n",
@@ -748,7 +734,7 @@
"id": "Adi1RZmOvVzj"
},
"source": [
"### **Exercise 2: Run pipeline with `dlt.secrets.value`**\n",
"### **Exercise 2: Run a pipeline with `dlt.secrets.value`**\n",
"\n",
"Explore the cells above and answer the question below using `sql_client`.\n",
"\n",
@@ -763,10 +749,9 @@
"id": "fQlOIe46ncYm"
},
"source": [
"---\n",
"### **Use environment variables**\n",
"\n",
"Let's set ENV in the one of the dlt formats: `ACCESS_TOKEN`.\n"
"Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`.\n"
]
},
{
@@ -777,17 +762,16 @@
},
"outputs": [],
"source": [
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"# define new dlt pipeline\n",
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
"\n",
"\n",
"# run the pipeline with the new resource\n",
"load_info = pipeline.run(github_source())\n",
"load_info = _pipeline.run(github_source())\n",
"print(load_info)"
]
},
@@ -797,7 +781,9 @@
"id": "ppEFU1hJPU6c"
},
"source": [
"Alternatively you can set:\n",
"Alternatively, you can set:\n",
"\n",
"> `userdata.get()` is Colab-specific.\n",
"\n",
"```python\n",
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get('SECRET_KEY')\n",
@@ -831,7 +817,6 @@
"id": "l7Y1oCAvJ79I"
},
"source": [
"---\n",
"### **Use dlt `secrets.toml` or `config.toml`**\n"
]
},
@@ -841,7 +826,7 @@
"id": "mNzCp5BGpDSh"
},
"source": [
"> Please note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."
"> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."
]
},
{
@@ -862,7 +847,7 @@
"└── my_pipeline.py\n",
"```\n",
"\n",
"Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials) here."
"Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials)."
]
},
{
@@ -871,7 +856,7 @@
"id": "6bTyl229sadQ"
},
"source": [
"To set credentials via TOMLs you would first add your access token to `secrets.toml`:\n",
"To set credentials via the toml files, you would first add your access token to `secrets.toml`:\n",
"\n",
"```toml\n",
"# .dlt/secrets.toml\n",
@@ -889,13 +874,13 @@
},
"source": [
"\n",
"Alternatively you can set:\n",
"Alternatively, you can set:\n",
"\n",
"```\n",
"[sources]\n",
"secret_key = \"your_access_token\"\n",
"```\n",
"is equal to:\n",
"which is equal to:\n",
"\n",
"```\n",
"secret_key = \"your_access_token\"\n",
@@ -907,7 +892,7 @@
"[sources.____main____]\n",
"secret_key = \"your_access_token\"\n",
"```\n",
"and to:\n",
"as well as:\n",
"\n",
"```\n",
"[sources.____main____.github_source]\n",
@@ -922,11 +907,11 @@
},
"source": [
"\n",
"### **Configure Secrets in Colab**\n",
"### **Configure secrets in Colab**\n",
"\n",
"You can configure secrets using **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n",
"You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n",
"\n",
"Open **Secrets** sidebar, press \"Add new secret\", create variable with name `secrets.toml` and copy-paste secrets in Value field and Enable it:\n",
"Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`:\n",
"\n",
"```\n",
"[sources]\n",
@@ -934,7 +919,7 @@
"```\n",
"\n",
"\n",
">dlt will not reload the secrets automatically. **Please restart your interpreter** in Colab options when you add/change content of the variables above."
">dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above."
]
},
{
@@ -952,17 +937,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_7dLATtZkdQl"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,773 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩‍💻🚀**
1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB.
2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency.
3. Used `@dlt.transformer` to process and enrich data between resources.
Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
**In this lesson, you will learn how to:**
- Use pagination for REST APIs.
- Use environment variables to manage both secrets & configs.
- Add values to `secrets.toml` or `config.toml`.
To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""In the previous lesson, we loaded data from the GitHub API to DuckDB,"""
)
return
@app.cell
def _():
import dlt
from dlt.sources.helpers import requests
from dlt.common.typing import TDataItems
@dlt.resource
# define dlt resources
def github_events() -> TDataItems:
url = "https://api.github.com/orgs/dlt-hub/events"
_response = requests.get(url)
yield _response.json()
_pipeline = dlt.pipeline(destination="duckdb")
_load_info = _pipeline.run(github_events)
print(_load_info)
# define dlt pipeline
# run dlt pipeline
# explore loaded data
_pipeline.dataset().github_events.df()
return TDataItems, dlt, requests
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You may notice we received only one page — just 30 records — even though this endpoint has many more.
To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img1.webp)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Pagination**
GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)
It explains that:
>You can use the `Link` header from the response to request additional pages of data.
>The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**GitHub API Pagination Example**
The GitHub API provides the `per_page` and `page` query parameters:
* `per_page`: The number of records per page (up to 100).
* `page`: The page number to retrieve.
""")
return
@app.cell
def _(requests):
_response = requests.get(
"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1"
)
_response.headers
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:"""
)
return
@app.cell
def _(requests):
_response = requests.get(
"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1"
)
_response.links
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **dlt RESTClient**
Now that we know how pagination works conceptually, lets see how to implement it efficiently!
When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.
Learn more about building pagination with Python and `requests`:
* [Link 1](https://farnamdata.com/api-pagination)
* [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval)
**But!** In this lesson, were going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.
**Why use RESTClient?**
RESTClient is part of dlt's helpers, making it easier to interact with REST APIs by managing repetitive tasks such as:
* Authentication
* Query parameter handling
* Pagination
This reduces boilerplate code and lets you focus on your data pipeline logic.
**Heres how to fetch paginated data:**
1. Import `RESTClient`
2. Create a `RESTClient` instance
3. Use the `paginate` method to iterate through all pages of data
""")
return
@app.cell
def _():
from dlt.sources.helpers.rest_client import RESTClient
client = RESTClient(base_url="https://api.github.com")
for _page in client.paginate("orgs/dlt-hub/events"):
print(_page)
return (RESTClient,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""☝️ The pagination type was detected automatically, but you can also specify it explicitly:"""
)
return
@app.cell
def _(RESTClient):
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
client_1 = RESTClient(
base_url="https://api.github.com", paginator=HeaderLinkPaginator()
)
return (client_1,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators)."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The events endpoint doesnt contain as much data, especially compared to the stargazers endpoint of the dlt repository.
If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**.
""")
return
@app.cell
def _(client_1):
for _page in client_1.paginate("repos/dlt-hub/dlt/stargazers"):
print(_page)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 1: Pagination with RESTClient**
Explore the cells above and answer the question below.
#### Question
What type of pagination should we use for the GitHub API?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Authentication**
To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):
1. Login to your GitHub account.
2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic).
2. Use it as an access token for the GitHub API.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
> **! ATTENTION !**
> Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Create an environment variable for your access token in Colab.
![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3.webp)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""In Molab, simply click on the `Secrets` section in the left-side menu and add your access token."""
)
return
@app.cell
def _():
import os
access_token = os.getenv("SECRET_KEY")
return access_token, os
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Use the `access_token` variable in the code below:""")
return
@app.cell
def _(RESTClient, access_token):
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
client_2 = RESTClient(
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
)
for _page in client_2.paginate("repos/dlt-hub/dlt/stargazers"):
print(_page)
break
return (BearerTokenAuth,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`."""
)
return
@app.cell
def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt):
@dlt.resource
def github_stargazers() -> TDataItems:
client = RESTClient(
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
)
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
yield _page
_pipeline = dlt.pipeline(destination="duckdb")
_load_info = _pipeline.run(github_stargazers)
print(_load_info)
_pipeline.dataset().github_stargazers.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""You can see that all dlt [stargazers](https://github.com/dlt-hub/dlt/stargazers) were loaded into the DuckDB destination."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **dlt configuration and secrets**
In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines.
**Configurations** are **non-sensitive** settings that define the behavior of a data pipeline, including file paths, database hosts, timeouts, API URLs, and performance settings.
On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks.
Both can be set up in various ways:
* As environment variables
* Within code using `dlt.secrets` and `dlt.config`
* Via configuration files (`secrets.toml` and `config.toml`)
> **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if thats more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.
We'll use `@dlt.source` to group both resources.
""")
return
@app.cell
def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt):
from typing import Iterable
from dlt.extract import DltResource
@dlt.source
def github_source() -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
)
@dlt.resource
def github_events() -> TDataItems:
for _page in client.paginate("orgs/dlt-hub/events"):
yield _page
@dlt.resource
def github_stargazers() -> TDataItems:
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
yield _page
return (github_events, github_stargazers)
return DltResource, Iterable
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:"""
)
return
@app.cell
def _(os):
os.environ.clear()
return
@app.cell
def _(BearerTokenAuth, DltResource, Iterable, RESTClient, TDataItems, dlt):
@dlt.source
def github_source_1(access_token=dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
)
@dlt.resource
def github_events() -> TDataItems:
for _page in client.paginate("orgs/dlt-hub/events"):
yield _page
@dlt.resource
def github_stargazers() -> TDataItems:
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
yield _page
return (github_events, github_stargazers)
return (github_source_1,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""If you now run the pipeline, you will see the following error:""")
return
@app.cell
def _(dlt, github_source_1):
_pipeline = dlt.pipeline(destination="duckdb")
_load_info = _pipeline.run(github_source_1())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Thats what happens when you use `dlt.secrets.value` for a variable in your pipeline but havent actually set the secret value.
When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below:
```python
ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration
for field "access_token" config providers and keys were tried in following order:
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found.
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____ACCESS_TOKEN was not found.
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES__ACCESS_TOKEN was not found.
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__ACCESS_TOKEN was not found.
In Environment Variables key SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found.
In Environment Variables key SOURCES____MAIN____ACCESS_TOKEN was not found.
In Environment Variables key SOURCES__ACCESS_TOKEN was not found.
In Environment Variables key ACCESS_TOKEN was not found.
WARNING: dlt looks for .dlt folder in your current working directory and your cwd (/content) is different from directory of your pipeline script (/usr/local/lib/python3.10/dist-packages).
If you keep your secret files in the same folder as your pipeline script but run your script from some other folder, secrets/configs will not be found
Please refer to https://dlthub.com/docs/general-usage/credentials for more information
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
To define the `access_token` secret value, we can use (as mentioned earlier):
1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds)
2. Environment variables (recommended for prod)
3. `secrets.toml` file (recommended for local dev)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Use `dlt.secrets` in code**
You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically.
""")
return
@app.cell
def _(dlt, github_source_1, os):
dlt.secrets["access_token"] = os.getenv("SECRET_KEY")
github_pipeline = dlt.pipeline(destination="duckdb")
_load_info = github_pipeline.run(github_source_1())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Alternatively, you can set:
```python
dlt.secrets["sources.access_token"] = userdata.get('SECRET_KEY')
dlt.secrets["sources.____main____.access_token"] = userdata.get('SECRET_KEY')
dlt.secrets["sources.____main____.github_source.access_token"] = userdata.get('SECRET_KEY')
...
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
* `sources` is a special word;
* `__main__` is a python module name;
* `github_source` is the resource name;
* `access_token` is the secret variable name.
So dlt looks for secrets according to this hierarchy:
```
pipeline_name
|
|-sources
|
|-<module name>
|
|-<source function 1 name>
|
|- secret variable 1
|- secret variable 2
```
To keep the **naming convention** flexible, dlt looks for a lot of **possible combinations** of key names, starting from the most specific possible path. Then, if the value is not found, it removes the right-most section and tries again.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 2: Run a pipeline with `dlt.secrets.value`**
Explore the cells above and answer the question below using `sql_client`.
#### Question
Who has id=`17202864` in the `stargazers` table? Use `sql_client`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Use environment variables**
Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`.
""")
return
@app.cell
def _(dlt, github_source_1, os):
os.environ["ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
_pipeline = dlt.pipeline(destination="duckdb")
_load_info = _pipeline.run(github_source_1())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Alternatively, you can set:
> `userdata.get()` is Colab-specific.
```python
os.environ["SOURCES__ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
os.environ["SOURCES____MAIN____ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
os.environ["SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
...
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**How does it work?**
`dlt` **automatically extracts** configuration settings and secrets based on flexible naming conventions.
It then **injects** these values where needed in functions decorated with `@dlt.source`, `@dlt.resource`, or `@dlt.destination`.
>dlt uses a specific naming hierarchy to search for the secrets and config values. This makes configurations and secrets easy to manage.
>
> The naming convention for **environment variables** in dlt follows a specific pattern. All names are **capitalized** and sections are separated with **double underscores** __ , e.g. `SOURCES____MAIN____GITHUB_SOURCE__SECRET_KEY`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **Use dlt `secrets.toml` or `config.toml`**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The `secrets.toml` file - along with the `config.toml` file - should be stored in the `.dlt` directory where your pipeline code is located:
```
/your_project_directory
├── .dlt
│ ├── secrets.toml
│ └── config.toml
└── my_pipeline.py
```
Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
To set credentials via the toml files, you would first add your access token to `secrets.toml`:
```toml
# .dlt/secrets.toml
[sources]
secret_key = "your_access_token"
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Alternatively, you can set:
```
[sources]
secret_key = "your_access_token"
```
which is equal to:
```
secret_key = "your_access_token"
```
and to:
```
[sources.____main____]
secret_key = "your_access_token"
```
as well as:
```
[sources.____main____.github_source]
secret_key = "your_access_token"
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Configure secrets in Colab**
You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.
Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`:
```
[sources]
secret_key = "your_access_token"
```
>dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img4](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img4.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,14 +6,14 @@
"id": "yTmIgQKpV355"
},
"source": [
"# **Recap of [Lesson 3](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Used pagination for RestAPIs.\n",
"2. Used authentication for RestAPIs.\n",
"3. Tried dlt RESTClient.\n",
"4. Used environment variables to handle both secrets & configs.\n",
"5. Learned how to add values to `secrets.toml` or `config.toml`.\n",
"6. Used `secrets.toml` ENV variable special for Colab."
"1. Used pagination with REST APIs. \n",
"2. Applied authentication for REST APIs. \n",
"3. Tried the dlt `RESTClient`. \n",
"4. Used environment variables to manage secrets and configuration. \n",
"5. Learned how to add values to `secrets.toml` and `config.toml`. \n",
"6. Used the special `secrets.toml` environment variable setup for Colab."
]
},
{
@@ -23,22 +23,21 @@
},
"source": [
"---\n",
"# **`dlt`s pre-built Sources and Destinations** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n",
"\n",
"# **`dlt`s pre-built Sources and Destinations** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n",
"\n",
"\n",
"**Here, you will learn:**\n",
"- How to initialize verified sources;\n",
"- Built-in `rest_api` source.\n",
"- Built-in `sql_database` source.\n",
"- Built-in `filesystem` source.\n",
"- How to initialize verified sources.\n",
"- The built-in `rest_api` source.\n",
"- The built-in `sql_database` source.\n",
"- The built-in `filesystem` source.\n",
"- How to switch between destinations.\n",
"\n",
"---\n",
"\n",
"Our verified sources are the simplest way to get started with building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as any SQL database, Google Sheets, Salesforce and others.\n",
"Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more.\n",
"\n",
"With our numerous destinations you can load data to a local database, warehouse or a data lake. Choose from Snowflake, Databricks and more."
"With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others."
]
},
{
@@ -76,15 +75,6 @@
"```"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cNs9mHKaEaTE"
},
"source": [
"### Step 0: Install dlt"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -125,7 +115,7 @@
"source": [
"This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning.\n",
"\n",
"Consider an example of a pipeline for the GitHub API:\n",
"Consider an example pipeline for the GitHub API:\n",
"\n",
"```\n",
"Available dlt single file templates:\n",
@@ -144,7 +134,7 @@
"\n",
"### Step 1. Initialize the source\n",
"\n",
"This command will initialize the pipeline example with GitHub API as the source and DuckBD as the destination:"
"This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination:"
]
},
{
@@ -165,10 +155,11 @@
},
"source": [
"Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline:\n",
"* `.dlt` folder for `secrets.toml` and `config.toml`;\n",
"* pipeline script `github_api_pipeline.py`;\n",
"* requirements.txt;\n",
"* `.gitignore`."
"\n",
"- The `.dlt` folder containing `secrets.toml` and `config.toml`\n",
"- The pipeline script `github_api_pipeline.py`\n",
"- `requirements.txt`\n",
"- `.gitignore`"
]
},
{
@@ -193,7 +184,7 @@
"- Adjust the pipeline script as needed\n",
"- Run the pipeline script\n",
"\n",
"> In certain cases, you can adjust the verified source code."
"> If needed, you can adjust the verified source code."
]
},
{
@@ -213,7 +204,8 @@
"id": "Rr3RWZSHcnSs"
},
"source": [
"From the code we can see that this pipeline loads **only \"issues\" endpoint**, you can adjust this code as you wish: add new endpoints, add additional logic, add transformations, etc."
"From the code, we can see that this pipeline loads **only the `\"issues\"` endpoint**. \n",
"You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more."
]
},
{
@@ -224,9 +216,10 @@
"source": [
"### Step 2. Add credentials\n",
"\n",
"In Colab is more convenient to use ENVs. In the previous lesson you learned how to configure dlt resource via environment variable.\n",
"In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`.\n",
"\n",
"In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable:\n",
"\n",
"In the pipeline above we can see that `access_token` variable is `dlt.secrets.value`, it means we should configure this variable.\n",
"\n",
"```python\n",
"@dlt.resource(write_disposition=\"replace\")\n",
@@ -243,10 +236,10 @@
},
"outputs": [],
"source": [
"import os\n",
"import dlt\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")"
"dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")"
]
},
{
@@ -284,13 +277,13 @@
"id": "imvWv_2Cbumt"
},
"source": [
"From the pipeline output we can take pipeline information like pipeline_name, dataset_name, destination path, etc.\n",
"From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more.\n",
"\n",
"\n",
"> Pipeline **github_api_pipeline** load step completed in 1.23 seconds\n",
"1 load package(s) were loaded to destination duckdb and into dataset **github_api_data**\n",
"The duckdb destination used duckdb:////content/**github_api_pipeline.duckdb** location to store data\n",
"Load package 1733848559.8195539 is LOADED and contains no failed jobs\n"
"> Pipeline **github_api_pipeline** load step completed in 1.23 seconds \n",
"> 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**. \n",
"> The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location. \n",
"> Load package `1733848559.8195539` is **LOADED** and contains no failed jobs.\n",
"\n"
]
},
{
@@ -301,7 +294,7 @@
"source": [
"## Step 4: Explore your data\n",
"\n",
"Let's explore what tables were created in duckdb."
"Let's explore what tables were created in the destination."
]
},
{
@@ -348,18 +341,18 @@
"source": [
"## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)**\n",
"\n",
"`rest_api` is a generic source that you can use to create a `dlt` source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this `dlt` source attempts to provide a declarative way to define a `dlt` source for those APIs.\n",
"`rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively.\n",
"\n",
"Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can define:\n",
"Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify:\n",
"\n",
"- the API endpoints to pull data from,\n",
"- their relationships,\n",
"- how to handle pagination,\n",
"- authentication.\n",
"\n",
"dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n",
"`dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**.\n",
"\n",
"In previous lesson you've already met Rest API Client. `dlt`s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source."
"In the previous lesson, you already used the REST API Client. `dlt`s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source.\n"
]
},
{
@@ -368,8 +361,9 @@
"id": "SqoKS0mNdFOd"
},
"source": [
"### Initialize `rest_api` template\n",
"You can initialize `rest_api` **template** using `init` command:"
"### Initialize the `rest_api` template\n",
"\n",
"You can initialize the `rest_api` **template** using the `init` command:\n"
]
},
{
@@ -389,15 +383,13 @@
"id": "MJ89LnH91GQh"
},
"source": [
"In the `rest_api_pipeline.py` script you will find sources for GitHub API and for PokeAPI, which were defined using `rest_api` source and `RESTAPIConfig`.\n",
"\n",
"Since the `rest_api` source is a **built-in source**, you don't have to initialize it. You can **import** it from `dlt.sources` and use it immediately.\n",
"In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`.\n",
"\n",
"Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it.\n",
"\n",
"### Example\n",
"\n",
"Here's a simplified example of how to configure the REST API source to load `issues` and issue `comments` from GitHub API:\n",
"\n"
"Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API:\n"
]
},
{
@@ -416,13 +408,11 @@
" \"client\": {\n",
" \"base_url\": \"https://api.github.com\",\n",
" \"auth\": {\n",
" \"token\": dlt.secrets[\n",
" \"sources.access_token\"\n",
" ], # <--- we already configured access_token above\n",
" \"token\": dlt.secrets[\"sources.access_token\"],\n",
" },\n",
" \"paginator\": \"header_link\", # <---- set up paginator type\n",
" \"paginator\": \"header_link\",\n",
" },\n",
" \"resources\": [ # <--- list resources\n",
" \"resources\": [\n",
" {\n",
" \"name\": \"issues\",\n",
" \"endpoint\": {\n",
@@ -433,40 +423,32 @@
" },\n",
" },\n",
" {\n",
" \"name\": \"issue_comments\", # <-- here we declare dlt.transformer\n",
" \"name\": \"issue_comments\",\n",
" \"endpoint\": {\n",
" \"path\": \"repos/dlt-hub/dlt/issues/{issue_number}/comments\",\n",
" \"params\": {\n",
" \"issue_number\": {\n",
" \"type\": (\n",
" \"resolve\"\n",
" ), # <--- use type 'resolve' to resolve {issue_number} for transformer\n",
" \"type\": (\"resolve\"),\n",
" \"resource\": \"issues\",\n",
" \"field\": \"number\",\n",
" },\n",
" },\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"contributors\",\n",
" \"endpoint\": {\n",
" \"path\": \"repos/dlt-hub/dlt/contributors\",\n",
" },\n",
" },\n",
" ],\n",
"}\n",
"\n",
"github_source = rest_api_source(config)\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
"rest_api_pipeline = dlt.pipeline(\n",
" pipeline_name=\"rest_api_github\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"rest_api_data\",\n",
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.run(github_source)\n",
"load_info = rest_api_pipeline.run(github_source)\n",
"print(load_info)"
]
},
@@ -478,7 +460,7 @@
},
"outputs": [],
"source": [
"pipeline.dataset().issues.df()"
"rest_api_pipeline.dataset().issues.df()"
]
},
{
@@ -487,12 +469,12 @@
"id": "mQuK4l23c8Of"
},
"source": [
"### **Exercise 1: Run rest_api source**\n",
"### **Exercise 1: Run `rest_api` source**\n",
"\n",
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
"\n",
"#### Question\n",
"How many columns has the `issues` table?"
"#### **Question**\n",
"How many columns does the `issues` table have?"
]
},
{
@@ -501,15 +483,16 @@
"id": "UTKIM2ntOIrh"
},
"source": [
"### **Exercise 2: Create dlt source with rest_api**\n",
"### **Exercise 2: Create a dlt source with `rest_api`**\n",
"\n",
"Add `contributors` endpoint for dlt repository to the `rest_api` configuration:\n",
"- resource name is \"contributors\"\n",
"- endpoint path : \"repos/dlt-hub/dlt/contributors\"\n",
"- no parameters\n",
"Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration:\n",
"\n",
"#### Question\n",
"How many columns has the `contributors` table?"
"- Resource name: **\"contributors\"**\n",
"- Endpoint path: **\"repos/dlt-hub/dlt/contributors\"**\n",
"- No parameters\n",
"\n",
"#### **Question**\n",
"How many columns does the `contributors` table have?\n"
]
},
{
@@ -536,9 +519,9 @@
"id": "bHcBOhgVdmZH"
},
"source": [
"### Initialize `sql_database` template\n",
"### Initialize the `sql_database` template\n",
"\n",
"Initialize dlt template for `sql_database` using `init` command:"
"Initialize the `dlt` template for `sql_database` using the `init` command:\n"
]
},
{
@@ -569,9 +552,9 @@
"source": [
"### Example\n",
"\n",
"The example below will show you how you can use dlt to load data from a SQL Database (PostgreSQL, MySQL, SQLight, Oracle, IBM DB2, etc.) into destination.\n",
"The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination.\n",
"\n",
"To make it easy to reproduce, we will be loading data from the [public MySQL RFam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance."
"To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance."
]
},
{
@@ -582,6 +565,7 @@
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install pymysql"
]
},
@@ -595,21 +579,21 @@
"source": [
"from dlt.sources.sql_database import sql_database\n",
"\n",
"source = sql_database(\n",
"sql_source = sql_database(\n",
" \"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam\",\n",
" table_names=[\n",
" \"family\",\n",
" ],\n",
")\n",
"\n",
"pipeline = dlt.pipeline(\n",
"sql_db_pipeline = dlt.pipeline(\n",
" pipeline_name=\"sql_database_example\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"sql_data\",\n",
" dev_mode=True,\n",
")\n",
"\n",
"load_info = pipeline.run(source)\n",
"load_info = sql_db_pipeline.run(sql_source)\n",
"print(load_info)"
]
},
@@ -619,11 +603,11 @@
"id": "pjyJyF4Ofyuu"
},
"source": [
"### **Exercise 3: Run sql_database source**\n",
"### **Exercise 3: Run `sql_database` source**\n",
"\n",
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
"\n",
"#### Question\n",
"#### **Question**\n",
"How many columns does the `family` table have?"
]
},
@@ -671,9 +655,9 @@
"id": "HfLjS_raUH9G"
},
"source": [
"### Initialize `filesystem` template\n",
"### Initialize the `filesystem` template\n",
"\n",
"Initialize dlt template for `filesystem` using `init` command:"
"Initialize the dlt template for `filesystem` using the `init` command:\n"
]
},
{
@@ -715,7 +699,19 @@
},
"outputs": [],
"source": [
"!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet"
"import os\n",
"import requests\n",
"\n",
"folder_name = \"local_data\"\n",
"os.makedirs(folder_name, exist_ok=True)\n",
"full_path = os.path.abspath(folder_name)\n",
"\n",
"url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n",
"resp = requests.get(url)\n",
"resp.raise_for_status()\n",
"\n",
"with open(f\"{full_path}/userdata.parquet\", \"wb\") as f:\n",
" f.write(resp.content)"
]
},
{
@@ -729,14 +725,12 @@
"import dlt\n",
"from dlt.sources.filesystem import filesystem, read_parquet\n",
"\n",
"filesystem_resource = filesystem(\n",
" bucket_url=\"/content/local_data\", file_glob=\"**/*.parquet\"\n",
")\n",
"filesystem_resource = filesystem(bucket_url=full_path, file_glob=\"**/*.parquet\")\n",
"filesystem_pipe = filesystem_resource | read_parquet()\n",
"\n",
"# We load the data into the table_name table\n",
"pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n",
"load_info = pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n",
"fs_pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n",
"load_info = fs_pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n",
"print(load_info)"
]
},
@@ -746,12 +740,12 @@
"id": "0jzeZeINEzQb"
},
"source": [
"### **Exercise 4: Run filesystem source**\n",
"### **Exercise 4: Run `filesystem` source**\n",
"\n",
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
"\n",
"#### Question\n",
"How many columns does the `userdata` table have?"
"#### **Question**\n",
"How many columns does the `userdata` table have?\n"
]
},
{
@@ -760,7 +754,8 @@
"id": "o4SGNHSkF7_Y"
},
"source": [
"How to configure **Cloud Storage** you can read in the official [dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration)."
"You can read how to configure **Cloud Storage** in the official \n",
"[dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration).\n"
]
},
{
@@ -769,9 +764,7 @@
"id": "M03Zc9l7Y6Ue"
},
"source": [
"# **Built-in Destinations**\n",
"\n",
"https://dlthub.com/docs/dlt-ecosystem/destinations/"
"# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)\n"
]
},
{
@@ -797,9 +790,12 @@
"id": "BWAnIbicE4XC"
},
"source": [
"TBH this is a matter of simply going through the [documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n",
"- Most likely the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n",
"- If not, you can simply define a custom destination and still be able to benefit from most `dlt`-specific features. FYI, custom destinations will be covered in the next Advanced course, so we expect you to come back for the second part..."
"To be honest, this is simply a matter of going through the \n",
"[documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n",
"\n",
"- Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n",
"- If not, you can define a custom destination and still benefit from most `dlt`-specific features. \n",
" *FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…*\n"
]
},
{
@@ -810,7 +806,7 @@
"source": [
"## **Choosing a destination**\n",
"\n",
"Switching between destinations in dlt is incredibly straightforward—simply modify the `destination` parameter in your pipeline configuration. For example:"
"Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example:"
]
},
{
@@ -821,17 +817,19 @@
},
"outputs": [],
"source": [
"pipeline = dlt.pipeline(\n",
"data_pipeline = dlt.pipeline(\n",
" pipeline_name=\"data_pipeline\",\n",
" destination=\"duckdb\", # <--- to test pipeline locally\n",
" destination=\"duckdb\",\n",
" dataset_name=\"data\",\n",
")\n",
"print(data_pipeline.destination.destination_type)\n",
"\n",
"pipeline = dlt.pipeline(\n",
"data_pipeline = dlt.pipeline(\n",
" pipeline_name=\"data_pipeline\",\n",
" destination=\"bigquery\", # <--- to run pipeline in production\n",
" destination=\"bigquery\",\n",
" dataset_name=\"data\",\n",
")"
")\n",
"print(data_pipeline.destination.destination_type)"
]
},
{
@@ -869,7 +867,7 @@
"source": [
"import os\n",
"\n",
"os.environ[\"BUCKET_URL\"] = \"/content\""
"os.environ[\"BUCKET_URL\"] = \"./content\""
]
},
{
@@ -902,13 +900,11 @@
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"fs_pipeline\",\n",
" destination=\"filesystem\", # <--- change destination to 'filesystem'\n",
" destination=\"filesystem\",\n",
" dataset_name=\"fs_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(\n",
" source, loader_file_format=\"parquet\"\n",
") # <--- choose a file format: parquet, csv or jsonl\n",
"load_info = pipeline.run(source, loader_file_format=\"parquet\")\n",
"print(load_info)"
]
},
@@ -929,7 +925,7 @@
},
"outputs": [],
"source": [
"! ls fs_data/family"
"! ls ./content/fs_data/family"
]
},
{
@@ -991,7 +987,7 @@
"load_info = pipeline.run(\n",
" source,\n",
" loader_file_format=\"parquet\",\n",
" table_format=\"iceberg\", # <--- choose a table format: delta or iceberg\n",
" table_format=\"iceberg\",\n",
")\n",
"print(load_info)"
]
@@ -1004,9 +1000,9 @@
"source": [
"**Note:**\n",
"\n",
"Open source version of dlt supports basic functionality for **iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** integration with iceberg.\n",
"The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration.\n",
"\n",
"[Join the waiting list to learn more about dlt+ and Iceberg.](https://info.dlthub.com/waiting-list)"
"[Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list)\n"
]
},
{
@@ -1017,9 +1013,12 @@
"source": [
"# **Spoiler: Custom Sources & Destinations**\n",
"\n",
"`dlt` tried to simplify as much as possible both the process of creating sources ([RestAPI Client](https://dlthub.com/docs/general-usage/http/rest-client), [rest_api source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n",
"`dlt` aims to simplify the process of creating both custom sources \n",
"([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client), \n",
"[`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) \n",
"and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n",
"\n",
"We will look at this topic in more detail in the next Advanced course."
"We will explore this topic in more detail in the next Advanced course.\n"
]
},
{
@@ -1028,17 +1027,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "wrVnW2UdVjV4"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,826 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "dlt[pyiceberg]",
# "numpy",
# "pandas",
# "pymysql",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩‍💻🚀**
1. Used pagination with REST APIs.
2. Applied authentication for REST APIs.
3. Tried the dlt `RESTClient`.
4. Used environment variables to manage secrets and configuration.
5. Learned how to add values to `secrets.toml` and `config.toml`.
6. Used the special `secrets.toml` environment variable setup for Colab.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **`dlt`s pre-built Sources and Destinations** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
**Here, you will learn:**
- How to initialize verified sources.
- The built-in `rest_api` source.
- The built-in `sql_database` source.
- The built-in `filesystem` source.
- How to switch between destinations.
---
Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more.
With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Using_pre_build_sources_and_destinations_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_4_Using_pre_build_sources_and_destinations_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Existing verified sources**
To use an [existing verified source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/), just run the `dlt init` command.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
There's a base project for each `dlt` verified source + destination combination, which you can adjust according to your needs.
These base project can be initialized with a simple command:
```
dlt init <verified-source> <destination>
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""List all verified sources:""")
return
@app.cell
def _():
import subprocess
subprocess.run(["dlt", "init", "--list-sources"], check=True)
return (subprocess,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning.
Consider an example pipeline for the GitHub API:
```
Available dlt single file templates:
---
arrow: The Arrow Pipeline Template will show how to load and transform arrow tables.
dataframe: The DataFrame Pipeline Template will show how to load and transform pandas dataframes.
debug: The Debug Pipeline Template will load a column with each datatype to your destination.
default: The Intro Pipeline Template contains the example from the docs intro page
fruitshop: The Default Pipeline Template provides a simple starting point for your dlt pipeline
---> github_api: The Github API templates provides a starting
point to read data from REST APIs with REST Client helper
requests: The Requests Pipeline Template provides a simple starting point for a dlt pipeline with the requests library
```
### Step 1. Initialize the source
This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination:
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "--non-interactive", "init", "github_api", "duckdb"], check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline:
- The `.dlt` folder containing `secrets.toml` and `config.toml`
- The pipeline script `github_api_pipeline.py`
- `requirements.txt`
- `.gitignore`
""")
return
@app.cell
def _(subprocess):
subprocess.run(["ls", "-a"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
What you would normally do with the project:
- Add your credentials and define configurations
- Adjust the pipeline script as needed
- Run the pipeline script
> If needed, you can adjust the verified source code.
""")
return
@app.cell
def _(subprocess):
subprocess.run(["cat", "github_api_pipeline.py"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
From the code, we can see that this pipeline loads **only the `"issues"` endpoint**.
You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Step 2. Add credentials
In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`.
In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable:
```python
@dlt.resource(write_disposition="replace")
def github_api_resource(access_token: Optional[str] = dlt.secrets.value):
...
```
""")
return
@app.cell
def _(os):
import dlt
dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
return (dlt,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### Step 3. Run the pipeline""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's run the pipeline!""")
return
@app.cell
def _(subprocess):
subprocess.run(["python", "github_api_pipeline.py"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more.
> Pipeline **github_api_pipeline** load step completed in 1.23 seconds
> 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**.
> The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location.
> Load package `1733848559.8195539` is **LOADED** and contains no failed jobs.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 4: Explore your data
Let's explore what tables were created in the destination.
""")
return
@app.cell
def _():
import duckdb
conn = duckdb.connect("github_api_pipeline.duckdb")
conn.sql("SET search_path = 'github_api_data'")
conn.sql("DESCRIBE").df()
return (conn,)
@app.cell
def _(conn):
data_table = conn.sql("SELECT * FROM github_api_resource").df()
data_table
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""# **Built-in sources: RestAPI, SQL database & Filesystem**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)**
`rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively.
Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify:
- the API endpoints to pull data from,
- their relationships,
- how to handle pagination,
- authentication.
`dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**.
In the previous lesson, you already used the REST API Client. `dlt`s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Initialize the `rest_api` template
You can initialize the `rest_api` **template** using the `init` command:
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "init", "rest_api", "duckdb"], input="y\n", text=True, check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`.
Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it.
### Example
Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API:
""")
return
@app.cell
def _(dlt):
from dlt.sources.rest_api import RESTAPIConfig, rest_api_source
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
config: RESTAPIConfig = {
"client": {
"base_url": "https://api.github.com",
"auth": {"token": dlt.secrets["sources.access_token"]},
"paginator": "header_link",
},
"resources": [
{
"name": "issues",
"endpoint": {
"path": "repos/dlt-hub/dlt/issues",
"params": {"state": "open"},
},
},
{
"name": "issue_comments",
"endpoint": {
"path": "repos/dlt-hub/dlt/issues/{issue_number}/comments",
"params": {
"issue_number": {
"type": "resolve",
"resource": "issues",
"field": "number",
}
},
},
},
],
}
github_source = rest_api_source(config)
rest_api_pipeline = dlt.pipeline(
pipeline_name="rest_api_github",
destination="duckdb",
dataset_name="rest_api_data",
dev_mode=True,
)
_load_info = rest_api_pipeline.run(github_source)
print(_load_info)
return (rest_api_pipeline,)
@app.cell
def _(rest_api_pipeline):
rest_api_pipeline.dataset().issues.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 1: Run `rest_api` source**
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
#### **Question**
How many columns does the `issues` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 2: Create a dlt source with `rest_api`**
Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration:
- Resource name: **"contributors"**
- Endpoint path: **"repos/dlt-hub/dlt/contributors"**
- No parameters
#### **Question**
How many columns does the `contributors` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **[SQL Databases source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database/)**
SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval.
The `sql_database` verified source loads data to your specified destination using one of the following backends:
* SQLAlchemy,
* PyArrow,
* pandas,
* ConnectorX.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Initialize the `sql_database` template
Initialize the `dlt` template for `sql_database` using the `init` command:
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "init", "sql_database", "duckdb"], input="y\n", text=True, check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The `sql_database` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Example
The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination.
To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance.
""")
return
@app.cell
def _(dlt):
from dlt.sources.sql_database import sql_database
sql_source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["family"],
)
sql_db_pipeline = dlt.pipeline(
pipeline_name="sql_database_example",
destination="duckdb",
dataset_name="sql_data",
dev_mode=True,
)
_load_info = sql_db_pipeline.run(sql_source)
print(_load_info)
return (sql_database,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 3: Run `sql_database` source**
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
#### **Question**
How many columns does the `family` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **[Filesystem source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/)**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The filesystem source allows seamless loading of files from the following locations:
* AWS S3
* Google Cloud Storage
* Google Drive
* Azure Blob Storage
* remote filesystem (via SFTP)
* local filesystem
The filesystem source natively supports CSV, Parquet, and JSONL files and allows customization for loading any type of structured file.
**How filesystem source works**
The Filesystem source doesn't just give you an easy way to load data from both remote and local files — it also comes with a powerful set of tools that let you customize the loading process to fit your specific needs.
Filesystem source loads data in two steps:
1. It accesses the files in your remote or local file storage **without** actually **reading** the content yet. At this point, you can filter files by metadata or name. You can also set up incremental loading to load only new files.
2. The **transformer** **reads** the files' content and yields the records. At this step, you can filter out the actual data, enrich records with metadata from files, or perform incremental loading based on the file content.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Initialize the `filesystem` template
Initialize the dlt template for `filesystem` using the `init` command:
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "init", "filesystem", "duckdb"], input="y\n", text=True, check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The `filesystem` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Example
To illustrate how this **built-in source** works, we first download some file to the local (Colab) filesystem.
""")
return
@app.cell
def _():
import os
import requests
folder_name = "local_data"
os.makedirs(folder_name, exist_ok=True)
full_path = os.path.abspath(folder_name)
url = "https://www.timestored.com/data/sample/userdata.parquet"
resp = requests.get(url)
resp.raise_for_status()
with open(f"{full_path}/userdata.parquet", "wb") as f:
f.write(resp.content)
return full_path, os
@app.cell
def _(dlt, full_path):
from dlt.sources.filesystem import filesystem, read_parquet
filesystem_resource = filesystem(bucket_url=full_path, file_glob="**/*.parquet")
filesystem_pipe = filesystem_resource | read_parquet()
fs_pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb")
_load_info = fs_pipeline.run(filesystem_pipe.with_name("userdata"))
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Exercise 4: Run `filesystem` source**
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
#### **Question**
How many columns does the `userdata` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can read how to configure **Cloud Storage** in the official
[dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_4_Using_pre_build_sources_and_destinations_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_4_Using_pre_build_sources_and_destinations_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Exploring `dlt` destinations**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
To be honest, this is simply a matter of going through the
[documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:
- Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.
- If not, you can define a custom destination and still benefit from most `dlt`-specific features.
*FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…*
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Choosing a destination**
Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example:
""")
return
@app.cell
def _(dlt):
data_pipeline = dlt.pipeline(
pipeline_name="data_pipeline",
destination="duckdb",
dataset_name="data",
)
print(data_pipeline.destination.destination_type)
data_pipeline = dlt.pipeline(
pipeline_name="data_pipeline",
destination="bigquery",
dataset_name="data",
)
print(data_pipeline.destination.destination_type)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""This flexibility allows you to easily transition from local development to production-grade environments."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Filesystem destination**
The `filesystem` destination enables you to load data into **files stored locally** or in **cloud storage** solutions, making it an excellent choice for lightweight testing, prototyping, or file-based workflows.
Below is an **example** demonstrating how to use the `filesystem` destination to load data in **Parquet** format:
* Step 1: Set up a local bucket or cloud directory for storing files
""")
return
@app.cell
def _(os):
os.environ["BUCKET_URL"] = "./content"
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""* Step 2: Define the data source""")
return
@app.cell
def _(dlt, sql_database):
source = sql_database(
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
table_names=["family"],
)
pipeline = dlt.pipeline(
pipeline_name="fs_pipeline", destination="filesystem", dataset_name="fs_data"
)
_load_info = pipeline.run(source, loader_file_format="parquet")
print(_load_info)
return pipeline, source
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Look at the files:""")
return
@app.cell
def _(subprocess):
subprocess.run(["ls", "./content/fs_data/family"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Look at the loaded data:""")
return
@app.cell
def _(pipeline):
# explore loaded data
pipeline.dataset().family.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Table formats: [Delta tables & Iceberg](https://dlthub.com/docs/dlt-ecosystem/destinations/delta-iceberg)**
dlt supports writing **Delta** and **Iceberg** tables when using the `filesystem` destination.
**How it works:**
dlt uses the `deltalake` and `pyiceberg` libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`.
""")
return
@app.cell
def _(pipeline, source):
_load_info = pipeline.run(
source, loader_file_format="parquet", table_format="iceberg"
)
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Note:**
The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration.
[Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list)
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Spoiler: Custom Sources & Destinations**
`dlt` aims to simplify the process of creating both custom sources
([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client),
[`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api))
and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).
We will explore this topic in more detail in the next Advanced course.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,14 +6,14 @@
"id": "h93BcC8SX2fj"
},
"source": [
"# **Recap of [Lesson 4](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Listed all available verified sources;\n",
"2. Initialized `github_api` verified source;\n",
"3. Explored built-in `rest_api` source.\n",
"4. Explored built-in `sql_database` source.\n",
"5. Explored built-in `filesystem` source.\n",
"6. Learned how to switch between destinations."
"1. Listed all available verified sources.\n",
"2. Initialized the `github_api` verified source.\n",
"3. Explored the built-in `rest_api` source.\n",
"4. Explored the built-in `sql_database` source.\n",
"5. Explored the built-in `filesystem` source.\n",
"6. Learned how to switch between destinations.\n"
]
},
{
@@ -24,7 +24,7 @@
"source": [
"---\n",
"\n",
"# **Write Disposition and Incremental Loading** ⚙️🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n",
"# **Write Disposition and Incremental Loading** ⚙️🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n",
"\n",
"\n",
"**Here, you will learn:**\n",
@@ -52,19 +52,17 @@
"id": "5ThZzzAwqLnn"
},
"source": [
"Write disposition in the context of the dlt library defines how the data should be written to the destination. There are three types of write dispositions:\n",
"A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types:\n",
"\n",
"* **Append**: This is the **default** disposition. It will append the data to the existing data in the destination.\n",
"- **Append**: The **default** disposition. It appends new data to the existing data in the destination.\n",
"\n",
"* **Replace**: This disposition replaces the data in the destination with the data from the resource. It **deletes** all the data and **recreates** the schema before loading the data.\n",
"- **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading.\n",
"\n",
"* **Merge**: This write disposition merges the data from the resource with the data at the destination. For the merge disposition, you need to specify a `primary_key` for the resource.\n",
"- **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource.\n",
"\n",
"The write disposition you choose depends on the dataset and how you can extract it. For more details, you can refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n",
"The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n",
"\n",
"\n",
"\n",
"A `write_disposition` in `dlt` can specified in the resource decorator:\n",
"You can specify a `write_disposition` in the resource decorator:\n",
"\n",
"```python\n",
"@dlt.resource(write_disposition=\"append\")\n",
@@ -79,25 +77,7 @@
"load_info = pipeline.run(my_resource, write_disposition=\"replace\")\n",
"```\n",
"\n",
"> In case you specify both, the write disposition specified at the pipeline run level will override the write disposition specified at the resource level."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SpEU7xzw9lZL"
},
"source": [
"### **0. Install dlt**"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Su4oUJelKaZY"
},
"source": [
"Install `dlt` with DuckDB as a destination as per usual:"
"> If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level."
]
},
{
@@ -128,7 +108,7 @@
"id": "5IpPPDpVrU75"
},
"source": [
"As we already have said `append` is a default loading behavior. Now we will explore how this write disposition works."
"As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works."
]
},
{
@@ -162,7 +142,7 @@
"id": "CltUh8t6rGUP"
},
"source": [
"We create dlt pipeline as usual and load this data into DuckDB."
"We create a `dlt` pipeline as usual and load this data into DuckDB."
]
},
{
@@ -179,23 +159,23 @@
"\n",
"@dlt.resource(\n",
" name=\"pokemon\",\n",
" write_disposition=\"append\", # <--- add new argument into decorator\n",
" write_disposition=\"append\",\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def append_pokemon() -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"poke_pipeline\",\n",
"append_pipeline = dlt.pipeline(\n",
" pipeline_name=\"append_poke_pipeline\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = append_pipeline.run(append_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"append_pipeline.dataset().pokemon.df()"
]
},
{
@@ -204,9 +184,9 @@
"id": "Wtz2oUpCs7Ay"
},
"source": [
"Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**. It is very useful.\n",
"Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful.\n",
"\n",
"Example use case: when you have a new folder created daily with json file logs, and you want to ingest them incrementally."
"Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally.\n"
]
},
{
@@ -217,11 +197,11 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = append_pipeline.run(append_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"append_pipeline.dataset().pokemon.df()"
]
},
{
@@ -240,7 +220,7 @@
"id": "Njz_qUcpDtTW"
},
"source": [
"Perhaps this duplicated data is not what you want to get in your work projects. For example, if your data was updated, how we can refresh it in the database? One method is to tell dlt to **replace** the data in existing tables by using **write_disposition**."
"Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**.\n"
]
},
{
@@ -256,23 +236,23 @@
"\n",
"@dlt.resource(\n",
" name=\"pokemon\",\n",
" write_disposition=\"replace\", # <--- change 'append' to 'replace'\n",
" write_disposition=\"replace\",\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def replace_pokemon() -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"poke_pipeline\",\n",
"replace_pipeline = dlt.pipeline(\n",
" pipeline_name=\"replace_poke_pipeline\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = replace_pipeline.run(replace_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"replace_pipeline.dataset().pokemon.df()"
]
},
{
@@ -292,11 +272,11 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = replace_pipeline.run(replace_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"replace_pipeline.dataset().pokemon.df()"
]
},
{
@@ -305,7 +285,7 @@
"id": "aPjezxijt_mz"
},
"source": [
"TAADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."
"TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."
]
},
{
@@ -364,24 +344,24 @@
"\n",
"@dlt.resource(\n",
" name=\"pokemon\",\n",
" write_disposition=\"merge\", # <--- change 'replace' to 'merge'\n",
" primary_key=\"id\", # <--- add primary_key\n",
" write_disposition=\"merge\",\n",
" primary_key=\"id\",\n",
")\n",
"def pokemon() -> TDataItems:\n",
"def merge_pokemon() -> TDataItems:\n",
" yield data\n",
"\n",
"\n",
"pipeline = dlt.pipeline(\n",
"merge_pipeline = dlt.pipeline(\n",
" pipeline_name=\"poke_pipeline_merge\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = merge_pipeline.run(merge_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"merge_pipeline.dataset().pokemon.df()"
]
},
{
@@ -431,24 +411,24 @@
"outputs": [],
"source": [
"# We added `created_at` field to the data\n",
"data = [\n",
"created_data = [\n",
" {\n",
" \"id\": \"1\",\n",
" \"name\": \"bulbasaur\",\n",
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
" \"created_at\": \"2024-12-01\", # <------- new field\n",
" \"created_at\": \"2024-12-01\",\n",
" },\n",
" {\n",
" \"id\": \"4\",\n",
" \"name\": \"charmander\",\n",
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
" \"created_at\": \"2024-09-01\", # <------- new field\n",
" \"created_at\": \"2024-09-01\",\n",
" },\n",
" {\n",
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
" \"created_at\": \"2023-06-01\", # <------- new field\n",
" \"created_at\": \"2023-06-01\",\n",
" },\n",
"]"
]
@@ -459,11 +439,11 @@
"id": "EO63mHgE_Oya"
},
"source": [
"**The goal**: Load only Pokémon caught after January 1, 2024, skipping the ones you already have.\n",
"**The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have.\n",
"\n",
"### **Step 2: Defining the incremental logic**\n",
"\n",
"Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémon caught after a certain date:\n",
"Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date:\n",
"```python\n",
"cursor_date = dlt.sources.incremental(\"created_at\", initial_value=\"2024-01-01\")\n",
"```\n",
@@ -489,12 +469,12 @@
" name=\"pokemon\",\n",
" write_disposition=\"append\",\n",
")\n",
"def pokemon(\n",
"def incremental_pokemon(\n",
" cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n",
" \"created_at\", initial_value=\"2024-01-01\"\n",
" )\n",
") -> TDataItems:\n",
" yield data"
" yield created_data"
]
},
{
@@ -524,17 +504,17 @@
},
"outputs": [],
"source": [
"pipeline = dlt.pipeline(\n",
"incremental_pipeline = dlt.pipeline(\n",
" pipeline_name=\"poke_pipeline_incremental\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = incremental_pipeline.run(incremental_pokemon)\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"incremental_pipeline.dataset().pokemon.df()"
]
},
{
@@ -584,7 +564,7 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = incremental_pipeline.run(incremental_pokemon)\n",
"print(load_info)"
]
},
@@ -619,21 +599,21 @@
},
"outputs": [],
"source": [
"# We added `created_at` field to the data\n",
"data = [\n",
"# We added `updated_at` field to the data\n",
"updated_data = [\n",
" {\n",
" \"id\": \"1\",\n",
" \"name\": \"bulbasaur\",\n",
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
" \"created_at\": \"2024-12-01\",\n",
" \"updated_at\": \"2024-12-01\", # <------- new field\n",
" \"updated_at\": \"2024-12-01\",\n",
" },\n",
" {\n",
" \"id\": \"4\",\n",
" \"name\": \"charmander\",\n",
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
" \"created_at\": \"2024-09-01\",\n",
" \"updated_at\": \"2024-09-01\", # <------- new field\n",
" \"updated_at\": \"2024-09-01\",\n",
" },\n",
" {\n",
" \"id\": \"25\",\n",
@@ -641,9 +621,9 @@
" \"size\": {\n",
" \"weight\": 9,\n",
" \"height\": 0.4,\n",
" }, # <----- pikachu gained weight from 6 to 9\n",
" },\n",
" \"created_at\": \"2023-06-01\",\n",
" \"updated_at\": \"2024-12-16\", # <------- new field, information about pikachu has updated\n",
" \"updated_at\": \"2024-12-16\",\n",
" },\n",
"]"
]
@@ -670,14 +650,15 @@
"\n",
"@dlt.resource(\n",
" name=\"pokemon\",\n",
" write_disposition=\"merge\", # <--- change write disposition from 'append' to 'merge'\n",
" primary_key=\"id\", # <--- set a primary key\n",
" write_disposition=\"merge\",\n",
" primary_key=\"id\",\n",
")\n",
"def pokemon(\n",
"def dedup_pokemon(\n",
" data: TDataItems,\n",
" cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n",
" \"updated_at\", initial_value=\"2024-01-01\"\n",
" )\n",
") -> TDataItems: # <--- change the cursor name from 'created_at' to 'updated_at'\n",
" ),\n",
") -> TDataItems:\n",
" yield data"
]
},
@@ -698,17 +679,17 @@
},
"outputs": [],
"source": [
"pipeline = dlt.pipeline(\n",
"dedup_pipeline = dlt.pipeline(\n",
" pipeline_name=\"poke_pipeline_dedup\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"pokemon_data\",\n",
")\n",
"\n",
"load_info = pipeline.run(pokemon)\n",
"load_info = dedup_pipeline.run(dedup_pokemon(updated_data))\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"dedup_pipeline.dataset().pokemon.df()"
]
},
{
@@ -717,7 +698,7 @@
"id": "omG1cgzcrqOs"
},
"source": [
"All Pokémon are processed because this is the pipelines first run.\n",
"All Pokémons are processed because this is the pipelines first run.\n",
"\n",
"Now, lets say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`."
]
@@ -730,8 +711,7 @@
},
"outputs": [],
"source": [
"# We added `created_at` field to the data\n",
"data = [\n",
"reupdated_data = [\n",
" {\n",
" \"id\": \"1\",\n",
" \"name\": \"bulbasaur\",\n",
@@ -749,9 +729,9 @@
" {\n",
" \"id\": \"25\",\n",
" \"name\": \"pikachu\",\n",
" \"size\": {\"weight\": 7.5, \"height\": 0.4}, # <--- pikachu lost weight\n",
" \"size\": {\"weight\": 7.5, \"height\": 0.4},\n",
" \"created_at\": \"2023-06-01\",\n",
" \"updated_at\": \"2024-12-23\", # <--- data about his weight was updated a week later\n",
" \"updated_at\": \"2024-12-23\",\n",
" },\n",
"]"
]
@@ -773,11 +753,11 @@
},
"outputs": [],
"source": [
"load_info = pipeline.run(pokemon)\n",
"load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data))\n",
"print(load_info)\n",
"\n",
"# explore loaded data\n",
"pipeline.dataset().pokemon.df()"
"dedup_pipeline.dataset().pokemon.df()"
]
},
{
@@ -786,10 +766,10 @@
"id": "u2hZHn_EowBd"
},
"source": [
"**What happens?**\n",
"**What happened?**\n",
"\n",
"* The pipeline detects that `updated_at` for Bulbasaur and Charmander hasnt changed—theyre skipped.\n",
"* Pikachus record is updated to reflect the latest weight.\n",
"* The pipeline detected that `updated_at` for Bulbasaur and Charmander hasnt changed—theyre skipped.\n",
"* Pikachus record was updated to reflect the latest weight.\n",
"\n",
"You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination."
]
@@ -800,28 +780,17 @@
"id": "pufZ_GWPxqEQ"
},
"source": [
"The **`dlt.sources.incremental`** instance above has the next attributes:\n",
"The **`dlt.sources.incremental`** instance above has the following attributes:\n",
"\n",
"* **`cursor_date.initial_value`** which is always equal to \"2024-01-01\" passed in the constructor;\n",
"* **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run;\n",
"* **`cursor_date.last_value`** a \"real-time\" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`;\n",
"* **`cursor_date.end_value`** (here not used) marking the end of the backfill range.\n",
"* **`cursor_date.end_value`** (not used here) marking the end of the backfill range.\n",
"\n",
"## **Example**\n",
"You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "l4C_IFK7G4m9"
},
"outputs": [],
"source": [
"exit() # we use exit() to reset all ENVs we set"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -838,10 +807,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -859,9 +827,7 @@
" )\n",
" ) -> TDataItems:\n",
" params = {\n",
" \"since\": (\n",
" cursor_date.last_value\n",
" ), # <--- use last_value to request only new data from API\n",
" \"since\": (cursor_date.last_value),\n",
" \"status\": \"open\",\n",
" }\n",
" for page in client.paginate(\"repos/dlt-hub/dlt/issues\", params=params):\n",
@@ -885,9 +851,9 @@
"id": "5d1J5DPX3Dn3"
},
"source": [
"Pay attention how we use **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n",
"Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n",
"\n",
"Run the pipeline again and make sure that **no data was loaded**."
"Run the pipeline again and make sure that **no data is loaded**."
]
},
{
@@ -934,12 +900,12 @@
"\n",
"Transform your GitHub API pipeline to use incremental loading. This means:\n",
"\n",
"* Implement new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n",
"* Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n",
"* Fetch only pulls comments updated after the last pipeline run.\n",
"* Use the `updated_at` field from the GitHub API as the incremental cursor.\n",
"* [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository)\n",
"* Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments`\n",
"* Use `since` parameter - only show results that were last updated after the given time - and `last_value`.\n",
"* Use the `since` parameter - only show results that were last updated after the given time - and `last_value`.\n",
"* `initial_value` is `2024-12-01`.\n",
"\n",
"\n",
@@ -954,17 +920,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SVyiG5wRVo1B"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,743 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩‍💻🚀**
1. Listed all available verified sources.
2. Initialized the `github_api` verified source.
3. Explored the built-in `rest_api` source.
4. Explored the built-in `sql_database` source.
5. Explored the built-in `filesystem` source.
6. Learned how to switch between destinations.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **Write Disposition and Incremental Loading** ⚙️🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
**Here, you will learn:**
- `dlt` write dispositions:
- Append
- Replace
- Merge
- What incremental loading is
- How to update and deduplicate your data
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **`dlt` write dispositions**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types:
- **Append**: The **default** disposition. It appends new data to the existing data in the destination.
- **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading.
- **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource.
The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).
You can specify a `write_disposition` in the resource decorator:
```python
@dlt.resource(write_disposition="append")
def my_resource():
...
yield data
```
Or directly in the pipeline run:
```python
load_info = pipeline.run(my_resource, write_disposition="replace")
```
> If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **1. Append**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's remember our Quick Start data sample with pokemons:""")
return
@app.cell
def _():
# Sample data containing pokemon details
data = [
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
]
return (data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""We create a `dlt` pipeline as usual and load this data into DuckDB."""
)
return
@app.cell
def _(data):
import dlt
from dlt.common.typing import TDataItems
@dlt.resource(name="pokemon", write_disposition="append")
def append_pokemon() -> TDataItems:
yield data
append_pipeline = dlt.pipeline(
pipeline_name="append_poke_pipeline",
destination="duckdb",
dataset_name="pokemon_data",
)
_load_info = append_pipeline.run(append_pokemon)
print(_load_info)
# explore loaded data
append_pipeline.dataset().pokemon.df()
return TDataItems, append_pipeline, append_pokemon, dlt
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful.
Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally.
""")
return
@app.cell
def _(append_pipeline, append_pokemon):
_load_info = append_pipeline.run(append_pokemon)
print(_load_info)
# explore loaded data
append_pipeline.dataset().pokemon.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **2. Replace**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**."""
)
return
@app.cell
def _(TDataItems, data, dlt):
@dlt.resource(name="pokemon", write_disposition="replace")
def replace_pokemon() -> TDataItems:
yield data
replace_pipeline = dlt.pipeline(
pipeline_name="replace_poke_pipeline",
destination="duckdb",
dataset_name="pokemon_data",
)
_load_info = replace_pipeline.run(replace_pokemon)
print(_load_info)
replace_pipeline.dataset().pokemon.df()
return replace_pipeline, replace_pokemon
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Run it again:""")
return
@app.cell
def _(replace_pipeline, replace_pokemon):
_load_info = replace_pipeline.run(replace_pokemon)
print(_load_info)
# explore loaded data
replace_pipeline.dataset().pokemon.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **3. [Merge](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental-loading)**
Consider a scenario where the data in the source has been updated, but you want to avoid reloading the entire dataset.
Merge write disposition is used to merge new data into the destination, using a `merge_key` and/or **deduplicating**/**upserting** new data using a `primary_key`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_5_Write_disposition_and_incremental_loading_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_5_Write_disposition_and_incremental_loading_img1.jpeg)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The **merge** write disposition can be useful in several situations:
1. If you have a dataset where records are frequently updated and you want to reflect these changes in your database, the `merge` write disposition can be used. It will **update the existing records** with the new data instead of creating duplicate entries.
2. If your data source occasionally sends **duplicate records**, the merge write disposition can help handle this. It uses a `primary_key` to identify unique records, so if a duplicate record (with the same `primary_key`) is encountered, it will be merged with the existing record instead of creating a new one.
3. If you are dealing with **Slowly Changing Dimensions** (SCD) where the attribute of a record changes over time and you want to maintain a history of these changes, you can use the `merge` write disposition with the scd2 strategy.
When using the merge disposition, you need to specify a `primary_key` or `merge_key` for the resource.
""")
return
@app.cell
def _(TDataItems, data, dlt):
@dlt.resource(name="pokemon", write_disposition="merge", primary_key="id")
def merge_pokemon() -> TDataItems:
yield data
merge_pipeline = dlt.pipeline(
pipeline_name="poke_pipeline_merge",
destination="duckdb",
dataset_name="pokemon_data",
)
_load_info = merge_pipeline.run(merge_pokemon)
print(_load_info)
merge_pipeline.dataset().pokemon.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The merge write disposition can be used with three different strategies:
* delete-insert (default strategy)
* scd2
* upsert
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Incremental Loading**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Incremental loading is the act of loading only new or changed data and not old records that we already loaded.
Imagine youre a Pokémon trainer trying to catch em all. You dont want to keep visiting the same old PokéStops, catching the same old Bulbasaurs—you only want to find new and exciting Pokémon that have appeared since your last trip. Thats what incremental loading is all about: collecting only the new data thats been added or changed, without wasting your Poké Balls (or database resources) on what you already have.
In this example, we have a dataset of Pokémon, each with a **unique ID**, their **name**, **size** (height and weight), and **when** they were "caught" (`created_at` field).
### **Step 1: Adding the `created_at` Field**
""")
return
@app.cell
def _():
# We added `created_at` field to the data
created_data = [
{
"id": "1",
"name": "bulbasaur",
"size": {"weight": 6.9, "height": 0.7},
"created_at": "2024-12-01",
},
{
"id": "4",
"name": "charmander",
"size": {"weight": 8.5, "height": 0.6},
"created_at": "2024-09-01",
},
{
"id": "25",
"name": "pikachu",
"size": {"weight": 6, "height": 0.4},
"created_at": "2023-06-01",
},
]
return (created_data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have.
### **Step 2: Defining the incremental logic**
Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date:
```python
cursor_date = dlt.sources.incremental("created_at", initial_value="2024-01-01")
```
This tells `dlt`:
- **Start date**: January 1, 2024 (`initial_value`).
- **Field to track**: `created_at` (our timestamp).
As you run the pipeline repeatedly, `dlt` will keep track of the latest `created_at` value processed. It will skip records older than this date in future runs.
""")
return
@app.cell
def _(TDataItems, created_data, dlt):
@dlt.resource(name="pokemon", write_disposition="append")
def incremental_pokemon(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"created_at", initial_value="2024-01-01"
)
) -> TDataItems:
yield created_data
return (incremental_pokemon,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""We use the `@dlt.resource` decorator to declare table **name** to which data will be loaded and **write disposition**, which is **append** by default."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Step 3: Running the pipeline**
Finally, we run our pipeline and load the fresh Pokémon data:
""")
return
@app.cell
def _(dlt, incremental_pokemon):
incremental_pipeline = dlt.pipeline(
pipeline_name="poke_pipeline_incremental",
destination="duckdb",
dataset_name="pokemon_data",
)
_load_info = incremental_pipeline.run(incremental_pokemon)
print(_load_info)
# explore loaded data
incremental_pipeline.dataset().pokemon.df()
return (incremental_pipeline,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This:
1. Loads **only Charmander and Bulbasaur** (caught after 2024-01-01).
2. Skips Pikachu because its old news.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Only data for 2024 year was loaded.""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_5_Write_disposition_and_incremental_loading_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_5_Write_disposition_and_incremental_loading_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Run the same pipeline again. The pipeline will detect that there are **no new records** based on the `created_at` field and the incremental cursor. As a result, **no new data will be loaded** into the destination:
>0 load package(s) were loaded
""")
return
@app.cell
def _(incremental_pipeline, incremental_pokemon):
_load_info = incremental_pipeline.run(incremental_pokemon)
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### **Why incremental loading matters**
* **Efficiency**. Skip redundant data, saving time and resources.
* **Scalability**. Handle growing datasets without bottlenecks.
* **Automation**. Let the tool track changes for you—no manual effort.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Update and deduplicate your data**
The script above finds new pokemons and adds them to the database. It will ignore any updates to user information.
""")
return
@app.cell
def _():
# We added `updated_at` field to the data
updated_data = [
{
"id": "1",
"name": "bulbasaur",
"size": {"weight": 6.9, "height": 0.7},
"created_at": "2024-12-01",
"updated_at": "2024-12-01",
},
{
"id": "4",
"name": "charmander",
"size": {"weight": 8.5, "height": 0.6},
"created_at": "2024-09-01",
"updated_at": "2024-09-01",
},
{
"id": "25",
"name": "pikachu",
"size": {
"weight": 9,
"height": 0.4,
},
"created_at": "2023-06-01",
"updated_at": "2024-12-16",
},
]
return (updated_data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Get always fresh content of all the pokemons: combine an **incremental load** with **merge** write disposition, like in the script below."""
)
return
@app.cell
def _(TDataItems, dlt):
@dlt.resource(name="pokemon", write_disposition="merge", primary_key="id")
def dedup_pokemon(
data: TDataItems,
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-01-01"
),
) -> TDataItems:
yield data
return (dedup_pokemon,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The incremental cursor keeps an eye on the `updated_at` field. Every time the pipeline runs, it only processes records with `updated_at` values greater than the last run."""
)
return
@app.cell
def _(dedup_pokemon, dlt, updated_data):
dedup_pipeline = dlt.pipeline(
pipeline_name="poke_pipeline_dedup",
destination="duckdb",
dataset_name="pokemon_data",
)
_load_info = dedup_pipeline.run(dedup_pokemon(updated_data))
print(_load_info)
# explore loaded data
dedup_pipeline.dataset().pokemon.df()
return (dedup_pipeline,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
All Pokémons are processed because this is the pipelines first run.
Now, lets say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`.
""")
return
@app.cell
def _():
reupdated_data = [
{
"id": "1",
"name": "bulbasaur",
"size": {"weight": 6.9, "height": 0.7},
"created_at": "2024-12-01",
"updated_at": "2024-12-01",
},
{
"id": "4",
"name": "charmander",
"size": {"weight": 8.5, "height": 0.6},
"created_at": "2024-09-01",
"updated_at": "2024-09-01",
},
{
"id": "25",
"name": "pikachu",
"size": {"weight": 7.5, "height": 0.4},
"created_at": "2023-06-01",
"updated_at": "2024-12-23",
},
]
return (reupdated_data,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Run the same pipeline:""")
return
@app.cell
def _(dedup_pipeline, dedup_pokemon, reupdated_data):
_load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data))
print(_load_info)
# explore loaded data
dedup_pipeline.dataset().pokemon.df()
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**What happened?**
* The pipeline detected that `updated_at` for Bulbasaur and Charmander hasnt changed—theyre skipped.
* Pikachus record was updated to reflect the latest weight.
You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The **`dlt.sources.incremental`** instance above has the following attributes:
* **`cursor_date.initial_value`** which is always equal to "2024-01-01" passed in the constructor;
* **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run;
* **`cursor_date.last_value`** a "real-time" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`;
* **`cursor_date.end_value`** (not used here) marking the end of the backfill range.
## **Example**
You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example:
""")
return
@app.cell
def _(TDataItems, dlt, os):
from typing import Iterable
from dlt.extract import DltResource
from dlt.sources.helpers import requests
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
@dlt.source
def github_source(access_token: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=access_token),
paginator=HeaderLinkPaginator(),
)
@dlt.resource(name="issues", write_disposition="merge", primary_key="id")
def github_issues(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/issues", params=params):
yield page
return github_issues
pipeline = dlt.pipeline(pipeline_name="github_incr", destination="duckdb")
_load_info = pipeline.run(github_source())
print(_load_info)
return github_source, pipeline
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.
Run the pipeline again and make sure that **no data is loaded**.
""")
return
@app.cell
def _(github_source, pipeline):
# run the pipeline with the new resource
_load_info = pipeline.run(github_source())
print(_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Apply Hints**
Alternatively, you can use `apply_hints` on a resource to define an incremental field:
```python
resource = resource()
resource.apply_hints(incremental=dlt.sources.incremental("updated_at"))
```
When you apply an incremental hint using `apply_hints`, the source still performs a full extract. The incremental hint is used by `dlt` to filter the data after it has been extracted, before it is loaded into the destination.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Exercise 1: Make the GitHub API pipeline incremental**
In the previous lessons, you built a pipeline to pull data from the GitHub API. Now, lets level it up by making it incremental, so it fetches only new or updated data.
Transform your GitHub API pipeline to use incremental loading. This means:
* Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.
* Fetch only pulls comments updated after the last pipeline run.
* Use the `updated_at` field from the GitHub API as the incremental cursor.
* [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository)
* Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments`
* Use the `since` parameter - only show results that were last updated after the given time - and `last_value`.
* `initial_value` is `2024-12-01`.
### Question
How many columns does the `comments` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,14 +6,14 @@
"id": "h93BcC8SX2fj"
},
"source": [
"# **Recap of [Lesson 5](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Explored 3 dlt write dispositions:\n",
" * append;\n",
" * replace;\n",
" * merge.\n",
"2. Learned how to update and depuplicate data\n",
"3. Created incremental pipeline\n"
"1. Explored 3 `dlt` write dispositions: \n",
" - append \n",
" - replace \n",
" - merge \n",
"2. Learned how to update and deduplicate data \n",
"3. Created an incremental pipeline\n"
]
},
{
@@ -22,16 +22,16 @@
"id": "26boldDvOn0R"
},
"source": [
"# **How dlt works** 🧠🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n",
"# **How dlt works** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n",
"\n",
"\n",
"**Here, you will learn:**\n",
"- Three main steps:\n",
" - Extract;\n",
" - Normalize;\n",
" - Load. \n",
"- Some default behaviour.\n",
"- About file formats."
"- The 3 main steps of a pipeline run: \n",
" - Extract \n",
" - Normalize \n",
" - Load \n",
"- Some default behaviors \n",
"- Supported file formats"
]
},
{
@@ -42,7 +42,7 @@
"source": [
"## **Introduction**\n",
"\n",
"The main building block of dlt is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method."
"The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method."
]
},
{
@@ -60,8 +60,8 @@
"id": "Xh6CKQATb63X"
},
"source": [
"# **Understing `pipeline.run()`**\n",
" The `pipeline.run()` method executes the entire pipeline, encompassing the [`extract`](#scrollTo=4C0U1dnwZxAB), [`normalize`](#scrollTo=bCeUqaW_cRSh), and [`load`](#scrollTo=Rn6cUc0OcWsk) stages."
"# **Understanding `pipeline.run()`**\n",
" The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages."
]
},
{
@@ -89,7 +89,7 @@
"outputs": [],
"source": [
"%%capture\n",
"!pip install -U dlt"
"!pip install dlt"
]
},
{
@@ -136,14 +136,14 @@
"The `progress=\"log\"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipelines progress during execution. These logs provide visibility into the pipelines operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization.\n",
"\n",
"\n",
"dlt supports 4 progress monitors out of the box:\n",
"`dlt` supports 4 progress monitors out of the box:\n",
"\n",
"* `enlighten` - a status bar with progress bars that also allows for logging.\n",
"* `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks.\n",
"* `alive_progress` - with the most fancy animations.\n",
"* `log` - dumps the progress information to log, console, or text stream. the most useful on production optionally adds memory and CPU usage stats.\n",
"* `log` dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats.\n",
"\n",
"For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#display-the-loading-progress)."
"For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress)."
]
},
{
@@ -154,7 +154,7 @@
"source": [
"## **Extract**\n",
"\n",
"Extract can be run individually with the `extract` command on the pipeline:\n",
"Extract can be run individually with the `extract` method on the pipeline:\n",
"\n",
"```python\n",
"pipeline.extract(data)\n",
@@ -712,17 +712,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rZpSep8SV1SZ"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,663 @@
# /// script
# dependencies = [
# "dlt",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩‍💻🚀**
1. Explored 3 `dlt` write dispositions:
- append
- replace
- merge
2. Learned how to update and deduplicate data
3. Created an incremental pipeline
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **How dlt works** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
**Here, you will learn:**
- The 3 main steps of a pipeline run:
- Extract
- Normalize
- Load
- Some default behaviors
- Supported file formats
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Introduction**
The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""So, let's take a step back and walk through the internal steps of `pipeline.run()`, identifying methods to optimize each one."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Understanding `pipeline.run()`**
The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_6_How_dlt_works_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_6_How_dlt_works_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Consider this intentionally short example:""")
return
@app.cell
def _():
import dlt
pipeline = dlt.pipeline(
pipeline_name="my_pipeline", destination="duckdb", progress="log"
)
load_info = pipeline.run(
[
{"id": 1},
{"id": 2},
{"id": 3, "nested": [{"id": 1}, {"id": 2}]},
],
table_name="items",
)
print(load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This is what happens when the `run` method is executed:
1. **Extract** - Fully extracts the data from your source to your hard drive. In the example above, an implicit source with one resource with 3 items is created and extracted.
2. **Normalize** - Inspects and normalizes your data and computes a schema compatible with your destination. For the example above, the normalizer will detect one column `id` of type `int` in one table named `items`, it will furthermore detect a nested list in table items and unnest it into a child table named `items__nested`.
3. **Load** - Runs schema migrations if necessary on your destination and loads your data into the destination. For the example above, a new dataset on a local duckdb database is created that contains the two tables discovered in the previous steps.
## **Display the loading progress**
Notice how we use `progress="log"` here.
The `progress="log"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipelines progress during execution. These logs provide visibility into the pipelines operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization.
`dlt` supports 4 progress monitors out of the box:
* `enlighten` - a status bar with progress bars that also allows for logging.
* `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks.
* `alive_progress` - with the most fancy animations.
* `log` — dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats.
For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress).
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Extract**
Extract can be run individually with the `extract` method on the pipeline:
```python
pipeline.extract(data)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **What happens at the extraction stage?**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
When the `pipeline.run()` method is executed, it first performs the `extract` stage, during which the following occurs:
1. Data is fetched and stored in an in-memory buffer.
2. When the buffer reaches its capacity, the data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items.
3. If a size is specified for intermediary files and an the intermediary file in question reaches this size, a new intermediary file is opened for further data.
```
API Data
| (extract)
Buffer
(resources) / | ... | \
extracted data in local storage
```
The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **Default behaviour at the extraction stage**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
- The in-memory buffer is set to `5000` items.
- By default, **intermediary files are not rotated**. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, `dlt` will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions.
- By default, intermediary files at the extract stage use a custom version of the JSONL format.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Normalize**
Normalize can be run individually with the `normalize` command on the pipeline. Normalize is dependent on having a completed extract phase and will not do anything if there is no extracted data.
```py
pipeline.normalize()
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **What happens at the normalization stage?**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
In the `normalize` stage, `dlt` first transforms the structure of the input data. This transformed data is then converted into a relational structure that can be easily loaded into the destination. To be detailed, here's what happens during this stage:
1. Intermediary files are sent from the `extract` stage to the `normalize` stage.
3. During normalization step it processes one intermediate file at a time within its own in-memory buffer.
4. When the buffer reaches its capacity, the normalized data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items.
4. If a size is specified for intermediary files in the normalize stage and the intermediary file in question reaches this size, a new intermediary file is opened.
```
(extract)
API Data --> extracted files in local storage
/ | \ (normalize)
one file ... one file
/ | \ / | \
normalized files normalized files
```
The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **Default behaviour at the normalization stage**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
- The in-memory buffer is set to `5000`, just like at the extraction stage.
- By default, **intermediary files are not rotated** as well. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, dlt will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## **Load**
Load can be run individually with the `load` command on the pipeline. Load is dependent on having a completed normalize phase and will not do anything if there is no normalized data.
```py
pipeline.load()
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **What happens at the loading stage?**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The `load` stage is responsible for taking the normalized data and loading it into your chosen destination:
1. All intermediary files from a single source are combined into a single load package.
2. All load packages are then loaded into the destination.
```
(extract) (normalize)
API Data --> extracted files --> normalized files
/ | ... | \ (load)
one normalized file ... one file
\ | ... | /
destination
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **Default behaviour at the loading stage**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""- Loading happens in `20` threads, each loading a single file.""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""## **Intermediary file formats**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Intermediary files at the extract stage use a custom version of the JSONL format, while the loader files - files created at the normalize stage - can take 4 different formats."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **JSONL**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Definition**: JSON Delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line.
**Compression:** enabled by default.
**Data type handling:**
- `datetime` and `date` are stored as ISO strings;
- `decimal` is stored as a text representation of a decimal number;
- `binary` is stored as a base64 encoded string;
- `HexBytes` is stored as a hex encoded string;
- `complex` is serialized as a string.
**By default used by:**
- Bigquery
- Snowflake
- Filesystem
**Configuration**:
- Directly in the `pipeline.run()`:
```py
info = pipeline.run(some_source(), loader_file_format="jsonl")
```
- In `config.toml` or `secrets.toml`:
```py
[normalize]
loader_file_format="jsonl"
```
- Via environment variables:
```py
export NORMALIZE__LOADER_FILE_FORMAT="jsonl"
```
- Specify directly in the resource decorator:
```py
@dlt.resource(file_format="jsonl")
def generate_rows():
...
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **Parquet**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Definition**: Apache Parquet is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem.
**Prerequisite:** To use this format, you need a pyarrow package. You can get this package as a dlt extra as well:
```py
pip install "dlt[parquet]"
```
**Default version**: 2.4, which coerces timestamps to microseconds and silently truncates nanoseconds for better compatibility with databases and pandas.
**Supported by:**
- Bigquery
- DuckDB
- Snowflake
- Filesystem
- Athena
- Databricks
- Synapse
**Configuration**:
- Directly in the `pipeline.run()`:
```py
info = pipeline.run(some_source(), loader_file_format="parquet")
```
- In `config.toml` or `secrets.toml`:
```py
[normalize]
loader_file_format="parquet"
```
- Via environment variables:
```py
export NORMALIZE__LOADER_FILE_FORMAT="parquet"
```
- Specify directly in the resource decorator:
```py
@dlt.resource(file_format="parquet")
def generate_rows():
...
```
**Destination AutoConfig**:
`dlt` automatically configures the Parquet writer based on the destination's capabilities:
- Selects the appropriate decimal type and sets the correct precision and scale for accurate numeric data storage, including handling very small units like Wei.
- Adjusts the timestamp resolution (seconds, microseconds, or nanoseconds) to match what the destination supports
**Writer settings:**
`dlt` uses the pyarrow Parquet writer for file creation. You can adjust the writer's behavior with the following options:
- `flavor` adjusts schema and compatibility settings for different target systems. Defaults to None (pyarrow default).
- `version` selects Parquet logical types based on the Parquet format version. Defaults to "2.6".
- `data_page_size` sets the target size for data pages within a column chunk (in bytes). Defaults to None.
- `timestamp_timezone` specifies the timezone; defaults to UTC.
- `coerce_timestamps` sets the timestamp resolution (s, ms, us, ns).
- `allow_truncated_timestamps` raises an error if precision is lost on truncated timestamps.
**Example configurations:**
- In `configs.toml` or `secrets.toml`:
```py
[normalize.data_writer]
# the default values
flavor="spark"
version="2.4"
data_page_size=1048576
timestamp_timezone="Europe/Berlin"
```
- Via environment variables:
```py
export NORMALIZE__DATA_WRITER__FLAVOR="spark"
```
**Timestamps and timezones**
`dlt` adds UTC adjustments to all timestamps, creating timezone-aware timestamp columns in destinations (except DuckDB).
**Disable timezone/UTC adjustments:**
- Set `flavor` to spark to use the deprecated `int96` timestamp type without logical adjustments.
- Set `timestamp_timezone` to an empty string (`DATA_WRITER__TIMESTAMP_TIMEZONE=""`) to generate logical timestamps without UTC adjustment.
By default, pyarrow converts timezone-aware DateTime objects to UTC and stores them in Parquet without timezone information.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **CSV**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Supported by:**
- PostgreSQL
- Filesystem
- Snowflake
**Configuration**:
- Directly in the `pipeline.run()`:
```py
info = pipeline.run(some_source(), loader_file_format="csv")
```
- In `config.toml` or `secrets.toml`:
```py
[normalize]
loader_file_format="csv"
```
- Via environment variables:
```py
export NORMALIZE__LOADER_FILE_FORMAT="csv"
```
- Specify directly in the resource decorator:
```py
@dlt.resource(file_format="csv")
def generate_rows():
...
```
**Two implementation**:
1. `pyarrow` csv writer - very fast, multithreaded writer for the arrow tables
- binary columns are supported only if they contain valid UTF-8 characters
- complex (nested, struct) types are not supported
2. `python stdlib writer` - a csv writer included in the Python standard library for Python objects
- binary columns are supported only if they contain valid UTF-8 characters (easy to add more encodings)
- complex columns dumped with json.dumps
- None values are always quoted
**Default settings:**
- separators are commas
- quotes are " and are escaped as ""
- NULL values both are empty strings and empty tokens as in the example below
- UNIX new lines are used
- dates are represented as ISO 8601
quoting style is "when needed"
**Adjustable setting:**
- `delimiter`: change the delimiting character (default: ',')
- `include_header`: include the header row (default: True)
- `quoting`: `quote_all` - all values are quoted, `quote_needed` - quote only values that need quoting (default: `quote_needed`)
```py
[normalize.data_writer]
delimiter="|"
include_header=false
quoting="quote_all"
```
or
```py
NORMALIZE__DATA_WRITER__DELIMITER=|
NORMALIZE__DATA_WRITER__INCLUDE_HEADER=False
NORMALIZE__DATA_WRITER__QUOTING=quote_all
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""### **SQL INSERT File Format**""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage.
Additional data types are stored as follows:
- `datetime` and date are stored as ISO strings;
- `decimal` is stored as a text representation of a decimal number;
- `binary` storage depends on the format accepted by the destination;
- `complex` storage also depends on the format accepted by the destination.
This file format is compressed by default.
**Default for:**
1. DuckDB
2. PostgreSQL
3. Redshift
**Supported by:**
1. Filesystem
**Configuration**:
- Directly in the `pipeline.run()`:
```py
info = pipeline.run(some_source(), loader_file_format="insert_values")
```
- In `config.toml` or `secrets.toml`:
```py
[normalize]
loader_file_format="insert_values"
```
- Via environment variables:
```py
export NORMALIZE__LOADER_FILE_FORMAT="insert_values"
```
- Specify directly in the resource decorator:
```py
@dlt.resource(file_format="insert_values")
def generate_rows():
...
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,14 +6,14 @@
"id": "h93BcC8SX2fj"
},
"source": [
"# **Recap of [Lesson 6](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Learned how dlt works under the hood;\n",
"2. Explored 3 main steps:\n",
" * Extract;\n",
" * Normalize;\n",
" * Load.\n",
"3. Learned which file formats dlt supports."
"1. Learned how `dlt` works under the hood. \n",
"2. Explored the 3 main steps of a pipeline run: \n",
" - Extract \n",
" - Normalize \n",
" - Load \n",
"3. Learned which file formats `dlt` supports."
]
},
{
@@ -24,7 +24,7 @@
"source": [
"---\n",
"\n",
"# **Inspecting & Adjusting Schema** 🧠🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n",
"# **Inspecting & Adjusting Schema** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n",
"\n",
"\n",
"**Here, you will learn or refresh your knowledge on:**\n",
@@ -56,7 +56,7 @@
"id": "1vRudCVb9zII"
},
"source": [
"Let's load some GitHub data to DuckDB to inspect the schema in different ways. First we need to install dlt with DuckDB:"
"Let's load some GitHub data to DuckDB to inspect the schema in different ways."
]
},
{
@@ -68,7 +68,7 @@
"outputs": [],
"source": [
"%%capture\n",
"!pip install -U dlt"
"!pip install dlt"
]
},
{
@@ -77,7 +77,7 @@
"id": "DKvf4NWW-U9V"
},
"source": [
"Define a dlt resource that fetches pull requests and wrap it in a dlt source, create a pipeline and run it:"
"Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:"
]
},
{
@@ -100,7 +100,7 @@
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -259,7 +259,7 @@
" pipeline_name=\"github_pipeline2\",\n",
" destination=\"duckdb\",\n",
" dataset_name=\"github_data\",\n",
" export_schema_path=\"schemas/export\", # <--- dir path for a schema export\n",
" export_schema_path=\"schemas/export\",\n",
")"
]
},
@@ -308,7 +308,9 @@
},
"outputs": [],
"source": [
"!ls schemas/export && cat schemas/export/github_source.schema.yaml"
"print(os.listdir(\"schemas/export\"))\n",
"with open(\"schemas/export/github_source.schema.yaml\") as f:\n",
" print(f.read())"
]
},
{
@@ -957,7 +959,8 @@
},
"outputs": [],
"source": [
"!cat schemas/export/github_source.schema.yaml"
"with open(\"schemas/export/github_source.schema.yaml\") as f:\n",
" print(f.read())"
]
},
{
@@ -977,17 +980,8 @@
"id": "NYbccmLie1zm"
},
"source": [
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1jp5UtydA3x9cAq-fbW2tRmAOl4LMZqM1#forceEdit=true&sandboxMode=true)!"
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gxU44wP9GvG6"
},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -0,0 +1,882 @@
# /// script
# dependencies = [
# "dlt",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩‍💻🚀**
1. Learned how `dlt` works under the hood.
2. Explored the 3 main steps of a pipeline run:
- Extract
- Normalize
- Load
3. Learned which file formats `dlt` supports.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **Inspecting & Adjusting Schema** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
**Here, you will learn or refresh your knowledge on:**
- Methods to inspect a schema
- The components of a schema
- How to modify a schema
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Methods to inspect a schema**
- **What's a schema?** The schema describes the structure of normalized data (e.g. tables, columns, data types, etc.). `dlt` generates schemas from the data during the normalization process.
- **How can you inspect a schema in `dlt`?** There are multiple ways:
- CLI
- Python
- Export schema directly
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Let's load some GitHub data to DuckDB to inspect the schema in different ways."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:"""
)
return
@app.cell
def _():
from typing import Iterable
import dlt
from dlt.common.typing import TDataItems
from dlt.extract import DltResource
from dlt.sources.helpers import requests
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
import os
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
@dlt.source
def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
return github_pulls
# define new dlt pipeline
pipeline = dlt.pipeline(
pipeline_name="github_pipeline1",
destination="duckdb",
dataset_name="github_data",
)
# run the pipeline with the new resource
load_info = pipeline.run(github_source())
print(load_info)
return dlt, github_source, load_info, os
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) CLI**
Let's first try the CLI command `dlt pipeline -v <pipeline_name> load-package`, which is used to inspect a load package in verbose mode.
> In the context of the `dlt` library, a load package is a collection of jobs with data for particular tables. The -v flag stands for verbose, which means the command will provide more detailed output.
Specifically, this command will show the schema changes introduced in the load package for the given pipeline.
""")
return
@app.cell
def _():
import subprocess
subprocess.run(
["dlt", "pipeline", "-v", "github_pipeline1", "load-package"], check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(1) Python**
Alternatively, we can inspect the schema object from load info with:
```python
print(load_info.load_packages[0].schema)
```
which has the following public methods and attributes:
""")
return
@app.cell
def _(load_info):
# This code snippet just prints out the public methods and attributes of the schema object in load info
all_attributes_methods = dir(load_info.load_packages[0].schema)
public_attributes_methods = [
attr for attr in all_attributes_methods if not attr.startswith("_")
]
print(f"{'Attribute/Method':<50} {'Type':<10}")
print("-" * 40)
for attr in public_attributes_methods:
attr_value = getattr(load_info.load_packages[0].schema, attr)
if callable(attr_value):
print(f"{attr:<50} {'method':<10}")
else:
print(f"{attr:<50} {'attribute':<10}")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Let's use the `to_pretty_json` method and print the schema:""")
return
@app.cell
def _(load_info):
print(load_info.load_packages[0].schema.to_pretty_json())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(2) Exporting schema**
> Exporting the data schema directly into a file might be even more straightforward than the two previous approaches.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The instruction to export a schema should be provided at the beginning when creating a pipeline:"""
)
return
@app.cell
def _(dlt):
pipeline_1 = dlt.pipeline(
pipeline_name="github_pipeline2",
destination="duckdb",
dataset_name="github_data",
export_schema_path="schemas/export",
)
return (pipeline_1,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Run the pipeline:""")
return
@app.cell
def _(github_source, pipeline_1):
load_info_1 = pipeline_1.run(github_source())
print(load_info_1)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img1.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Check if the schema was exported.""")
return
@app.cell
def _(os):
print(os.listdir("schemas/export"))
with open("schemas/export/github_source.schema.yaml") as _f:
print(_f.read())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **The components of a schema**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> Since we learned the ways we can inspect the schema, it's important to actually understand what it contains to be able to meaningfully adjust it later."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
A schema (in YAML format) looks somethng like this:
```yaml
version: 2
version_hash: wdIt+pExjT8Mj1ygQEMhq3E3SXtNBuIbHg0fDz9xD9I=
engine_version: 11
name: github_source
tables:
_dlt_version:
...
_dlt_loads:
...
github_pulls:
...
settings:
detections:
- iso_timestamp
default_hints:
not_null:
- _dlt_id
- _dlt_root_id
- _dlt_parent_id
- _dlt_list_idx
- _dlt_load_id
parent_key:
- _dlt_parent_id
root_key:
- _dlt_root_id
unique:
- _dlt_id
row_key:
- _dlt_id
normalizers:
names: snake_case
json:
module: dlt.common.normalizers.json.relational
previous_hashes:
- 0WLnuf3Jh1J1XsbVrV2eB824Z6heOlf5o912i1v3tho=
- 0d1z0RFV2O0OvfEWkebtSjxrCjjiyv1lOeNiF0V8Lws=
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) Schema version hash**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The schema hash, denoted by `version_hash`, is generated from the actual schema content, excluding the hash values and version of the schema.
Each time the schema is changed, a new hash is produced.
> Note that during the initial run (the first pipeline run), the version will be 2, and there will be two previous hashes because the schema is updated during both the extract and normalize stages. You can rely on the version number to determine how many times the schema has been changed, but keep in mind that it stops being reliable when parallelization is introduced.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Each version hash is then stored in the `_dlt_version` table.""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
On subsequent runs, `dlt` checks if the generated schema hash is stored in this table. If it is not, `dlt` concludes that the schema has changed and migrates the destination accordingly.
- If multiple pipelines are sending data to the same dataset and there is a clash in table names, a single table with the union of the columns will be created.
- If columns clash and have different types or other incompatible characteristics, the load may fail if the data cannot be coerced.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(1) Naming convention**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Each schema contains a naming convention that is denoted in the following way when the schema is exported:
```yaml
...
normalizers:
names: snake_case # naming convention
...
```
The naming convention is particularly useful if the identifiers of the data to be loaded (e.g., keys in JSON files) need to match the namespace of the destination (such as Redshift, which accepts case-insensitive alphanumeric identifiers with a maximum of 127 characters). This convention is used by `dlt` to translate between these identifiers and namespaces.
The standard behavior of `dlt` is to use the same naming convention for all destinations, ensuring that users always see the same tables and columns in their databases.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The default naming convention is `snake_case`:
- Removes all ASCII characters except alphanumerics and underscores.
- Adds an underscore (`_`) if the name starts with a number.
- Multiple underscores (`_`) are reduced to a single underscore.
- The parent-child relationship is expressed as a double underscore (`__`) in names.
- The identifier is shortened if it exceeds the length allowed at the destination.
> If you provide any schema elements that contain identifiers via decorators or arguments (e.g., `table_name` or `columns`), all the names used will be converted according to the naming convention when added to the schema. For example, if you execute `dlt.run(..., table_name="CamelCaseTableName")`, the data will be loaded into `camel_case_table_name`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
To retain the original naming convention, you can define the following in your `config.toml`:
```python
[schema]
naming="direct"
```
or use an environment variable as:
```
SCHEMA__NAMING=direct
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(2) Schema settings**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The `settings` section of the schema file allows you to define various global rules that impact how tables and columns are inferred from data.
```yaml
settings:
detections:
...
default_hints:
...
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**1. Detections**
You can define a set of functions that will be used to infer the data type of the column from a value. These functions are executed sequentially from top to bottom on the list.
```yaml
settings:
detections:
- timestamp # detects int and float values that can be interpreted as timestamps within a 5-year range and converts them
- iso_timestamp # detects ISO 8601 strings and converts them to timestamp
- iso_date #detects strings representing an ISO-like date (excluding timestamps) and, if so, converts to date
- large_integer # detects integers too large for 64-bit and classifies as "wei" or converts to text if extremely large
- hexbytes_to_text # detects HexBytes objects and converts them to text
- wei_to_double # detects Wei values and converts them to double for aggregate non-financial reporting
```
> `iso_timestamp` detector is enabled by default.
Detectors can be removed or added directly in code:
```python
source = source()
source.schema.remove_type_detection("iso_timestamp")
source.schema.add_type_detection("timestamp")
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**2. Column hint rules**
The `default_hints` section in the schema file is used to define global rules that apply to newly inferred columns.
> These rules are applied **after normalization**, meaning after the naming convention is applied!
By default, schema adopts column hint rules from the json(relational) normalizer to support correct hinting of columns added by the normalizer:
```yaml
settings:
default_hints:
foreign_key:
- _dlt_parent_id
not_null:
- _dlt_id
- _dlt_root_id
- _dlt_parent_id
- _dlt_list_idx
- _dlt_load_id
unique:
- _dlt_id
root_key:
- _dlt_root_id
```
You can define column names with regular expressions as well.
```yaml
settings:
default_hints:
partition:
- re:_timestamp$ # add partition hint to all columns ending with _timestamp
```
Column hints can be added directly in code:
```python
source = data_source()
# this will update existing hints with the hints passed
source.schema.merge_hints({"partition": ["re:_timestamp$"]})
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**3. Preferred data types**
In the `preferred_types` section, you can define rules that will set the data type for newly created columns. On the left side, you specify a rule for a column name, and on the right side, you define the corresponding data type. You can use column names directly or with regular expressions to match them.
```yaml
settings:
preferred_types:
re:timestamp: timestamp
inserted_at: timestamp
created_at: timestamp
updated_at: timestamp
```
Above, we prefer `timestamp` data type for all columns containing timestamp substring and define a exact matches for certain columns.
Preferred data types can be added directly in code as well:
```python
source = data_source()
source.schema.update_preferred_types(
{
"re:timestamp": "timestamp",
"inserted_at": "timestamp",
"created_at": "timestamp",
"updated_at": "timestamp",
}
)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **How to modify a schema**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Speaking of data types... you can directly apply data types and hints to your resources, bypassing the need for importing and adjusting schemas. This approach is ideal for rapid prototyping and handling data sources with dynamic schema requirements.
The two main approaches are:
- Using the `columns` argument in the `dlt.resource` decorator.
- Using the `apply_hints` method.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **`(0) @dlt.resource(columns=...)`**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
This code snippet sets up a nullable boolean column named `my_column` directly in the decorator.
```python
@dlt.resource(name='my_table', columns={"my_column": {"data_type": "bool", "nullable": True}})
def my_resource():
for i in range(10):
yield {'my_column': i % 2 == 0}
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(1) `apply_hints`**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
When dealing with dynamically generated resources or needing to programmatically set hints, `apply_hints` is your go-to tool.
The `apply_hints` method in dlt is used to programmatically **set** or **adjust** various aspects of your data resources or pipeline. It can be used in several ways:
* You can use `apply_hints` to **directly define data types** and their properties, such as nullability, within the `@dlt.resource` decorator. This eliminates the dependency on external schema files.
* When **dealing with dynamically generated resources** or needing to programmatically set hints, `apply_hints` is your tool. It's especially useful for applying hints across various collections or tables at once.
* `apply_hints` can be used to **load your data incrementally**. For example, you can load only files that have been updated since the last time dlt processed them, or load only the new or updated records by looking at a specific column.
* You can **set or update the table name, columns, and other schema elements** when your resource is executed, and you already yield data. Such changes will be merged with the existing schema in the same way the `apply_hints` method works.
Its especially useful for applying hints across multiple collections or tables at once.
For example, to apply a complex data type across all collections from a MongoDB source:
```python
all_collections = ["collection1", "collection2", "collection3"] # replace with your actual collection names
source_data = mongodb().with_resources(*all_collections)
for col in all_collections:
source_data.resources[col].apply_hints(columns={"column_name": {"data_type": "complex"}})
pipeline = dlt.pipeline(
pipeline_name="mongodb_pipeline",
destination="duckdb",
dataset_name="mongodb_data"
)
load_info = pipeline.run(source_data)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(2) Adjusting schema settings**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> Maybe you've noticed, but there several ways to adjust your schema settings directly in code were already covered. This is just a recap. You can go back directly to the Schema Settings section."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Detectors can be removed or added directly in code:
```python
source = source()
source.schema.remove_type_detection("iso_timestamp")
source.schema.add_type_detection("timestamp")
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Column hints can be added directly in code:
```python
source = data_source()
# this will update existing hints with the hints passed
source.schema.merge_hints({"partition": ["re:_timestamp$"]})
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Preferred data types can be added directly in code as well:
```python
source = data_source()
source.schema.update_preferred_types(
{
"re:timestamp": "timestamp",
"inserted_at": "timestamp",
"created_at": "timestamp",
"updated_at": "timestamp",
}
)
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(3) Importing a schema**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""> We mentioned that you can export a schema. In a similar fashion you can import a schema. The usual approach to use this functionaility is to export the schema first, make the adjustments and put the adjusted schema into the corresponding import folder."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""The instruction to import a schema should be provided at the beginning when creating a pipeline:"""
)
return
@app.cell
def _(dlt):
pipeline_2 = dlt.pipeline(
pipeline_name="github_pipeline3",
destination="duckdb",
dataset_name="github_data",
export_schema_path="schemas/export",
import_schema_path="schemas/import",
)
return (pipeline_2,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Let's make an initial pipeline run to export schema into the file."""
)
return
@app.cell
def _(github_source, pipeline_2):
# run the pipeline with the new resource
load_info_2 = pipeline_2.run(github_source())
print(load_info_2)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Look at the "Files" in the left sidebar, see the `schema` folder, and `export` and `import` folders inside."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img3.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Now, both folders contain identic schema files.
### **Exercise 1: Adjust import schema**
**Adjust the import schema** by adding a description of the **`github_pulls`** table.
```
github_pulls:
columns:
updated_at:
incremental: true
write_disposition: append
resource: github_pulls
description: Table contains all pull requests information from dlt repository
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Run the pipeline:""")
return
@app.cell
def _(github_source, pipeline_2):
load_info_3 = pipeline_2.run(github_source())
print(load_info_3)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Check the exported schema file. It should now contain a description for the `github_pulls` table."""
)
return
@app.cell
def _():
with open("schemas/export/github_source.schema.yaml") as _f:
print(_f.read())
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Question
What **data type** does the column `version` in the `_dlt_version` table have?
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!"""
)
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -6,12 +6,12 @@
"id": "h93BcC8SX2fj"
},
"source": [
"# **Recap of [Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n",
"# **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩‍💻🚀**\n",
"\n",
"1. Learned what is a schema.\n",
"1. Learned what a schema is.\n",
"2. Explored schema settings and components.\n",
"3. Learned how to retrieve dlt pipeline schema.\n",
"4. Learned how to adjust schema."
"3. Learned how to retrieve a dlt pipeline schema.\n",
"4. Learned how to adjust the schema."
]
},
{
@@ -22,13 +22,13 @@
"source": [
"---\n",
"\n",
"# **Understanding Pipeline Metadata and State** 👻📄 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n",
"# **Understanding Pipeline Metadata and State** 👻📄 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n",
"\n",
"\n",
"**Here, you will learn or brush up on:**\n",
"- What's pipeline metadata\n",
"- What pipeline metadata is\n",
"- Exploring pipeline metadata from load info\n",
"- Exploring pipeline metadate from trace\n",
"- Exploring pipeline metadata from trace\n",
"- Exploring pipeline metadata from state"
]
},
@@ -48,16 +48,16 @@
"id": "nFZNlDb1Y7ZH"
},
"source": [
"Metadata is basically data about data.\n",
"**Metadata** is essentially *data about data*.\n",
"\n",
"Pipeline Metadata is data about your data pipeline. This can be useful if you want to know things like:\n",
"**Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like:\n",
"\n",
"- When your pipeline first ran\n",
"- When your pipeline last ran\n",
"- Information about your source or destination\n",
"- Processing time\n",
"- Or information that you yourself may want to add to the metadata\n",
"- And much more!\n"
"- Custom metadata you add yourself\n",
"- And much more!"
]
},
{
@@ -73,9 +73,9 @@
"id": "wY2ySVotY-JU"
},
"source": [
" `dlt` allows you to be able to view all this metadata through various options!\n",
"`dlt` allows you to view all this metadata through various options!\n",
"\n",
"This notebook will walk you through those options. Namely:\n",
"This notebook will walk you through those options, namely:\n",
"\n",
"- Load info\n",
"- Trace\n",
@@ -88,7 +88,7 @@
"id": "JTR2acUYZbku"
},
"source": [
"Let's load some GitHub data to DuckDB to inspect the pipeline metadata in different ways. First we need to install dlt with DuckDB:"
"Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways."
]
},
{
@@ -109,7 +109,7 @@
"id": "AhU2JVjTZn_j"
},
"source": [
"Define a dlt resource that fetches Pull Requests and wrap it in a dlt source:"
"Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:"
]
},
{
@@ -129,10 +129,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -206,13 +205,13 @@
"id": "NA2dPY3_a2Ue"
},
"source": [
"From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r) Colab we've already learned that we can see which schema changes a load package has introduced with the command:\n",
"From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command:\n",
"\n",
"```\n",
"dlt pipeline -v <pipeline_name> load-package\n",
"```\n",
"\n",
"The verbose flag only accounts for the schema changes, so if we run it without the flag, we will still see the most recent load package info:"
"The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info:"
]
},
{
@@ -232,9 +231,9 @@
"id": "w9ztJjzWcB3q"
},
"source": [
"The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of 0 when the load process is fully completed. The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them.\n",
"The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them.\n",
"\n",
"We can also see load package info with a specific load id:"
"We can also view load package info for a specific `load_id` (replace the value with the one output above):\n"
]
},
{
@@ -264,12 +263,12 @@
"id": "Lg1lg6FVdKLl"
},
"source": [
"From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r?usp=sharing) Colab we've also learned that a schema can be accessed with:\n",
"From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with:\n",
"\n",
"```python\n",
"print(load_info.load_packages[0].schema)\n",
"```\n",
"Similarly if we drop the schema part, we will just get the load package info:"
"Similarly, if we drop the schema part, we will get the load package info:"
]
},
{
@@ -351,7 +350,7 @@
"id": "P3_rFHz6elTy"
},
"source": [
"You can access pipeline trace using the command:\n",
"You can access the pipeline trace using the command:\n",
"\n",
"\n",
"```\n",
@@ -365,7 +364,7 @@
"id": "E2B3-30Yezbi"
},
"source": [
"Try on the github issues pipeline:"
"Try running it on the github issues pipeline:"
]
},
{
@@ -458,7 +457,7 @@
"id": "XMsVhKYHff20"
},
"source": [
"In particular how many rows of data were normalized:"
"How many rows of data were normalized:"
]
},
{
@@ -513,17 +512,19 @@
},
"source": [
"**When to use pipeline state**\n",
"- dlt uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n",
"- `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n",
"- Store a list of already requested entities if the list is not much bigger than 100k elements.\n",
"- Store large dictionaries of last values if you are not able to implement it with the standard incremental construct.\n",
"- Store the custom fields dictionaries, dynamic configurations and other source-scoped state.\n",
"\n",
"**When not to use pipeline state**\n",
"\n",
"Do not use dlt state when it may grow to millions of elements. Do you plan to store modification timestamps of all of your millions of user records? This is probably a bad idea! In that case you could:\n",
"Do not use `dlt` state when it may grow to millions of elements. \n",
"For example, storing modification timestamps for millions of user records is a bad idea. \n",
"In that case, you could:\n",
"\n",
"- Store the state in dynamo-db, redis etc. taking into the account that if the extract stage fails you'll end with invalid state.\n",
"- Use your loaded data as the state. dlt exposes the current pipeline via dlt.current.pipeline() from which you can obtain sqlclient and load the data of interest. In that case try at least to process your user records in batches."
"- Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state.\n",
"- Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches."
]
},
{
@@ -634,10 +635,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -696,7 +696,7 @@
"id": "UEBszW96bX1F"
},
"source": [
"In the state you will see the new items:"
"In the state, you will see the new items:"
]
},
{
@@ -748,10 +748,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -826,11 +825,10 @@
"id": "im-o7K5IkoW5"
},
"source": [
"You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store mapping of custom fields to their displayable names.\n",
"You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names.\n",
"\n",
"Let's read some custom keys from the state:\n",
"Let's read some custom keys from the state with:\n",
"```python\n",
"# Let's read some custom state information\n",
"source_new_keys = dlt.current.source_state().get(\"resources\", {}).get(\"github_pulls\", {}).get(\"new_key\")\n",
"```\n",
"Full example:"
@@ -850,10 +848,9 @@
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
"\n",
"import os\n",
"from google.colab import userdata\n",
"\n",
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
"\n",
"\n",
"@dlt.source\n",
@@ -915,17 +912,24 @@
"id": "WIhvQCY_lEaB"
},
"source": [
"What if you run your pipeline on, for example, Airflow where every task gets a clean filesystem and pipeline working directory is always deleted?\n",
"What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted?\n",
"\n",
"**dlt loads** your **state** into the destination **together** with all other **data** and when faced with a clean start, it will try to restore state from the destination.\n",
"**dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination.\n",
"\n",
"The remote state is identified by pipeline name, the destination location (as given by the credentials) and destination dataset. To re-use **the same state**, use **the same pipeline name** and destination.\n",
"The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset. \n",
"To reuse **the same state**, use **the same pipeline name** and the same destination.\n",
"\n",
"The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, pipeline run (that the state belongs to) and state blob.\n",
"The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob.\n",
"\n",
"dlt has `dlt pipeline <pipeline name> sync` command where you can request the state back from that table.\n",
"`dlt` provides the command:\n",
"\n",
"💡 If you can keep the pipeline working directory across the runs, you can disable the state sync by setting `restore_from_destination=false` i.e. in your `config.toml`."
"```\n",
"dlt pipeline <pipeline name> sync\n",
"```\n",
"\n",
"which retrieves the state from that table.\n",
"\n",
"💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`."
]
},
{
@@ -937,11 +941,8 @@
"outputs": [],
"source": [
"import duckdb\n",
"from google.colab import data_table\n",
"from IPython.display import display\n",
"\n",
"data_table.enable_dataframe_formatter()\n",
"\n",
"# a database 'chess_pipeline.duckdb' was created in working directory so just connect to it\n",
"conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n",
"conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n",
@@ -955,7 +956,7 @@
"id": "YIy5yLOAlJ9M"
},
"source": [
"Column \"state\" is compressed json dictionary."
"The \"state\" column is a compressed json dictionary."
]
},
{
@@ -998,14 +999,14 @@
"source": [
"**To fully reset the state:**\n",
"\n",
"Drop the destination dataset to fully reset the pipeline.\n",
"Set the `dev_mode` flag when creating pipeline.\n",
"Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n",
"- Drop the destination dataset to fully reset the pipeline. \n",
"- Set the `dev_mode` flag when creating the pipeline. \n",
"- Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n",
"\n",
"**To partially reset the state:**\n",
"\n",
"Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource.\n",
"Use the `dlt pipeline drop --state-paths` command to reset the state at given path without touching the tables and data."
"- Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource. \n",
"- Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data."
]
},
{
@@ -1014,9 +1015,9 @@
"id": "fUuRzapCl8pC"
},
"source": [
"**Example for partial reset:**\n",
"**Example for a partial reset:**\n",
"\n",
"> in an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI"
"> In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI."
]
},
{
@@ -1058,7 +1059,7 @@
"id": "NYbccmLie1zm"
},
"source": [
"🎊🎊🎊 That is actually it! We hope you enjoyed this course and learned more about dlt! 🎊🎊🎊\n",
"🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊\n",
"\n",
"Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼"
]

View File

@@ -0,0 +1,884 @@
# /// script
# dependencies = [
# "dlt[duckdb]",
# "numpy",
# "pandas",
# "sqlalchemy",
# ]
# ///
import marimo
__generated_with = "0.17.4"
app = marimo.App()
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩‍💻🚀**
1. Learned what a schema is.
2. Explored schema settings and components.
3. Learned how to retrieve a dlt pipeline schema.
4. Learned how to adjust the schema.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
# **Understanding Pipeline Metadata and State** 👻📄 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
**Here, you will learn or brush up on:**
- What pipeline metadata is
- Exploring pipeline metadata from load info
- Exploring pipeline metadata from trace
- Exploring pipeline metadata from state
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Pipeline Metadata**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Metadata** is essentially *data about data*.
**Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like:
- When your pipeline first ran
- When your pipeline last ran
- Information about your source or destination
- Processing time
- Custom metadata you add yourself
- And much more!
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_8_Understanding_Pipeline_Metadata_and_State_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_8_Understanding_Pipeline_Metadata_and_State_img1.jpeg)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
`dlt` allows you to view all this metadata through various options!
This notebook will walk you through those options, namely:
- Load info
- Trace
- State
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:"""
)
return
@app.cell
def _(os):
from typing import Iterable
import dlt
from dlt.extract import DltResource
from dlt.common.typing import TDataItems
from dlt.sources.helpers import requests
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
@dlt.source
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
return github_pulls
pipeline = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
load_info = pipeline.run(_github_source())
# define new dlt pipeline
# run the pipeline with the new resource
print(load_info)
return (
BearerTokenAuth,
DltResource,
HeaderLinkPaginator,
Iterable,
RESTClient,
TDataItems,
dlt,
load_info,
pipeline,
)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Load info**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
`Load Info:` This is a collection of useful information about the recently loaded data. It includes details like the pipeline and dataset name, destination information, and a list of loaded packages with their statuses, file sizes, types, and error messages (if any).
`Load Package:` A load package is a collection of jobs with data for specific tables, generated during each execution of the pipeline. Each package is uniquely identified by a `load_id`.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) CLI**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command:
```
dlt pipeline -v <pipeline_name> load-package
```
The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info:
""")
return
@app.cell
def _():
import subprocess
subprocess.run(["dlt", "pipeline", "github_pipeline", "load-package"], check=True)
return (subprocess,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them.
We can also view load package info for a specific `load_id` (replace the value with the one output above):
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "pipeline", "github_pipeline", "load-package", "1741348101.3398592"],
check=True,
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) Python**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with:
```python
print(load_info.load_packages[0].schema)
```
Similarly, if we drop the schema part, we will get the load package info:
""")
return
@app.cell
def _(load_info):
print(load_info.load_packages[0])
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""which has the following public methods and attributes:""")
return
@app.cell
def _(load_info):
# This code snippet just prints out the public methoda and attributes of the schema object in load info
all_attributes_methods = dir(load_info.load_packages[0])
public_attributes_methods = [
attr for attr in all_attributes_methods if not attr.startswith("_")
]
print(f"{'Attribute/Method':<50} {'Type':<10}")
print("-" * 40)
for attr in public_attributes_methods:
attr_value = getattr(load_info.load_packages[0], attr)
if callable(attr_value):
print(f"{attr:<50} {'method':<10}")
else:
print(f"{attr:<50} {'attribute':<10}")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **Trace**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""`Trace`: A trace is a detailed record of the execution of a pipeline. It provides rich information on the pipeline processing steps: **extract**, **normalize**, and **load**. It also shows the last `load_info`."""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) CLI**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can access the pipeline trace using the command:
```
dlt pipeline <pipeline_name> trace
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Try running it on the github issues pipeline:""")
return
@app.cell
def _(subprocess):
subprocess.run(["dlt", "pipeline", "github_pipeline", "trace"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) Python**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We can also print out the trace in code:""")
return
@app.cell
def _(pipeline):
# print human friendly trace information
print(pipeline.last_trace)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Separately receive the extract stage info:""")
return
@app.cell
def _(pipeline):
# print human friendly trace information
print(pipeline.last_trace.last_extract_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""As well as the normalization stage info with:""")
return
@app.cell
def _(pipeline):
# print human friendly normalization information
print(pipeline.last_trace.last_normalize_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""How many rows of data were normalized:""")
return
@app.cell
def _(pipeline):
# access row counts dictionary of normalize info
print(pipeline.last_trace.last_normalize_info.row_counts)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""And finally the load stage info:""")
return
@app.cell
def _(pipeline):
# print human friendly load information
print(pipeline.last_trace.last_load_info)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
## **State**
[`The pipeline state`](https://dlthub.com/docs/general-usage/state) is a Python dictionary that lives alongside your data. You can store values in it during a pipeline run, and then retrieve them in the next pipeline run. It's used for tasks like preserving the "last value" or similar loading checkpoints, and it gets committed atomically with the data. The state is stored locally in the pipeline working directory and is also stored at the destination for future runs.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**When to use pipeline state**
- `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.
- Store a list of already requested entities if the list is not much bigger than 100k elements.
- Store large dictionaries of last values if you are not able to implement it with the standard incremental construct.
- Store the custom fields dictionaries, dynamic configurations and other source-scoped state.
**When not to use pipeline state**
Do not use `dlt` state when it may grow to millions of elements.
For example, storing modification timestamps for millions of user records is a bad idea.
In that case, you could:
- Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state.
- Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(0) CLI**
""")
return
@app.cell
def _(subprocess):
subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **(1) Python**
""")
return
@app.cell
def _():
import json
def read_state(filepath: str) -> str:
with open(filepath, "r", encoding="utf-8") as file:
data = json.load(file)
pretty_json = json.dumps(data, indent=4)
return pretty_json
return (read_state,)
@app.cell
def _(read_state):
# stored in your default pipelines folder
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **Modify State**
The pipeline state is a Python dictionary that lives alongside your data; you can store values in it and, on the next pipeline run, request them back.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
#### **(0) Resource state**
You can **read** and **write** the state in your resources using:
```python
dlt.current.resource_state().get()
```
and
```python
dlt.current.resource_state().setdefault(key, value)
```
""")
return
@app.cell
def _(
BearerTokenAuth,
DltResource,
HeaderLinkPaginator,
Iterable,
RESTClient,
TDataItems,
dlt,
os,
):
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
@dlt.source
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
dlt.current.resource_state().setdefault(
"new_key", ["first_value", "second_value"]
)
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
return github_pulls
pipeline_1 = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
load_info_1 = pipeline_1.run(_github_source())
print(load_info_1)
return
@app.cell
def _(read_state):
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""In the state, you will see the new items:""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""![Lesson_8_Understanding_Pipeline_Metadata_and_State_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_8_Understanding_Pipeline_Metadata_and_State_img2.png)"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can modify any item in the state dict:
```python
new_keys = dlt.current.resource_state().setdefault("new_key", ["first_value", "second_value"])
if "something_happend":
new_keys.append("third_value")
incremental_dict = dlt.current.resource_state().get("incremental")
incremental_dict.update({"second_new_key": "fourth_value"})
```
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Full example:""")
return
@app.cell
def _(
BearerTokenAuth,
DltResource,
HeaderLinkPaginator,
Iterable,
RESTClient,
TDataItems,
dlt,
os,
):
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
@dlt.source
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
new_keys = dlt.current.resource_state().setdefault(
"new_key", ["first_value", "second_value"]
)
if "something_happened":
new_keys.append("third_value")
incremental_dict = dlt.current.resource_state().get("incremental")
incremental_dict.update({"second_new_key": "fourth_value"})
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
return github_pulls
pipeline_2 = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
load_info_2 = pipeline_2.run(_github_source())
print(load_info_2)
return
@app.cell
def _(read_state):
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
#### **(1) Source state**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names.
Let's read some custom keys from the state with:
```python
source_new_keys = dlt.current.source_state().get("resources", {}).get("github_pulls", {}).get("new_key")
```
Full example:
""")
return
@app.cell
def _(
BearerTokenAuth,
DltResource,
HeaderLinkPaginator,
Iterable,
RESTClient,
TDataItems,
dlt,
os,
):
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
@dlt.source
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
client = RESTClient(
base_url="https://api.github.com",
auth=BearerTokenAuth(token=secret_key),
paginator=HeaderLinkPaginator(),
)
@dlt.resource
def github_pulls(
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
"updated_at", initial_value="2024-12-01"
)
) -> TDataItems:
params = {"since": cursor_date.last_value, "status": "open"}
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
yield page
source_new_keys = (
dlt.current.source_state()
.get("resources", {})
.get("github_pulls", {})
.get("new_key")
)
print("My custom values: ", source_new_keys)
return github_pulls
pipeline_3 = dlt.pipeline(
pipeline_name="github_pipeline",
destination="duckdb",
dataset_name="github_data",
)
load_info_3 = pipeline_3.run(_github_source())
print(load_info_3)
return (pipeline_3,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **Sync State**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted?
**dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination.
The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset.
To reuse **the same state**, use **the same pipeline name** and the same destination.
The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob.
`dlt` provides the command:
```
dlt pipeline <pipeline name> sync
```
which retrieves the state from that table.
💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`.
""")
return
@app.cell
def _(pipeline_3):
import duckdb
from IPython.display import display
conn = duckdb.connect(f"{pipeline_3.pipeline_name}.duckdb")
# a database 'chess_pipeline.duckdb' was created in working directory so just connect to it
conn.sql(f"SET search_path = '{pipeline_3.dataset_name}'")
stats_table = conn.sql("SELECT * FROM _dlt_pipeline_state").df()
display(stats_table)
return (conn,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""The "state" column is a compressed json dictionary.""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
|index|version|engine\_version|pipeline\_name|state|created\_at|version\_hash|\_dlt\_load\_id|\_dlt\_id|
|---|---|---|---|---|---|---|---|---|
|0|1|4|github\_pipeline|eNplkN....6+/m/QA7mbNc|2025-03-10 14:02:34\.340458+00:00|pnp+9AIA5jAGx5LKon6zWmPnfYVb10ROa5aIKjv9O0I=|1741615353\.5473728|FOzn5XuSZ/y/BQ|
""")
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "--non-interactive", "pipeline", "github_pipeline", "sync"], check=True
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
---
### **Reset State**
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**To fully reset the state:**
- Drop the destination dataset to fully reset the pipeline.
- Set the `dev_mode` flag when creating the pipeline.
- Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.
**To partially reset the state:**
- Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource.
- Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
**Example for a partial reset:**
> In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI.
""")
return
@app.cell
def _(conn):
conn.close()
return
@app.cell
def _(subprocess):
subprocess.run(
["dlt", "pipeline", "github_pipeline", "drop", "github_pulls"],
input="y\n",
text=True,
check=True,
)
return
@app.cell
def _(subprocess):
subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊
Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼
""")
return
@app.cell
def _():
import marimo as mo
return (mo,)
if __name__ == "__main__":
app.run()

View File

@@ -43,6 +43,7 @@ dependencies = [
"regex>=2025.10.23",
"pytest-forked>=1.6.0",
"databind>=4.5.2",
"marimo>=0.17.4",
]

2
docs/uv.lock generated
View File

@@ -1121,6 +1121,7 @@ dependencies = [
{ name = "google-api-python-client" },
{ name = "google-auth-oauthlib" },
{ name = "lancedb" },
{ name = "marimo" },
{ name = "modal" },
{ name = "mypy" },
{ name = "nbqa" },
@@ -1160,6 +1161,7 @@ requires-dist = [
{ name = "google-api-python-client", specifier = ">=1.7.11" },
{ name = "google-auth-oauthlib", specifier = ">=1.0.0,<2" },
{ name = "lancedb", marker = "python_full_version < '3.13'", specifier = ">=0.8.2" },
{ name = "marimo", specifier = ">=0.17.4" },
{ name = "modal", specifier = ">=0.64.170" },
{ name = "modal", specifier = ">=1.2.1" },
{ name = "mypy", specifier = ">=1.11.0,<1.13.0" },

View File

@@ -10,34 +10,34 @@ In this course, you'll go far beyond the basics. Youll build production-grade
## Lessons
### **Lesson 1: Custom Sources REST APIs & RESTClient** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)
### **Lesson 1: Custom Sources REST APIs & RESTClient** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)
Learn how to build flexible REST API connectors from scratch using `@dlt.resource` and the powerful `RESTClient`.
### **Lesson 2: Custom Sources SQL Databases** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
### **Lesson 2: Custom Sources SQL Databases** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
Connect to any SQL-compatible database, reflect table schemas, write query adapters, and selectively ingest data using `sql_database`.
### **Lesson 3: Custom Sources Filesystems & Cloud Storage** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)
### **Lesson 3: Custom Sources Filesystems & Cloud Storage** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)
Build sources that read from local or remote files (S3, GCS, Azure).
### **Lesson 4: Custom Destinations Reverse ETL** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
### **Lesson 4: Custom Destinations Reverse ETL** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
Use `@dlt.destination` to send data back to APIs like Notion, Slack, or Airtable. Learn batching, retries, and idempotent patterns.
### **Lesson 5: Transforming Data Before & After Load** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
### **Lesson 5: Transforming Data Before & After Load** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
Learn when and how to apply `add_map`, `add_filter`, `@dlt.transformer`, or even post-load transformations via SQL or Ibis. Control exactly how your data looks.
### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)
### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)
Understand how to use `replace` and `merge`, and combine them with schema hints and incremental loading.
### **Lesson 7: Data Contracts** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
### **Lesson 7: Data Contracts** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
Define expectations on schema, enforce data types and behaviors, and lock down your schema evolution. Ensure reliable downstream use of your data.
### **Lesson 8: Logging & Tracing** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)
### **Lesson 8: Logging & Tracing** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)
Track every step of your pipeline: from extraction to load. Use logs, traces, and metadata to debug and analyze performance.
### **Lesson 9: Performance Optimization** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)
### **Lesson 9: Performance Optimization** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)
Handle large datasets, tune buffer sizes, parallelize resource extraction, optimize memory usage, and reduce pipeline runtime.
## Homework & Certification

View File

@@ -10,42 +10,41 @@ In this course you will learn the fundamentals of `dlt` alongside some of the mo
## Lessons
### Lesson 1: Quick Start [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
### Lesson 1: Quick Start [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
Discover what dlt is, run your first pipeline with toy data, and explore it like a pro using DuckDB, `sql_client`, and dlt datasets!
### Lesson 2: dlt Resources and Sources [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)
### Lesson 2: dlt Resources and Sources [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)
Learn to run pipelines with diverse data sources (dataframes, databases, and REST APIs),
master `dlt.resource`, `dlt.source`, and `dlt.transformer`, and create your first REST API pipeline!
### Lesson 3: Pagination & Authentication & dlt Configuration [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
### Lesson 3: Pagination & Authentication & dlt Configuration [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
Since it is never a good idea to publicly put your API keys into your code, different environments have different methods to set and access these secret keys. `dlt` is no different.
Master pagination and authentication for REST APIs, explore dlt's RESTClient and manage secrets and configs.
### Lesson 4: Using dlt's pre-built Sources and Destinations [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
### Lesson 4: Using dlt's pre-built Sources and Destinations [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
Now that you took a data source and loaded it into a `duckdb` destination, it is time to look into what other possibilities `dlt` offers.
In this notebook we will take a look at pre-built verified sources and destinations and how to use them.
### Lesson 5: Write disposition and incremental loading [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
### Lesson 5: Write disposition and incremental loading [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
Learn to control data behavior with dlt write dispositions (Append, Replace, Merge), master incremental loading, and efficiently update and deduplicate your datasets.
### Lesson 6: How dlt works [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
### Lesson 6: How dlt works [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
Discover the magic behind `dlt`! Learn its three main steps — Extract, Normalize, Load — along with default behaviors and supported file formats.
### Lesson 7: Inspecting & Adjusting Schema [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
### Lesson 7: Inspecting & Adjusting Schema [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
dlt creates and manages the schema automatically, but what if you want to control it yourself? Explore the schema and customize it to your needs easily with dlt!
### Lesson 8: Understanding Pipeline State & Metadata [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
### Lesson 8: Understanding Pipeline State & Metadata [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
After having learnt about pipelines and how to move data from one place to another. We now learn about information about the pipeline itself. Or, metadata of a pipeline that can be accessed and edited through dlt.

View File

@@ -248,6 +248,8 @@ dev = [
"pydoclint>=0.6.5,<0.7",
"types-paramiko>=3.5.0.20250708",
"graphviz>=0.21",
# limits sqlglot - remove when #3489 is fixed
"sqlglot<28.1",
]
# NOTE: those dependencies are used to test built in sources

View File

@@ -462,3 +462,45 @@ def prepare_service_json() -> Tuple[str, str]:
services_str = base64.b64decode(f.read().strip(), validate=True).decode()
dest_path = storage.save("level-dragon-333019-707809ee408a.json", services_str)
return services_str, dest_path
def test_bigquery_configuration_accepts_oauth_credentials() -> None:
# Create OAuth credentials
oauth_creds = GcpOAuthCredentials()
oauth_creds.project_id = "test-project"
oauth_creds.token = "test-token"
oauth_creds.client_id = ""
oauth_creds.refresh_token = ""
oauth_creds.resolve()
# Test that configuration accepts OAuth credentials
config = BigQueryClientConfiguration(credentials=oauth_creds)._bind_dataset_name(
dataset_name="test_dataset"
)
assert config.credentials == oauth_creds
assert config.credentials.project_id == "test-project"
def test_bigquery_configuration_accepts_base_gcp_credentials() -> None:
from google.oauth2.credentials import Credentials as GoogleOAuth2Credentials
# Create a wrapper that uses base GcpCredentials type
# This mimics what happens with Workload Identity Federation
native_credentials = GoogleOAuth2Credentials(token="test-token")
native_credentials.expiry = None # Non-refreshable
# Wrap in GcpServiceAccountCredentials (which extends GcpCredentials)
wrapper_creds = GcpServiceAccountCredentials()
wrapper_creds.project_id = "test-project"
wrapper_creds._set_default_credentials(native_credentials)
wrapper_creds.__is_resolved__ = True
# Test that configuration accepts wrapped credentials
config = BigQueryClientConfiguration(credentials=wrapper_creds)._bind_dataset_name(
dataset_name="test_dataset"
)
assert config.credentials == wrapper_creds
assert config.credentials.project_id == "test-project"
assert config.credentials.has_default_credentials()

View File

@@ -11,7 +11,11 @@ from dlt.common.utils import uniq_id
from dlt.common.schema import Schema
from dlt.common.schema.utils import new_table
from dlt.destinations import snowflake
from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient, SUPPORTED_HINTS
from dlt.destinations.impl.snowflake.snowflake import (
SnowflakeClient,
SUPPORTED_HINTS,
COLUMN_COMMENT_HINT,
)
from dlt.destinations.impl.snowflake.configuration import (
SnowflakeClientConfiguration,
SnowflakeCredentials,
@@ -259,3 +263,54 @@ def test_create_table_with_partition_and_cluster(snowflake_client: SnowflakeClie
# clustering must be the last
assert sql.endswith('CLUSTER BY ("COL2","COL5")')
def test_create_table_with_column_comments(snowflake_client: SnowflakeClient) -> None:
"""Test that column comments are added to CREATE TABLE SQL."""
mod_update = deepcopy(TABLE_UPDATE[:3])
# Add description (generic field) to first column
mod_update[0]["description"] = "This is the first column"
# Add snowflake-specific column comment hint to second column
mod_update[1][COLUMN_COMMENT_HINT] = "Snowflake specific comment" # type: ignore[typeddict-unknown-key]
statements = snowflake_client._get_table_update_sql("event_test_table", mod_update, False)
assert len(statements) == 1
sql = statements[0]
# Verify column comments are in the SQL
assert "COMMENT 'This is the first column'" in sql
assert "COMMENT 'Snowflake specific comment'" in sql
# Third column should not have a comment
assert '"COL3" BOOLEAN NOT NULL' in sql
assert sql.count("COMMENT") == 2
def test_column_comment_escaping(snowflake_client: SnowflakeClient) -> None:
"""Test that special characters in column comments are properly escaped."""
mod_update = deepcopy(TABLE_UPDATE[:1])
# Add comment with special characters that need escaping
mod_update[0]["description"] = "User's \"data\" with 'quotes'"
statements = snowflake_client._get_table_update_sql("event_test_table", mod_update, False)
sql = statements[0]
# Snowflake escapes single quotes by doubling them
assert "COMMENT 'User''s \"data\" with ''quotes'''" in sql
def test_alter_table_with_column_comments(snowflake_client: SnowflakeClient) -> None:
"""Test that column comments work with ALTER TABLE."""
new_columns = deepcopy(TABLE_UPDATE[1:3])
new_columns[0]["description"] = "Added column with comment"
statements = snowflake_client._get_table_update_sql("event_test_table", new_columns, True)
# First statement should be ADD COLUMN
add_column_sql = statements[0]
assert add_column_sql.startswith("ALTER TABLE")
assert "ADD COLUMN" in add_column_sql
assert "COMMENT 'Added column with comment'" in add_column_sql

View File

@@ -22,6 +22,7 @@ def _make_pipeline(destination_name: str):
)
@pytest.mark.skip("Reenable after #3343 is resolved")
@pytest.mark.parametrize(
"destination_config",
destinations_configs(default_sql_configs=True, local_filesystem_configs=True),
@@ -58,6 +59,7 @@ def test_rest_api_source(destination_config: DestinationTestConfiguration) -> No
assert table_counts.items() >= POKEMON_EXPECTED_TABLE_COUNTS.items()
@pytest.mark.skip("Reenable after #3343 is resolved")
@pytest.mark.parametrize(
"destination_config",
destinations_configs(default_sql_configs=True, local_filesystem_configs=True),

View File

@@ -22,6 +22,7 @@ def _make_pipeline(destination_name: str):
)
@pytest.mark.skip("Reenable after #3343 is resolved")
def test_rest_api_config_provider(toml_providers: ConfigProvidersContainer) -> None:
# mock dicts in toml provider
dlt.config["client"] = {
@@ -45,6 +46,7 @@ def test_rest_api_config_provider(toml_providers: ConfigProvidersContainer) -> N
print(load_info)
@pytest.mark.skip("Reenable after #3343 is resolved")
@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS)
@pytest.mark.parametrize("invocation_type", ("deco", "factory"))
def test_rest_api_source(destination_name: str, invocation_type: str) -> None:
@@ -82,6 +84,7 @@ def test_rest_api_source(destination_name: str, invocation_type: str) -> None:
assert table_counts.items() >= POKEMON_EXPECTED_TABLE_COUNTS.items()
@pytest.mark.skip("Reenable after #3343 is resolved")
@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS)
@pytest.mark.parametrize("invocation_type", ("deco", "factory"))
def test_dependent_resource(destination_name: str, invocation_type: str) -> None:

2
uv.lock generated
View File

@@ -2282,6 +2282,7 @@ dev = [
{ name = "requests-mock" },
{ name = "ruff" },
{ name = "sqlfluff" },
{ name = "sqlglot" },
{ name = "types-cachetools" },
{ name = "types-click" },
{ name = "types-deprecated" },
@@ -2487,6 +2488,7 @@ dev = [
{ name = "requests-mock", specifier = ">=1.10.0,<2" },
{ name = "ruff", specifier = ">=0.3.2,<0.4" },
{ name = "sqlfluff", specifier = ">=2.3.2,<3" },
{ name = "sqlglot", specifier = "<28.1" },
{ name = "types-cachetools", specifier = ">=4.2.9" },
{ name = "types-click", specifier = ">=7.1.8,<8" },
{ name = "types-deprecated", specifier = ">=1.2.9.2,<2" },