mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
Docs: Converting Jupyter notebooks in education to marimo notebooks (#3068)
* Initial commit * lesson_1_quick_start adjusted for marimo * lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline marimo * Fundamentals course 3 improved * Marimo badges added * Fundamenta: course 8 * Marimo badge link fix * Fundamentals: course 7 * Fundamentals: course 6 * Fundamentals: course 5 * Fundamentals: cousre 4 * Fundamentals: course 3 * Fundamentals: course 2 * Fundmantals: course 1 * marimo links corrected * Inline deps * Fundamentals: fix lesson 2 * Fundamentals: fix lesson 3 * Fundamentals: fix lesson 4 * Formatting moved to build-molabs * Fundamentals: fix lesson 5 * Removal of scrolls * Fundamentals: fix lesson 6 * Fundamentals: fix lesson 7 * Fundamentals: fix lesson 8 * os.environ replaced with dlt.secrets where relevant * Advanced: fix lesson 5 * Advanced fix lesson 9 * os.environ fixes * Advanced: fix lesson 1 * Comments cleanup * Additional comment removal, fix lesson 6 advanced * Clean main makefile * Get rid of constants.py * Nicer json.loads() * Better functions in preprocess_to_molab * Tests for doc tooling funcs * Validate molab command * Marimo check added * docs pages adjustment * limits sqlglot in dev group until fixed --------- Co-authored-by: Marcin Rudolf <rudolfix@rudolfix.org>
This commit is contained in:
6
.github/workflows/test_docs.yml
vendored
6
.github/workflows/test_docs.yml
vendored
@@ -106,3 +106,9 @@ jobs:
|
||||
|
||||
- name: run docs preprocessor
|
||||
run: cd docs && make preprocess-docs
|
||||
|
||||
- name: test preprocess_to_molab
|
||||
run: cd docs && make test-preprocess-molabs
|
||||
|
||||
- name: Ensure marimo notebooks are up-to-date
|
||||
run: cd docs && make validate-molabs
|
||||
|
||||
@@ -40,7 +40,8 @@ except ModuleNotFoundError:
|
||||
raise MissingDependencyException(
|
||||
"dlt pyarrow helpers",
|
||||
[f"{version.DLT_PKG_NAME}[parquet]"],
|
||||
"Install pyarrow to be allow to load arrow tables, panda frames and to use parquet files.",
|
||||
"Install pyarrow to be allowed to load arrow tables, panda frames and to use parquet"
|
||||
" files.",
|
||||
)
|
||||
|
||||
import ctypes
|
||||
|
||||
@@ -27,10 +27,9 @@ test-examples: ## Tests the examples in the examples folder
|
||||
test-snippets: ## Tests the snippets in the snippets folder
|
||||
cd website/docs && uv run pytest --ignore=node_modules
|
||||
|
||||
format: ## Formats the docs tooling, notebooks, and examples
|
||||
format: ## Formats the docs tooling, website, examples, and notebooks
|
||||
uv run black docs_tools website examples
|
||||
uv run black education --ipynb
|
||||
|
||||
uv run black education/*/*.ipynb --ipynb
|
||||
|
||||
generate-api-ref: ## Generates the API reference documentation from dlt codebase for website
|
||||
cd docs_tools/api_docs && uv run pydoc-markdown
|
||||
@@ -43,3 +42,14 @@ preprocess-docs: ## Preprocesses the docs pages, copies docs to docs_processed
|
||||
preprocess-docs-watch: ## Preprocesses the docs pages, copies docs to docs_processed folder and inserts snippets and tuba links and watches for changes
|
||||
uv run preprocess-docs --watch
|
||||
|
||||
test-preprocess-molabs: ## Tests functions used to build Molabs
|
||||
uv run pytest docs_tools/education/tests
|
||||
|
||||
build-molabs: ## Format the notebooks files first and build Molabs
|
||||
uv run black education/*/*.ipynb --ipynb
|
||||
uv run python docs_tools/education/preprocess_to_molab.py
|
||||
uv run black education/*/*.py
|
||||
uv run marimo check education/*/*.py --fix --quiet
|
||||
|
||||
validate-molabs: build-molabs ## Validate marimo notebooks are up-to-date
|
||||
git diff --quiet --exit-code -- education/
|
||||
|
||||
0
docs/docs_tools/education/__init__.py
Normal file
0
docs/docs_tools/education/__init__.py
Normal file
290
docs/docs_tools/education/preprocess_to_molab.py
Normal file
290
docs/docs_tools/education/preprocess_to_molab.py
Normal file
@@ -0,0 +1,290 @@
|
||||
import json
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
EDUCATION_NOTEBOOKS_DIR = Path(__file__).parent.parent.parent / "education"
|
||||
TEMP_IPYNB_FILE_PREIFX = "tmp"
|
||||
|
||||
MUST_INSTALL_PACKAGES = {"numpy", "pandas", "sqlalchemy"}
|
||||
|
||||
|
||||
def replace_colab_imports_in_notebook(notebook_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Remove Google Colab-specific imports and replace Colab API calls with standard Python.
|
||||
|
||||
Google Colab provides special APIs like `google.colab.userdata` for accessing secrets
|
||||
that don't exist outside the Colab environment. This function:
|
||||
- Removes: `from google.colab import userdata` (and similar imports)
|
||||
- Replaces: `userdata.get(...)` → `os.getenv(...)`
|
||||
|
||||
Args:
|
||||
notebook_dict: Notebook as a Python dictionary
|
||||
|
||||
Returns:
|
||||
Modified notebook dictionary
|
||||
"""
|
||||
for cell in notebook_dict.get("cells", []):
|
||||
if cell.get("cell_type") == "code":
|
||||
source = cell.get("source", [])
|
||||
if isinstance(source, list):
|
||||
# Remove lines with Google Colab imports
|
||||
source = [
|
||||
line
|
||||
for line in source
|
||||
if not re.match(r"^\s*from google\.colab import", line)
|
||||
]
|
||||
# Replace userdata.get with os.getenv
|
||||
source = [
|
||||
line.replace("userdata.get(", "os.getenv(") for line in source
|
||||
]
|
||||
cell["source"] = source
|
||||
|
||||
return notebook_dict
|
||||
|
||||
|
||||
def process_shell_commands_in_notebook(
|
||||
notebook_dict: Dict[str, Any]
|
||||
) -> tuple[Dict[str, Any], set[str]]:
|
||||
"""
|
||||
Convert Jupyter shell commands to Python subprocess calls and extract dependencies.
|
||||
|
||||
Jupyter/Colab notebooks support shell commands with `!` syntax (e.g., `!pip install dlt`),
|
||||
but this is IPython-specific magic syntax that doesn't work in standard Python or Marimo.
|
||||
This function:
|
||||
- Extracts package names from `!pip install` commands for dependency tracking
|
||||
- Converts other `!command` shell commands to `subprocess.run()` calls
|
||||
- Removes notebook-specific magic commands (e.g., `%%capture`)
|
||||
|
||||
Args:
|
||||
notebook_dict: Notebook as a Python dictionary
|
||||
|
||||
Returns:
|
||||
Tuple of (modified notebook dict, set of package names extracted from pip install commands)
|
||||
"""
|
||||
packages: set[str] = set()
|
||||
subprocess_imported: bool = False
|
||||
|
||||
for cell in notebook_dict.get("cells", []):
|
||||
if cell.get("cell_type") == "code":
|
||||
cell_code = cell.get("source", [])
|
||||
new_cell_code = []
|
||||
|
||||
for line in cell_code:
|
||||
stripped = line.strip()
|
||||
|
||||
# skip magic commands
|
||||
if stripped.startswith("%%capture"):
|
||||
continue
|
||||
|
||||
# extract packages from pip install
|
||||
if stripped.startswith("!pip install"):
|
||||
match = re.search(r"!pip install\s+(.+?)(?:\n|$)", stripped)
|
||||
if match:
|
||||
cleaned = (
|
||||
match.group(1).strip().replace('"', "").replace("'", "")
|
||||
)
|
||||
# Remove spaces around commas in brackets
|
||||
cleaned = re.sub(r"\[\s*", "[", cleaned) # Remove space after [
|
||||
cleaned = re.sub(
|
||||
r"\s*\]", "]", cleaned
|
||||
) # Remove space before ]
|
||||
cleaned = re.sub(
|
||||
r",\s+", ",", cleaned
|
||||
) # Remove space after commas
|
||||
|
||||
pkgs = [
|
||||
p.strip()
|
||||
for p in cleaned.split()
|
||||
if p.strip() and not p.startswith("-")
|
||||
] # Filter flags
|
||||
packages.update(pkgs)
|
||||
continue
|
||||
|
||||
# convert other shell commands
|
||||
elif stripped.startswith("!"):
|
||||
if not subprocess_imported:
|
||||
new_cell_code.append("import subprocess\n")
|
||||
subprocess_imported = True
|
||||
cmd = stripped[1:]
|
||||
new_line = _build_subprocess_line(cmd) + "\n"
|
||||
new_cell_code.append(new_line)
|
||||
|
||||
else:
|
||||
new_cell_code.append(line)
|
||||
|
||||
cell["source"] = new_cell_code
|
||||
|
||||
return notebook_dict, packages
|
||||
|
||||
|
||||
def add_inline_dependencies_to_content(packages: set[str], py_content: str) -> str:
|
||||
"""
|
||||
Add PEP 723 inline script metadata block with dependencies.
|
||||
|
||||
Marimo/Molab can automatically install packages when they're declared using PEP 723
|
||||
inline script metadata. The dependency list includes:
|
||||
- Packages extracted from !pip install commands in the original notebook
|
||||
- MUST_INSTALL_PACKAGES (core dependencies required for all notebooks)
|
||||
|
||||
Args:
|
||||
packages: Set of package names to include (will be merged with MUST_INSTALL_PACKAGES)
|
||||
py_content: The Python file content as a string
|
||||
|
||||
Returns:
|
||||
Python content with PEP 723 metadata block prepended
|
||||
|
||||
NOTE: Without this, users would need to go through a step of manually installing packages before running
|
||||
the notebook (Marimo will try to install missing imports, which is not exactly nice for a smooth experience.
|
||||
Also, some libraries used under the hood are not directly imported and are not caught by Marimo).
|
||||
|
||||
Format:
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "package1",
|
||||
# "package2",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
packages = packages.copy() # Don't mutate the input set
|
||||
packages.update(MUST_INSTALL_PACKAGES)
|
||||
if not packages:
|
||||
return py_content
|
||||
|
||||
pkg_lines = "\n".join(f'# "{pkg}",' for pkg in sorted(packages))
|
||||
deps_block = f"""# /// script
|
||||
# dependencies = [
|
||||
{pkg_lines}
|
||||
# ]
|
||||
# ///
|
||||
|
||||
"""
|
||||
|
||||
return deps_block + py_content
|
||||
|
||||
|
||||
def read_notebook(ipynb_path: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Read a Jupyter notebook file and return as a dictionary.
|
||||
|
||||
Args:
|
||||
ipynb_path: Path to the .ipynb file
|
||||
|
||||
Returns:
|
||||
Notebook data as a Python dictionary
|
||||
"""
|
||||
data: Dict[str, Any] = json.loads(ipynb_path.read_text(encoding="utf-8"))
|
||||
return data
|
||||
|
||||
|
||||
def write_notebook(notebook_dict: Dict[str, Any], output_path: Path) -> None:
|
||||
"""
|
||||
Write a notebook dictionary to a file.
|
||||
|
||||
Args:
|
||||
notebook_dict: Notebook data as a Python dictionary
|
||||
output_path: Path where the notebook should be written
|
||||
"""
|
||||
output_path.write_text(
|
||||
json.dumps(notebook_dict, indent=1, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def convert_notebook_to_marimo(temp_ipynb_path: Path) -> str:
|
||||
"""
|
||||
Convert a Jupyter notebook to Marimo Python format using marimo CLI.
|
||||
|
||||
Args:
|
||||
temp_ipynb_path: Path to the temporary preprocessed notebook
|
||||
|
||||
Returns:
|
||||
Marimo Python file content as a string
|
||||
"""
|
||||
result = subprocess.run(
|
||||
["marimo", "convert", str(temp_ipynb_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def write_python_file(content: str, output_path: Path) -> None:
|
||||
"""
|
||||
Write Python content to a file.
|
||||
|
||||
Args:
|
||||
content: Python file content as a string
|
||||
output_path: Path where the file should be written
|
||||
"""
|
||||
output_path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def _build_subprocess_line(cmd: str) -> str:
|
||||
"""
|
||||
Generate a subprocess.run() call string from a shell command.
|
||||
|
||||
This helper converts various shell command patterns to their Python subprocess
|
||||
equivalents, handling special cases like piped input.
|
||||
|
||||
Conversion rules:
|
||||
- Simple commands: `command arg` → `subprocess.run(['command', 'arg'], check=True)`
|
||||
- Yes piping: `yes | command` → `subprocess.run(['command'], input='y\\n', ...)`
|
||||
- No piping: `no | command` → `subprocess.run(['command'], input='n\\n', ...)`
|
||||
- Complex pipes: `cmd1 | cmd2` → `subprocess.run('cmd1 | cmd2', shell=True, ...)`
|
||||
|
||||
Args:
|
||||
cmd: The shell command string (without the leading `!`)
|
||||
|
||||
Returns:
|
||||
A string containing Python code for subprocess.run()
|
||||
"""
|
||||
cmd = cmd.strip()
|
||||
|
||||
# No pipe → simple list argv
|
||||
if "|" not in cmd:
|
||||
argv = shlex.split(cmd)
|
||||
return f"subprocess.run({argv!r}, check=True)"
|
||||
|
||||
# Split pipe
|
||||
left, right = map(str.strip, cmd.split("|", 1))
|
||||
left_lower = left.lower()
|
||||
|
||||
# yes | command → feed "y\n"
|
||||
if left_lower == "yes":
|
||||
argv = shlex.split(right)
|
||||
return f"subprocess.run({argv!r}, input='y\\n', text=True, check=True)"
|
||||
|
||||
# no | command → feed "n\n"
|
||||
if left_lower == "no":
|
||||
argv = shlex.split(right)
|
||||
return f"subprocess.run({argv!r}, input='n\\n', text=True, check=True)"
|
||||
|
||||
# generic pipe: shell=True fallback
|
||||
return f"subprocess.run({cmd!r}, shell=True, check=True)"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for ipynb_file in EDUCATION_NOTEBOOKS_DIR.glob("*/*.ipynb"):
|
||||
# 1. Read notebook file
|
||||
notebook_dict = read_notebook(ipynb_file)
|
||||
# 2. Replace Colab imports
|
||||
notebook_dict = replace_colab_imports_in_notebook(notebook_dict)
|
||||
# 3. Process shell commands
|
||||
notebook_dict, packages = process_shell_commands_in_notebook(notebook_dict)
|
||||
# 4. Write temporary notebook
|
||||
temp_ipynb_file = ipynb_file.with_name(
|
||||
f"{TEMP_IPYNB_FILE_PREIFX}_{ipynb_file.name}"
|
||||
)
|
||||
write_notebook(notebook_dict, temp_ipynb_file)
|
||||
# 5. Convert to Marimo format
|
||||
py_content = convert_notebook_to_marimo(temp_ipynb_file)
|
||||
# 6. Add inline dependencies
|
||||
py_content_with_deps = add_inline_dependencies_to_content(packages, py_content)
|
||||
# 7. Write final Python file
|
||||
output_path = ipynb_file.with_suffix(".py")
|
||||
write_python_file(py_content_with_deps, output_path)
|
||||
# 8. Clean up temporary files
|
||||
temp_ipynb_file.unlink()
|
||||
0
docs/docs_tools/education/tests/__init__.py
Normal file
0
docs/docs_tools/education/tests/__init__.py
Normal file
109
docs/docs_tools/education/tests/test_preprocess_to_molab.py
Normal file
109
docs/docs_tools/education/tests/test_preprocess_to_molab.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import pytest
|
||||
from docs_tools.education.preprocess_to_molab import (
|
||||
replace_colab_imports_in_notebook,
|
||||
process_shell_commands_in_notebook,
|
||||
add_inline_dependencies_to_content,
|
||||
)
|
||||
|
||||
|
||||
def test_replace_colab_imports() -> None:
|
||||
"""Ensure that collab specific imports are removed and converted where necessary."""
|
||||
notebook = {
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from google.colab import userdata\n",
|
||||
"api_key = userdata.get('API_KEY')\n",
|
||||
"print(api_key)\n",
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
result = replace_colab_imports_in_notebook(notebook)
|
||||
assert result == {
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"api_key = os.getenv('API_KEY')\n",
|
||||
"print(api_key)\n",
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def test_process_shell_commands_in_notebook() -> None:
|
||||
"""Ensure that pip install commands are removed, shell commands converted."""
|
||||
notebook = {
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!pip install dlt\n",
|
||||
"!pip install dlt[bigquery,postgres]\n",
|
||||
"!pip install requests==2.28.0\n",
|
||||
"!pip install -q scikit-learn\n",
|
||||
],
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!ls -la\n",
|
||||
"!pwd\n",
|
||||
"!yes | dlt init source destination\n",
|
||||
"!no | some_command --flag\n",
|
||||
"!cat file.txt | grep pattern\n",
|
||||
"%%capture\n",
|
||||
"print('hello')\n",
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
result, packages = process_shell_commands_in_notebook(notebook)
|
||||
assert packages == {
|
||||
"dlt",
|
||||
"dlt[bigquery,postgres]",
|
||||
"requests==2.28.0",
|
||||
"scikit-learn",
|
||||
}
|
||||
assert result == {
|
||||
"cells": [
|
||||
{"cell_type": "code", "source": []},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import subprocess\n",
|
||||
"subprocess.run(['ls', '-la'], check=True)\n",
|
||||
"subprocess.run(['pwd'], check=True)\n",
|
||||
"subprocess.run(['dlt', 'init', 'source', 'destination'], input='y\\n', text=True, check=True)\n",
|
||||
"subprocess.run(['some_command', '--flag'], input='n\\n', text=True, check=True)\n",
|
||||
"subprocess.run('cat file.txt | grep pattern', shell=True, check=True)\n",
|
||||
"print('hello')\n",
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def test_add_inline_dependencies_to_content() -> None:
|
||||
"""Ensure that PEP 723 metadata block is correctly added and includes MUST_INSTALL_PACKAGES."""
|
||||
packages = {"requests", "dlt[bigquery,postgres]"}
|
||||
py_content = "import marimo\n"
|
||||
result = add_inline_dependencies_to_content(packages, py_content)
|
||||
expected = """# /// script
|
||||
# dependencies = [
|
||||
# "dlt[bigquery,postgres]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "requests",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
"""
|
||||
print(result)
|
||||
assert result == expected
|
||||
31
docs/education/README.md
Normal file
31
docs/education/README.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Adding New Notebooks
|
||||
|
||||
## Overview
|
||||
|
||||
The `.py` files in this directory are **auto-generated** from `.ipynb` files. Only edit the `.ipynb` files.
|
||||
|
||||
To regenerate `.py` files:
|
||||
```bash
|
||||
make build-molabs
|
||||
```
|
||||
|
||||
Preprocessing logic: [`docs/docs_tools/education/`](../docs_tools/education/)
|
||||
|
||||
## Things to consider
|
||||
|
||||
To ensure compatibility with both **Google Colab** and **Marimo/Molab**:
|
||||
|
||||
### 1. **No inline comments**
|
||||
Bad: `x = 5 # comment`
|
||||
Good: Separate line comments
|
||||
|
||||
**Why:** `marimo convert` scatters inline comments
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Create/edit `.ipynb` in the course folder
|
||||
2. Follow guidelines above
|
||||
3. Run `make build-molabs` to generate `.py` files
|
||||
4. Test both versions (Colab and Molab)
|
||||
5. Commit both `.ipynb` and `.py` files
|
||||
6. Make changes to the processing logic in `docs/docs_tools/education/` if necessary.
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "TKD-8-XUjqU4"
|
||||
},
|
||||
"source": [
|
||||
"# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)"
|
||||
"# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -46,7 +46,9 @@
|
||||
"We constructed a custom source for the **GitHub API** using the `RESTClient` class, decorators like `@dlt.resource` and `@dlt.source`, and manual pagination handling.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### **Example**"
|
||||
"#### **Example**\n",
|
||||
"\n",
|
||||
"> Don't forget to use your [GitHub API token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) below! "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -81,7 +83,7 @@
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -148,7 +150,7 @@
|
||||
" \"client\": {\n",
|
||||
" \"base_url\": \"https://api.github.com\",\n",
|
||||
" \"auth\": {\n",
|
||||
" \"token\": dlt.secrets[\"access_token\"], # Access token configured above\n",
|
||||
" \"token\": dlt.secrets[\"access_token\"],\n",
|
||||
" },\n",
|
||||
" \"paginator\": \"header_link\",\n",
|
||||
" },\n",
|
||||
@@ -182,14 +184,14 @@
|
||||
"\n",
|
||||
"git_source = rest_api_source(config)\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"rest_api_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"rest_api_github\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"rest_api_data\",\n",
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(git_source)\n",
|
||||
"load_info = rest_api_pipeline.run(git_source)\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -212,7 +214,7 @@
|
||||
"source": [
|
||||
"If you don't like black boxes and prefer lower-level building blocks, then our `RESTClient` is perfect for you!\n",
|
||||
"\n",
|
||||
"The `RESTClient` class offers an Pythonic interface for interacting with RESTful APIs, including features like:\n",
|
||||
"The `RESTClient` class offers a Pythonic interface for interacting with RESTful APIs, including features like:\n",
|
||||
"\n",
|
||||
"- automatic pagination,\n",
|
||||
"- various authentication mechanisms,\n",
|
||||
@@ -225,7 +227,7 @@
|
||||
"- How to build a custom `@dlt.source`\n",
|
||||
"- How to run the pipeline and inspect the data\n",
|
||||
"\n",
|
||||
"For more information, read `dlt` [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client) official documentation."
|
||||
"For more information, read `dlt`'s official documentation for the [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -248,11 +250,10 @@
|
||||
"source": [
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
@@ -335,7 +336,7 @@
|
||||
"\n",
|
||||
"#### **Authentication Details:**\n",
|
||||
"\n",
|
||||
"To use NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n",
|
||||
"To use the NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n",
|
||||
"\n",
|
||||
"```http\n",
|
||||
"GET /v2/everything?q=python&page=1&apiKey=YOUR_API_KEY\n",
|
||||
@@ -357,7 +358,7 @@
|
||||
"\n",
|
||||
"1. **Sign up** at [https://newsapi.org/register](https://newsapi.org/register)\n",
|
||||
"2. Copy your **API key** from your dashboard\n",
|
||||
"3. Save your **API key** in Colab Secrets (side-bar on the right) as NEWS_API_KEY\n",
|
||||
"3. Save your **API key** in Colab (or Molab) Secrets (side-bar on the right) as NEWS_API_KEY\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### **How we chose the right authenticator for NewsAPI**\n",
|
||||
@@ -423,12 +424,12 @@
|
||||
"\n",
|
||||
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
"news_api_client = RESTClient(\n",
|
||||
" base_url=\"https://newsapi.org/v2/\",\n",
|
||||
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"response = news_api_client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"print(response.json())"
|
||||
]
|
||||
},
|
||||
@@ -503,16 +504,24 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"page_iterator = news_api_client.paginate(\n",
|
||||
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
|
||||
")\n",
|
||||
"# prints the original request object\n",
|
||||
"print(next(page_iterator).request)\n",
|
||||
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"page_iterator = news_api_client.paginate(\n",
|
||||
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
|
||||
")\n",
|
||||
"# prints the raw HTTP response\n",
|
||||
"print(next(page_iterator).response)\n",
|
||||
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"page_iterator = news_api_client.paginate(\n",
|
||||
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
|
||||
")\n",
|
||||
"# prints the paginator that was used\n",
|
||||
"print(next(page_iterator).paginator)\n",
|
||||
"page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n",
|
||||
"page_iterator = news_api_client.paginate(\n",
|
||||
" \"everything\", params={\"q\": \"python\", \"page\": 1}\n",
|
||||
")\n",
|
||||
"# prints the authentication class used\n",
|
||||
"print(next(page_iterator).auth)"
|
||||
]
|
||||
@@ -545,7 +554,7 @@
|
||||
"### **Question 1:**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Which paginator is used by `client.paginate()` by default in the example above?\n",
|
||||
"Which paginator is used by `news_api_client.paginate()` by default in the example above?\n",
|
||||
"\n",
|
||||
"\n",
|
||||
">Answer this question and select the correct option in the homework Google Form.\n"
|
||||
@@ -627,19 +636,19 @@
|
||||
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
"another_client = RESTClient(\n",
|
||||
" base_url=\"https://newsapi.org/v2/\",\n",
|
||||
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
|
||||
" paginator=PageNumberPaginator(\n",
|
||||
" base_page=1, # NewsAPI starts paging from 1\n",
|
||||
" page_param=\"page\", # Matches the API spec\n",
|
||||
" total_path=None, # Set it to None explicitly\n",
|
||||
" stop_after_empty_page=True, # Stop if no articles returned\n",
|
||||
" maximum_page=4, # Optional limit for dev/testing\n",
|
||||
" base_page=1,\n",
|
||||
" page_param=\"page\",\n",
|
||||
" total_path=None,\n",
|
||||
" stop_after_empty_page=True,\n",
|
||||
" maximum_page=4,\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for page in client.paginate(\n",
|
||||
"for page in another_client.paginate(\n",
|
||||
" \"everything\", params={\"q\": \"python\", \"pageSize\": 5, \"language\": \"en\"}\n",
|
||||
"):\n",
|
||||
" for article in page:\n",
|
||||
@@ -670,14 +679,14 @@
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n",
|
||||
"\n",
|
||||
"os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.resource(write_disposition=\"replace\", name=\"python_articles\")\n",
|
||||
"def get_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
|
||||
"def get_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
|
||||
" client = RESTClient(\n",
|
||||
" base_url=\"https://newsapi.org/v2/\",\n",
|
||||
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
|
||||
" auth=APIKeyAuth(name=\"apiKey\", api_key=news_api_key, location=\"query\"),\n",
|
||||
" paginator=PageNumberPaginator(\n",
|
||||
" base_page=1,\n",
|
||||
" page_param=\"page\",\n",
|
||||
@@ -715,11 +724,11 @@
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n",
|
||||
"\n",
|
||||
"os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.resource(write_disposition=\"replace\", name=\"top_articles\")\n",
|
||||
"def get_top_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
|
||||
"def get_top_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n",
|
||||
" client = RESTClient(\n",
|
||||
" base_url=\"https://newsapi.org/v2/\",\n",
|
||||
" auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n",
|
||||
@@ -759,8 +768,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dlt.source\n",
|
||||
"def newsapi_source(api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n",
|
||||
" return [get_articles(api_key=api_key), get_top_articles(api_key=api_key)]"
|
||||
"def newsapi_source(news_api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n",
|
||||
" return [get_articles(news_api_key), get_top_articles(news_api_key)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -843,7 +852,7 @@
|
||||
"\n",
|
||||
"dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n",
|
||||
"\n",
|
||||
"In previous section you've already met Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n",
|
||||
"In the previous section, you've already learned about the Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
@@ -909,7 +918,7 @@
|
||||
"source": [
|
||||
"### **RESTAPIConfig**\n",
|
||||
"\n",
|
||||
"The central object when working with `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n",
|
||||
"The central object when working with the `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n",
|
||||
"\n",
|
||||
"It defines:\n",
|
||||
"- how to connect to the API (base URL, authentication)\n",
|
||||
@@ -1045,7 +1054,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1081,7 +1090,7 @@
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab's secret manager.\n",
|
||||
"This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab or Molab's secret manager.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The available authentication methods you can find in [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#authentication)."
|
||||
@@ -1122,12 +1131,12 @@
|
||||
"\n",
|
||||
"news_source = rest_api_source(news_config)\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"another_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
"another_pipeline.run(news_source)\n",
|
||||
"print(another_pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1202,7 +1211,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1292,7 +1301,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1318,7 +1327,7 @@
|
||||
"- dlt will remember the last `publishedAt` seen\n",
|
||||
"- On the next run, it will only request articles newer than that\n",
|
||||
"\n",
|
||||
"This is optional and depends on your usage pattern.\n"
|
||||
"This is optional and depends on your usage pattern."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1331,8 +1340,14 @@
|
||||
"source": [
|
||||
"import dlt\n",
|
||||
"from dlt.sources.rest_api import rest_api_source\n",
|
||||
"from datetime import datetime, timedelta, timezone\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"# the free plan of newsapi.org only allows you to fetch news from a maximum of 1 month ago\n",
|
||||
"one_month_ago = datetime.now(timezone.utc) - timedelta(days=30)\n",
|
||||
"initial_from = one_month_ago.replace(microsecond=0).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"api_key = userdata.get(\"NEWS_API_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1365,7 +1380,7 @@
|
||||
" \"from\": {\n",
|
||||
" \"type\": \"incremental\",\n",
|
||||
" \"cursor_path\": \"publishedAt\",\n",
|
||||
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
|
||||
" \"initial_value\": initial_from,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
@@ -1379,11 +1394,11 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)\n",
|
||||
"\n",
|
||||
"# Run the pipeline one more time\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1471,7 +1486,7 @@
|
||||
" \"from\": {\n",
|
||||
" \"type\": \"incremental\",\n",
|
||||
" \"cursor_path\": \"publishedAt\",\n",
|
||||
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
|
||||
" \"initial_value\": initial_from,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
@@ -1485,11 +1500,11 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)\n",
|
||||
"\n",
|
||||
"# Run the pipeline one more time\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1580,7 +1595,7 @@
|
||||
" \"from\": {\n",
|
||||
" \"type\": \"incremental\",\n",
|
||||
" \"cursor_path\": \"publishedAt\",\n",
|
||||
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
|
||||
" \"initial_value\": initial_from,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
@@ -1601,7 +1616,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().top_headlines.df().head()"
|
||||
@@ -1672,9 +1687,10 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def debug_response(\n",
|
||||
" response: requests.Response, *args: Any, **kwargs: Any\n",
|
||||
") -> requests.Response:\n",
|
||||
"from dlt.sources.helpers.requests import Response\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def debug_response(response: Response, *args: Any, **kwargs: Any) -> Response:\n",
|
||||
" print(\"Intercepted:\", response.status_code)\n",
|
||||
" return response"
|
||||
]
|
||||
@@ -1728,7 +1744,7 @@
|
||||
" \"response_actions\": [\n",
|
||||
" {\n",
|
||||
" \"status_code\": 200,\n",
|
||||
" \"action\": debug_response, # <--- add some action\n",
|
||||
" \"action\": debug_response,\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
" \"params\": {\n",
|
||||
@@ -1736,7 +1752,7 @@
|
||||
" \"from\": {\n",
|
||||
" \"type\": \"incremental\",\n",
|
||||
" \"cursor_path\": \"publishedAt\",\n",
|
||||
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
|
||||
" \"initial_value\": initial_from,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
@@ -1757,7 +1773,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().news_articles.df().head()"
|
||||
@@ -1807,8 +1823,8 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def lower_title(record: TDataItem) -> TDataItem:\n",
|
||||
" record[\"title\"] = record[\"title\"].lower()\n",
|
||||
"def lower_title(record: dict[str, Any]) -> dict[str, Any]:\n",
|
||||
" record[\"title\"] = str(record[\"title\"]).lower()\n",
|
||||
" return record"
|
||||
]
|
||||
},
|
||||
@@ -1857,8 +1873,8 @@
|
||||
" {\n",
|
||||
" \"name\": \"news_articles\",\n",
|
||||
" \"processing_steps\": [\n",
|
||||
" {\"filter\": lambda x: len(x[\"author\"]) > 0}, # <--- add filter\n",
|
||||
" {\"map\": lower_title}, # <--- add some transformation\n",
|
||||
" {\"filter\": lambda x: len(x[\"author\"]) > 0},\n",
|
||||
" {\"map\": lower_title},\n",
|
||||
" ],\n",
|
||||
" \"endpoint\": {\n",
|
||||
" \"path\": \"everything\",\n",
|
||||
@@ -1873,7 +1889,7 @@
|
||||
" \"from\": {\n",
|
||||
" \"type\": \"incremental\",\n",
|
||||
" \"cursor_path\": \"publishedAt\",\n",
|
||||
" \"initial_value\": \"2025-04-15T00:00:00Z\",\n",
|
||||
" \"initial_value\": initial_from,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
@@ -1894,7 +1910,7 @@
|
||||
" pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(news_source)\n",
|
||||
"pipeline.run(news_source)\n",
|
||||
"print(pipeline.last_trace)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().news_articles.df().head()"
|
||||
@@ -1944,15 +1960,15 @@
|
||||
"\n",
|
||||
"### Requirements:\n",
|
||||
"1. Use `rest_api_source` to define your source config.\n",
|
||||
"2. This API uses **pagination**. Figure out what type is it.\n",
|
||||
"2. This API uses **pagination**. Figure out what type it is.\n",
|
||||
"3. Add incremental loading to `orders`, starting from `2017-08-01` and using `ordered_at` as the cursor.\n",
|
||||
"4. Add `processing_steps` to `orders`:\n",
|
||||
" - Remove records from orders which `order_total` > 500.\n",
|
||||
" - Remove records from orders for which it is true that `order_total` > 500.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Question:\n",
|
||||
"How many rows does resulted table `orders` contain?\n"
|
||||
"How many rows does the resulting table `orders` contain?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1972,7 +1988,7 @@
|
||||
"id": "70D6czgeId7F"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Well done! Go to [the next lesson.](https://colab.research.google.com/drive/1lQ8VkrGJwZMsVtbkuYympcvbv0_CCgYo#forceEdit=true&sandboxMode=true)"
|
||||
"✅ ▶ Well done! Go to [the next lesson.](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
"id": "NvaKFdYx-kbG"
|
||||
},
|
||||
"source": [
|
||||
"# Building custom sources using SQL Databases [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n",
|
||||
"# Building custom sources using SQL Databases [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n",
|
||||
"\n",
|
||||
"This lesson covers building flexible and powerful custom sources using the `sql_database` verified source.\n"
|
||||
]
|
||||
@@ -32,15 +32,6 @@
|
||||
"- How to load only new data with incremental loading\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "4PRqLBIQA7rj"
|
||||
},
|
||||
"source": [
|
||||
"Setup & install dlt:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -198,7 +189,7 @@
|
||||
"id": "YjPZMS6DWVNN"
|
||||
},
|
||||
"source": [
|
||||
"Let's save this filtered data:"
|
||||
"Let's load this filtered data:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -209,7 +200,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"info = pipeline.run(filtered_resource, table_name=\"bacterias\")\n",
|
||||
"info = pipeline.run(filtered_resource, table_name=\"bacteria\")\n",
|
||||
"print(info)"
|
||||
]
|
||||
},
|
||||
@@ -230,7 +221,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline.dataset().bacterias.df().head()"
|
||||
"pipeline.dataset().bacteria.df().head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -241,7 +232,7 @@
|
||||
"source": [
|
||||
"### **Question 1**:\n",
|
||||
"\n",
|
||||
"How many rows are present in the `bacterias` table?\n",
|
||||
"How many rows are present in the `bacteria` table?\n",
|
||||
"\n",
|
||||
">Answer this question and select the correct option in the homework Quiz.\n"
|
||||
]
|
||||
@@ -278,8 +269,10 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def add_max_timestamp(table: Table) -> Any:\n",
|
||||
" max_ts = sa.func.greatest(table.c.created, table.c.updated).label(\"max_timestamp\")\n",
|
||||
" subq = sa.select(*table.c, max_ts).subquery()\n",
|
||||
" max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label(\n",
|
||||
" \"max_timestamp\"\n",
|
||||
" )\n",
|
||||
" subq = sa.select(*table.columns, max_ts).subquery()\n",
|
||||
" return subq"
|
||||
]
|
||||
},
|
||||
@@ -476,7 +469,7 @@
|
||||
"\n",
|
||||
"We'll also be looking at where these incremental values are stored.\n",
|
||||
"\n",
|
||||
"Hint: they are stored in [dlt state](https://dlthub.com/docs/general-usage/state)."
|
||||
"Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -583,17 +576,8 @@
|
||||
"id": "IkvUgaRhI6iY"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1P8pOw9C6J9555o2jhZydESVuVb-3z__y#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "Iz0lz3QhJEvv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,435 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "duckdb",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "pymysql",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Building custom sources using SQL Databases [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
|
||||
|
||||
This lesson covers building flexible and powerful custom sources using the `sql_database` verified source.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## What you will learn
|
||||
|
||||
- How to build a custom pipeline using SQL sources
|
||||
- How to use `query_adapter_callback`, `table_adapter_callback`, and `type_adapter_callback`
|
||||
- How to load only new data with incremental loading
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Step 1: Load data from SQL Databases""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""We’ll use the [Rfam MySQL public DB](https://docs.rfam.org/en/latest/database.html) and load it into DuckDB:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from typing import Any
|
||||
from dlt.sources.sql_database import sql_database
|
||||
import dlt
|
||||
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["family"],
|
||||
)
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="sql_database_example",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
dev_mode=True,
|
||||
)
|
||||
load_info = pipeline.run(_source)
|
||||
print(load_info)
|
||||
return Any, dlt, pipeline, sql_database
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Explore the `family` table:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
pipeline.dataset().family.df().head()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 2: Customize SQL queries with `query_adapter_callback`
|
||||
|
||||
You can fully rewrite or modify the SQL SELECT statement per table.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Filter rows using a WHERE clause""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from sqlalchemy import text
|
||||
from dlt.sources.sql_database.helpers import SelectClause, Table
|
||||
|
||||
def query_adapter_callback(query: SelectClause, table: Table) -> SelectClause:
|
||||
return text(f"SELECT * FROM {table.fullname} WHERE rfam_id like '%bacteria%'")
|
||||
return Table, query_adapter_callback
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""To be able to use `sql_database` and not have to declare the connection string each time, we save it as an environment variable. This can also (should preferably) be done in `secrets.toml`"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
|
||||
os.environ[
|
||||
"SOURCES__SQL_DATABASE__CREDENTIALS"
|
||||
] = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(query_adapter_callback, sql_database):
|
||||
filtered_resource = sql_database(
|
||||
query_adapter_callback=query_adapter_callback, table_names=["family"]
|
||||
)
|
||||
return (filtered_resource,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's load this filtered data:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(filtered_resource, pipeline):
|
||||
_info = pipeline.run(filtered_resource, table_name="bacteria")
|
||||
print(_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Explore the data:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
pipeline.dataset().bacteria.df().head()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Question 1**:
|
||||
|
||||
How many rows are present in the `bacteria` table?
|
||||
|
||||
>Answer this question and select the correct option in the homework Quiz.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 3: Modify table schema with `table_adapter_callback`
|
||||
|
||||
Add columns, change types, or transform schema using this hook.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Example: Add computed column `max_timestamp`""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Any, Table):
|
||||
import sqlalchemy as sa
|
||||
|
||||
def add_max_timestamp(table: Table) -> Any:
|
||||
max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label(
|
||||
"max_timestamp"
|
||||
)
|
||||
subq = sa.select(*table.columns, max_ts).subquery()
|
||||
return subq
|
||||
return add_max_timestamp, sa
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Use it with `sql_table`:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(add_max_timestamp, dlt, pipeline):
|
||||
from dlt.sources.sql_database import sql_table
|
||||
|
||||
table = sql_table(
|
||||
table="family",
|
||||
table_adapter_callback=add_max_timestamp,
|
||||
incremental=dlt.sources.incremental("max_timestamp"),
|
||||
)
|
||||
_info = pipeline.run(table, table_name="family_with_max_timestamp")
|
||||
print(_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's check out if this column exists!""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
pipeline.dataset().family_with_max_timestamp.df().head()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 4: Adapt column data types with `type_adapter_callback`
|
||||
|
||||
When the default types don’t match what you want in the destination, you can remap them.
|
||||
|
||||
Let's look at the schema that has already been loaded:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
schema = pipeline.default_schema.to_dict()["tables"]["family"]["columns"]
|
||||
for _column in schema:
|
||||
print(schema[_column]["name"], ":", schema[_column]["data_type"])
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Lets change `hmm_lambda` from decimal to float.
|
||||
|
||||
💡 Quick fyi: The `float` data type is:
|
||||
- Fast and uses less space
|
||||
- But it's approximate — you may get 0.30000000000000004 instead of 0.3
|
||||
- Bad for money, great for probabilities, large numeric ranges, scientific values
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Example: Change data types""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Any, sa):
|
||||
from sqlalchemy.types import Float
|
||||
|
||||
def type_adapter_callback(sql_type: Any) -> Any:
|
||||
if isinstance(sql_type, sa.Numeric):
|
||||
return Float
|
||||
return sql_type
|
||||
return (type_adapter_callback,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Use it with `sql_database`:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline, sql_database, type_adapter_callback):
|
||||
new_source = sql_database(
|
||||
type_adapter_callback=type_adapter_callback, table_names=["family"]
|
||||
)
|
||||
_info = pipeline.run(new_source, table_name="type_changed_family")
|
||||
print(_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""👀 Can you see how the column data types have changed?""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
schema1 = pipeline.default_schema.to_dict()["tables"]["family"]["columns"]
|
||||
schema2 = pipeline.default_schema.to_dict()["tables"]["type_changed_family"][
|
||||
"columns"
|
||||
]
|
||||
_column = "trusted_cutoff"
|
||||
print(
|
||||
"For table 'family':",
|
||||
schema1[_column]["name"],
|
||||
":",
|
||||
schema1[_column]["data_type"],
|
||||
)
|
||||
print(
|
||||
"For table 'type_changed_family':",
|
||||
schema2[_column]["name"],
|
||||
":",
|
||||
schema2[_column]["data_type"],
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Question 2**:
|
||||
|
||||
How many columns had their type changed in the `type_changed_family` table?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 5: Incremental loads with `sql_database`
|
||||
Track only new rows using a timestamp or ID column.
|
||||
|
||||
We'll also be looking at where these incremental values are stored.
|
||||
|
||||
Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import json
|
||||
|
||||
with open(
|
||||
"/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8"
|
||||
) as _f:
|
||||
_data = json.load(_f)
|
||||
_data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys()
|
||||
return (json,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, pipeline, sql_database):
|
||||
import pendulum
|
||||
|
||||
_source = sql_database(table_names=["family"])
|
||||
_source.family.apply_hints(
|
||||
incremental=dlt.sources.incremental(
|
||||
"updated", initial_value=pendulum.datetime(2024, 1, 1)
|
||||
)
|
||||
)
|
||||
_info = pipeline.run(_source)
|
||||
print(_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(json):
|
||||
with open(
|
||||
"/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8"
|
||||
) as _f:
|
||||
_data = json.load(_f)
|
||||
_data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## **Rename tables for `sql_database` source**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, sql_database):
|
||||
_source = sql_database(table_names=["family"])
|
||||
for _resource_name, resource in _source.resources.items():
|
||||
resource.apply_hints(table_name=f"xxxx__{resource.name}")
|
||||
pipeline_1 = dlt.pipeline(
|
||||
pipeline_name="sql_db_prefixed_tables",
|
||||
destination="duckdb",
|
||||
dataset_name="renamed_tables",
|
||||
)
|
||||
print(pipeline_1.run(_source))
|
||||
pipeline_1.dataset().row_counts().df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "8ucJBHffzqYB"
|
||||
},
|
||||
"source": [
|
||||
"# Building Custom Sources with the Filesystem in `dlt` [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"
|
||||
"# Building Custom Sources with the Filesystem in `dlt` [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -24,8 +24,6 @@
|
||||
"id": "F5ayDx9Nz1ts"
|
||||
},
|
||||
"source": [
|
||||
"You will learn how to:\n",
|
||||
"\n",
|
||||
"- Use the `filesystem` resource to build real custom sources\n",
|
||||
"- Apply filters to file metadata (name, size, date)\n",
|
||||
"- Implement and register custom transformers\n",
|
||||
@@ -42,15 +40,6 @@
|
||||
"## Setup: Download real data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "siTnHHjg1fSK"
|
||||
},
|
||||
"source": [
|
||||
"Install dlt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -80,7 +69,14 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet"
|
||||
"import urllib.request\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.makedirs(\"local_data\", exist_ok=True)\n",
|
||||
"\n",
|
||||
"url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n",
|
||||
"dest = \"local_data/userdata.parquet\"\n",
|
||||
"urllib.request.urlretrieve(url, dest)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -277,7 +273,9 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# Download a JSON file\n",
|
||||
"!wget -O local_data/sample.json https://jsonplaceholder.typicode.com/users\n",
|
||||
"url = \"https://jsonplaceholder.typicode.com/users\"\n",
|
||||
"dest = \"local_data/sample.json\"\n",
|
||||
"urllib.request.urlretrieve(url, dest)\n",
|
||||
"\n",
|
||||
"fs = filesystem(bucket_url=\"./local_data\", file_glob=\"sample.json\")\n",
|
||||
"pipeline = dlt.pipeline(\"json_pipeline\", destination=\"duckdb\")\n",
|
||||
@@ -366,7 +364,7 @@
|
||||
"id": "XoWLhw7DLg7i"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/14br3TZTRFwTSwpDyom7fxlZCeRF4efMk#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -375,15 +373,6 @@
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "rBJ9K3XwMhZW"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,301 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""# Building Custom Sources with the Filesystem in `dlt` [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## What you will learn""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
- Use the `filesystem` resource to build real custom sources
|
||||
- Apply filters to file metadata (name, size, date)
|
||||
- Implement and register custom transformers
|
||||
- Enrich records with file metadata
|
||||
- Use incremental loading both for files and content
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Setup: Download real data""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""We’ll use a real `.parquet` file from [TimeStored.com](https://www.timestored.com/data/sample/userdata.parquet)"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import urllib.request
|
||||
import os
|
||||
|
||||
os.makedirs("local_data", exist_ok=True)
|
||||
_url = "https://www.timestored.com/data/sample/userdata.parquet"
|
||||
_dest = "local_data/userdata.parquet"
|
||||
urllib.request.urlretrieve(_url, _dest)
|
||||
return os, urllib
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1: Load Parquet file from Local Filesystem
|
||||
|
||||
**What the script below does**: Lists and reads all `.parquet` files in `./local_data` and loads them into a table named `userdata`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
from dlt.sources.filesystem import filesystem, read_parquet
|
||||
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
|
||||
# Point to the local file directory
|
||||
parquet_data = _fs | read_parquet()
|
||||
pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb")
|
||||
# Add a transformer
|
||||
_load_info = pipeline.run(parquet_data.with_name("userdata"))
|
||||
print(_load_info)
|
||||
# Create and run pipeline
|
||||
# Inspect data
|
||||
pipeline.dataset().userdata.df().head()
|
||||
return dlt, filesystem, pipeline, read_parquet
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Question 1**:
|
||||
|
||||
In the `my_pipeline` pipeline, and the `userdata` dataset, what is the ratio of men:women in decimal?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# check out the numbers below and answer 👀
|
||||
df = pipeline.dataset().userdata.df()
|
||||
df.groupby("gender").describe()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 2: Enrich records with file metadata
|
||||
|
||||
Let’s add the file name to every record to track the data origin.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, filesystem):
|
||||
from dlt.common.typing import TDataItems
|
||||
|
||||
@dlt.transformer()
|
||||
def read_parquet_with_filename(files: TDataItems) -> TDataItems:
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
for file_item in files:
|
||||
with file_item.open() as f:
|
||||
table = pq.read_table(f).to_pandas()
|
||||
table["source_file"] = file_item["file_name"]
|
||||
yield table.to_dict(orient="records")
|
||||
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="*.parquet")
|
||||
pipeline_1 = dlt.pipeline("meta_pipeline", destination="duckdb")
|
||||
_load_info = pipeline_1.run(
|
||||
(_fs | read_parquet_with_filename()).with_name("userdata")
|
||||
)
|
||||
print(_load_info)
|
||||
return (TDataItems,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Step 3: Filter files by metadata""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Only load files matching custom logic:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, filesystem, read_parquet):
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
|
||||
_fs.add_filter(lambda f: "user" in f["file_name"] and f["size_in_bytes"] < 1000000)
|
||||
pipeline_2 = dlt.pipeline("filtered_pipeline", destination="duckdb")
|
||||
_load_info = pipeline_2.run((_fs | read_parquet()).with_name("userdata_filtered"))
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 4: Load files incrementally
|
||||
Avoid reprocessing the same file twice.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, filesystem, read_parquet):
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet")
|
||||
_fs.apply_hints(incremental=dlt.sources.incremental("modification_date"))
|
||||
data = (_fs | read_parquet()).with_name("userdata")
|
||||
pipeline_3 = dlt.pipeline("incremental_pipeline", destination="duckdb")
|
||||
_load_info = pipeline_3.run(data)
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 5: Create a custom transformer
|
||||
|
||||
Let’s read structured data from `.json` files.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, filesystem, urllib):
|
||||
@dlt.transformer(standalone=True)
|
||||
def read_json(items: TDataItems) -> TDataItems:
|
||||
from dlt.common import json
|
||||
|
||||
for file_obj in items:
|
||||
with file_obj.open() as f:
|
||||
yield json.load(f)
|
||||
|
||||
_url = "https://jsonplaceholder.typicode.com/users"
|
||||
_dest = "local_data/sample.json"
|
||||
urllib.request.urlretrieve(_url, _dest)
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="sample.json")
|
||||
pipeline_4 = dlt.pipeline("json_pipeline", destination="duckdb")
|
||||
_load_info = pipeline_4.run((_fs | read_json()).with_name("users"))
|
||||
print(_load_info)
|
||||
return (pipeline_4,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
📁 You will see that this file also exists in your local_data directory.
|
||||
|
||||
> A **standalone** resource is defined on a function that is top-level in a module (not an inner function) that accepts config and secrets values. Additionally, if the standalone flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function, and the user must call the wrapper to get the actual resource.
|
||||
|
||||
Let's inspect the `users` table in your DuckDB dataset:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_4):
|
||||
pipeline_4.dataset().users.df().head()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 6: Copy files before loading
|
||||
|
||||
Copy files locally as part of the pipeline. This is useful for backups or post-processing.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, filesystem, os):
|
||||
from dlt.common.storages.fsspec_filesystem import FileItemDict
|
||||
|
||||
def copy_local(item: FileItemDict) -> FileItemDict:
|
||||
local_path = os.path.join("copied", item["file_name"])
|
||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||
item.fsspec.download(item["file_url"], local_path)
|
||||
return item
|
||||
|
||||
_fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet").add_map(
|
||||
copy_local
|
||||
)
|
||||
pipeline_5 = dlt.pipeline("copy_pipeline", destination="duckdb")
|
||||
_load_info = pipeline_5.run(_fs.with_name("copied_files"))
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Next steps
|
||||
|
||||
- Try building a transformer for `.xml` using `xmltodict`
|
||||
- Combine multiple directories or buckets in a single pipeline
|
||||
- Explore [more examples](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/advanced)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "eZpIGo3Fg8hR"
|
||||
},
|
||||
"source": [
|
||||
"# Custom destinations & Reverse ETL [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n",
|
||||
"# Custom destinations & Reverse ETL [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
@@ -15,7 +15,7 @@
|
||||
"- What reverse ETL means in practice \n",
|
||||
"- How to build custom destinations with `@dlt.destination` \n",
|
||||
"- How batching works \n",
|
||||
"- How to push real data from Rfam database to Notion \n",
|
||||
"- How to push real data from the Rfam database to Notion \n",
|
||||
"\n",
|
||||
"---\n"
|
||||
]
|
||||
@@ -237,8 +237,8 @@
|
||||
"\n",
|
||||
"### 4.1. Step 1: Create a database in Notion\n",
|
||||
"\n",
|
||||
"1. Create empty database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n",
|
||||
"2. [Create integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n",
|
||||
"1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n",
|
||||
"2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n",
|
||||
"3. Connect your database to the integration.\n",
|
||||
"4. Create 3 columns: Accession (title), ID (text), Description (text)"
|
||||
]
|
||||
@@ -263,7 +263,7 @@
|
||||
"id": "0AdDovQklsE9"
|
||||
},
|
||||
"source": [
|
||||
"### 4.2. Step 2: Install and configure"
|
||||
"### 4.2. Step 2: Configure"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -289,7 +289,7 @@
|
||||
"2. Set your credentials either in:\n",
|
||||
" - `~/.dlt/secrets.toml` \n",
|
||||
" - or environment variables\n",
|
||||
" - or (**in our case**) in Colab Secrets\n",
|
||||
" - or (**in our case**) in Colab or Molab Secrets\n",
|
||||
"\n",
|
||||
" ```toml\n",
|
||||
" [destination.notion]\n",
|
||||
@@ -344,7 +344,7 @@
|
||||
"id": "C0r_R3M_6ePP"
|
||||
},
|
||||
"source": [
|
||||
"You can also check if your integration works via `curl`:\n",
|
||||
"You can also check if your integration works via the requests library:\n",
|
||||
"1. Modify Bearer token\n",
|
||||
"2. Modify \"query\" if you database have another name"
|
||||
]
|
||||
@@ -357,7 +357,24 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! curl -X POST 'https://api.notion.com/v1/search' -H 'Authorization: Bearer '\"ntn_q5_your_token_o5xQLn1sewnep6\"'' -H 'Content-Type: application/json' -H 'Notion-Version: 2022-06-28' --data '{\"query\": \"Advanced\", \"filter\": {\"value\": \"database\", \"property\": \"object\"}, \"sort\": {\"direction\":\"ascending\", \"timestamp\":\"last_edited_time\"}}'"
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = \"https://api.notion.com/v1/search\"\n",
|
||||
"\n",
|
||||
"headers = {\n",
|
||||
" \"Authorization\": \"Bearer ntn_q5_your_token_o5xQLn1sewnep6\",\n",
|
||||
" \"Content-Type\": \"application/json\",\n",
|
||||
" \"Notion-Version\": \"2022-06-28\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"query\": \"Advanced\",\n",
|
||||
" \"filter\": {\"value\": \"database\", \"property\": \"object\"},\n",
|
||||
" \"sort\": {\"direction\": \"ascending\", \"timestamp\": \"last_edited_time\"},\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = requests.post(url, headers=headers, json=data)\n",
|
||||
"print(response.json())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -424,8 +441,8 @@
|
||||
"from notion_client import Client\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n",
|
||||
"os.environ[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n",
|
||||
"dlt.secrets[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n",
|
||||
"dlt.secrets[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.destination(name=\"notion\")\n",
|
||||
@@ -522,17 +539,8 @@
|
||||
"id": "nJach4xBFfva"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1--wNVd26TqNolnnECnUYZqeE2CXOeVZE#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "vmz0tMhcmwPh"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,464 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "dlt[duckdb]",
|
||||
# "notion-client",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "pymysql",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Custom destinations & Reverse ETL [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
|
||||
|
||||
---
|
||||
|
||||
## What you’ll learn
|
||||
|
||||
- What reverse ETL means in practice
|
||||
- How to build custom destinations with `@dlt.destination`
|
||||
- How batching works
|
||||
- How to push real data from the Rfam database to Notion
|
||||
|
||||
---
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **1. Concept: What is a custom destination?**
|
||||
|
||||
Normally, dlt sends your data to databases like BigQuery or Postgres.
|
||||
|
||||
But with `@dlt.destination`, you can **intercept the normalized data** and send it wherever you want:
|
||||
- APIs (Notion, Slack, Airtable)
|
||||
- Message queues (Kafka, SQS)
|
||||
- Logging systems
|
||||
- Custom data sinks
|
||||
|
||||
All you have to do is define a function like:
|
||||
|
||||
```python
|
||||
@dlt.destination
|
||||
def my_destination(items, table):
|
||||
...
|
||||
```
|
||||
|
||||
And dlt will call this for every batch of data extracted and normalized.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
## **2. Simple example: print data rows**
|
||||
|
||||
### Code example:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
from dlt.common.typing import TDataItems
|
||||
from dlt.common.schema import TTableSchema
|
||||
|
||||
@dlt.destination(batch_size=5)
|
||||
def print_sink(items: TDataItems, table: TTableSchema) -> None:
|
||||
print(f"\nTable: {table['name']}")
|
||||
for item in items:
|
||||
print(item)
|
||||
|
||||
@dlt.resource
|
||||
def simple_data() -> TDataItems:
|
||||
yield [{"id": i, "value": f"row-{i}"} for i in range(12)]
|
||||
|
||||
_pipeline = dlt.pipeline("print_example", destination=print_sink)
|
||||
_pipeline.run(simple_data())
|
||||
print(_pipeline.last_trace)
|
||||
return TDataItems, TTableSchema, dlt, simple_data
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**What’s happening?**
|
||||
|
||||
- `simple_data()` yields 12 small records.
|
||||
- The data goes through **normalization** (converted to rows + types).
|
||||
- `@dlt.destination(batch_size=5)` groups these rows into batches of 5.
|
||||
- For each batch, `print_sink()` is called.
|
||||
- The `table` parameter tells you which table the batch belongs to.
|
||||
|
||||
|
||||
**Why this is important?**
|
||||
|
||||
- This is the **simplest possible custom destination.**
|
||||
- You’re in control: log, debug, or route data per table.
|
||||
- It introduces how dlt structures the data and calls your function.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Question 1:
|
||||
|
||||
In the following example, how many times will the function be called?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, TTableSchema, dlt):
|
||||
@dlt.destination(batch_size=2)
|
||||
def new_print_sink(items: TDataItems, table: TTableSchema) -> None:
|
||||
print(items)
|
||||
|
||||
@dlt.resource
|
||||
def new_simple_data() -> TDataItems:
|
||||
yield [{"id": i} for i in range(6)]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **3. How batching works**
|
||||
|
||||
By default `batch_size` is 10.
|
||||
|
||||
|
||||
Let’s tweak just one thing:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, TTableSchema, dlt, simple_data):
|
||||
@dlt.destination(batch_size=1)
|
||||
def print_each_row(items: TDataItems, table: TTableSchema) -> None:
|
||||
print(f"Got one row from table {table['name']}:")
|
||||
print(items)
|
||||
|
||||
_pipeline = dlt.pipeline("print_example", destination=print_each_row)
|
||||
_pipeline.run(simple_data())
|
||||
print(_pipeline.last_trace)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Now, dlt calls your function **once per row** instead of per 5 rows.
|
||||
|
||||
Useful if:
|
||||
- Your API doesn’t support bulk inserts.
|
||||
- You want fine-grained control or retries.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **4. Real-world project: Rfam database → Notion**
|
||||
|
||||
Let’s build a real pipeline that fetches data from database and **sends it to Notion**.
|
||||
|
||||
### Why Notion?
|
||||
|
||||
- Notion is a great tool for product/dev teams.
|
||||
- But dlt doesn’t support Notion as a *destination*.
|
||||
- So, we’ll build that ourselves.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### 4.1. Step 1: Create a database in Notion
|
||||
|
||||
1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)
|
||||
2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace.
|
||||
3. Connect your database to the integration.
|
||||
4. Create 3 columns: Accession (title), ID (text), Description (text)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### 4.2. Step 2: Configure""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
2. Set your credentials either in:
|
||||
- `~/.dlt/secrets.toml`
|
||||
- or environment variables
|
||||
- or (**in our case**) in Colab or Molab Secrets
|
||||
|
||||
```toml
|
||||
[destination.notion]
|
||||
notion_auth = "<your_integration_token>"
|
||||
notion_page_id = "<your_database_id>"
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""- Save your [Notion authentication token](https://developers.notion.com/docs/authorization#internal-integration-auth-flow-set-up) and the [ID of the page](https://developers.notion.com/docs/working-with-page-content#creating-a-page-with-content) where you want to create a database in your Colab secrets:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> Make sure to [connect the page](https://www.notion.so/help/add-and-manage-connections-with-the-api#add-connections-to-pages) to the integration associated with the token!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can also check if your integration works via the requests library:
|
||||
1. Modify Bearer token
|
||||
2. Modify "query" if you database have another name
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import requests
|
||||
|
||||
url = "https://api.notion.com/v1/search"
|
||||
|
||||
headers = {
|
||||
"Authorization": "Bearer ntn_q5_your_token_o5xQLn1sewnep6",
|
||||
"Content-Type": "application/json",
|
||||
"Notion-Version": "2022-06-28",
|
||||
}
|
||||
|
||||
data = {
|
||||
"query": "Advanced",
|
||||
"filter": {"value": "database", "property": "object"},
|
||||
"sort": {"direction": "ascending", "timestamp": "last_edited_time"},
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
print(response.json())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### 4.3. Step 3: Get data from Rfam database
|
||||
|
||||
Let's use `query_callback` and limit the number of data rows:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import text
|
||||
from dlt.sources.sql_database import sql_database
|
||||
from dlt.sources.sql_database.helpers import SelectClause, Table
|
||||
|
||||
def limit_rows(query: SelectClause, table: Table) -> SelectClause:
|
||||
return text(f"SELECT * FROM {table.fullname} LIMIT 20")
|
||||
|
||||
source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["family"],
|
||||
query_adapter_callback=limit_rows,
|
||||
)
|
||||
return os, source
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### 4.4. Step 4: Define Notion destination""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, TTableSchema, dlt, os):
|
||||
from notion_client import Client
|
||||
|
||||
dlt.secrets["DESTINATION__NOTION__NOTION_AUTH"] = os.getenv("NOTION_AUTHENTICATION")
|
||||
dlt.secrets["DESTINATION__NOTION__NOTION_PAGE_ID"] = os.getenv("NOTION_PAGE_ID")
|
||||
|
||||
@dlt.destination(name="notion")
|
||||
def push_to_notion(
|
||||
items: TDataItems,
|
||||
table: TTableSchema,
|
||||
notion_auth: str = dlt.secrets.value,
|
||||
notion_page_id: str = dlt.secrets.value,
|
||||
) -> None:
|
||||
client = Client(auth=notion_auth)
|
||||
print(len(items))
|
||||
for item in items:
|
||||
client.pages.create(
|
||||
parent={"database_id": notion_page_id},
|
||||
properties={
|
||||
"Accession": {"title": [{"text": {"content": item["rfam_acc"]}}]},
|
||||
"ID": {"rich_text": [{"text": {"content": item["rfam_id"]}}]},
|
||||
"Description": {
|
||||
"rich_text": [{"text": {"content": item["description"]}}]
|
||||
},
|
||||
},
|
||||
)
|
||||
return (push_to_notion,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**What’s happening?**
|
||||
|
||||
- dlt will call `push_to_notion()` with one batch of records at a time.
|
||||
- For each record, we create a page in Notion.
|
||||
- Credentials and database ID come from `secrets.toml` or env vars.
|
||||
|
||||
**Why this is useful?**
|
||||
|
||||
- You just turned your pipeline into a full **reverse ETL** job.
|
||||
- No need for Airbyte or writing custom orchestration scripts.
|
||||
- It’s reusable and works with dlt’s retry logic, state management, and transformations.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### 4.5. Step 5: Run the pipeline""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, push_to_notion, source):
|
||||
_pipeline = dlt.pipeline(
|
||||
"notion_pipeline", destination=push_to_notion, progress="log"
|
||||
)
|
||||
_pipeline.run(source, table_name="rfam_family")
|
||||
print(_pipeline.last_trace)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## 5. Reliability and state
|
||||
|
||||
### What if Notion fails mid-run?
|
||||
|
||||
- dlt **retries batches** up to 5 times.
|
||||
- You can restart the pipeline and it will continue from the failed batch.
|
||||
- But you must make your destination **idempotent** (i.e., safe to re-run the same input).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,17 +6,17 @@
|
||||
"id": "CbFVutT06Cqq"
|
||||
},
|
||||
"source": [
|
||||
"# Transforming and filtering the data [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n",
|
||||
"# Transforming and filtering the data [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n",
|
||||
"\n",
|
||||
"In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion.\n",
|
||||
"\n",
|
||||
"dlt provides several ways of doing it during the ingestion:\n",
|
||||
"1. With custom query (applicable for `sql_database` source).\n",
|
||||
"2. With dlt special functions (`add_map` and `add_filter`).\n",
|
||||
"1. With a custom query (applicable for `sql_database` source).\n",
|
||||
"2. With special dlt functions (`add_map` and `add_filter`).\n",
|
||||
"3. Via `@dlt.transformers`.\n",
|
||||
"4. With `pipeline.dataset()`.\n",
|
||||
"\n",
|
||||
"Let's review and compare those methods."
|
||||
"Let's review and compare these methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -116,8 +116,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome\") as table:\n",
|
||||
" genome = table.df()\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome\") as my_table:\n",
|
||||
" genome = my_table.df()\n",
|
||||
"genome"
|
||||
]
|
||||
},
|
||||
@@ -139,8 +139,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as table:\n",
|
||||
" print(table.df())"
|
||||
" with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as my_table:\n",
|
||||
" print(my_table.df())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -158,7 +158,7 @@
|
||||
"id": "edAUbOHXuwlL"
|
||||
},
|
||||
"source": [
|
||||
"Imagine a use-case where we're only interested in getting the genome data for bacterias. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."
|
||||
"Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -172,8 +172,8 @@
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\n",
|
||||
" \"SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'\"\n",
|
||||
" ) as table:\n",
|
||||
" print(table.df())"
|
||||
" ) as my_table:\n",
|
||||
" print(my_table.df())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -190,20 +190,14 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "F8A675ZXTCn9"
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny:\n",
|
||||
" if table.name == \"genome\":\n",
|
||||
" # Only select rows where the column kingdom has value \"bacteria\"\n",
|
||||
" return query.where(table.c.kingdom == \"bacteria\")\n",
|
||||
" # Use the original query for other tables\n",
|
||||
" return query"
|
||||
" return query.where(table.c.kingdom == \"bacteria\") if table.name else query"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -240,8 +234,7 @@
|
||||
" dataset_name=\"sql_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"\n",
|
||||
"pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -305,16 +298,16 @@
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\n",
|
||||
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan\"\n",
|
||||
" ) as table:\n",
|
||||
" ) as my_table:\n",
|
||||
" print(\"Table clan:\")\n",
|
||||
" print(table.df())\n",
|
||||
" print(my_table.df())\n",
|
||||
" print(\"\\n\")\n",
|
||||
"\n",
|
||||
" with client.execute_query(\n",
|
||||
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n",
|
||||
" ) as table:\n",
|
||||
" ) as my_table:\n",
|
||||
" print(\"Table genome:\")\n",
|
||||
" print(table.df())"
|
||||
" print(my_table.df())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -373,9 +366,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
|
||||
" print(\"Table clan:\")\n",
|
||||
" print(table.df())"
|
||||
" print(my_table.df())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -465,9 +458,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
|
||||
" print(\"Table clan:\")\n",
|
||||
" clan = table.df()\n",
|
||||
" clan = my_table.df()\n",
|
||||
"\n",
|
||||
"clan"
|
||||
]
|
||||
@@ -546,9 +539,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n",
|
||||
" with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n",
|
||||
" print(\"Table clan:\")\n",
|
||||
" print(table.df())"
|
||||
" print(my_table.df())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -596,8 +589,8 @@
|
||||
"\n",
|
||||
"resource.add_map(add_greeting)\n",
|
||||
"\n",
|
||||
"for row in resource():\n",
|
||||
" print(row)"
|
||||
"for _row in resource():\n",
|
||||
" print(_row)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -680,7 +673,7 @@
|
||||
")\n",
|
||||
"source.genome.add_filter(lambda item: item[\"kingdom\"] == \"bacteria\")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
@@ -696,9 +689,9 @@
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\n",
|
||||
" \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n",
|
||||
" ) as table:\n",
|
||||
" ) as my_table:\n",
|
||||
" print(\"Table genome:\")\n",
|
||||
" genome_count = table.df()\n",
|
||||
" genome_count = my_table.df()\n",
|
||||
"genome_count"
|
||||
]
|
||||
},
|
||||
@@ -753,8 +746,7 @@
|
||||
")\n",
|
||||
"source.genome.add_limit(1)\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"\n",
|
||||
"pipeline.run(source, write_disposition=\"replace\")\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -767,8 +759,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome\") as table:\n",
|
||||
" genome_limited = table.df()\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome\") as my_table:\n",
|
||||
" genome_limited = my_table.df()\n",
|
||||
"genome_limited"
|
||||
]
|
||||
},
|
||||
@@ -824,7 +816,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"info = pipeline.run([genome_resource, genome_resource | batch_stats])\n",
|
||||
"pipeline.run([genome_resource, genome_resource | batch_stats])\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -837,8 +829,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT * FROM batch_stats\") as table:\n",
|
||||
" res = table.df()\n",
|
||||
" with client.execute_query(\"SELECT * FROM batch_stats\") as my_table:\n",
|
||||
" res = my_table.df()\n",
|
||||
"res"
|
||||
]
|
||||
},
|
||||
@@ -879,16 +871,16 @@
|
||||
"# NOTE: this is the duckdb sql dialect, other destinations may use different expressions\n",
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
" client.execute_sql(\n",
|
||||
" \"\"\" CREATE OR REPLACE TABLE genome_length AS\n",
|
||||
" SELECT\n",
|
||||
" SUM(total_length) AS total_total_length,\n",
|
||||
" AVG(total_length) AS average_total_length\n",
|
||||
" FROM\n",
|
||||
" genome\n",
|
||||
" \"\"\"\n",
|
||||
" (\n",
|
||||
" \"CREATE OR REPLACE TABLE genome_length AS \"\n",
|
||||
" \"SELECT \"\n",
|
||||
" \" SUM(total_length) AS total_total_length, \"\n",
|
||||
" \" AVG(total_length) AS average_total_length \"\n",
|
||||
" \"FROM genome\"\n",
|
||||
" )\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome_length\") as table:\n",
|
||||
" genome_length = table.df()\n",
|
||||
" )\n",
|
||||
" with client.execute_query(\"SELECT * FROM genome_length\") as my_table:\n",
|
||||
" genome_length = my_table.df()\n",
|
||||
"\n",
|
||||
"genome_length"
|
||||
]
|
||||
@@ -1068,7 +1060,7 @@
|
||||
"id": "AH3F46PaJZe4"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1XT1xUIQIWj0nPWOmTixThgdXzi4vudce#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
@@ -0,0 +1,765 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[sql_database,duckdb]",
|
||||
# "ibis-framework[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "pymysql",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Transforming and filtering the data [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
|
||||
|
||||
In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion.
|
||||
|
||||
dlt provides several ways of doing it during the ingestion:
|
||||
1. With a custom query (applicable for `sql_database` source).
|
||||
2. With special dlt functions (`add_map` and `add_filter`).
|
||||
3. Via `@dlt.transformers`.
|
||||
4. With `pipeline.dataset()`.
|
||||
|
||||
Let's review and compare these methods.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## What you’ll learn:
|
||||
|
||||
- How to limit rows at the source with SQL queries.
|
||||
- How to apply custom Python logic per record.
|
||||
- How to write transformations using functional and declarative APIs.
|
||||
- How to access and query your loaded data using `.dataset()`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Setup and initial Load""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
We will be using the `sql_database` source as an example and will connect to the public [MySQL RFam](https://www.google.com/url?q=https%3A%2F%2Fwww.google.com%2Furl%3Fq%3Dhttps%253A%252F%252Fdocs.rfam.org%252Fen%252Flatest%252Fdatabase.html) database. The RFam database contains publicly accessible scientific data on RNA structures.
|
||||
|
||||
Let's perform an initial load:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
from dlt.sources.sql_database import sql_database
|
||||
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["family", "genome"],
|
||||
)
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_load_info = pipeline.run(_source)
|
||||
print(_load_info)
|
||||
return dlt, pipeline, sql_database
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
with pipeline.sql_client() as _client:
|
||||
with _client.execute_query("SELECT * FROM genome") as _my_table:
|
||||
genome = _my_table.df()
|
||||
genome
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""You can check your data count using `sql_client`:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
with pipeline.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT COUNT(*) AS total_rows FROM genome"
|
||||
) as _my_table:
|
||||
print(_my_table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""## **1. Filtering the data during the ingestion with `query_adapter_callback`**"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
with pipeline.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'"
|
||||
) as _my_table:
|
||||
print(_my_table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
When ingesting data using the `sql_database` source, dlt runs a `SELECT` statement in the back, and using the `query_adapter_callback` parameter makes it possible to pass a `WHERE` clause inside the underlying `SELECT` statement.
|
||||
|
||||
In this example, only the table `genome` is filtered on the column `kingdom`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause
|
||||
|
||||
def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny:
|
||||
return query.where(table.c.kingdom == "bacteria") if table.name else query
|
||||
return SelectAny, SelectClause, Table, query_adapter_callback
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Attach it:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, query_adapter_callback, sql_database):
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["genome"],
|
||||
query_adapter_callback=query_adapter_callback,
|
||||
)
|
||||
pipeline_1 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_filtered",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
pipeline_1.run(_source, write_disposition="replace")
|
||||
print(pipeline_1.last_trace)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
In the snippet above we created an SQL VIEW in your source database and extracted data from it. In that case, dlt will infer all column types and read data in shape you define in a view without any further customization.
|
||||
|
||||
If creating a view is not feasible, you can fully rewrite the automatically generated query with an extended version of `query_adapter_callback`:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(SelectAny, SelectClause, Table, dlt, sql_database):
|
||||
import sqlalchemy as sa
|
||||
|
||||
def query_adapter_callback_1(query: SelectAny, table: Table) -> SelectClause:
|
||||
if table.name == "genome":
|
||||
return sa.text(f"SELECT * FROM {table.fullname} WHERE kingdom='bacteria'")
|
||||
return query
|
||||
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["genome", "clan"],
|
||||
query_adapter_callback=query_adapter_callback_1,
|
||||
)
|
||||
pipeline_2 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_filtered",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_load_info = pipeline_2.run(_source, write_disposition="replace")
|
||||
print(_load_info)
|
||||
return (pipeline_2,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_2):
|
||||
with pipeline_2.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan"
|
||||
) as _my_table:
|
||||
print("Table clan:")
|
||||
print(_my_table.df())
|
||||
print("\n")
|
||||
with _client.execute_query(
|
||||
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome"
|
||||
) as _my_table:
|
||||
print("Table genome:")
|
||||
print(_my_table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## **2. Transforming the data after extract and before load**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Since dlt is a Python library, it gives you a lot of control over the extracted data.
|
||||
|
||||
You can attach any number of transformations that are evaluated on an item-per-item basis to your resource. The available transformation types:
|
||||
|
||||
* `map` - transform the data item (resource.add_map).
|
||||
* `filter` - filter the data item (resource.add_filter).
|
||||
* `yield map` - a map that returns an iterator (so a single row may generate many rows - resource.add_yield_map).
|
||||
* `limit` - limits the number of records processed by a resource. Useful for testing or reducing data volume during development.
|
||||
|
||||
For example, if we wanted to anonymize sensitive data before it is loaded into the destination, then we can write a python function for it and apply it to source or resource using the `.add_map()` method.
|
||||
|
||||
[dlt documentation.](https://dlthub.com/docs/general-usage/resource#filter-transform-and-pivot-data)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Using `add_map`""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""In the table `clan`, we notice that there is a column `author` that we would like to anonymize."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_2):
|
||||
with pipeline_2.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT DISTINCT author FROM clan LIMIT 5"
|
||||
) as _my_table:
|
||||
print("Table clan:")
|
||||
print(_my_table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""We write a function in python that anonymizes a string""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import hashlib
|
||||
from dlt.common.typing import TDataItem
|
||||
|
||||
def pseudonymize_name(row: TDataItem) -> TDataItem:
|
||||
"""
|
||||
Pseudonymization is a deterministic type of PII-obscuring.
|
||||
Its role is to allow identifying users by their hash,
|
||||
without revealing the underlying info.
|
||||
"""
|
||||
# add a constant salt to generate
|
||||
salt = "WI@N57%zZrmk#88c"
|
||||
salted_string = row["author"] + salt
|
||||
sh = hashlib.sha256()
|
||||
sh.update(salted_string.encode())
|
||||
hashed_string = sh.digest().hex()
|
||||
row["author"] = hashed_string
|
||||
return row
|
||||
return TDataItem, hashlib, pseudonymize_name
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, pseudonymize_name, sql_database):
|
||||
pipeline_3 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_anonymized",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["clan"],
|
||||
)
|
||||
_source.clan.add_map(pseudonymize_name)
|
||||
_info = pipeline_3.run(_source)
|
||||
print(_info)
|
||||
return (pipeline_3,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""After the pipeline has run, we can observe that the author column has been anonymized."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_3):
|
||||
with pipeline_3.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT DISTINCT author FROM clan LIMIT 5"
|
||||
) as _my_table:
|
||||
print("Table clan:")
|
||||
clan = _my_table.df()
|
||||
clan
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""**Note:** If you're using the `pyarrow` or `connectorx` backend, the data is not processed item-by-item. Instead they're processed in batches, therefore your function should be adjusted. For example, for PyArrow chunks the function could be changed as follows:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, hashlib, sql_database):
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
|
||||
def pseudonymize_name_pyarrow(table: pa.Table) -> pa.Table:
|
||||
"""
|
||||
Pseudonymizes the 'author' column in a PyArrow Table.
|
||||
"""
|
||||
salt = "WI@N57%zZrmk#88c"
|
||||
_df = table.to_pandas()
|
||||
_df["author"] = (
|
||||
_df["author"]
|
||||
.astype(str)
|
||||
.apply(lambda x: hashlib.sha256((x + salt).encode()).hexdigest())
|
||||
)
|
||||
new_table = pa.Table.from_pandas(_df)
|
||||
return new_table
|
||||
|
||||
pipeline_4 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_anonymized1",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["clan"],
|
||||
backend="pyarrow",
|
||||
)
|
||||
_source.clan.add_map(pseudonymize_name_pyarrow)
|
||||
_info = pipeline_4.run(_source)
|
||||
print(_info)
|
||||
return (pipeline_4,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_4):
|
||||
with pipeline_4.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT DISTINCT author FROM clan LIMIT 5"
|
||||
) as _my_table:
|
||||
print("Table clan:")
|
||||
print(_my_table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### `add_map` vs `add_yield_map`
|
||||
|
||||
The difference between `add_map` and `add_yield_map` matters when a transformation returns multiple records from a single input.
|
||||
|
||||
#### **`add_map`**
|
||||
- Use `add_map` when you want to transform each item into exactly one item.
|
||||
- Think of it like modifying or enriching a row.
|
||||
- You use a regular function that returns one modified item.
|
||||
- Great for adding fields or changing structure.
|
||||
|
||||
#### Example
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItem, dlt):
|
||||
from dlt.common.typing import TDataItems
|
||||
|
||||
@dlt.resource
|
||||
def _resource() -> TDataItems:
|
||||
yield [{"name": "Alice"}, {"name": "Bob"}]
|
||||
|
||||
def add_greeting(item: TDataItem) -> TDataItem:
|
||||
item["greeting"] = f"Hello, {item['name']}!"
|
||||
return item
|
||||
|
||||
_resource.add_map(add_greeting)
|
||||
for _row in _resource():
|
||||
print(_row)
|
||||
return (TDataItems,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
#### **`add_yield_map`**
|
||||
- Use `add_yield_map` when you want to turn one item into multiple items, or possibly no items.
|
||||
- Your function is a generator that uses yield.
|
||||
- Great for pivoting nested data, flattening lists, or filtering rows.
|
||||
|
||||
#### Example
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItem, TDataItems, dlt):
|
||||
@dlt.resource
|
||||
def _resource() -> TDataItems:
|
||||
yield [
|
||||
{"name": "Alice", "hobbies": ["reading", "chess"]},
|
||||
{"name": "Bob", "hobbies": ["cycling"]},
|
||||
]
|
||||
|
||||
def expand_hobbies(item: TDataItem) -> TDataItem:
|
||||
for hobby in item["hobbies"]:
|
||||
yield {"name": item["name"], "hobby": hobby}
|
||||
|
||||
_resource.add_yield_map(expand_hobbies)
|
||||
for row in _resource():
|
||||
print(row)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Using `add_filter`
|
||||
`add_filter` function can be used similarly. The difference is that `add_filter` expects a function that returns a boolean value for each item. For example, to implement the same filtering we did with a query callback, we can use:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, sql_database):
|
||||
import time
|
||||
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["genome"],
|
||||
)
|
||||
pipeline_5 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_filtered",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_source.genome.add_filter(lambda item: item["kingdom"] == "bacteria")
|
||||
pipeline_5.run(_source, write_disposition="replace")
|
||||
print(pipeline_5.last_trace)
|
||||
return (pipeline_5,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_5):
|
||||
with pipeline_5.sql_client() as _client:
|
||||
with _client.execute_query(
|
||||
"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome"
|
||||
) as _my_table:
|
||||
print("Table genome:")
|
||||
genome_count = _my_table.df()
|
||||
genome_count
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Question 1:
|
||||
|
||||
What is a `total_rows` in the example above?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Using `add_limit`
|
||||
|
||||
If your resource loads thousands of pages of data from a REST API or millions of rows from a database table, you may want to sample just a fragment of it in order to quickly see the dataset with example data and test your transformations, etc.
|
||||
|
||||
To do this, you limit how many items will be yielded by a resource (or source) by calling the `add_limit` method. This method will close the generator that produces the data after the limit is reached.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, sql_database):
|
||||
_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["genome"],
|
||||
chunk_size=10,
|
||||
)
|
||||
pipeline_6 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_filtered",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
)
|
||||
_source.genome.add_limit(1)
|
||||
pipeline_6.run(_source, write_disposition="replace")
|
||||
print(pipeline_6.last_trace)
|
||||
return (pipeline_6,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_6):
|
||||
with pipeline_6.sql_client() as _client:
|
||||
with _client.execute_query("SELECT * FROM genome") as _my_table:
|
||||
genome_limited = _my_table.df()
|
||||
genome_limited
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **3. Transforming data with `@dlt.transformer`**
|
||||
|
||||
The main purpose of transformers is to create children tables with additional data requests, but they can also be used for data transformations especially if you want to keep the original data as well.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItem, TDataItems, dlt, sql_database):
|
||||
@dlt.transformer()
|
||||
def batch_stats(items: TDataItems) -> TDataItem:
|
||||
"""
|
||||
Pseudonymization is a deterministic type of PII-obscuring.
|
||||
Its role is to allow identifying users by their hash,
|
||||
without revealing the underlying info.
|
||||
"""
|
||||
yield {
|
||||
"batch_length": len(items),
|
||||
"max_length": max([item["total_length"] for item in items]),
|
||||
}
|
||||
|
||||
genome_resource = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", chunk_size=10000
|
||||
).genome
|
||||
pipeline_7 = dlt.pipeline(
|
||||
pipeline_name="sql_database_pipeline_with_transformers1",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
dev_mode=True,
|
||||
)
|
||||
pipeline_7.run([genome_resource, genome_resource | batch_stats])
|
||||
print(pipeline_7.last_trace) # add a constant salt to generate
|
||||
return (pipeline_7,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_7):
|
||||
with pipeline_7.sql_client() as _client:
|
||||
with _client.execute_query("SELECT * FROM batch_stats") as _my_table:
|
||||
res = _my_table.df()
|
||||
res
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **4. Transforming data after the load**
|
||||
|
||||
Another possibility for data transformation is transforming data after the load. dlt provides several way of doing it:
|
||||
|
||||
* using `sql_client`,
|
||||
* via `.dataset()` and ibis integration,
|
||||
* via [dbt integration](https://dlthub.com/docs/dlt-ecosystem/transformations/dbt/).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### SQL client
|
||||
|
||||
You already saw examples of using dlt's SQL client. dlt gives you an opportunity to connect to your destination and execute any SQL query.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_7):
|
||||
with pipeline_7.sql_client() as _client:
|
||||
_client.execute_sql(
|
||||
"CREATE OR REPLACE TABLE genome_length AS SELECT SUM(total_length) AS total_total_length, AVG(total_length) AS average_total_length FROM genome"
|
||||
)
|
||||
with _client.execute_query("SELECT * FROM genome_length") as _my_table:
|
||||
genome_length = _my_table.df()
|
||||
genome_length
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Accessing loaded data with `pipeline.dataset()`
|
||||
|
||||
Use `pipeline.dataset()` to inspect and work with your data in Python after loading.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_7):
|
||||
dataset = pipeline_7.dataset()
|
||||
# List tables
|
||||
dataset.row_counts().df()
|
||||
return (dataset,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Note that `row_counts` didn't return the new table `genome_length`,"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset):
|
||||
# Access as pandas
|
||||
_df = dataset["genome"].df()
|
||||
_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset):
|
||||
# Access as Arrow
|
||||
arrow_table = dataset["genome_length"].arrow()
|
||||
arrow_table
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""You can also filter, limit, and select columns:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset):
|
||||
_df = dataset["genome"].select("kingdom", "ncbi_id").limit(10).df()
|
||||
_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""To iterate over large data:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset):
|
||||
for chunk in dataset["genome"].iter_df(chunk_size=500):
|
||||
print(chunk.head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""For more advanced users, this interface supports **Ibis expressions**, joins, and subqueries."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Ibis integration
|
||||
|
||||
Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/).
|
||||
|
||||
[dlt provides a way to use Ibis expressions natively](https://dlthub.com/docs/general-usage/dataset-access/ibis-backend) with a lot of destinations. Supported ones are:
|
||||
* Snowflake
|
||||
* DuckDB
|
||||
* MotherDuck
|
||||
* Postgres
|
||||
* Redshift
|
||||
* Clickhouse
|
||||
* MSSQL (including Synapse)
|
||||
* BigQuery
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_7):
|
||||
# get the dataset from the pipeline
|
||||
dataset_1 = pipeline_7.dataset()
|
||||
dataset_name = pipeline_7.dataset_name
|
||||
ibis_connection = dataset_1.ibis()
|
||||
# get the native ibis connection from the dataset
|
||||
print(ibis_connection.list_tables(database=dataset_name))
|
||||
table = ibis_connection.table("batch_stats", database=dataset_name)
|
||||
# list all tables in the dataset
|
||||
# NOTE: You need to provide the dataset name to ibis, in ibis datasets are named databases
|
||||
# get the items table
|
||||
# print the first 2 rows
|
||||
print(table.limit(2).execute()) # # type: ignore[attr-defined]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "_dbt9Ilnmktb"
|
||||
},
|
||||
"source": [
|
||||
"# Merge and replace strategies & Advanced tricks [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n"
|
||||
"# Merge and replace strategies & Advanced tricks [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -46,7 +46,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"A `write_disposition` in `dlt` can specified in the resource decorator:\n",
|
||||
"A `write_disposition` in `dlt` can be specified in the resource decorator:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"@dlt.resource(write_disposition=\"append\")\n",
|
||||
@@ -153,17 +153,17 @@
|
||||
" - Append\n",
|
||||
" - Replace\n",
|
||||
" - Merge\n",
|
||||
"- What incremental loading is.\n",
|
||||
"- What incremental loading is\n",
|
||||
"\n",
|
||||
"**Now, we will cover** the different strategies for `merge` write disposition:\n",
|
||||
"- `delete-insert` strategy.\n",
|
||||
"- `upsert` strategy.\n",
|
||||
"- `SCD2` strategy.\n",
|
||||
"- `delete-insert` strategy\n",
|
||||
"- `upsert` strategy\n",
|
||||
"- `SCD2` strategy\n",
|
||||
"\n",
|
||||
"We also will take a look at\n",
|
||||
"* Hard deletes.\n",
|
||||
"* Falling back for incremental cursors.\n",
|
||||
"* Backfills."
|
||||
"We will also take a look at:\n",
|
||||
"* Hard deletes\n",
|
||||
"* Falling back for incremental cursors\n",
|
||||
"* Backfills"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -258,9 +258,7 @@
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"dlt.secrets[\n",
|
||||
" \"destination.replace_strategy\"\n",
|
||||
"] = \"truncate-and-insert\" # <--- set the replace strategy using TOML, ENVs or Python\n",
|
||||
"dlt.secrets[\"destination.replace_strategy\"] = \"truncate-and-insert\"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"pokemon_load_1\",\n",
|
||||
@@ -268,7 +266,7 @@
|
||||
" dataset_name=\"pokemon_data_1\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
|
||||
"pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -350,9 +348,7 @@
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"dlt.secrets[\n",
|
||||
" \"destination.replace_strategy\"\n",
|
||||
"] = \"insert-from-staging\" # <--- set the replace strategy using TOML, ENVs or Python\n",
|
||||
"dlt.secrets[\"destination.replace_strategy\"] = \"insert-from-staging\"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"pokemon_load_2\",\n",
|
||||
@@ -360,8 +356,7 @@
|
||||
" dataset_name=\"pokemon_data_2\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
|
||||
"\n",
|
||||
"pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -391,7 +386,7 @@
|
||||
"\n",
|
||||
"In this example, the `insert-from-staging` strategy will load the pokemon data **into a staging table** in the `pokemon_data_2_staging` schema in DuckDB (or any other destination you choose). \n",
|
||||
"\n",
|
||||
"Let's check the content of this table:"
|
||||
"Let's check the contents of this table:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -558,7 +553,7 @@
|
||||
" write_disposition=\"merge\",\n",
|
||||
" primary_key=\"id\",\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def pokemon(data: TDataItems) -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -568,7 +563,7 @@
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
@@ -645,7 +640,7 @@
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 7.5, \"height\": 0.4},\n",
|
||||
" }, # <--- Pikachu's weight has increased\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -666,7 +661,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
@@ -729,7 +724,7 @@
|
||||
"id": "S06hBVpXgmqF"
|
||||
},
|
||||
"source": [
|
||||
"We see that only new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one."
|
||||
"We see that only the new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -892,12 +887,12 @@
|
||||
"@dlt.resource(\n",
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition={\n",
|
||||
" \"disposition\": \"merge\", # <--- specifies that existing data should be merged\n",
|
||||
" \"strategy\": \"scd2\", # <--- enables SCD2 tracking, which keeps historical records of changes\n",
|
||||
" \"disposition\": \"merge\",\n",
|
||||
" \"strategy\": \"scd2\",\n",
|
||||
" },\n",
|
||||
" primary_key=\"id\",\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def pokemon(data: TDataItems) -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -908,7 +903,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -972,7 +967,7 @@
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" }, # <--- weight has changed back\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -993,7 +988,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -1075,19 +1070,19 @@
|
||||
" \"name\": \"bulbasaur\",\n",
|
||||
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
|
||||
" \"deleted_flag\": True,\n",
|
||||
" }, # <--- should be deleted\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"4\",\n",
|
||||
" \"name\": \"charmander\",\n",
|
||||
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
|
||||
" \"deleted_flag\": None,\n",
|
||||
" }, # <--- should be kept\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"deleted_flag\": False,\n",
|
||||
" }, # <--- should be kept\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -1106,9 +1101,9 @@
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"merge\",\n",
|
||||
" primary_key=\"id\",\n",
|
||||
" columns={\"deleted_flag\": {\"hard_delete\": True}}, # <--- set columns argument\n",
|
||||
" columns={\"deleted_flag\": {\"hard_delete\": True}},\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def pokemon(data: TDataItems) -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1119,7 +1114,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -1160,7 +1155,7 @@
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"deleted_flag\": True,\n",
|
||||
" }, # <--- set to True\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -1172,7 +1167,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -1236,19 +1231,19 @@
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"deleted_flag\": None,\n",
|
||||
" }, # <--- will be filtered out\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 7, \"height\": 0.4},\n",
|
||||
" \"deleted_flag\": True,\n",
|
||||
" }, # <--- will be removed\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 8, \"height\": 0.4},\n",
|
||||
" \"deleted_flag\": None,\n",
|
||||
" }, # <--- will be loaded\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -1279,9 +1274,9 @@
|
||||
" columns={\n",
|
||||
" \"deleted_flag\": {\"hard_delete\": True},\n",
|
||||
" \"size__weight\": {\"dedup_sort\": \"desc\"},\n",
|
||||
" }, # <-- desc means that the record with the highest value remains.\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def pokemon(data: TDataItems) -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1292,7 +1287,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
@@ -1381,7 +1376,7 @@
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"created_at\": 3,\n",
|
||||
" \"updated_at\": None,\n",
|
||||
" }, # <--- Incremental cursor is None\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -1396,12 +1391,13 @@
|
||||
"import dlt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.resource\n",
|
||||
"@dlt.resource(name=\"pokemon\")\n",
|
||||
"def pokemon(\n",
|
||||
" data: TDataItems,\n",
|
||||
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n",
|
||||
" \"updated_at\", on_cursor_value_missing=\"include\"\n",
|
||||
" )\n",
|
||||
") -> TDataItems: # <--- we want to include all data rows even if cursor is missing\n",
|
||||
" ),\n",
|
||||
") -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1412,7 +1408,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = pipeline.run(pokemon(data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
@@ -1474,7 +1470,7 @@
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"created_at\": 3,\n",
|
||||
" \"updated_at\": None,\n",
|
||||
" }, # <--- Incremental cursor is None\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
@@ -1488,6 +1484,7 @@
|
||||
"source": [
|
||||
"@dlt.resource\n",
|
||||
"def some_data(\n",
|
||||
" data: TDataItems,\n",
|
||||
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\"updated_at\"),\n",
|
||||
") -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
@@ -1495,9 +1492,7 @@
|
||||
"\n",
|
||||
"def set_default_updated_at(record: TDataItem) -> TDataItems:\n",
|
||||
" if record.get(\"updated_at\") is None:\n",
|
||||
" record[\"updated_at\"] = record.get(\n",
|
||||
" \"created_at\"\n",
|
||||
" ) # <--- use 'created_at' instead of missing 'updated_at'\n",
|
||||
" record[\"updated_at\"] = record.get(\"created_at\")\n",
|
||||
" return record"
|
||||
]
|
||||
},
|
||||
@@ -1510,7 +1505,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Modifies records before the incremental processing\n",
|
||||
"with_default_values = some_data().add_map(set_default_updated_at, insert_at=1)"
|
||||
"with_default_values = some_data(data).add_map(set_default_updated_at, insert_at=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1542,7 +1537,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Removes records before the incremental processing\n",
|
||||
"without_none = some_data().add_filter(\n",
|
||||
"without_none = some_data(data).add_filter(\n",
|
||||
" lambda r: r.get(\"updated_at\") is not None, insert_at=1\n",
|
||||
")"
|
||||
]
|
||||
@@ -1641,9 +1636,10 @@
|
||||
"\n",
|
||||
"@dlt.resource\n",
|
||||
"def some_data(\n",
|
||||
" data: TDataItems,\n",
|
||||
" updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n",
|
||||
" \"created_at\", initial_value=0, end_value=2\n",
|
||||
" )\n",
|
||||
" ),\n",
|
||||
") -> TDataItems:\n",
|
||||
" yield data"
|
||||
]
|
||||
@@ -1662,7 +1658,7 @@
|
||||
" dataset_name=\"pokemon_inc_wd\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(some_data, table_name=\"pokemon\")\n",
|
||||
"load_info = pipeline.run(some_data(data), table_name=\"pokemon\")\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
@@ -1752,7 +1748,7 @@
|
||||
"continue_load_flag = True\n",
|
||||
"\n",
|
||||
"while continue_load_flag:\n",
|
||||
" load_info = pipeline.run(source.genome.add_limit(10))\n",
|
||||
" pipeline.run(source.genome.add_limit(10))\n",
|
||||
" continue_load_flag = (\n",
|
||||
" my_table_name in pipeline.last_trace.last_normalize_info.row_counts.keys()\n",
|
||||
" )\n",
|
||||
@@ -1772,17 +1768,8 @@
|
||||
"id": "AH3F46PaJZe4"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mC09rjkheo92-ycjjq0AlIzgwJC8-ZMX#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "K4smMmlfMysW"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
"id": "Wat0fkM3BHwn"
|
||||
},
|
||||
"source": [
|
||||
"# **Introduction** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n",
|
||||
"# **Introduction** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n",
|
||||
"\n",
|
||||
"`dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀\n"
|
||||
]
|
||||
@@ -35,7 +35,7 @@
|
||||
"source": [
|
||||
"When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path=\"schemas/export\"` in your pipeline.\n",
|
||||
"\n",
|
||||
"See [dlt Fundamentals: Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)\n"
|
||||
"See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -167,18 +167,9 @@
|
||||
"\n",
|
||||
"Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves.\n",
|
||||
"\n",
|
||||
"`dlt` allows you to implement these data contracts at various levels, including the [table level](#scrollTo=zzVNMHgqNEYr), [column level](#scrollTo=Bq_9SNOMQGk_), and [data type level](#scrollTo=H9eMPvlOQHrJ). This provides granular control over how different parts of your schema evolve.\n",
|
||||
"`dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve.\n",
|
||||
"\n",
|
||||
"> **Note**: This Colab is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "g2XDHclpusOU"
|
||||
},
|
||||
"source": [
|
||||
"To get started with data contracts, first install `dlt`:"
|
||||
"> **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -190,8 +181,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"\n",
|
||||
"# Install dlt\n",
|
||||
"!pip install dlt[duckdb]"
|
||||
]
|
||||
},
|
||||
@@ -468,13 +457,13 @@
|
||||
"load_info = column_pipeline.run(\n",
|
||||
" discard_row(\n",
|
||||
" [\n",
|
||||
" {\"id\": 3, \"name\": \"Sam\", \"age\": 30}, # This row will be loaded\n",
|
||||
" {\"id\": 3, \"name\": \"Sam\", \"age\": 30},\n",
|
||||
" {\n",
|
||||
" \"id\": 4,\n",
|
||||
" \"name\": \"Kate\",\n",
|
||||
" \"age\": 79,\n",
|
||||
" \"phone\": \"123-456-7890\",\n",
|
||||
" }, # This row will not be loaded\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" ),\n",
|
||||
" table_name=\"users\",\n",
|
||||
@@ -711,8 +700,8 @@
|
||||
"load_info = data_type_pipeline.run(\n",
|
||||
" discard_row(\n",
|
||||
" [\n",
|
||||
" {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"}, # This row will be loaded\n",
|
||||
" {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"}, # This row will not be loaded\n",
|
||||
" {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"},\n",
|
||||
" {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"},\n",
|
||||
" ]\n",
|
||||
" ),\n",
|
||||
" table_name=\"users\",\n",
|
||||
@@ -940,17 +929,8 @@
|
||||
"id": "AH3F46PaJZe4"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1YCjHWMyOO9QGC66t1a5bIxL-ZUeVKViR#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "6_6WprxWXhXi"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
780
docs/education/dlt-advanced-course/lesson_7_data_contracts.py
Normal file
780
docs/education/dlt-advanced-course/lesson_7_data_contracts.py
Normal file
@@ -0,0 +1,780 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Introduction** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
|
||||
|
||||
`dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# [Refresher] **Understanding schema**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path="schemas/export"` in your pipeline.
|
||||
|
||||
See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This YAML file will look something like:
|
||||
|
||||
```yaml
|
||||
version: 2 # version of the schema
|
||||
version_hash: xmTG0tOmE40LvzY2DbPBOnRaNNK8YlLpVP1PMO0YgyE= # hash of the actual schema content
|
||||
engine_version: 9. # shema engine version of dlt
|
||||
name: quick_start
|
||||
tables:
|
||||
_dlt_version:
|
||||
...
|
||||
_dlt_loads:
|
||||
...
|
||||
_dlt_pipeline_state:
|
||||
...
|
||||
issues:
|
||||
columns:
|
||||
url:
|
||||
data_type: text
|
||||
nullable: true
|
||||
repository_url:
|
||||
data_type: text
|
||||
nullable: true
|
||||
labels_url:
|
||||
data_type: text
|
||||
nullable: true
|
||||
...
|
||||
write_disposition: append
|
||||
resource: get_issues
|
||||
x-normalizer:
|
||||
seen-data: true
|
||||
issues__assignees:
|
||||
columns:
|
||||
...
|
||||
parent: issues
|
||||
|
||||
settings:
|
||||
detections:
|
||||
- iso_timestamp
|
||||
default_hints:
|
||||
not_null:
|
||||
- _dlt_id
|
||||
- _dlt_root_id
|
||||
- _dlt_parent_id
|
||||
- _dlt_list_idx
|
||||
- _dlt_load_id
|
||||
foreign_key:
|
||||
- _dlt_parent_id
|
||||
root_key:
|
||||
- _dlt_root_id
|
||||
unique:
|
||||
- _dlt_id
|
||||
normalizers:
|
||||
names: snake_case # naming convention
|
||||
json:
|
||||
module: dlt.common.normalizers.json.relational
|
||||
previous_hashes:
|
||||
- O4M6U4KA32Xz4Vrdcqo4XPBPFVcK1FZbgRu5qcMfjn4=
|
||||
- 0DQRnVWANYV21yD0T5nsoUtdTeq0/jIOYMUxpPE6Fcc=
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## **Tables and columns**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
A `table schema` may have the following properties:
|
||||
|
||||
- `name`
|
||||
- `description`
|
||||
- `parent`: The name of the parent table if this is a child table.
|
||||
- `columns`: A list of column schemas defining the table's structure.
|
||||
- `write_disposition`: A hint telling `dlt` how new data coming into the table should be loaded.
|
||||
|
||||
|
||||
A `column schema` may have the following properties:
|
||||
|
||||
- `name`
|
||||
- `description`
|
||||
- `data_type`
|
||||
- `precision`: Defines the precision for text, timestamp, time, bigint, binary, and decimal types.
|
||||
- `scale`: Defines the scale for the decimal type.
|
||||
- `is_variant`: Indicates that the column was generated as a variant of another column.
|
||||
|
||||
A `column schema` may have the following basic hints:
|
||||
|
||||
- `nullable`
|
||||
- `primary_key`
|
||||
- `merge_key`: Marks the column as part of the merge key used for incremental loads.
|
||||
- `foreign_key`
|
||||
- `root_key`: Marks the column as part of a root key, a type of foreign key that always refers to the root table.
|
||||
- `unique`
|
||||
|
||||
|
||||
A `column schema` may have the following performance hints:
|
||||
|
||||
- `partition`: Marks the column to be used for partitioning data.
|
||||
- `cluster`: Marks the column to be used for clustering data.
|
||||
- `sort`: : Marks the column as sortable or ordered; on some destinations, this may generate an index, even if the column is not unique.
|
||||
|
||||
> Each destination can interpret these performance hints in its own way. For example, the `cluster` hint is used by Redshift to define table distribution, by BigQuery to specify a cluster column, and is ignored by DuckDB and Postgres when creating tables.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Data contracts**
|
||||
|
||||
Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves.
|
||||
|
||||
`dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve.
|
||||
|
||||
> **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
###**Table level**
|
||||
|
||||
On the table level, you can specify `evolve` or `freeze` as part of the schema contract.
|
||||
|
||||
- `evolve`: Allows the creation of new tables within the schema.
|
||||
- `freeze`: Prevents any changes to the schema, ensuring no new tables can be added.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Before diving into the modes above, let's load some sample data into a DuckDB database.
|
||||
> You'll find the database stored in the `Files` section on the left sidebar.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
|
||||
data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
|
||||
# Sample data to be loaded
|
||||
table_pipeline = dlt.pipeline(
|
||||
pipeline_name="data_contracts_table_level",
|
||||
destination="duckdb",
|
||||
dataset_name="mydata",
|
||||
)
|
||||
_load_info = table_pipeline.run(data, table_name="users")
|
||||
# Create a dlt pipeline
|
||||
print(_load_info)
|
||||
# Load the data to the "users" table
|
||||
# Print the row counts for each table that was loaded in the last run of the pipeline
|
||||
print(
|
||||
"\nNumber of new rows loaded into each table: ",
|
||||
table_pipeline.last_trace.last_normalize_info.row_counts,
|
||||
)
|
||||
return data, dlt, table_pipeline
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Now, try out the `evolve` mode at the table level by loading the same sample data into the same database, but this time into a new table called `new_users`."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, dlt, table_pipeline):
|
||||
from dlt.common.typing import TDataItems
|
||||
|
||||
@dlt.resource(schema_contract={"tables": "evolve"})
|
||||
# Define a dlt resource that allows the creation of new tables
|
||||
def allow_new_tables(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = table_pipeline.run(allow_new_tables(data), table_name="new_users")
|
||||
print(_load_info)
|
||||
# Run the pipeline again with the above dtl resource to load the same data into a new table "new_users"
|
||||
# Print the row counts for each table that was loaded in the last run of the pipeline
|
||||
print(
|
||||
"\nNumber of new rows loaded into each table: ",
|
||||
table_pipeline.last_trace.last_normalize_info.row_counts,
|
||||
)
|
||||
return (TDataItems,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The `freeze` mode at the table level, as mentioned earlier, won't allow any changes to the schema, so the pipeline run below that tries to create another table with the name `newest_users` will fail 👇"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data, dlt, table_pipeline):
|
||||
# Define a dlt resource that prevents any changes to the schema at the table level (no new tables can be added)
|
||||
@dlt.resource(schema_contract={"tables": "freeze"})
|
||||
def no_new_tables(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = table_pipeline.run(no_new_tables(data), table_name="newest_users")
|
||||
# Now, run the pipeline with the resource above, attempting to load the same data into "newest_users".
|
||||
# This will fail, as new tables can't be added.
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
###**Column level**
|
||||
At the column level, you can specify:
|
||||
- `evolve`: Allows for the addition of new columns or changes in the existing ones.
|
||||
- `freeze`: Prevents any changes to the existing columns.
|
||||
- `discard_row`: Skips rows that have new columns but loads those that follow the existing schema.
|
||||
- `discard_value`: Doesn't skip entire rows. Instead, it only skips the values of new columns, loading the rest of the row data.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Just like we did in the previous section, let's first load some sample data into a new database using a new pipeline.
|
||||
|
||||
> After you run the following code snippet, a new `data_contracts_column_level.duckdb` file should appear in `Files`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
column_pipeline = dlt.pipeline(
|
||||
pipeline_name="data_contracts_column_level",
|
||||
destination="duckdb",
|
||||
dataset_name="mydata",
|
||||
)
|
||||
_load_info = column_pipeline.run([{"id": 1, "name": "Alice"}], table_name="users")
|
||||
print(_load_info)
|
||||
return (column_pipeline,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""View the loaded data using `dlt`'s `sql_client()`.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Alternatively, you can simply use the DuckDB client.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(column_pipeline):
|
||||
import duckdb
|
||||
|
||||
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return (duckdb,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Assume that Alice ☝️ is the first user at your imaginary company, and you have now decided to collect users' ages as well.
|
||||
|
||||
When you load the information for your second user, Bob, who also provided his age 👇, the schema contract at the column level set to `evolve` will allow `dlt` to automatically adjust the schema in the destination database by adding a new column for "age".
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, column_pipeline, dlt, duckdb):
|
||||
# Define dlt resource that allows new columns in the data
|
||||
@dlt.resource(schema_contract={"columns": "evolve"})
|
||||
def allow_new_columns(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = column_pipeline.run(
|
||||
allow_new_columns([{"id": 2, "name": "Bob", "age": 35}]), table_name="users"
|
||||
)
|
||||
print(_load_info)
|
||||
# Now, load a new row into the same table, "users", which includes an additional column "age"
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Now, imagine your business partner, with whom you started the company, began requiring phone numbers from users. However, you weren't informed of this requriement and want to first load the data of users who provided their info before this change, i.e., users who did NOT provide their phone numbers.
|
||||
|
||||
In this case, you would use the `discard_row` mode - which will only load Sam's data 👇 because he didn't provide a phone number, and therefore his data complies with the schema.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, column_pipeline, dlt, duckdb):
|
||||
# Define a dlt resource that skips rows that have new columns but loads those that follow the existing schema
|
||||
@dlt.resource(schema_contract={"columns": "discard_row"})
|
||||
def _discard_row(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = column_pipeline.run(
|
||||
_discard_row(
|
||||
[
|
||||
{"id": 3, "name": "Sam", "age": 30},
|
||||
{"id": 4, "name": "Kate", "age": 79, "phone": "123-456-7890"},
|
||||
]
|
||||
),
|
||||
table_name="users",
|
||||
)
|
||||
print(_load_info)
|
||||
# Attempt to load two additional rows. Only the row that follows the existing schema will be loaded
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Due to some unknown reasons, you've suddenly decided that phone numbers are irrelevant altogether. From now on, you want to load all new data but without the "phone" column.
|
||||
|
||||
To achieve this, you can use the `discard_value` mode - which will load both Sarah's and Violetta's data 👇, regardless of whether either of them provided a phone number. However, the phone number column itself will be discarded.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, column_pipeline, dlt, duckdb):
|
||||
# Define a dlt resource that only skips the values of new columns, loading the rest of the row data
|
||||
@dlt.resource(schema_contract={"columns": "discard_value"})
|
||||
def _discard_value(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = column_pipeline.run(
|
||||
_discard_value(
|
||||
[
|
||||
{"id": 5, "name": "Sarah", "age": "23"},
|
||||
{"id": 6, "name": "Violetta", "age": "22", "phone": "666-513-4510"},
|
||||
]
|
||||
),
|
||||
table_name="users",
|
||||
)
|
||||
print(_load_info)
|
||||
# Load two additional rows. Since we're using the "discard_value" resource, both rows will be added
|
||||
# However, the "phone" column in the second row will be ignored and not loaded
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb")
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Eventually you decide that users' id, name and age are the only things you need for your obscure business...
|
||||
|
||||
So, you set the mode to `freeze`, forbidding any changes to the table schema. The attempt to violate the schema contract, as shown below 👇, will fail.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, column_pipeline, dlt):
|
||||
# Define a dlt resource that does not allow new columns in the data
|
||||
@dlt.resource(schema_contract={"columns": "freeze"})
|
||||
def no_new_columns(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = column_pipeline.run(
|
||||
no_new_columns([{"id": 7, "name": "Lisa", "age": 40, "phone": "098-765-4321"}]),
|
||||
table_name="users",
|
||||
)
|
||||
# Attempt to load a row with additional columns when the column contract is set to freeze
|
||||
# This will fail as no new columns are allowed.
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Data type level**
|
||||
At this level, you can choose:
|
||||
- `evolve`: Allows any data type. This may result with variant columns upstream.
|
||||
- `freeze`: Prevents any changes to the existing data types.
|
||||
- `discard_row`: Omits rows with unverifiable data types.
|
||||
- `discard_value`: Replaces unverifiable values with None, but retains the rest of the row data.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
(*No imaginary situations in this section for the sake of variety and ease* ... 👀)
|
||||
|
||||
Load a sample row entry into a new database using a new pipeline.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, duckdb):
|
||||
data_type_pipeline = dlt.pipeline(
|
||||
pipeline_name="data_contracts_data_type",
|
||||
destination="duckdb",
|
||||
dataset_name="mydata",
|
||||
)
|
||||
_load_info = data_type_pipeline.run(
|
||||
[{"id": 1, "name": "Alice", "age": 24}], table_name="users"
|
||||
)
|
||||
print(_load_info)
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return (data_type_pipeline,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Before trying out the `evolve` mode at the data type level 👇, take a moment to understand how variant columns mentioned earlier are created:
|
||||
- **TLDR:** `dlt` creates a new column when the data type of a field in the incoming data can't be validated against the existing data type in the destination table.
|
||||
- These variant columns will be named following the pattern `<original name>__v_<type>`, where `original_name` is the existing column name (with the data type clash) and `type` is the name of the new data type stored in the variant column.
|
||||
|
||||
In the example below, even though Bob's age is passed as a string, it can be validated as an integer, so it won't cause any problems.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data_type_pipeline, dlt, duckdb):
|
||||
# Define dlt resource that accepts all data types
|
||||
@dlt.resource(schema_contract={"data_type": "evolve"})
|
||||
def allow_any_data_type(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = data_type_pipeline.run(
|
||||
allow_any_data_type([{"id": 2, "name": "Bob", "age": "35"}]), table_name="users"
|
||||
)
|
||||
print(_load_info)
|
||||
# Now, load a new row where the "age" column is passed as a string but will be validated and stored as an integer
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
|
||||
# If you pass the age as "thirty-five", a new variant column will be added
|
||||
# Note: Running the uncommented code below may affect subsequent steps, so proceed with caution
|
||||
# load_info = data_type_pipeline.run(allow_any_data_type([{"id": 2, "name": "Bob", "age": "thirty-five"}]), table_name="users")
|
||||
# print(load_info)
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""But if we ran the commented-out pipeline, this would be the outcome with an additional variant column:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The `discard_row` mode at the data type level functions similarly to how it does at the column level. The only difference is that it discards rows with diverging data types instead of columns. As a result, you will see that Kate's data will not be loaded 👇."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data_type_pipeline, dlt, duckdb):
|
||||
# Define dlt resource that omits rows with unverifiable data types
|
||||
@dlt.resource(schema_contract={"data_type": "discard_row"})
|
||||
def _discard_row(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = data_type_pipeline.run(
|
||||
_discard_row(
|
||||
[
|
||||
{"id": 3, "name": "Sam", "age": "35"},
|
||||
{"id": 4, "name": "Kate", "age": "seventy"},
|
||||
]
|
||||
),
|
||||
table_name="users",
|
||||
)
|
||||
print(_load_info)
|
||||
# Attempt to load two additional rows. Only the row where all column types can be validated will be loaded
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The same goes for the `discard_value` mode. However, note that when applied at the data type level, it will replace non-validating row items with `None`. So, in this example, Violetta's age will be set to `None` 👇."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data_type_pipeline, dlt, duckdb):
|
||||
# Define a dlt resource that replaces unverifiable values with None, but retains the rest of the row data
|
||||
@dlt.resource(schema_contract={"data_type": "discard_value"})
|
||||
def _discard_value(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = data_type_pipeline.run(
|
||||
_discard_value(
|
||||
[
|
||||
{"id": 5, "name": "Sarah", "age": 23},
|
||||
{"id": 6, "name": "Violetta", "age": "twenty-eight"},
|
||||
]
|
||||
),
|
||||
table_name="users",
|
||||
)
|
||||
print(_load_info)
|
||||
# Load two additional rows. Since we're using the "discard_value" resource, both rows will be added
|
||||
# However, the "age" value "twenty-eight" in the second row will be ignored and not loaded
|
||||
print("\n")
|
||||
_conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb")
|
||||
# View the data that has been loaded
|
||||
_conn.sql("SELECT * FROM mydata.users").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The `freeze` mode prohibits any changes to the data types of existing columns and will result in an error if there is a "breach in contract". The example below will fail."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data_type_pipeline, dlt):
|
||||
# Define dlt resource that prevents any changes to the existing data types
|
||||
@dlt.resource(schema_contract={"data_type": "freeze"})
|
||||
def no_data_type_changes(input_data: TDataItems) -> TDataItems:
|
||||
yield input_data
|
||||
|
||||
_load_info = data_type_pipeline.run(
|
||||
no_data_type_changes([{"id": 7, "name": "Lisa", "age": "forty"}]),
|
||||
table_name="users",
|
||||
)
|
||||
# Attempt to load a row with a column value that can't be validated, in this case "forty"
|
||||
# This will fail as no data type changes are allowed with the "no_data_type_changes" resource
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# **Pydantic Models**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Pydantic models can also be used to [define table schemas and validate incoming data](https://dlthub.com/docs/general-usage/resource#define-a-schema-with-pydantic).
|
||||
They can be passed directly to the "columns" argument of a `dlt` resource:
|
||||
```python
|
||||
class User(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
tags: List[str]
|
||||
email: Optional[str]
|
||||
address: Address
|
||||
status: Union[int, str]
|
||||
|
||||
@dlt.resource(name="user", columns=User)
|
||||
def get_users():
|
||||
...
|
||||
```
|
||||
This will set the schema contract to align with the default Pydantic behavior:
|
||||
```python
|
||||
{
|
||||
"tables": "evolve",
|
||||
"columns": "discard_value",
|
||||
"data_type": "freeze"
|
||||
}
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
If you happen to pass a `schema_contract` explicitly along with the `columns` argument to a `dlt` resource, the following happens:
|
||||
|
||||
- `tables`: The contract will not impact the Pydantic model and will be applied when a new table is created.
|
||||
- `columns`: The modes for columns are mapped into the `extra` modes of Pydantic. If your models contain other models, `dlt` will apply this setting recursively. The contract for columns is applied when a new column is created on an existing table.
|
||||
|
||||
<center>
|
||||
|
||||
| Column Mode | Pydantic Extra |
|
||||
|-----------------|----------------|
|
||||
| evolve | allow |
|
||||
| freeze | forbid |
|
||||
| discard_value | ignore |
|
||||
| discard_row | forbid |
|
||||
|
||||
</center>
|
||||
|
||||
- `data_type`: This supports the following modes for Pydantic:
|
||||
1. `evolve` will synthesize a lenient model that allows for any data type. It may result in variant columns upstream.
|
||||
2. `freeze` will re-raise a ValidationException.
|
||||
3. `discard_row` will remove the non-validating data items.
|
||||
4. `discard_value` is not currently supported.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# **Good to Know**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
- Unless you specify a schema contract, settings will default to `evolve` on all levels.
|
||||
|
||||
- The `schema_contract` argument accepts two forms:
|
||||
1. Full form: A detailed mapping of schema entities to their respective contract modes.
|
||||
```python
|
||||
schema_contract={"tables": "freeze", "columns": "freeze", "data_type": "freeze"}
|
||||
```
|
||||
2. Shorthand form: A single contract mode that will be uniformly applied to all schema entities.
|
||||
```python
|
||||
schema_contract="freeze"
|
||||
```
|
||||
|
||||
- Schema contracts can be defined for:
|
||||
1. `dlt` resources: The contract applies to the corresponding table and any child tables.
|
||||
```python
|
||||
@dlt.resource(schema_contract={"columns": "evolve"})
|
||||
def items():
|
||||
...
|
||||
```
|
||||
2. `dlt` sources: The contract serves as a default for all resources within that source.
|
||||
```python
|
||||
@dlt.source(schema_contract="freeze")
|
||||
def source():
|
||||
...
|
||||
```
|
||||
3. The `pipeline.run()`: This contract overrides any existing schema contracts.
|
||||
```python
|
||||
pipeline.run(source(), schema_contract="freeze")
|
||||
```
|
||||
|
||||
- You can change the contract on a `dlt` source via its `schema_contract` property.
|
||||
```python
|
||||
source = dlt.source(...)
|
||||
source.schema_contract = {"tables": "evolve", "columns": "freeze", "data_type": "discard_row"}
|
||||
```
|
||||
|
||||
- To update the contract for `dlt` resources, use `apply_hints`.
|
||||
```python
|
||||
resource.apply_hints(schema_contract={"tables": "evolve", "columns": "freeze"})
|
||||
```
|
||||
|
||||
- For the `discard_row` method at the table level, if there are two tables in a parent-child relationship, such as `users` and `users__addresses`, and the contract is violated in the child table, the row in the child table (`users__addresses`) will be discarded, while the corresponding parent row in the `users` table will still be loaded.
|
||||
|
||||
- If a table is a `new table` that hasn't been created on the destination yet, `dlt` will allow the creation of new columns. During the first pipeline run, the column mode is temporarily changed to `evolve` and then reverted back to the original mode. Following tables are considered new:
|
||||
1. Child tables inferred the nested data.
|
||||
2. Dynamic tables created from the data during extraction.
|
||||
3. Tables containing incomplete columns - columns without a data type bound to them.
|
||||
|
||||
> Note that tables with columns defined with Pydantic models are not considered new.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "y0sqFhxJnH5r"
|
||||
},
|
||||
"source": [
|
||||
"# **Introduction** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"
|
||||
"# **Introduction** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -49,6 +49,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from typing import Iterable, Union\n",
|
||||
"import dlt\n",
|
||||
"from dlt.sources.helpers import requests\n",
|
||||
@@ -58,10 +59,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -162,10 +162,7 @@
|
||||
"\n",
|
||||
"## What is `Sentry` 🤔\n",
|
||||
"\n",
|
||||
"`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Remember, `dlt` does not have the `Sentry` client as a dependency. You need to install it."
|
||||
"`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -297,10 +294,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")"
|
||||
"dlt.config[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -416,9 +412,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import dlt\n",
|
||||
"\n",
|
||||
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
|
||||
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -470,7 +466,7 @@
|
||||
" dataset_name=\"github_data_merge\",\n",
|
||||
")\n",
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"\n",
|
||||
"print(load_info)\n",
|
||||
"# result gets showed despite no print statement ? check dlt.log"
|
||||
]
|
||||
},
|
||||
@@ -512,9 +508,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import dlt\n",
|
||||
"\n",
|
||||
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
|
||||
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -576,7 +572,8 @@
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"github_data_merge\",\n",
|
||||
")\n",
|
||||
"load_info = pipeline.run(github_source())"
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -596,9 +593,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import dlt\n",
|
||||
"\n",
|
||||
"os.environ[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n",
|
||||
"dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
@@ -607,7 +604,8 @@
|
||||
" dataset_name=\"github_data_merge\",\n",
|
||||
" progress=\"log\",\n",
|
||||
")\n",
|
||||
"load_info = pipeline.run(github_source())"
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -616,17 +614,8 @@
|
||||
"id": "AH3F46PaJZe4"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/11P5O2R40ExtFtPfX4o1O5mF7nFbibtuZ#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "maZdAnM0bjiv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,470 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "loguru",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sentry-sdk",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""# **Introduction** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
In this notebook, we focus more on pipeline metadata, and how to use that to be able to trace and debug our pipelines.
|
||||
|
||||
First, we create the pipeline we'll inspect throughout this notebook.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Create the pipeline we will inspect""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
from typing import Iterable, Union
|
||||
import dlt
|
||||
from dlt.sources.helpers import requests
|
||||
from dlt.extract import DltResource
|
||||
from dlt.common.typing import TDataItems
|
||||
from dlt.sources.helpers.rest_client import RESTClient
|
||||
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
|
||||
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
|
||||
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("ACCESS_TOKEN")
|
||||
|
||||
@dlt.source
|
||||
def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
|
||||
return github_pulls
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
_load_info = pipeline.run(github_source())
|
||||
# define new dlt pipeline
|
||||
# run the pipeline with the new resource
|
||||
print(_load_info)
|
||||
return Union, dlt, github_source, os, pipeline
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## Look at the data""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
import duckdb
|
||||
|
||||
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")
|
||||
|
||||
conn.sql("SHOW ALL TABLES").df()
|
||||
return (conn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""More importantly, let's look at the saved load info""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(conn):
|
||||
conn.sql("select * from github_data._dlt_loads").df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# **Tracing with Sentry**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can enable tracing through Sentry.
|
||||
|
||||
## What is `Sentry` 🤔
|
||||
|
||||
`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import sentry_sdk
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Sentry needs to be initialized in normal scripts
|
||||
|
||||
|
||||
|
||||
```
|
||||
import sentry_sdk
|
||||
import os
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=os.getenv("RUNTIME__SENTRY_DSN"),
|
||||
traces_sample_rate=1.0 # Adjust this for performance monitoring if needed
|
||||
)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Say, you make an error and it is caught with Sentry:
|
||||
|
||||
|
||||
|
||||
```
|
||||
try:
|
||||
1 / 0
|
||||
except ZeroDivisionError as e:
|
||||
sentry_sdk.capture_exception(e)
|
||||
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""It will then show up on your Sentry dashboard:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Even when a normal error arises after Sentry has been initiated, your program executes normally, but sends that error to your dashboard, so it can be tracked!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### In dlt, you can enable Sentry quite easily
|
||||
|
||||
You can configure the `DSN` in the `config.toml`:
|
||||
|
||||
```
|
||||
[runtime]
|
||||
|
||||
sentry_dsn="https:///<...>"
|
||||
```
|
||||
|
||||
|
||||
Alternatively, you can use environment variables. **This is what we'll be doing**:
|
||||
```
|
||||
RUNTIME__SENTRY_DSN="https:///<...>"
|
||||
```
|
||||
The entry client is configured after the first pipeline is created with `dlt.pipeline()`. Feel free to use `sentry_sdk` init again to cover your specific needs.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's try introducing the same error again""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, os):
|
||||
dlt.config["RUNTIME__SENTRY_DSN"] = os.getenv("SENTRY_TOKEN")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
data = {12: 34}
|
||||
|
||||
info = pipeline.run([data], table_name="issues")
|
||||
info
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""And that comes up in Sentry as well""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The message sent to Sentry is:
|
||||
```
|
||||
Job for issues.a3f927c556.insert_values failed terminally in load 1723645286.6510239 with message Constraint Error: NOT NULL constraint failed: issues.id
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# **Logging**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
There are various environments where we would be completely lost without logs.
|
||||
|
||||
Debugging any system would be incredibly hard if we didn't know what was going on, or at what point the program ran into an error.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Setting log levels in `dlt`""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can set log levels in your `config.toml` file:
|
||||
|
||||
|
||||
|
||||
```
|
||||
[runtime]
|
||||
log_level="INFO"
|
||||
```
|
||||
|
||||
`log_level` accepts the Python standard logging level names.
|
||||
|
||||
The default log level is `WARNING`.
|
||||
|
||||
**`INFO` log level is useful when diagnosing problems in production.**
|
||||
|
||||
**`CRITICAL` will disable logging.**
|
||||
|
||||
**`DEBUG` should not be used in production.**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""We'll be setting the log level in our environment variables:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
dlt.config["RUNTIME__LOG_LEVEL"] = "INFO"
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
dlt logs to a logger named `dlt`.
|
||||
|
||||
dlt logger uses a regular python logger so you can configure the handlers as per your requirement.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import logging
|
||||
|
||||
# Create a logger
|
||||
logger = logging.getLogger("dlt")
|
||||
|
||||
# Set the log level
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Create a file handler
|
||||
handler = logging.FileHandler("dlt.log")
|
||||
|
||||
# Add the handler to the logger
|
||||
logger.addHandler(handler)
|
||||
return (logging,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source):
|
||||
pipeline_1 = dlt.pipeline(
|
||||
pipeline_name="github_issues_merge_logger",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data_merge",
|
||||
)
|
||||
_load_info = pipeline_1.run(github_source())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Logging via `Loguru` in our GitHub example""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""let's change the logging level""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
dlt.config["RUNTIME__LOG_LEVEL"] = "INFO"
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Union, logging):
|
||||
import sys
|
||||
from loguru import logger as loguru_logger
|
||||
|
||||
class InterceptHandler(logging.Handler):
|
||||
@loguru_logger.catch(default=True, onerror=lambda _: sys.exit(1))
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
# parent class logging.Handler processes log messages
|
||||
try:
|
||||
level: Union[str, int] = loguru_logger.level(
|
||||
record.levelname
|
||||
).name # decorator provided by loguru that catches any exceptions in the decorated function and logs them
|
||||
except ValueError:
|
||||
level = record.levelno
|
||||
(frame, depth) = (
|
||||
sys._getframe(6),
|
||||
6,
|
||||
) # Get corresponding Loguru level if it exists.
|
||||
while frame and frame.f_code.co_filename == logging.__file__:
|
||||
frame = frame.f_back
|
||||
depth = depth + 1
|
||||
loguru_logger.opt(depth=depth, exception=record.exc_info).log(
|
||||
level, record.getMessage()
|
||||
)
|
||||
|
||||
logger_dlt = logging.getLogger("dlt")
|
||||
logger_dlt.addHandler(
|
||||
InterceptHandler()
|
||||
) # Find caller (call frame) from where originated the logged message.
|
||||
# all logs will be written to dlt_loguru.log
|
||||
loguru_logger.add(
|
||||
"dlt_loguru.log"
|
||||
) # logs the message using loguru, with the level, exception information, and depth
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source):
|
||||
pipeline_2 = dlt.pipeline(
|
||||
pipeline_name="github_issues_merge_loguru",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data_merge",
|
||||
)
|
||||
_load_info = pipeline_2.run(github_source())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## **Logs for monitoring the progress**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source):
|
||||
dlt.config["RUNTIME__LOG_LEVEL"] = "WARNING"
|
||||
pipeline_3 = dlt.pipeline(
|
||||
pipeline_name="github_issues_progress",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data_merge",
|
||||
progress="log",
|
||||
)
|
||||
_load_info = pipeline_3.run(github_source())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "GNU4s2jjWTOV"
|
||||
},
|
||||
"source": [
|
||||
"# **Performance Optimization in dlt pipelines** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)"
|
||||
"# **Performance Optimization in dlt pipelines** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -94,7 +94,7 @@
|
||||
"\n",
|
||||
"We'll now look at how to optimize each of these stages individually.\n",
|
||||
"\n",
|
||||
"> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) first."
|
||||
"> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) first."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -232,23 +232,27 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import multiprocessing\n",
|
||||
"import time\n",
|
||||
"import multiprocessing\n",
|
||||
"from concurrent.futures import ProcessPoolExecutor\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def compute_heavy_task() -> None:\n",
|
||||
"def compute_heavy_task() -> str:\n",
|
||||
" lines = []\n",
|
||||
" for number in range(3):\n",
|
||||
" print(\n",
|
||||
" f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\\n\"\n",
|
||||
" lines.append(\n",
|
||||
" f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\"\n",
|
||||
" )\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" return \"\\n\".join(lines)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" with ProcessPoolExecutor(max_workers=4) as process_executor:\n",
|
||||
" for _ in range(4):\n",
|
||||
" process_executor.submit(compute_heavy_task)"
|
||||
" futures = [process_executor.submit(compute_heavy_task) for _ in range(4)]\n",
|
||||
" for fut in futures:\n",
|
||||
" print(fut.result())\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -450,12 +454,12 @@
|
||||
"id": "rvId84tCaH7u"
|
||||
},
|
||||
"source": [
|
||||
"- Control the [in-memory buffer size](#scrollTo=ffVpDFHfnqO-) for the extract stage\n",
|
||||
"- Control the `in-memory buffer size` for the extract stage\n",
|
||||
"- Group `dlt` resources into `dlt` sources\n",
|
||||
"- Specify the number of thread workers or..\n",
|
||||
"- When using async generators, control the number of async functions/awaitables being evaluated in parallel\n",
|
||||
"- Yield pages instead of rows\n",
|
||||
"- Customize the [size of intermediary files](#scrollTo=g9AGWfLkoAMb) created in the extract stage to control file rotation"
|
||||
"- Customize the `size of intermediary files` created in the extract stage to control file rotation"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -559,7 +563,7 @@
|
||||
" dataset_name=\"mydata\",\n",
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"load_info = pipeline.extract(buffered_resource)\n",
|
||||
"pipeline.extract(buffered_resource)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -604,7 +608,8 @@
|
||||
" dataset_name=\"mydata\",\n",
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"load_info = pipeline.extract(buffered_resource)\n",
|
||||
"\n",
|
||||
"pipeline.extract(buffered_resource)\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -779,9 +784,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(\n",
|
||||
" [buffered_resource1, buffered_resource2, buffered_resource3]\n",
|
||||
")\n",
|
||||
"pipeline.extract([buffered_resource1, buffered_resource2, buffered_resource3])\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -825,7 +828,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -954,7 +957,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1038,7 +1041,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1089,13 +1092,13 @@
|
||||
"@dlt.resource\n",
|
||||
"def sync_items() -> TDataItems:\n",
|
||||
" for i in range(10):\n",
|
||||
" time.sleep(0.5) # Blocking call\n",
|
||||
" time.sleep(0.5)\n",
|
||||
" yield i\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.transformer\n",
|
||||
"def sync_transform(item: TDataItem) -> TDataItems:\n",
|
||||
" time.sleep(0.5) # Also blocking\n",
|
||||
" time.sleep(0.5)\n",
|
||||
" return {\"row\": item}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1130,13 +1133,13 @@
|
||||
"@dlt.resource\n",
|
||||
"async def async_items() -> TDataItems:\n",
|
||||
" for i in range(10):\n",
|
||||
" await asyncio.sleep(0.5) # Blocking\n",
|
||||
" await asyncio.sleep(0.5)\n",
|
||||
" yield i\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.transformer\n",
|
||||
"async def async_transform(item) -> TDataItems:\n",
|
||||
" await asyncio.sleep(0.5) # Non-blocking\n",
|
||||
" await asyncio.sleep(0.5)\n",
|
||||
" # just return the results, if you yield, generator will be evaluated in main thread\n",
|
||||
" return {\"row\": item}\n",
|
||||
"\n",
|
||||
@@ -1276,7 +1279,7 @@
|
||||
"@dlt.resource\n",
|
||||
"def get_users() -> TDataItems:\n",
|
||||
" for user in fetch_users():\n",
|
||||
" yield user # yields one row at a time"
|
||||
" yield user"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1354,8 +1357,8 @@
|
||||
"\n",
|
||||
"def yield_chunks(iterator: Iterator[Dict[str, int]], chunk_size=10):\n",
|
||||
" iterator = iter(iterator)\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
|
||||
" time.sleep(0.01) # Simulate slow API call\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)):\n",
|
||||
" time.sleep(0.01)\n",
|
||||
" yield chunk\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1387,7 +1390,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1463,7 +1466,7 @@
|
||||
"4. These files are then used in the **load** stage.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
">If you’re not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true&scrollTo=bCeUqaW_cRSh) before diving into performance tuning. "
|
||||
">If you’re not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) before diving into performance tuning. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1583,8 +1586,8 @@
|
||||
"\n",
|
||||
"def yield_chunks(iterable, chunk_size=10):\n",
|
||||
" iterator = iter(iterable)\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
|
||||
" time.sleep(0.01) # Simulate slow API call\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)):\n",
|
||||
" time.sleep(0.01)\n",
|
||||
" yield chunk\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -1611,7 +1614,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1639,7 +1642,7 @@
|
||||
"\n",
|
||||
"os.environ[\"NORMALIZE__WORKERS\"] = \"1\"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.normalize()\n",
|
||||
"pipeline.normalize()\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
@@ -1710,8 +1713,8 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.extract(source())\n",
|
||||
"load_info = pipeline.normalize()\n",
|
||||
"pipeline.extract(source())\n",
|
||||
"pipeline.normalize()\n",
|
||||
"\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
@@ -1881,8 +1884,8 @@
|
||||
"\n",
|
||||
"def yield_chunks(iterable, chunk_size=10):\n",
|
||||
" iterator = iter(iterable)\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n",
|
||||
" time.sleep(0.01) # Simulate slow API call\n",
|
||||
" while chunk := list(islice(iterator, chunk_size)):\n",
|
||||
" time.sleep(0.01)\n",
|
||||
" yield chunk\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -2060,7 +2063,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install dlt if not already installed\n",
|
||||
"%%capture\n",
|
||||
"!pip install \"dlt[duckdb]\""
|
||||
]
|
||||
@@ -2082,7 +2084,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exit()"
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ.clear()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2117,9 +2121,9 @@
|
||||
"def pagination(url):\n",
|
||||
" while True:\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" time.sleep(0.1) # Simulate delay\n",
|
||||
" time.sleep(0.1)\n",
|
||||
" response.raise_for_status()\n",
|
||||
" yield response.json() # Here we're yielding pages\n",
|
||||
" yield response.json()\n",
|
||||
"\n",
|
||||
" # Get next page\n",
|
||||
" if \"next\" not in response.links:\n",
|
||||
@@ -2201,7 +2205,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(\n",
|
||||
"pipeline.run(\n",
|
||||
" [\n",
|
||||
" get_issues,\n",
|
||||
" get_stargazers,\n",
|
||||
@@ -2355,9 +2359,6 @@
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"improved_p = dlt.pipeline(\"test_pipeline_2\", destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"extract_pipeline_example2\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
@@ -2365,7 +2366,7 @@
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(github_data())\n",
|
||||
"pipeline.run(github_data())\n",
|
||||
"print(pipeline.last_trace)"
|
||||
]
|
||||
},
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
"id": "pTAeTdoKJHZV"
|
||||
},
|
||||
"source": [
|
||||
"# **Quick Start** 👩💻🚀 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n",
|
||||
"# **Quick Start** 👩💻🚀 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n",
|
||||
"\n",
|
||||
"**Here, you will learn:**\n",
|
||||
"- What is dlt?\n",
|
||||
@@ -55,15 +55,6 @@
|
||||
"> **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Su4oUJelKaZY"
|
||||
},
|
||||
"source": [
|
||||
"[Install](https://dlthub.com/docs/reference/installation) `dlt` with DuckDB as destination:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -180,7 +171,7 @@
|
||||
"> **What just happened?** \n",
|
||||
"> The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables.\n",
|
||||
">\n",
|
||||
"> For this example `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n",
|
||||
"> For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n",
|
||||
">\n",
|
||||
">For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline)."
|
||||
]
|
||||
@@ -191,7 +182,7 @@
|
||||
"id": "Z9ll-Ax1BxGu"
|
||||
},
|
||||
"source": [
|
||||
"Quick start was really quick, hah? It seems like some kind of magic happened.\n",
|
||||
"Quick start was really quick, huh? It seems like some kind of magic happened.\n",
|
||||
"\n",
|
||||
"We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline?"
|
||||
]
|
||||
@@ -217,7 +208,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"another_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"resource_source\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"mydata\",\n",
|
||||
@@ -237,7 +228,7 @@
|
||||
"* **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline.\n",
|
||||
"* **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline.\n",
|
||||
"\n",
|
||||
"There are more arguments, but they are for advanced use, we skip it for now."
|
||||
"There are additional arguments for advanced use, but we’ll skip them for now."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -262,7 +253,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run the pipeline and print load info\n",
|
||||
"load_info = pipeline.run(data, table_name=\"pokemon\")\n",
|
||||
"load_info = another_pipeline.run(data, table_name=\"pokemon\")\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -309,7 +300,7 @@
|
||||
"id": "xQcYIbDbQevC"
|
||||
},
|
||||
"source": [
|
||||
"Start a connection to your database using native `duckdb` connection and look what tables were generated:"
|
||||
"Start a connection to your database using a native `duckdb` connection and see which tables were generated:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -321,17 +312,14 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import duckdb\n",
|
||||
"from google.colab import data_table\n",
|
||||
"\n",
|
||||
"data_table.enable_dataframe_formatter()\n",
|
||||
"\n",
|
||||
"# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it\n",
|
||||
"\n",
|
||||
"# Connect to the DuckDB database\n",
|
||||
"conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n",
|
||||
"conn = duckdb.connect(f\"{another_pipeline.pipeline_name}.duckdb\")\n",
|
||||
"\n",
|
||||
"# Set search path to the dataset\n",
|
||||
"conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n",
|
||||
"conn.sql(f\"SET search_path = '{another_pipeline.dataset_name}'\")\n",
|
||||
"\n",
|
||||
"# Describe the dataset\n",
|
||||
"conn.sql(\"DESCRIBE\").df()"
|
||||
@@ -399,7 +387,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Query data from 'pokemon' using the SQL client\n",
|
||||
"with pipeline.sql_client() as client:\n",
|
||||
"with another_pipeline.sql_client() as client:\n",
|
||||
" with client.execute_query(\"SELECT * FROM pokemon\") as cursor:\n",
|
||||
" data = cursor.df()\n",
|
||||
"\n",
|
||||
@@ -427,7 +415,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = pipeline.dataset()\n",
|
||||
"dataset = another_pipeline.dataset()\n",
|
||||
"dataset.pokemon.df()"
|
||||
]
|
||||
},
|
||||
@@ -467,17 +455,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "lN6cXVfhVPmq"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
394
docs/education/dlt-fundamentals-course/lesson_1_quick_start.py
Normal file
394
docs/education/dlt-fundamentals-course/lesson_1_quick_start.py
Normal file
@@ -0,0 +1,394 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Quick Start** 👩💻🚀 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
|
||||
|
||||
**Here, you will learn:**
|
||||
- What is dlt?
|
||||
- How to run a simple pipeline with toy data.
|
||||
- How to explore the loaded data using:
|
||||
- DuckDB connection
|
||||
- dlt's sql_client
|
||||
- dlt datasets
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **What is dlt?**
|
||||
|
||||
In today's data-driven world, organizations often grapple with the challenge of efficiently **extracting, transforming,** and **loading** (ETL) data from various, often messy, data sources into well-structured, live datasets. This process can be complex, time-consuming, and prone to errors, especially when dealing with large volumes of data or nested data structures.
|
||||
|
||||
Enter **dlt**, an **open-source Python library** designed to simplify and streamline this process. **dlt can load data from** a wide range of **sources** including REST APIs, SQL databases, cloud storage, and Python data structures, among others. It offers a lightweight interface that **infers schemas** and **data types**, **normalizes** the data, and handles **nested data** structures, making it easy to use, flexible, and scalable.
|
||||
|
||||
Moreover, dlt supports a variety of **popular destinations** and allows for the addition of custom destinations to create **reverse ETL** pipelines. It can be deployed **anywhere Python runs**, be it on Airflow, serverless functions, or any other cloud deployment of your choice. With features like **schema evolution**, **data contracts** and **incremental loading**, dlt also automates pipeline maintenance, saving valuable time and resources.
|
||||
|
||||
In essence, dlt is a powerful tool that simplifies the ETL process, making it more efficient and less error-prone. It allows data teams to **focus** on leveraging the data and driving value, while ensuring effective **governance** through timely notifications of any changes.
|
||||
|
||||
[Learn more about dlt here](https://dlthub.com/docs/intro) and in this course!
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Installation**
|
||||
|
||||
> **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Read more about DuckDB as a destination [here](https://dlthub.com/docs/dlt-ecosystem/destinations/duckdb)."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Run a simple pipeline with toy data**
|
||||
For educational purposes, let’s start with a simple pipeline using a small dataset — Pokémon data represented as a list of Python dictionaries.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""1. Define a list of Python dictionaries, which will be your toy data:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# Sample data containing pokemon details
|
||||
data = [
|
||||
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
||||
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
||||
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
||||
]
|
||||
return (data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""2. Import `dlt` and create a simple pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
|
||||
# Set pipeline name, destination, and dataset name
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="quick_start",
|
||||
destination="duckdb",
|
||||
dataset_name="mydata",
|
||||
)
|
||||
return dlt, pipeline
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""3. Run your pipeline and print the load info:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, pipeline):
|
||||
# Run the pipeline with data and table name
|
||||
_load_info = pipeline.run(data, table_name="pokemon")
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
> **What just happened?**
|
||||
> The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables.
|
||||
>
|
||||
> For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.
|
||||
>
|
||||
>For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Quick start was really quick, huh? It seems like some kind of magic happened.
|
||||
|
||||
We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
## **What is a `dlt` Pipeline?**
|
||||
|
||||
A [pipeline](https://dlthub.com/docs/general-usage/pipeline) is a connection that moves data from your Python code to a destination. The pipeline accepts dlt sources or resources, as well as generators, async generators, lists, and any iterables. Once the pipeline runs, all resources are evaluated and the data is loaded at the destination.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
another_pipeline = dlt.pipeline(
|
||||
pipeline_name="resource_source",
|
||||
destination="duckdb",
|
||||
dataset_name="mydata",
|
||||
dev_mode=True,
|
||||
)
|
||||
return (another_pipeline,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You instantiate a pipeline by calling the `dlt.pipeline` function with the following arguments:
|
||||
* **`pipeline_name`**: This is the name you give to your pipeline. It helps you track and monitor your pipeline, and also helps to bring back its state and data structures for future runs. If you don't provide a name, dlt will use the name of the Python file you're running as the pipeline name.
|
||||
* **`destination`**: a name of the destination to which dlt will load the data. It may also be provided to the run method of the pipeline.
|
||||
* **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline.
|
||||
* **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline.
|
||||
|
||||
There are additional arguments for advanced use, but we’ll skip them for now.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
## **Run method**
|
||||
|
||||
To load the data, you call the `run()` method and pass your data in the data argument.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline, data):
|
||||
# Run the pipeline and print load info
|
||||
_load_info = another_pipeline.run(data, table_name="pokemon")
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Commonly used arguments:
|
||||
|
||||
* **`data`** (the first argument) may be a dlt source, resource, generator function, or any Iterator or Iterable (i.e., a list or the result of the map function).
|
||||
* **`write_disposition`** controls how to write data to a table. Defaults to the value "append".
|
||||
* `append` will always add new data at the end of the table.
|
||||
* `replace` will replace existing data with new data.
|
||||
* `skip` will prevent data from loading.
|
||||
* `merge` will deduplicate and merge data based on `primary_key` and `merge_key` hints.
|
||||
* **`table_name`**: specified in cases when the table name cannot be inferred, i.e., from the resources or name of the generator function.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Explore the loaded data**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(1) DuckDB Connection**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Start a connection to your database using a native `duckdb` connection and see which tables were generated:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline):
|
||||
import duckdb
|
||||
|
||||
# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it
|
||||
|
||||
# Connect to the DuckDB database
|
||||
conn = duckdb.connect(f"{another_pipeline.pipeline_name}.duckdb")
|
||||
|
||||
# Set search path to the dataset
|
||||
conn.sql(f"SET search_path = '{another_pipeline.dataset_name}'")
|
||||
|
||||
# Describe the dataset
|
||||
conn.sql("DESCRIBE").df()
|
||||
return (conn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can see:
|
||||
- `pokemon` table,
|
||||
|
||||
and 3 special `dlt` tables (we will discuss them later):
|
||||
- `_dlt_loads`,
|
||||
- `_dlt_pipeline_state`,
|
||||
- `_dlt_version`.
|
||||
|
||||
Let's execute a query to get all data from the `pokemon` table:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(conn):
|
||||
# Fetch all data from 'pokemon' as a DataFrame
|
||||
table = conn.sql("SELECT * FROM pokemon").df()
|
||||
|
||||
# Display the DataFrame
|
||||
table
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(2) `dlt`'s [sql_client](https://dlthub.com/docs/general-usage/dataset-access/sql-client)**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Most dlt destinations (even filesystem) use an implementation of the `SqlClientBase` class to connect to the physical destination to which your data is loaded. You can access the SQL client of your destination via the `sql_client` method on your pipeline.
|
||||
|
||||
Start a connection to your database with `pipeline.sql_client()` and execute a query to get all data from the `pokemon` table:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline):
|
||||
# Query data from 'pokemon' using the SQL client
|
||||
with another_pipeline.sql_client() as client:
|
||||
with client.execute_query("SELECT * FROM pokemon") as cursor:
|
||||
data_1 = cursor.df()
|
||||
# Display the data
|
||||
data_1
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(3) dlt [datasets](https://dlthub.com/docs/general-usage/dataset-access/dataset)**
|
||||
|
||||
Here's an example of how to retrieve data from a pipeline and load it into a Pandas DataFrame or a PyArrow Table.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline):
|
||||
dataset = another_pipeline.dataset()
|
||||
dataset.pokemon.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
# **Exercise 1**
|
||||
|
||||
Using the code from the previous cell, fetch the data from the `pokemon` table into a dataframe and count the number of columns in the table `pokemon`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""**Use this number to answer the question in the Quiz LearnWorlds Form.**"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,7 +6,7 @@
|
||||
"id": "qvMyiV0uMY-7"
|
||||
},
|
||||
"source": [
|
||||
"# **dlt sources and resources**: Create first dlt pipeline. [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n"
|
||||
"# **dlt sources and resources**: Create your first dlt pipeline [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -24,12 +24,12 @@
|
||||
"id": "pZCRBANQftVQ"
|
||||
},
|
||||
"source": [
|
||||
"## Recap of [Lesson 1](https://colab.research.google.com/drive/1QwlDWxX5hvwbHMkCgiF0UCzGFRMRoSPY#forceEdit=true&sandboxMode=true) 👩💻🚀\n",
|
||||
"1. Created a pipeline, loaded toy data into DuckDB, and viewed load info.\n",
|
||||
"2. Used `dlt.pipeline` and `pipeline.run` methods.\n",
|
||||
"3. Used DuckDB, `sql_client` and dlt `dataset` to view tables and query data.\n",
|
||||
"## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩💻🚀\n",
|
||||
"1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info.\n",
|
||||
"2. Used the `dlt.pipeline` and `pipeline.run` methods.\n",
|
||||
"3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`.\n",
|
||||
"\n",
|
||||
"Now we move to the next lesson to learn more details about dlt! 🚀"
|
||||
"Now, let's move on to the next lesson to learn more! 🚀"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -39,18 +39,9 @@
|
||||
},
|
||||
"source": [
|
||||
"**Here, you will learn how to:**\n",
|
||||
"- Run a simple pipeline with different types of data, such as dataframes, databases and RestAPI.\n",
|
||||
"- Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs.\n",
|
||||
"- Use `dlt.resource`, `dlt.source` and `dlt.transformer`.\n",
|
||||
"- Build your first dlt pipeline for RestAPI."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "oaLSnDr9hSxE"
|
||||
},
|
||||
"source": [
|
||||
"## **Install dlt**"
|
||||
"- Build your first dlt pipeline for a REST API."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -142,7 +133,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# Create a dlt resource from the data\n",
|
||||
"@dlt.resource(table_name=\"pokemon_new\") # <--- we set new table name\n",
|
||||
"@dlt.resource(table_name=\"pokemon_new\")\n",
|
||||
"def my_dict_list() -> TDataItems:\n",
|
||||
" yield data"
|
||||
]
|
||||
@@ -156,8 +147,8 @@
|
||||
"Commonly used arguments:\n",
|
||||
"\n",
|
||||
"* **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name.\n",
|
||||
"* **`table_name`**: the name of the table, if different from the resource name.\n",
|
||||
"* **`write_disposition`**: controls how to write data to a table. Defaults to the value \"append\"."
|
||||
"* **`table_name`**: The name of the table, if different from the resource name.\n",
|
||||
"* **`write_disposition`**: Controls how to write data to a table. Defaults to the value \"append\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -232,7 +223,7 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"### Dataframes\n",
|
||||
"For creating a pipeline using dataframes, you would do:"
|
||||
"To create a pipeline using dataframes, you would do:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -268,11 +259,9 @@
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"### Database\n",
|
||||
"### Databases\n",
|
||||
"\n",
|
||||
"For creating a pipeline from an SQL database query you would:\n",
|
||||
"\n",
|
||||
"1. Install the PyMySQL package:"
|
||||
"To create a pipeline from an SQL database query you would:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -293,7 +282,7 @@
|
||||
"id": "ktAAuuJqW792"
|
||||
},
|
||||
"source": [
|
||||
"2. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"
|
||||
"1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -458,7 +447,7 @@
|
||||
"* The source Python module typically contains optional customizations and data transformations.\n",
|
||||
"* The source Python module typically contains the authentication and pagination code for a particular API.\n",
|
||||
"\n",
|
||||
"Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource) here."
|
||||
"Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -508,12 +497,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a pipeline\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"new_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"resource_source_new\", destination=\"duckdb\", dataset_name=\"all_data\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Run the pipeline\n",
|
||||
"load_info = pipeline.run(all_data())\n",
|
||||
"load_info = new_pipeline.run(all_data())\n",
|
||||
"\n",
|
||||
"# Print load info\n",
|
||||
"print(load_info)"
|
||||
@@ -602,8 +591,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dlt.resource(table_name=\"pokemon\")\n",
|
||||
"def my_dict_list() -> TDataItems:\n",
|
||||
" yield data"
|
||||
"def my_pokemons() -> TDataItems:\n",
|
||||
" pokemons = [\n",
|
||||
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
|
||||
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
|
||||
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
|
||||
" ]\n",
|
||||
" yield pokemons"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -623,45 +617,27 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"data = [\n",
|
||||
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
|
||||
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
|
||||
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define a resource to read and write data to pokemon table\n",
|
||||
"@dlt.resource(table_name=\"pokemon\")\n",
|
||||
"def my_dict_list() -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define a transformer to enrich pokemon data with additional details\n",
|
||||
"@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n",
|
||||
"# NOTE: the `items` argument contains data from the `my_dict_list` resource\n",
|
||||
"@dlt.transformer(data_from=my_pokemons, table_name=\"detailed_info\")\n",
|
||||
"def poke_details(\n",
|
||||
" items: TDataItems,\n",
|
||||
") -> (\n",
|
||||
" TDataItems\n",
|
||||
"): # <--- `items` is a variable and contains data from `my_dict_list` resource\n",
|
||||
") -> TDataItems:\n",
|
||||
" for item in items:\n",
|
||||
" print(\n",
|
||||
" f\"Item: {item}\\n\"\n",
|
||||
" ) # <-- print what data we get from `my_dict_list` source\n",
|
||||
" print(f\"Item: {item}\\n\")\n",
|
||||
"\n",
|
||||
" item_id = item[\"id\"]\n",
|
||||
" url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n",
|
||||
" response = requests.get(url)\n",
|
||||
" details = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"Details: {details}\\n\") # <--- print what data we get from API\n",
|
||||
" print(f\"Details: {details}\\n\")\n",
|
||||
"\n",
|
||||
" yield details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Set pipeline name, destination, and dataset name\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"another_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"quick_start\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokedata\",\n",
|
||||
@@ -687,7 +663,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(poke_details())\n",
|
||||
"load_info = another_pipeline.run(poke_details())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -709,14 +685,20 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dlt.resource(table_name=\"pokemon\")\n",
|
||||
"def my_dict_list() -> TDataItems:\n",
|
||||
" yield from data # <--- This would yield one item at a time\n",
|
||||
"def my_other_pokemons() -> TDataItems:\n",
|
||||
" pokemons = [\n",
|
||||
" {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n",
|
||||
" {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n",
|
||||
" {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n",
|
||||
" ]\n",
|
||||
" yield from pokemons\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n",
|
||||
"def details(\n",
|
||||
"# NOTE: Transformer receives one item at a time\n",
|
||||
"@dlt.transformer(data_from=my_other_pokemons, table_name=\"detailed_info\")\n",
|
||||
"def other_poke_details(\n",
|
||||
" data_item: TDataItem,\n",
|
||||
") -> TDataItems: # <--- Transformer receives one item at a time\n",
|
||||
") -> TDataItems:\n",
|
||||
" item_id = data_item[\"id\"]\n",
|
||||
" url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n",
|
||||
" response = requests.get(url)\n",
|
||||
@@ -725,7 +707,7 @@
|
||||
" yield details\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(details())\n",
|
||||
"load_info = another_pipeline.run(other_poke_details())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -746,7 +728,8 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(my_dict_list | details)"
|
||||
"load_info = another_pipeline.run(my_pokemons | poke_details)\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -767,7 +750,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Query the 'detailed_info' table and convert the result to a DataFrame\n",
|
||||
"pipeline.dataset().detailed_info.df()"
|
||||
"another_pipeline.dataset().detailed_info.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -809,7 +792,7 @@
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## **Exercise 1: Create a pipeline for GitHub API - repos endpoint**\n",
|
||||
"## **Exercise 1: Create a pipeline for GitHub API – repos endpoint**\n",
|
||||
"\n",
|
||||
"In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination.\n",
|
||||
"\n",
|
||||
@@ -819,22 +802,22 @@
|
||||
"\n",
|
||||
" Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:\n",
|
||||
"\n",
|
||||
" GET https://api.github.com/orgs/{org}/repos\n",
|
||||
" `GET https://api.github.com/orgs/{org}/repos`\n",
|
||||
"\n",
|
||||
"2. **Build the Pipeline**\n",
|
||||
"2. **Build the pipeline**\n",
|
||||
"\n",
|
||||
" Write a script to:\n",
|
||||
"\n",
|
||||
" * Fetch repositories for a **dlt-hub** organization.\n",
|
||||
" * Use `dlt.resource` to define the data extraction logic.\n",
|
||||
" * Combine all resources to a single `@dlt.source`.\n",
|
||||
" * Load the data into a DuckDB database.\n",
|
||||
" - Fetch repositories for the **dlt-hub** organization.\n",
|
||||
" - Use `dlt.resource` to define the data extraction logic.\n",
|
||||
" - Combine all resources into a single `@dlt.source`.\n",
|
||||
" - Load the data into a DuckDB database.\n",
|
||||
"\n",
|
||||
"3. **Look at the data**\n",
|
||||
"3. **Inspect the data**\n",
|
||||
"\n",
|
||||
" Use `duckdb` connection, `sql_client` or `pipeline.dataset()`.\n",
|
||||
" Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`.\n",
|
||||
"\n",
|
||||
"> **Note**: For this exercise you don't need to use Auth and Pagination."
|
||||
"> **Note**: For this exercise you don't need to use authentication or pagination.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -843,7 +826,7 @@
|
||||
"id": "lcBEFsCUuylN"
|
||||
},
|
||||
"source": [
|
||||
"Play with API using requests library:\n"
|
||||
"Play with the API using the requests library:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -853,9 +836,20 @@
|
||||
"collapsed": true,
|
||||
"id": "Ws7JhfPJvRTa"
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mRunning cells with 'dlt (Python 3.10.0)' requires the ipykernel package.\n",
|
||||
"\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
|
||||
"\u001b[1;31mCommand: '/Users/anuunchinbat/Documents/GitHub/dlt/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from dlt.sources.helpers import requests\n",
|
||||
"\n",
|
||||
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/repos\")\n",
|
||||
"response.json()[0]"
|
||||
@@ -867,7 +861,7 @@
|
||||
"id": "7PUyt5LAXEMY"
|
||||
},
|
||||
"source": [
|
||||
"In the code snippet below you will find an **example** for the **`events`** endpoint:"
|
||||
"In the code snippet below, you will find an **example** for the **`events`** endpoint:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -889,20 +883,23 @@
|
||||
" yield response.json()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# here is your code\n",
|
||||
"print(\"build the `github_repos` resource here\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
"def github_data() -> Iterable[DltResource]:\n",
|
||||
" return (github_events,) # github_repos\n",
|
||||
" return (github_events,)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"return your new resource as part of the source above\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Set pipeline name, destination, and dataset name\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"github_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"github_pipeline\", destination=\"duckdb\", dataset_name=\"github_data\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(github_data())\n",
|
||||
"load_info = github_pipeline.run(github_data())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -913,7 +910,7 @@
|
||||
},
|
||||
"source": [
|
||||
"### Question\n",
|
||||
"How many columns has the `github_repos` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`."
|
||||
"How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -922,14 +919,15 @@
|
||||
"id": "mYfeMBI82Tg0"
|
||||
},
|
||||
"source": [
|
||||
"## **Exercise 2: Create a pipeline for GitHub API - stargazers endpoint**\n",
|
||||
"## **Exercise 2: Create a pipeline for the GitHub API – stargazers endpoint**\n",
|
||||
"\n",
|
||||
"Create a `dlt.transformer` for the \"stargazers\" endpoint\n",
|
||||
"https://api.github.com/repos/OWNER/REPO/stargazers for `dlt-hub` organization.\n",
|
||||
"Create a `dlt.transformer` for the **\"stargazers\"** endpoint \n",
|
||||
"`https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization.\n",
|
||||
"\n",
|
||||
"Use `github_repos` resource as a main resource for the transformer:\n",
|
||||
"1. Get all `dlt-hub` repositories.\n",
|
||||
"2. Feed these repository names to dlt transformer and get all stargazers for all `dlt-hub` repositories."
|
||||
"Use the `github_repos` resource as the main resource for the transformer:\n",
|
||||
"\n",
|
||||
"1. Get all repositories in the `dlt-hub` organization. \n",
|
||||
"2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -940,7 +938,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# here is your code"
|
||||
"print(\"YOUR CODE GOES HERE\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -950,7 +948,7 @@
|
||||
},
|
||||
"source": [
|
||||
"### Question\n",
|
||||
"How many columns has the `github_stargazer` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`."
|
||||
"How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -959,7 +957,7 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -970,11 +968,13 @@
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "dlt",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"name": "python",
|
||||
"version": "3.10.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,745 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "pymysql",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""# **dlt sources and resources**: Create your first dlt pipeline [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩💻🚀
|
||||
1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info.
|
||||
2. Used the `dlt.pipeline` and `pipeline.run` methods.
|
||||
3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`.
|
||||
|
||||
Now, let's move on to the next lesson to learn more! 🚀
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Here, you will learn how to:**
|
||||
- Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs.
|
||||
- Use `dlt.resource`, `dlt.source` and `dlt.transformer`.
|
||||
- Build your first dlt pipeline for a REST API.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **`dlt` resources**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### List of dicts
|
||||
|
||||
|
||||
In the previous lesson, we simply used a list of dictionaries that essentially represents the `pokemon` table.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
|
||||
data = [
|
||||
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
||||
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
||||
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
||||
]
|
||||
# Sample data containing pokemon details
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="quick_start", destination="duckdb", dataset_name="mydata"
|
||||
)
|
||||
_load_info = pipeline.run(data, table_name="pokemon")
|
||||
# Set pipeline name, destination, and dataset name
|
||||
# Run the pipeline with data and table name
|
||||
print(_load_info)
|
||||
return data, dlt, pipeline
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""A better way is to wrap it in the `@dlt.resource` decorator which denotes a logical grouping of data within a data source, typically holding data of similar structure and origin:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, dlt):
|
||||
from dlt.common.typing import TDataItems, TDataItem
|
||||
|
||||
@dlt.resource(table_name="pokemon_new")
|
||||
def my_dict_list() -> TDataItems:
|
||||
# Create a dlt resource from the data
|
||||
yield data
|
||||
return TDataItem, TDataItems, my_dict_list
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Commonly used arguments:
|
||||
|
||||
* **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name.
|
||||
* **`table_name`**: The name of the table, if different from the resource name.
|
||||
* **`write_disposition`**: Controls how to write data to a table. Defaults to the value "append".
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> **Why is it a better way?** This allows you to use `dlt` functionalities to the fullest that follow Data Engineering best practices, including incremental loading and data contracts."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Try running the pipeline with the `my_dict_list` resource:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(my_dict_list, pipeline):
|
||||
# Run the pipeline and print load info
|
||||
_load_info = pipeline.run(my_dict_list)
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Check what was loaded to the `pokemon_new` table:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
pipeline.dataset().pokemon_new.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Instead of a dict list, the data could also be a/an:
|
||||
- dataframe
|
||||
- database query response
|
||||
- API request response
|
||||
- Anything you can transform into JSON/dict format
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### Dataframes
|
||||
To create a pipeline using dataframes, you would do:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, pipeline):
|
||||
import pandas as pd
|
||||
|
||||
@dlt.resource(table_name="df_data")
|
||||
# Define a resource to load data from a CSV
|
||||
def my_df() -> TDataItems:
|
||||
sample_df = pd.read_csv(
|
||||
"https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv"
|
||||
)
|
||||
yield sample_df
|
||||
|
||||
_load_info = pipeline.run(my_df)
|
||||
print(_load_info)
|
||||
# Run the pipeline with the defined resource
|
||||
# Query the loaded data from 'df_data'
|
||||
pipeline.dataset().df_data.df()
|
||||
return (my_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### Databases
|
||||
|
||||
To create a pipeline from an SQL database query you would:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, pipeline):
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
@dlt.resource(table_name="genome_data")
|
||||
def get_genome_data() -> TDataItems:
|
||||
engine = create_engine(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
query = "SELECT * FROM genome LIMIT 1000"
|
||||
rows = conn.execution_options(yield_per=100).exec_driver_sql(query)
|
||||
yield from map(lambda row: dict(row._mapping), rows)
|
||||
|
||||
_load_info = pipeline.run(get_genome_data)
|
||||
print(_load_info)
|
||||
pipeline.dataset().genome_data.df()
|
||||
return (get_genome_data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### REST API
|
||||
|
||||
For REST API endpoints, create a pipeline as follows:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, pipeline):
|
||||
from dlt.sources.helpers import requests
|
||||
|
||||
@dlt.resource(table_name="pokemon_api")
|
||||
# Define a resource to fetch pokemons from PokeAPI
|
||||
def get_pokemon() -> TDataItems:
|
||||
url = "https://pokeapi.co/api/v2/pokemon"
|
||||
response = requests.get(url)
|
||||
yield response.json()["results"]
|
||||
|
||||
_load_info = pipeline.run(get_pokemon)
|
||||
print(_load_info)
|
||||
# Run the pipeline using the defined resource
|
||||
# Query the loaded data from 'pokemon_api' table
|
||||
pipeline.dataset().pokemon_api.df()
|
||||
return get_pokemon, requests
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Try loading everything above, in a single pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(get_genome_data, get_pokemon, my_df, pipeline):
|
||||
_load_info = pipeline.run([my_df, get_genome_data, get_pokemon])
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Check which new tables were created:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# List all table names from the database
|
||||
with pipeline.sql_client() as client:
|
||||
with client.execute_query(
|
||||
"SELECT table_name FROM information_schema.tables"
|
||||
) as table:
|
||||
print(table.df())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **`dlt` sources**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Now that there are multiple `dlt` resources, each corresponding to a separate table, we can group them into a `dlt` source."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
A source is a logical grouping of resources, e.g., endpoints of a single API. The most common approach is to define it in a separate Python module.
|
||||
|
||||
* A source is a function decorated with `@dlt.source` that returns one or more resources.
|
||||
* A source can optionally define a schema with tables, columns, performance hints, and more.
|
||||
* The source Python module typically contains optional customizations and data transformations.
|
||||
* The source Python module typically contains the authentication and pagination code for a particular API.
|
||||
|
||||
Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You declare a source by decorating a function that returns or yields one or more resources with `@dlt.source`.
|
||||
|
||||
Here's how it's done:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, get_genome_data, get_pokemon, my_df):
|
||||
from typing import Iterable
|
||||
from dlt.extract import DltResource
|
||||
|
||||
@dlt.source
|
||||
def all_data() -> Iterable[DltResource]:
|
||||
return my_df, get_genome_data, get_pokemon
|
||||
return DltResource, Iterable, all_data
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Only using the source above, load everything into a separate database using a new pipeline:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_data, dlt):
|
||||
# Create a pipeline
|
||||
new_pipeline = dlt.pipeline(
|
||||
pipeline_name="resource_source_new",
|
||||
destination="duckdb",
|
||||
dataset_name="all_data",
|
||||
)
|
||||
_load_info = new_pipeline.run(all_data())
|
||||
# Run the pipeline
|
||||
# Print load info
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
> **Why does this matter?**:
|
||||
- It is more efficient than running your resources separately.
|
||||
- It organizes both your schema and your code. 🙂
|
||||
- It enables the option for parallelization.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **`dlt` transformers**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
We now know that `dlt` resources can be grouped into a `dlt` source, represented as:
|
||||
|
||||
|
||||
```
|
||||
Source
|
||||
/ \
|
||||
Resource 1 ... Resource N
|
||||
|
||||
```
|
||||
|
||||
However, imagine a scenario where you need an additional step in between:
|
||||
|
||||
```
|
||||
Source
|
||||
/ \
|
||||
step \
|
||||
/ \
|
||||
Resource 1 ... Resource N
|
||||
|
||||
```
|
||||
|
||||
This step could arise, for example, in a situation where:
|
||||
|
||||
- Resource 1 returns a list of pokemons IDs, and you need to use each of those IDs to retrieve detailed information about the pokemons from a separate API endpoint.
|
||||
|
||||
In such cases, you would use `dlt` transformers — special `dlt` resources that can be fed data from another resource:
|
||||
|
||||
```
|
||||
Source
|
||||
/ \
|
||||
Transformer \
|
||||
/ \
|
||||
Resource 1 ... Resource N
|
||||
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let’s assume Resource 1 is:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt):
|
||||
@dlt.resource(table_name="pokemon")
|
||||
def my_pokemons() -> TDataItems:
|
||||
pokemons = [
|
||||
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
||||
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
||||
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
||||
]
|
||||
yield pokemons
|
||||
return (my_pokemons,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""We need to get detailed information about pokemons from [PokeAPI](https://pokeapi.co/) `"https://pokeapi.co/api/v2/pokemon/{id}"` based on their IDs. We would do:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, my_pokemons, requests):
|
||||
# Define a transformer to enrich pokemon data with additional details
|
||||
# NOTE: the `items` argument contains data from the `my_dict_list` resource
|
||||
@dlt.transformer(data_from=my_pokemons, table_name="detailed_info")
|
||||
def poke_details(
|
||||
items: TDataItems,
|
||||
) -> TDataItems:
|
||||
for item in items:
|
||||
print(f"Item: {item}\n")
|
||||
|
||||
item_id = item["id"]
|
||||
url = f"https://pokeapi.co/api/v2/pokemon/{item_id}"
|
||||
response = requests.get(url)
|
||||
details = response.json()
|
||||
|
||||
print(f"Details: {details}\n")
|
||||
|
||||
yield details
|
||||
|
||||
# Set pipeline name, destination, and dataset name
|
||||
another_pipeline = dlt.pipeline(
|
||||
pipeline_name="quick_start",
|
||||
destination="duckdb",
|
||||
dataset_name="pokedata",
|
||||
dev_mode=True,
|
||||
)
|
||||
return another_pipeline, poke_details
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Run the pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline, poke_details):
|
||||
_load_info = another_pipeline.run(poke_details())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Alternatively, we could do:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItem, TDataItems, another_pipeline, dlt, requests):
|
||||
@dlt.resource(table_name="pokemon")
|
||||
def my_other_pokemons() -> TDataItems:
|
||||
pokemons = [
|
||||
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
||||
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
||||
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
||||
]
|
||||
yield from pokemons
|
||||
|
||||
@dlt.transformer(data_from=my_other_pokemons, table_name="detailed_info")
|
||||
def other_poke_details(data_item: TDataItem) -> TDataItems:
|
||||
item_id = data_item["id"]
|
||||
url = f"https://pokeapi.co/api/v2/pokemon/{item_id}"
|
||||
response = requests.get(url)
|
||||
# NOTE: Transformer receives one item at a time
|
||||
details = response.json()
|
||||
yield details
|
||||
|
||||
_load_info = another_pipeline.run(other_poke_details())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""You can also use pipe instead of `data_from`, this is useful when you want to apply `dlt.transformer` to multiple `dlt.resources`:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline, my_pokemons, poke_details):
|
||||
_load_info = another_pipeline.run(my_pokemons | poke_details)
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Check the loaded data:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(another_pipeline):
|
||||
# Query the 'detailed_info' table and convert the result to a DataFrame
|
||||
another_pipeline.dataset().detailed_info.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Reduce the nesting level of generated tables**
|
||||
You can limit how deep dlt goes when generating nested tables and flattening dicts into columns. By default, the library will descend and generate nested tables for all nested lists, without limit.
|
||||
|
||||
You can set nesting level for all resources on the source level:
|
||||
|
||||
```python
|
||||
@dlt.source(max_table_nesting=1)
|
||||
def all_data():
|
||||
return my_df, get_genome_data, get_pokemon
|
||||
```
|
||||
|
||||
or for each resource separately:
|
||||
|
||||
```python
|
||||
@dlt.resource(table_name='pokemon_new', max_table_nesting=1)
|
||||
def my_dict_list():
|
||||
yield data
|
||||
```
|
||||
|
||||
In the example above, we want only 1 level of nested tables to be generated (so there are no nested tables of a nested table). Typical settings:
|
||||
|
||||
* `max_table_nesting=0` will not generate nested tables and will not flatten dicts into columns at all. All nested data will be represented as JSON.
|
||||
* `max_table_nesting=1` will generate nested tables of root tables and nothing more. All nested data in nested tables will be represented as JSON.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Exercise 1: Create a pipeline for GitHub API – repos endpoint**
|
||||
|
||||
In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination.
|
||||
|
||||
## Instructions
|
||||
|
||||
1. **Explore the GitHub API**
|
||||
|
||||
Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:
|
||||
|
||||
`GET https://api.github.com/orgs/{org}/repos`
|
||||
|
||||
2. **Build the pipeline**
|
||||
|
||||
Write a script to:
|
||||
|
||||
- Fetch repositories for the **dlt-hub** organization.
|
||||
- Use `dlt.resource` to define the data extraction logic.
|
||||
- Combine all resources into a single `@dlt.source`.
|
||||
- Load the data into a DuckDB database.
|
||||
|
||||
3. **Inspect the data**
|
||||
|
||||
Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`.
|
||||
|
||||
> **Note**: For this exercise you don't need to use authentication or pagination.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Play with the API using the requests library:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(requests):
|
||||
response = requests.get("https://api.github.com/orgs/dlt-hub/repos")
|
||||
response.json()[0]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""In the code snippet below, you will find an **example** for the **`events`** endpoint:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(DltResource, Iterable, TDataItems, dlt, requests):
|
||||
@dlt.resource
|
||||
def github_events() -> TDataItems:
|
||||
url = "https://api.github.com/orgs/dlt-hub/events"
|
||||
response = requests.get(url)
|
||||
yield response.json()
|
||||
|
||||
print("build the `github_repos` resource here")
|
||||
|
||||
@dlt.source
|
||||
def github_data() -> Iterable[DltResource]:
|
||||
return (github_events,)
|
||||
|
||||
print("return your new resource as part of the source above")
|
||||
github_pipeline = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
_load_info = github_pipeline.run(github_data())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Question
|
||||
How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Exercise 2: Create a pipeline for the GitHub API – stargazers endpoint**
|
||||
|
||||
Create a `dlt.transformer` for the **"stargazers"** endpoint
|
||||
`https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization.
|
||||
|
||||
Use the `github_repos` resource as the main resource for the transformer:
|
||||
|
||||
1. Get all repositories in the `dlt-hub` organization.
|
||||
2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
print("YOUR CODE GOES HERE")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Question
|
||||
How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,13 +6,13 @@
|
||||
"id": "MfQUdpVg2Trs"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 2](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Used `@dlt.resource` to load and query data like lists, dataframes, and REST API responses into DuckDB. \n",
|
||||
"1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB. \n",
|
||||
"2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. \n",
|
||||
"3. Used `@dlt.transformer` to process and enrich data between resources. \n",
|
||||
"\n",
|
||||
"Next: Dive deeper into building dlt pipelines using pagination, authentication and dlt configuration! 🚀"
|
||||
"Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -23,16 +23,16 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"# **Pagination & Authentication & dlt Configuration** 🤫🔩 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n",
|
||||
"# **Pagination & Authentication & dlt Configuration** 🤫🔩 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn how to:**\n",
|
||||
"- Use pagination for RestAPIs.\n",
|
||||
"- Use environment variables to handle both secrets & configs.\n",
|
||||
"**In this lesson, you will learn how to:**\n",
|
||||
"- Use pagination for REST APIs.\n",
|
||||
"- Use environment variables to manage both secrets & configs.\n",
|
||||
"- Add values to `secrets.toml` or `config.toml`.\n",
|
||||
"\n",
|
||||
"To read more about credentails refer to [dlt documentation](https://dlthub.com/docs/general-usage/credentials/) here."
|
||||
"To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -41,7 +41,7 @@
|
||||
"id": "aAN9q0Kz0tt_"
|
||||
},
|
||||
"source": [
|
||||
"In previous lesson we loaded data from GitHub API to DuckDB,"
|
||||
"In the previous lesson, we loaded data from the GitHub API to DuckDB,"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -53,7 +53,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"!pip install dlt"
|
||||
"!pip install \"dlt[duckdb]\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -78,14 +78,14 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# define dlt pipeline\n",
|
||||
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"# run dlt pipeline\n",
|
||||
"load_info = pipeline.run(github_events)\n",
|
||||
"load_info = _pipeline.run(github_events)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().github_events.df()"
|
||||
"_pipeline.dataset().github_events.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -94,9 +94,9 @@
|
||||
"id": "GtyMwBig37uK"
|
||||
},
|
||||
"source": [
|
||||
"You could notice that we received only 1 page, only 30 records. But this endpoint has muuuch more records in total. To get all the pages you should use a pagination.\n",
|
||||
"You may notice we received only one page — just 30 records — even though this endpoint has many more.\n",
|
||||
"\n",
|
||||
"When working with APIs like GitHub, data is often returned in pages. Pagination allows you to retrieve all the data when an endpoint limits how much can be fetched at once."
|
||||
"To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -114,14 +114,16 @@
|
||||
"id": "BolhMQE10Zgk"
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## **Pagination**\n",
|
||||
"\n",
|
||||
"GitHub has very good documentation, so it is not difficult to go through the documentation and find the relevant page: [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n",
|
||||
"GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n",
|
||||
"\n",
|
||||
"It says:\n",
|
||||
">You can use the `link` header from the response to request additional pages of data.\n",
|
||||
"It explains that:\n",
|
||||
"\n",
|
||||
">The link header contains URLs that you can use to fetch additional pages of results. For example, the previous, next, first, and last page of results."
|
||||
">You can use the `Link` header from the response to request additional pages of data.\n",
|
||||
"\n",
|
||||
">The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -130,7 +132,7 @@
|
||||
"id": "iU-xQriAHJI2"
|
||||
},
|
||||
"source": [
|
||||
"**GitHub API Pagination example**\n",
|
||||
"**GitHub API Pagination Example**\n",
|
||||
"\n",
|
||||
"The GitHub API provides the `per_page` and `page` query parameters:\n",
|
||||
"\n",
|
||||
@@ -146,8 +148,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n",
|
||||
"response.headers"
|
||||
]
|
||||
@@ -158,7 +158,7 @@
|
||||
"id": "ZdDGuAVJ4Qqo"
|
||||
},
|
||||
"source": [
|
||||
"Gotcha! We can see 'Link' in the headers. To get this link we can alternatively use `response.links`:"
|
||||
"Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -169,8 +169,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n",
|
||||
"response.links"
|
||||
]
|
||||
@@ -183,18 +181,17 @@
|
||||
"source": [
|
||||
"### **dlt RESTClient**\n",
|
||||
"\n",
|
||||
"The response includes a 'Link' header for navigating to the next page.\n",
|
||||
"So now we can implement a pagination!\n",
|
||||
"Now that we know how pagination works conceptually, let’s see how to implement it efficiently!\n",
|
||||
"\n",
|
||||
"When working with APIs, you could implement pagination using only Python and the requests library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n",
|
||||
"When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n",
|
||||
"\n",
|
||||
"More about how to build pagination with Python and `requests`:\n",
|
||||
"Learn more about building pagination with Python and `requests`:\n",
|
||||
"\n",
|
||||
"* [Link 1](https://farnamdata.com/api-pagination)\n",
|
||||
"\n",
|
||||
"* [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval)\n",
|
||||
"\n",
|
||||
"**But!** In this lesson, we’re gonna use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n",
|
||||
"**But!** In this lesson, we’re going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Why use RESTClient?**\n",
|
||||
@@ -208,9 +205,9 @@
|
||||
"This reduces boilerplate code and lets you focus on your data pipeline logic.\n",
|
||||
"\n",
|
||||
"**Here’s how to fetch paginated data:**\n",
|
||||
"1. Import RESTClient\n",
|
||||
"2. Create the RESTClient instance\n",
|
||||
"3. Use the `paginate` method to iterate through all pages of data."
|
||||
"1. Import `RESTClient`\n",
|
||||
"2. Create a `RESTClient` instance\n",
|
||||
"3. Use the `paginate` method to iterate through all pages of data"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -238,7 +235,7 @@
|
||||
"id": "yNB8jyz5Kmo1"
|
||||
},
|
||||
"source": [
|
||||
"Pagination type was detected automatically, but you can explicitly provide it:"
|
||||
"☝️ The pagination type was detected automatically, but you can also specify it explicitly:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -249,7 +246,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
@@ -264,7 +260,7 @@
|
||||
"id": "_jNBmv1qkUhk"
|
||||
},
|
||||
"source": [
|
||||
"The full list of available paginators you can see in offcial [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n"
|
||||
"The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -278,13 +274,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Dqi7NQtqhfeb"
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The events endpoint does not have as much data, specially if you compare it with the stargazers endpoint for the dlt repo.\n",
|
||||
"The events endpoint doesn’t contain as much data, especially compared to the stargazers endpoint of the dlt repository.\n",
|
||||
"\n",
|
||||
"If you run the pipeline for stargazers endpoint, there is a high chance that you face the **rate limit error**."
|
||||
"If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -295,13 +289,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
" base_url=\"https://api.github.com\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n",
|
||||
" print(page)"
|
||||
]
|
||||
@@ -324,24 +311,22 @@
|
||||
"id": "iKUgNTKuiP6w"
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## **Authentication**\n",
|
||||
"\n",
|
||||
"To avoid this error you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n",
|
||||
"To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n",
|
||||
"\n",
|
||||
"1. Login to your GitHub account.\n",
|
||||
"2. Generate [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic one!).\n",
|
||||
"2. Use it as an access token for GitHub API."
|
||||
"2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic).\n",
|
||||
"2. Use it as an access token for the GitHub API."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "-7ZHBjYspQxt"
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**! ATTENTION !**\n",
|
||||
"\n",
|
||||
"Never share your credentials in public and never hard-code them in your code. Use **environment variables** or **dlt secrets.toml**."
|
||||
"> **! ATTENTION !**\n",
|
||||
"> Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -350,11 +335,18 @@
|
||||
"id": "UB02kiI8ncYm"
|
||||
},
|
||||
"source": [
|
||||
"Create an environment variable for your access token.\n",
|
||||
"Create an environment variable for your access token in Colab.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In Molab, simply click on the `Secrets` section in the left-side menu and add your access token."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -375,7 +367,7 @@
|
||||
"id": "6bdNZJ0HqY4O"
|
||||
},
|
||||
"source": [
|
||||
"So now you can use `access_token` variable in the code below:"
|
||||
"Use the `access_token` variable in the code below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -386,13 +378,12 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = RESTClient(\n",
|
||||
" base_url=\"https://api.github.com\",\n",
|
||||
" auth=BearerTokenAuth(token=access_token), # <--- put your token here\n",
|
||||
" auth=BearerTokenAuth(token=access_token),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n",
|
||||
@@ -406,7 +397,7 @@
|
||||
"id": "D7-rTvYvr05t"
|
||||
},
|
||||
"source": [
|
||||
"So now we can rewrite our GitHub dlt pipeline using the RestAPI Client and `access_token`."
|
||||
"Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -418,7 +409,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import dlt\n",
|
||||
"from dlt.sources.helpers import requests\n",
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"\n",
|
||||
@@ -435,16 +425,16 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# define new dlt pipeline\n",
|
||||
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# run the pipeline with the new resource\n",
|
||||
"load_info = pipeline.run(github_stargazers)\n",
|
||||
"load_info = _pipeline.run(github_stargazers)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().github_stargazers.df()"
|
||||
"_pipeline.dataset().github_stargazers.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -462,6 +452,7 @@
|
||||
"id": "SxpBIZZ_yE8R"
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"## **dlt configuration and secrets**\n",
|
||||
"\n",
|
||||
"In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines.\n",
|
||||
@@ -470,15 +461,13 @@
|
||||
"\n",
|
||||
"On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks.\n",
|
||||
"\n",
|
||||
"These can be set up in various ways:\n",
|
||||
"Both can be set up in various ways:\n",
|
||||
"\n",
|
||||
"* Environment variables\n",
|
||||
"* As environment variables\n",
|
||||
"* Within code using `dlt.secrets` and `dlt.config`\n",
|
||||
"* Configuration files (`secrets.toml` and `config.toml`)\n",
|
||||
"* Via configuration files (`secrets.toml` and `config.toml`)\n",
|
||||
"\n",
|
||||
"We're gonna use `dlt.secrets.value` to define credentials in resources and sources. dlt automatically **extracts** configuration settings and secrets based on flexible naming conventions. It then **injects** these values where needed in code.\n",
|
||||
"\n",
|
||||
"**Note**: It's important to note that while you can put all configurations and credentials in the `dlt.secrets` (or `secrets.toml`) if it's more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt doesn't look for them there.\n"
|
||||
"> **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if that’s more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -487,9 +476,9 @@
|
||||
"id": "64JM2Lnlxyoa"
|
||||
},
|
||||
"source": [
|
||||
"Let's create dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n",
|
||||
"Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n",
|
||||
"\n",
|
||||
"We'll use `@dlt.source` to combine all resources in one place."
|
||||
"We'll use `@dlt.source` to group both resources."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -533,7 +522,7 @@
|
||||
"id": "0h3ugsRiLhfv"
|
||||
},
|
||||
"source": [
|
||||
"Now we'll use `dlt.secrets.value` in our source to enable dlt secrets configuration. Rename `access_token` variable to `secret_key` because it's already defined.\n",
|
||||
"Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
@@ -545,7 +534,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exit() # we use exit() to reset all ENVs we set"
|
||||
"os.environ.clear()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -559,7 +548,6 @@
|
||||
"from typing import Iterable\n",
|
||||
"import dlt\n",
|
||||
"from dlt.extract import DltResource\n",
|
||||
"from dlt.sources.helpers import requests\n",
|
||||
"from dlt.sources.helpers.rest_client import RESTClient\n",
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.common.typing import TDataItems\n",
|
||||
@@ -568,7 +556,7 @@
|
||||
"@dlt.source\n",
|
||||
"def github_source(\n",
|
||||
" access_token=dlt.secrets.value,\n",
|
||||
") -> Iterable[DltResource]: # <--- set the secret variable \"access_token\" here\n",
|
||||
") -> Iterable[DltResource]:\n",
|
||||
" client = RESTClient(\n",
|
||||
" base_url=\"https://api.github.com\", auth=BearerTokenAuth(token=access_token)\n",
|
||||
" )\n",
|
||||
@@ -592,7 +580,7 @@
|
||||
"id": "H-wNVUqfuD37"
|
||||
},
|
||||
"source": [
|
||||
"Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."
|
||||
"> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -601,7 +589,7 @@
|
||||
"id": "shfeHo-vOcD1"
|
||||
},
|
||||
"source": [
|
||||
"If you run the pipeline with `secret_key` as `dlt.secrets.value`, you will see the following error:"
|
||||
"If you now run the pipeline, you will see the following error:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -613,11 +601,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define new dlt pipeline\n",
|
||||
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# run the pipeline with the new resource\n",
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"load_info = _pipeline.run(github_source())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -627,9 +615,9 @@
|
||||
"id": "GCmqzzo7OpgE"
|
||||
},
|
||||
"source": [
|
||||
"^ That is what happens if you set `dlt.secrets.value` for any variable in your dlt pipeline, but don't set the secret value up.\n",
|
||||
"That’s what happens when you use `dlt.secrets.value` for a variable in your pipeline but haven’t actually set the secret value.\n",
|
||||
"\n",
|
||||
"dlt is looking for secrets in following formats:\n",
|
||||
"When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration\n",
|
||||
@@ -654,10 +642,10 @@
|
||||
"id": "Ox08B2V5NCaH"
|
||||
},
|
||||
"source": [
|
||||
"To define `access_token` secret value we can use:\n",
|
||||
"To define the `access_token` secret value, we can use (as mentioned earlier):\n",
|
||||
"\n",
|
||||
"1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds)\n",
|
||||
"2. Environment variables (recomnended for prod)\n",
|
||||
"2. Environment variables (recommended for prod)\n",
|
||||
"3. `secrets.toml` file (recommended for local dev)"
|
||||
]
|
||||
},
|
||||
@@ -669,7 +657,7 @@
|
||||
"source": [
|
||||
"### **Use `dlt.secrets` in code**\n",
|
||||
"\n",
|
||||
"You can easily rewrite your secret right in the Python code. It's especially convenient if you take credentials from third-party secret providers, or if you want to update credentials and configs dinamically."
|
||||
"You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -680,17 +668,15 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"dlt.secrets[\"access_token\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"# define new dlt pipeline\n",
|
||||
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"github_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"# run the pipeline with the new resource\n",
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"load_info = github_pipeline.run(github_source())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -700,7 +686,7 @@
|
||||
"id": "GNghaiYwSBGm"
|
||||
},
|
||||
"source": [
|
||||
"Alternatively you can set:\n",
|
||||
"Alternatively, you can set:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"dlt.secrets[\"sources.access_token\"] = userdata.get('SECRET_KEY')\n",
|
||||
@@ -748,7 +734,7 @@
|
||||
"id": "Adi1RZmOvVzj"
|
||||
},
|
||||
"source": [
|
||||
"### **Exercise 2: Run pipeline with `dlt.secrets.value`**\n",
|
||||
"### **Exercise 2: Run a pipeline with `dlt.secrets.value`**\n",
|
||||
"\n",
|
||||
"Explore the cells above and answer the question below using `sql_client`.\n",
|
||||
"\n",
|
||||
@@ -763,10 +749,9 @@
|
||||
"id": "fQlOIe46ncYm"
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"### **Use environment variables**\n",
|
||||
"\n",
|
||||
"Let's set ENV in the one of the dlt formats: `ACCESS_TOKEN`.\n"
|
||||
"Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -777,17 +762,16 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"# define new dlt pipeline\n",
|
||||
"pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"_pipeline = dlt.pipeline(destination=\"duckdb\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# run the pipeline with the new resource\n",
|
||||
"load_info = pipeline.run(github_source())\n",
|
||||
"load_info = _pipeline.run(github_source())\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -797,7 +781,9 @@
|
||||
"id": "ppEFU1hJPU6c"
|
||||
},
|
||||
"source": [
|
||||
"Alternatively you can set:\n",
|
||||
"Alternatively, you can set:\n",
|
||||
"\n",
|
||||
"> `userdata.get()` is Colab-specific.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get('SECRET_KEY')\n",
|
||||
@@ -831,7 +817,6 @@
|
||||
"id": "l7Y1oCAvJ79I"
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"### **Use dlt `secrets.toml` or `config.toml`**\n"
|
||||
]
|
||||
},
|
||||
@@ -841,7 +826,7 @@
|
||||
"id": "mNzCp5BGpDSh"
|
||||
},
|
||||
"source": [
|
||||
"> Please note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."
|
||||
"> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -862,7 +847,7 @@
|
||||
"└── my_pipeline.py\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials) here."
|
||||
"Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -871,7 +856,7 @@
|
||||
"id": "6bTyl229sadQ"
|
||||
},
|
||||
"source": [
|
||||
"To set credentials via TOMLs you would first add your access token to `secrets.toml`:\n",
|
||||
"To set credentials via the toml files, you would first add your access token to `secrets.toml`:\n",
|
||||
"\n",
|
||||
"```toml\n",
|
||||
"# .dlt/secrets.toml\n",
|
||||
@@ -889,13 +874,13 @@
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"Alternatively you can set:\n",
|
||||
"Alternatively, you can set:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"[sources]\n",
|
||||
"secret_key = \"your_access_token\"\n",
|
||||
"```\n",
|
||||
"is equal to:\n",
|
||||
"which is equal to:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"secret_key = \"your_access_token\"\n",
|
||||
@@ -907,7 +892,7 @@
|
||||
"[sources.____main____]\n",
|
||||
"secret_key = \"your_access_token\"\n",
|
||||
"```\n",
|
||||
"and to:\n",
|
||||
"as well as:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"[sources.____main____.github_source]\n",
|
||||
@@ -922,11 +907,11 @@
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"### **Configure Secrets in Colab**\n",
|
||||
"### **Configure secrets in Colab**\n",
|
||||
"\n",
|
||||
"You can configure secrets using **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n",
|
||||
"You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n",
|
||||
"\n",
|
||||
"Open **Secrets** sidebar, press \"Add new secret\", create variable with name `secrets.toml` and copy-paste secrets in Value field and Enable it:\n",
|
||||
"Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"[sources]\n",
|
||||
@@ -934,7 +919,7 @@
|
||||
"```\n",
|
||||
"\n",
|
||||
"\n",
|
||||
">dlt will not reload the secrets automatically. **Please restart your interpreter** in Colab options when you add/change content of the variables above."
|
||||
">dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -952,17 +937,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "_7dLATtZkdQl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,773 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩💻🚀**
|
||||
|
||||
1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB.
|
||||
2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency.
|
||||
3. Used `@dlt.transformer` to process and enrich data between resources.
|
||||
|
||||
Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# **Pagination & Authentication & dlt Configuration** 🤫🔩 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
|
||||
|
||||
|
||||
|
||||
**In this lesson, you will learn how to:**
|
||||
- Use pagination for REST APIs.
|
||||
- Use environment variables to manage both secrets & configs.
|
||||
- Add values to `secrets.toml` or `config.toml`.
|
||||
|
||||
To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""In the previous lesson, we loaded data from the GitHub API to DuckDB,"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
from dlt.sources.helpers import requests
|
||||
from dlt.common.typing import TDataItems
|
||||
|
||||
@dlt.resource
|
||||
# define dlt resources
|
||||
def github_events() -> TDataItems:
|
||||
url = "https://api.github.com/orgs/dlt-hub/events"
|
||||
_response = requests.get(url)
|
||||
yield _response.json()
|
||||
|
||||
_pipeline = dlt.pipeline(destination="duckdb")
|
||||
_load_info = _pipeline.run(github_events)
|
||||
print(_load_info)
|
||||
# define dlt pipeline
|
||||
# run dlt pipeline
|
||||
# explore loaded data
|
||||
_pipeline.dataset().github_events.df()
|
||||
return TDataItems, dlt, requests
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You may notice we received only one page — just 30 records — even though this endpoint has many more.
|
||||
|
||||
To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Pagination**
|
||||
|
||||
GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)
|
||||
|
||||
It explains that:
|
||||
|
||||
>You can use the `Link` header from the response to request additional pages of data.
|
||||
|
||||
>The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**GitHub API Pagination Example**
|
||||
|
||||
The GitHub API provides the `per_page` and `page` query parameters:
|
||||
|
||||
* `per_page`: The number of records per page (up to 100).
|
||||
* `page`: The page number to retrieve.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(requests):
|
||||
_response = requests.get(
|
||||
"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1"
|
||||
)
|
||||
_response.headers
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(requests):
|
||||
_response = requests.get(
|
||||
"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1"
|
||||
)
|
||||
_response.links
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **dlt RESTClient**
|
||||
|
||||
Now that we know how pagination works conceptually, let’s see how to implement it efficiently!
|
||||
|
||||
When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.
|
||||
|
||||
Learn more about building pagination with Python and `requests`:
|
||||
|
||||
* [Link 1](https://farnamdata.com/api-pagination)
|
||||
|
||||
* [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval)
|
||||
|
||||
**But!** In this lesson, we’re going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.
|
||||
|
||||
|
||||
**Why use RESTClient?**
|
||||
|
||||
RESTClient is part of dlt's helpers, making it easier to interact with REST APIs by managing repetitive tasks such as:
|
||||
|
||||
* Authentication
|
||||
* Query parameter handling
|
||||
* Pagination
|
||||
|
||||
This reduces boilerplate code and lets you focus on your data pipeline logic.
|
||||
|
||||
**Here’s how to fetch paginated data:**
|
||||
1. Import `RESTClient`
|
||||
2. Create a `RESTClient` instance
|
||||
3. Use the `paginate` method to iterate through all pages of data
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from dlt.sources.helpers.rest_client import RESTClient
|
||||
|
||||
client = RESTClient(base_url="https://api.github.com")
|
||||
for _page in client.paginate("orgs/dlt-hub/events"):
|
||||
print(_page)
|
||||
return (RESTClient,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""☝️ The pagination type was detected automatically, but you can also specify it explicitly:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(RESTClient):
|
||||
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
|
||||
|
||||
client_1 = RESTClient(
|
||||
base_url="https://api.github.com", paginator=HeaderLinkPaginator()
|
||||
)
|
||||
return (client_1,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators)."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The events endpoint doesn’t contain as much data, especially compared to the stargazers endpoint of the dlt repository.
|
||||
|
||||
If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(client_1):
|
||||
for _page in client_1.paginate("repos/dlt-hub/dlt/stargazers"):
|
||||
print(_page)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 1: Pagination with RESTClient**
|
||||
Explore the cells above and answer the question below.
|
||||
#### Question
|
||||
What type of pagination should we use for the GitHub API?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Authentication**
|
||||
|
||||
To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):
|
||||
|
||||
1. Login to your GitHub account.
|
||||
2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic).
|
||||
2. Use it as an access token for the GitHub API.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
> **! ATTENTION !**
|
||||
> Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Create an environment variable for your access token in Colab.
|
||||
|
||||

|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""In Molab, simply click on the `Secrets` section in the left-side menu and add your access token."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
|
||||
access_token = os.getenv("SECRET_KEY")
|
||||
return access_token, os
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Use the `access_token` variable in the code below:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(RESTClient, access_token):
|
||||
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
|
||||
|
||||
client_2 = RESTClient(
|
||||
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
|
||||
)
|
||||
for _page in client_2.paginate("repos/dlt-hub/dlt/stargazers"):
|
||||
print(_page)
|
||||
break
|
||||
return (BearerTokenAuth,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt):
|
||||
@dlt.resource
|
||||
def github_stargazers() -> TDataItems:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
|
||||
)
|
||||
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
|
||||
yield _page
|
||||
|
||||
_pipeline = dlt.pipeline(destination="duckdb")
|
||||
_load_info = _pipeline.run(github_stargazers)
|
||||
print(_load_info)
|
||||
_pipeline.dataset().github_stargazers.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""You can see that all dlt [stargazers](https://github.com/dlt-hub/dlt/stargazers) were loaded into the DuckDB destination."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **dlt configuration and secrets**
|
||||
|
||||
In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines.
|
||||
|
||||
**Configurations** are **non-sensitive** settings that define the behavior of a data pipeline, including file paths, database hosts, timeouts, API URLs, and performance settings.
|
||||
|
||||
On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks.
|
||||
|
||||
Both can be set up in various ways:
|
||||
|
||||
* As environment variables
|
||||
* Within code using `dlt.secrets` and `dlt.config`
|
||||
* Via configuration files (`secrets.toml` and `config.toml`)
|
||||
|
||||
> **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if that’s more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.
|
||||
|
||||
We'll use `@dlt.source` to group both resources.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt):
|
||||
from typing import Iterable
|
||||
from dlt.extract import DltResource
|
||||
|
||||
@dlt.source
|
||||
def github_source() -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_events() -> TDataItems:
|
||||
for _page in client.paginate("orgs/dlt-hub/events"):
|
||||
yield _page
|
||||
|
||||
@dlt.resource
|
||||
def github_stargazers() -> TDataItems:
|
||||
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
|
||||
yield _page
|
||||
|
||||
return (github_events, github_stargazers)
|
||||
return DltResource, Iterable
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os):
|
||||
os.environ.clear()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(BearerTokenAuth, DltResource, Iterable, RESTClient, TDataItems, dlt):
|
||||
@dlt.source
|
||||
def github_source_1(access_token=dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token)
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_events() -> TDataItems:
|
||||
for _page in client.paginate("orgs/dlt-hub/events"):
|
||||
yield _page
|
||||
|
||||
@dlt.resource
|
||||
def github_stargazers() -> TDataItems:
|
||||
for _page in client.paginate("repos/dlt-hub/dlt/stargazers"):
|
||||
yield _page
|
||||
|
||||
return (github_events, github_stargazers)
|
||||
return (github_source_1,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""If you now run the pipeline, you will see the following error:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source_1):
|
||||
_pipeline = dlt.pipeline(destination="duckdb")
|
||||
_load_info = _pipeline.run(github_source_1())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
That’s what happens when you use `dlt.secrets.value` for a variable in your pipeline but haven’t actually set the secret value.
|
||||
|
||||
When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below:
|
||||
|
||||
```python
|
||||
ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration
|
||||
for field "access_token" config providers and keys were tried in following order:
|
||||
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found.
|
||||
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____ACCESS_TOKEN was not found.
|
||||
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES__ACCESS_TOKEN was not found.
|
||||
In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__ACCESS_TOKEN was not found.
|
||||
In Environment Variables key SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found.
|
||||
In Environment Variables key SOURCES____MAIN____ACCESS_TOKEN was not found.
|
||||
In Environment Variables key SOURCES__ACCESS_TOKEN was not found.
|
||||
In Environment Variables key ACCESS_TOKEN was not found.
|
||||
WARNING: dlt looks for .dlt folder in your current working directory and your cwd (/content) is different from directory of your pipeline script (/usr/local/lib/python3.10/dist-packages).
|
||||
If you keep your secret files in the same folder as your pipeline script but run your script from some other folder, secrets/configs will not be found
|
||||
Please refer to https://dlthub.com/docs/general-usage/credentials for more information
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
To define the `access_token` secret value, we can use (as mentioned earlier):
|
||||
|
||||
1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds)
|
||||
2. Environment variables (recommended for prod)
|
||||
3. `secrets.toml` file (recommended for local dev)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Use `dlt.secrets` in code**
|
||||
|
||||
You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source_1, os):
|
||||
dlt.secrets["access_token"] = os.getenv("SECRET_KEY")
|
||||
github_pipeline = dlt.pipeline(destination="duckdb")
|
||||
_load_info = github_pipeline.run(github_source_1())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Alternatively, you can set:
|
||||
|
||||
```python
|
||||
dlt.secrets["sources.access_token"] = userdata.get('SECRET_KEY')
|
||||
dlt.secrets["sources.____main____.access_token"] = userdata.get('SECRET_KEY')
|
||||
dlt.secrets["sources.____main____.github_source.access_token"] = userdata.get('SECRET_KEY')
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
* `sources` is a special word;
|
||||
|
||||
* `__main__` is a python module name;
|
||||
|
||||
* `github_source` is the resource name;
|
||||
|
||||
* `access_token` is the secret variable name.
|
||||
|
||||
|
||||
So dlt looks for secrets according to this hierarchy:
|
||||
```
|
||||
pipeline_name
|
||||
|
|
||||
|-sources
|
||||
|
|
||||
|-<module name>
|
||||
|
|
||||
|-<source function 1 name>
|
||||
|
|
||||
|- secret variable 1
|
||||
|- secret variable 2
|
||||
```
|
||||
|
||||
To keep the **naming convention** flexible, dlt looks for a lot of **possible combinations** of key names, starting from the most specific possible path. Then, if the value is not found, it removes the right-most section and tries again.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 2: Run a pipeline with `dlt.secrets.value`**
|
||||
|
||||
Explore the cells above and answer the question below using `sql_client`.
|
||||
|
||||
#### Question
|
||||
|
||||
Who has id=`17202864` in the `stargazers` table? Use `sql_client`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Use environment variables**
|
||||
|
||||
Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, github_source_1, os):
|
||||
os.environ["ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
|
||||
_pipeline = dlt.pipeline(destination="duckdb")
|
||||
_load_info = _pipeline.run(github_source_1())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Alternatively, you can set:
|
||||
|
||||
> `userdata.get()` is Colab-specific.
|
||||
|
||||
```python
|
||||
os.environ["SOURCES__ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
|
||||
os.environ["SOURCES____MAIN____ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
|
||||
os.environ["SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN"] = userdata.get('SECRET_KEY')
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**How does it work?**
|
||||
|
||||
`dlt` **automatically extracts** configuration settings and secrets based on flexible naming conventions.
|
||||
|
||||
It then **injects** these values where needed in functions decorated with `@dlt.source`, `@dlt.resource`, or `@dlt.destination`.
|
||||
|
||||
|
||||
>dlt uses a specific naming hierarchy to search for the secrets and config values. This makes configurations and secrets easy to manage.
|
||||
>
|
||||
> The naming convention for **environment variables** in dlt follows a specific pattern. All names are **capitalized** and sections are separated with **double underscores** __ , e.g. `SOURCES____MAIN____GITHUB_SOURCE__SECRET_KEY`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **Use dlt `secrets.toml` or `config.toml`**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The `secrets.toml` file - along with the `config.toml` file - should be stored in the `.dlt` directory where your pipeline code is located:
|
||||
|
||||
```
|
||||
/your_project_directory
|
||||
│
|
||||
├── .dlt
|
||||
│ ├── secrets.toml
|
||||
│ └── config.toml
|
||||
│
|
||||
└── my_pipeline.py
|
||||
```
|
||||
|
||||
Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
To set credentials via the toml files, you would first add your access token to `secrets.toml`:
|
||||
|
||||
```toml
|
||||
# .dlt/secrets.toml
|
||||
|
||||
[sources]
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Alternatively, you can set:
|
||||
|
||||
```
|
||||
[sources]
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
which is equal to:
|
||||
|
||||
```
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
|
||||
and to:
|
||||
|
||||
```
|
||||
[sources.____main____]
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
as well as:
|
||||
|
||||
```
|
||||
[sources.____main____.github_source]
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Configure secrets in Colab**
|
||||
|
||||
You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.
|
||||
|
||||
Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`:
|
||||
|
||||
```
|
||||
[sources]
|
||||
secret_key = "your_access_token"
|
||||
```
|
||||
|
||||
|
||||
>dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,14 +6,14 @@
|
||||
"id": "yTmIgQKpV355"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 3](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Used pagination for RestAPIs.\n",
|
||||
"2. Used authentication for RestAPIs.\n",
|
||||
"3. Tried dlt RESTClient.\n",
|
||||
"4. Used environment variables to handle both secrets & configs.\n",
|
||||
"5. Learned how to add values to `secrets.toml` or `config.toml`.\n",
|
||||
"6. Used `secrets.toml` ENV variable special for Colab."
|
||||
"1. Used pagination with REST APIs. \n",
|
||||
"2. Applied authentication for REST APIs. \n",
|
||||
"3. Tried the dlt `RESTClient`. \n",
|
||||
"4. Used environment variables to manage secrets and configuration. \n",
|
||||
"5. Learned how to add values to `secrets.toml` and `config.toml`. \n",
|
||||
"6. Used the special `secrets.toml` environment variable setup for Colab."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -23,22 +23,21 @@
|
||||
},
|
||||
"source": [
|
||||
"---\n",
|
||||
"# **`dlt`’s pre-built Sources and Destinations** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n",
|
||||
"\n",
|
||||
"# **`dlt`’s pre-built Sources and Destinations** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn:**\n",
|
||||
"- How to initialize verified sources;\n",
|
||||
"- Built-in `rest_api` source.\n",
|
||||
"- Built-in `sql_database` source.\n",
|
||||
"- Built-in `filesystem` source.\n",
|
||||
"- How to initialize verified sources.\n",
|
||||
"- The built-in `rest_api` source.\n",
|
||||
"- The built-in `sql_database` source.\n",
|
||||
"- The built-in `filesystem` source.\n",
|
||||
"- How to switch between destinations.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"Our verified sources are the simplest way to get started with building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as any SQL database, Google Sheets, Salesforce and others.\n",
|
||||
"Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more.\n",
|
||||
"\n",
|
||||
"With our numerous destinations you can load data to a local database, warehouse or a data lake. Choose from Snowflake, Databricks and more."
|
||||
"With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -76,15 +75,6 @@
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "cNs9mHKaEaTE"
|
||||
},
|
||||
"source": [
|
||||
"### Step 0: Install dlt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -125,7 +115,7 @@
|
||||
"source": [
|
||||
"This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning.\n",
|
||||
"\n",
|
||||
"Consider an example of a pipeline for the GitHub API:\n",
|
||||
"Consider an example pipeline for the GitHub API:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Available dlt single file templates:\n",
|
||||
@@ -144,7 +134,7 @@
|
||||
"\n",
|
||||
"### Step 1. Initialize the source\n",
|
||||
"\n",
|
||||
"This command will initialize the pipeline example with GitHub API as the source and DuckBD as the destination:"
|
||||
"This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -165,10 +155,11 @@
|
||||
},
|
||||
"source": [
|
||||
"Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline:\n",
|
||||
"* `.dlt` folder for `secrets.toml` and `config.toml`;\n",
|
||||
"* pipeline script `github_api_pipeline.py`;\n",
|
||||
"* requirements.txt;\n",
|
||||
"* `.gitignore`."
|
||||
"\n",
|
||||
"- The `.dlt` folder containing `secrets.toml` and `config.toml`\n",
|
||||
"- The pipeline script `github_api_pipeline.py`\n",
|
||||
"- `requirements.txt`\n",
|
||||
"- `.gitignore`"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -193,7 +184,7 @@
|
||||
"- Adjust the pipeline script as needed\n",
|
||||
"- Run the pipeline script\n",
|
||||
"\n",
|
||||
"> In certain cases, you can adjust the verified source code."
|
||||
"> If needed, you can adjust the verified source code."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -213,7 +204,8 @@
|
||||
"id": "Rr3RWZSHcnSs"
|
||||
},
|
||||
"source": [
|
||||
"From the code we can see that this pipeline loads **only \"issues\" endpoint**, you can adjust this code as you wish: add new endpoints, add additional logic, add transformations, etc."
|
||||
"From the code, we can see that this pipeline loads **only the `\"issues\"` endpoint**. \n",
|
||||
"You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -224,9 +216,10 @@
|
||||
"source": [
|
||||
"### Step 2. Add credentials\n",
|
||||
"\n",
|
||||
"In Colab is more convenient to use ENVs. In the previous lesson you learned how to configure dlt resource via environment variable.\n",
|
||||
"In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`.\n",
|
||||
"\n",
|
||||
"In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable:\n",
|
||||
"\n",
|
||||
"In the pipeline above we can see that `access_token` variable is `dlt.secrets.value`, it means we should configure this variable.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"@dlt.resource(write_disposition=\"replace\")\n",
|
||||
@@ -243,10 +236,10 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import dlt\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")"
|
||||
"dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -284,13 +277,13 @@
|
||||
"id": "imvWv_2Cbumt"
|
||||
},
|
||||
"source": [
|
||||
"From the pipeline output we can take pipeline information like pipeline_name, dataset_name, destination path, etc.\n",
|
||||
"\n",
|
||||
"From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more.\n",
|
||||
"\n",
|
||||
"> Pipeline **github_api_pipeline** load step completed in 1.23 seconds \n",
|
||||
"1 load package(s) were loaded to destination duckdb and into dataset **github_api_data**\n",
|
||||
"The duckdb destination used duckdb:////content/**github_api_pipeline.duckdb** location to store data\n",
|
||||
"Load package 1733848559.8195539 is LOADED and contains no failed jobs\n"
|
||||
"> 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**. \n",
|
||||
"> The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location. \n",
|
||||
"> Load package `1733848559.8195539` is **LOADED** and contains no failed jobs.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -301,7 +294,7 @@
|
||||
"source": [
|
||||
"## Step 4: Explore your data\n",
|
||||
"\n",
|
||||
"Let's explore what tables were created in duckdb."
|
||||
"Let's explore what tables were created in the destination."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -348,18 +341,18 @@
|
||||
"source": [
|
||||
"## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)**\n",
|
||||
"\n",
|
||||
"`rest_api` is a generic source that you can use to create a `dlt` source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this `dlt` source attempts to provide a declarative way to define a `dlt` source for those APIs.\n",
|
||||
"`rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively.\n",
|
||||
"\n",
|
||||
"Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can define:\n",
|
||||
"Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify:\n",
|
||||
"\n",
|
||||
"- the API endpoints to pull data from,\n",
|
||||
"- their relationships,\n",
|
||||
"- how to handle pagination,\n",
|
||||
"- authentication.\n",
|
||||
"\n",
|
||||
"dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n",
|
||||
"`dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**.\n",
|
||||
"\n",
|
||||
"In previous lesson you've already met Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source."
|
||||
"In the previous lesson, you already used the REST API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -368,8 +361,9 @@
|
||||
"id": "SqoKS0mNdFOd"
|
||||
},
|
||||
"source": [
|
||||
"### Initialize `rest_api` template\n",
|
||||
"You can initialize `rest_api` **template** using `init` command:"
|
||||
"### Initialize the `rest_api` template\n",
|
||||
"\n",
|
||||
"You can initialize the `rest_api` **template** using the `init` command:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -389,15 +383,13 @@
|
||||
"id": "MJ89LnH91GQh"
|
||||
},
|
||||
"source": [
|
||||
"In the `rest_api_pipeline.py` script you will find sources for GitHub API and for PokeAPI, which were defined using `rest_api` source and `RESTAPIConfig`.\n",
|
||||
"\n",
|
||||
"Since the `rest_api` source is a **built-in source**, you don't have to initialize it. You can **import** it from `dlt.sources` and use it immediately.\n",
|
||||
"In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`.\n",
|
||||
"\n",
|
||||
"Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it.\n",
|
||||
"\n",
|
||||
"### Example\n",
|
||||
"\n",
|
||||
"Here's a simplified example of how to configure the REST API source to load `issues` and issue `comments` from GitHub API:\n",
|
||||
"\n"
|
||||
"Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -416,13 +408,11 @@
|
||||
" \"client\": {\n",
|
||||
" \"base_url\": \"https://api.github.com\",\n",
|
||||
" \"auth\": {\n",
|
||||
" \"token\": dlt.secrets[\n",
|
||||
" \"sources.access_token\"\n",
|
||||
" ], # <--- we already configured access_token above\n",
|
||||
" \"token\": dlt.secrets[\"sources.access_token\"],\n",
|
||||
" },\n",
|
||||
" \"paginator\": \"header_link\", # <---- set up paginator type\n",
|
||||
" \"paginator\": \"header_link\",\n",
|
||||
" },\n",
|
||||
" \"resources\": [ # <--- list resources\n",
|
||||
" \"resources\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"issues\",\n",
|
||||
" \"endpoint\": {\n",
|
||||
@@ -433,40 +423,32 @@
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"issue_comments\", # <-- here we declare dlt.transformer\n",
|
||||
" \"name\": \"issue_comments\",\n",
|
||||
" \"endpoint\": {\n",
|
||||
" \"path\": \"repos/dlt-hub/dlt/issues/{issue_number}/comments\",\n",
|
||||
" \"params\": {\n",
|
||||
" \"issue_number\": {\n",
|
||||
" \"type\": (\n",
|
||||
" \"resolve\"\n",
|
||||
" ), # <--- use type 'resolve' to resolve {issue_number} for transformer\n",
|
||||
" \"type\": (\"resolve\"),\n",
|
||||
" \"resource\": \"issues\",\n",
|
||||
" \"field\": \"number\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"contributors\",\n",
|
||||
" \"endpoint\": {\n",
|
||||
" \"path\": \"repos/dlt-hub/dlt/contributors\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" ],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"github_source = rest_api_source(config)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"rest_api_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"rest_api_github\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"rest_api_data\",\n",
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(github_source)\n",
|
||||
"load_info = rest_api_pipeline.run(github_source)\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -478,7 +460,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline.dataset().issues.df()"
|
||||
"rest_api_pipeline.dataset().issues.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -487,12 +469,12 @@
|
||||
"id": "mQuK4l23c8Of"
|
||||
},
|
||||
"source": [
|
||||
"### **Exercise 1: Run rest_api source**\n",
|
||||
"### **Exercise 1: Run `rest_api` source**\n",
|
||||
"\n",
|
||||
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
|
||||
"\n",
|
||||
"#### Question\n",
|
||||
"How many columns has the `issues` table?"
|
||||
"#### **Question**\n",
|
||||
"How many columns does the `issues` table have?"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -501,15 +483,16 @@
|
||||
"id": "UTKIM2ntOIrh"
|
||||
},
|
||||
"source": [
|
||||
"### **Exercise 2: Create dlt source with rest_api**\n",
|
||||
"### **Exercise 2: Create a dlt source with `rest_api`**\n",
|
||||
"\n",
|
||||
"Add `contributors` endpoint for dlt repository to the `rest_api` configuration:\n",
|
||||
"- resource name is \"contributors\"\n",
|
||||
"- endpoint path : \"repos/dlt-hub/dlt/contributors\"\n",
|
||||
"- no parameters\n",
|
||||
"Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration:\n",
|
||||
"\n",
|
||||
"#### Question\n",
|
||||
"How many columns has the `contributors` table?"
|
||||
"- Resource name: **\"contributors\"**\n",
|
||||
"- Endpoint path: **\"repos/dlt-hub/dlt/contributors\"**\n",
|
||||
"- No parameters\n",
|
||||
"\n",
|
||||
"#### **Question**\n",
|
||||
"How many columns does the `contributors` table have?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -536,9 +519,9 @@
|
||||
"id": "bHcBOhgVdmZH"
|
||||
},
|
||||
"source": [
|
||||
"### Initialize `sql_database` template\n",
|
||||
"### Initialize the `sql_database` template\n",
|
||||
"\n",
|
||||
"Initialize dlt template for `sql_database` using `init` command:"
|
||||
"Initialize the `dlt` template for `sql_database` using the `init` command:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -569,9 +552,9 @@
|
||||
"source": [
|
||||
"### Example\n",
|
||||
"\n",
|
||||
"The example below will show you how you can use dlt to load data from a SQL Database (PostgreSQL, MySQL, SQLight, Oracle, IBM DB2, etc.) into destination.\n",
|
||||
"The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination.\n",
|
||||
"\n",
|
||||
"To make it easy to reproduce, we will be loading data from the [public MySQL RFam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance."
|
||||
"To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -582,6 +565,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"!pip install pymysql"
|
||||
]
|
||||
},
|
||||
@@ -595,21 +579,21 @@
|
||||
"source": [
|
||||
"from dlt.sources.sql_database import sql_database\n",
|
||||
"\n",
|
||||
"source = sql_database(\n",
|
||||
"sql_source = sql_database(\n",
|
||||
" \"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam\",\n",
|
||||
" table_names=[\n",
|
||||
" \"family\",\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"sql_db_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"sql_database_example\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"sql_data\",\n",
|
||||
" dev_mode=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(source)\n",
|
||||
"load_info = sql_db_pipeline.run(sql_source)\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -619,11 +603,11 @@
|
||||
"id": "pjyJyF4Ofyuu"
|
||||
},
|
||||
"source": [
|
||||
"### **Exercise 3: Run sql_database source**\n",
|
||||
"### **Exercise 3: Run `sql_database` source**\n",
|
||||
"\n",
|
||||
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
|
||||
"\n",
|
||||
"#### Question\n",
|
||||
"#### **Question**\n",
|
||||
"How many columns does the `family` table have?"
|
||||
]
|
||||
},
|
||||
@@ -671,9 +655,9 @@
|
||||
"id": "HfLjS_raUH9G"
|
||||
},
|
||||
"source": [
|
||||
"### Initialize `filesystem` template\n",
|
||||
"### Initialize the `filesystem` template\n",
|
||||
"\n",
|
||||
"Initialize dlt template for `filesystem` using `init` command:"
|
||||
"Initialize the dlt template for `filesystem` using the `init` command:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -715,7 +699,19 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet"
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"folder_name = \"local_data\"\n",
|
||||
"os.makedirs(folder_name, exist_ok=True)\n",
|
||||
"full_path = os.path.abspath(folder_name)\n",
|
||||
"\n",
|
||||
"url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n",
|
||||
"resp = requests.get(url)\n",
|
||||
"resp.raise_for_status()\n",
|
||||
"\n",
|
||||
"with open(f\"{full_path}/userdata.parquet\", \"wb\") as f:\n",
|
||||
" f.write(resp.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -729,14 +725,12 @@
|
||||
"import dlt\n",
|
||||
"from dlt.sources.filesystem import filesystem, read_parquet\n",
|
||||
"\n",
|
||||
"filesystem_resource = filesystem(\n",
|
||||
" bucket_url=\"/content/local_data\", file_glob=\"**/*.parquet\"\n",
|
||||
")\n",
|
||||
"filesystem_resource = filesystem(bucket_url=full_path, file_glob=\"**/*.parquet\")\n",
|
||||
"filesystem_pipe = filesystem_resource | read_parquet()\n",
|
||||
"\n",
|
||||
"# We load the data into the table_name table\n",
|
||||
"pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n",
|
||||
"load_info = pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n",
|
||||
"fs_pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n",
|
||||
"load_info = fs_pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -746,12 +740,12 @@
|
||||
"id": "0jzeZeINEzQb"
|
||||
},
|
||||
"source": [
|
||||
"### **Exercise 4: Run filesystem source**\n",
|
||||
"### **Exercise 4: Run `filesystem` source**\n",
|
||||
"\n",
|
||||
"Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n",
|
||||
"\n",
|
||||
"#### Question\n",
|
||||
"How many columns does the `userdata` table have?"
|
||||
"#### **Question**\n",
|
||||
"How many columns does the `userdata` table have?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -760,7 +754,8 @@
|
||||
"id": "o4SGNHSkF7_Y"
|
||||
},
|
||||
"source": [
|
||||
"How to configure **Cloud Storage** you can read in the official [dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration)."
|
||||
"You can read how to configure **Cloud Storage** in the official \n",
|
||||
"[dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -769,9 +764,7 @@
|
||||
"id": "M03Zc9l7Y6Ue"
|
||||
},
|
||||
"source": [
|
||||
"# **Built-in Destinations**\n",
|
||||
"\n",
|
||||
"https://dlthub.com/docs/dlt-ecosystem/destinations/"
|
||||
"# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -797,9 +790,12 @@
|
||||
"id": "BWAnIbicE4XC"
|
||||
},
|
||||
"source": [
|
||||
"TBH this is a matter of simply going through the [documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n",
|
||||
"- Most likely the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n",
|
||||
"- If not, you can simply define a custom destination and still be able to benefit from most `dlt`-specific features. FYI, custom destinations will be covered in the next Advanced course, so we expect you to come back for the second part..."
|
||||
"To be honest, this is simply a matter of going through the \n",
|
||||
"[documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n",
|
||||
"\n",
|
||||
"- Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n",
|
||||
"- If not, you can define a custom destination and still benefit from most `dlt`-specific features. \n",
|
||||
" *FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…*\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -810,7 +806,7 @@
|
||||
"source": [
|
||||
"## **Choosing a destination**\n",
|
||||
"\n",
|
||||
"Switching between destinations in dlt is incredibly straightforward—simply modify the `destination` parameter in your pipeline configuration. For example:"
|
||||
"Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -821,17 +817,19 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"data_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"data_pipeline\",\n",
|
||||
" destination=\"duckdb\", # <--- to test pipeline locally\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"data\",\n",
|
||||
")\n",
|
||||
"print(data_pipeline.destination.destination_type)\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"data_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"data_pipeline\",\n",
|
||||
" destination=\"bigquery\", # <--- to run pipeline in production\n",
|
||||
" destination=\"bigquery\",\n",
|
||||
" dataset_name=\"data\",\n",
|
||||
")"
|
||||
")\n",
|
||||
"print(data_pipeline.destination.destination_type)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -869,7 +867,7 @@
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"BUCKET_URL\"] = \"/content\""
|
||||
"os.environ[\"BUCKET_URL\"] = \"./content\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -902,13 +900,11 @@
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"fs_pipeline\",\n",
|
||||
" destination=\"filesystem\", # <--- change destination to 'filesystem'\n",
|
||||
" destination=\"filesystem\",\n",
|
||||
" dataset_name=\"fs_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(\n",
|
||||
" source, loader_file_format=\"parquet\"\n",
|
||||
") # <--- choose a file format: parquet, csv or jsonl\n",
|
||||
"load_info = pipeline.run(source, loader_file_format=\"parquet\")\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -929,7 +925,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! ls fs_data/family"
|
||||
"! ls ./content/fs_data/family"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -991,7 +987,7 @@
|
||||
"load_info = pipeline.run(\n",
|
||||
" source,\n",
|
||||
" loader_file_format=\"parquet\",\n",
|
||||
" table_format=\"iceberg\", # <--- choose a table format: delta or iceberg\n",
|
||||
" table_format=\"iceberg\",\n",
|
||||
")\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
@@ -1004,9 +1000,9 @@
|
||||
"source": [
|
||||
"**Note:**\n",
|
||||
"\n",
|
||||
"Open source version of dlt supports basic functionality for **iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** integration with iceberg.\n",
|
||||
"The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration.\n",
|
||||
"\n",
|
||||
"[Join the waiting list to learn more about dlt+ and Iceberg.](https://info.dlthub.com/waiting-list)"
|
||||
"[Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1017,9 +1013,12 @@
|
||||
"source": [
|
||||
"# **Spoiler: Custom Sources & Destinations**\n",
|
||||
"\n",
|
||||
"`dlt` tried to simplify as much as possible both the process of creating sources ([RestAPI Client](https://dlthub.com/docs/general-usage/http/rest-client), [rest_api source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n",
|
||||
"`dlt` aims to simplify the process of creating both custom sources \n",
|
||||
"([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client), \n",
|
||||
"[`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) \n",
|
||||
"and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n",
|
||||
"\n",
|
||||
"We will look at this topic in more detail in the next Advanced course."
|
||||
"We will explore this topic in more detail in the next Advanced course.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1028,17 +1027,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "wrVnW2UdVjV4"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,826 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "dlt[pyiceberg]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "pymysql",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩💻🚀**
|
||||
|
||||
1. Used pagination with REST APIs.
|
||||
2. Applied authentication for REST APIs.
|
||||
3. Tried the dlt `RESTClient`.
|
||||
4. Used environment variables to manage secrets and configuration.
|
||||
5. Learned how to add values to `secrets.toml` and `config.toml`.
|
||||
6. Used the special `secrets.toml` environment variable setup for Colab.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
# **`dlt`’s pre-built Sources and Destinations** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
|
||||
|
||||
|
||||
**Here, you will learn:**
|
||||
- How to initialize verified sources.
|
||||
- The built-in `rest_api` source.
|
||||
- The built-in `sql_database` source.
|
||||
- The built-in `filesystem` source.
|
||||
- How to switch between destinations.
|
||||
|
||||
---
|
||||
|
||||
Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more.
|
||||
|
||||
With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Existing verified sources**
|
||||
To use an [existing verified source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/), just run the `dlt init` command.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
There's a base project for each `dlt` verified source + destination combination, which you can adjust according to your needs.
|
||||
|
||||
These base project can be initialized with a simple command:
|
||||
|
||||
```
|
||||
dlt init <verified-source> <destination>
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""List all verified sources:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import subprocess
|
||||
|
||||
subprocess.run(["dlt", "init", "--list-sources"], check=True)
|
||||
return (subprocess,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning.
|
||||
|
||||
Consider an example pipeline for the GitHub API:
|
||||
|
||||
```
|
||||
Available dlt single file templates:
|
||||
---
|
||||
arrow: The Arrow Pipeline Template will show how to load and transform arrow tables.
|
||||
dataframe: The DataFrame Pipeline Template will show how to load and transform pandas dataframes.
|
||||
debug: The Debug Pipeline Template will load a column with each datatype to your destination.
|
||||
default: The Intro Pipeline Template contains the example from the docs intro page
|
||||
fruitshop: The Default Pipeline Template provides a simple starting point for your dlt pipeline
|
||||
|
||||
---> github_api: The Github API templates provides a starting
|
||||
|
||||
point to read data from REST APIs with REST Client helper
|
||||
requests: The Requests Pipeline Template provides a simple starting point for a dlt pipeline with the requests library
|
||||
```
|
||||
|
||||
### Step 1. Initialize the source
|
||||
|
||||
This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "--non-interactive", "init", "github_api", "duckdb"], check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline:
|
||||
|
||||
- The `.dlt` folder containing `secrets.toml` and `config.toml`
|
||||
- The pipeline script `github_api_pipeline.py`
|
||||
- `requirements.txt`
|
||||
- `.gitignore`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["ls", "-a"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
What you would normally do with the project:
|
||||
- Add your credentials and define configurations
|
||||
- Adjust the pipeline script as needed
|
||||
- Run the pipeline script
|
||||
|
||||
> If needed, you can adjust the verified source code.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["cat", "github_api_pipeline.py"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
From the code, we can see that this pipeline loads **only the `"issues"` endpoint**.
|
||||
You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Step 2. Add credentials
|
||||
|
||||
In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`.
|
||||
|
||||
In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable:
|
||||
|
||||
|
||||
```python
|
||||
@dlt.resource(write_disposition="replace")
|
||||
def github_api_resource(access_token: Optional[str] = dlt.secrets.value):
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os):
|
||||
import dlt
|
||||
|
||||
dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
|
||||
return (dlt,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### Step 3. Run the pipeline""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's run the pipeline!""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["python", "github_api_pipeline.py"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more.
|
||||
|
||||
> Pipeline **github_api_pipeline** load step completed in 1.23 seconds
|
||||
> 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**.
|
||||
> The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location.
|
||||
> Load package `1733848559.8195539` is **LOADED** and contains no failed jobs.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 4: Explore your data
|
||||
|
||||
Let's explore what tables were created in the destination.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import duckdb
|
||||
|
||||
conn = duckdb.connect("github_api_pipeline.duckdb")
|
||||
conn.sql("SET search_path = 'github_api_data'")
|
||||
conn.sql("DESCRIBE").df()
|
||||
return (conn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(conn):
|
||||
data_table = conn.sql("SELECT * FROM github_api_resource").df()
|
||||
data_table
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""# **Built-in sources: RestAPI, SQL database & Filesystem**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)**
|
||||
|
||||
`rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively.
|
||||
|
||||
Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify:
|
||||
|
||||
- the API endpoints to pull data from,
|
||||
- their relationships,
|
||||
- how to handle pagination,
|
||||
- authentication.
|
||||
|
||||
`dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**.
|
||||
|
||||
In the previous lesson, you already used the REST API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Initialize the `rest_api` template
|
||||
|
||||
You can initialize the `rest_api` **template** using the `init` command:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "init", "rest_api", "duckdb"], input="y\n", text=True, check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`.
|
||||
|
||||
Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it.
|
||||
|
||||
### Example
|
||||
|
||||
Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
from dlt.sources.rest_api import RESTAPIConfig, rest_api_source
|
||||
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
|
||||
|
||||
config: RESTAPIConfig = {
|
||||
"client": {
|
||||
"base_url": "https://api.github.com",
|
||||
"auth": {"token": dlt.secrets["sources.access_token"]},
|
||||
"paginator": "header_link",
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"name": "issues",
|
||||
"endpoint": {
|
||||
"path": "repos/dlt-hub/dlt/issues",
|
||||
"params": {"state": "open"},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "issue_comments",
|
||||
"endpoint": {
|
||||
"path": "repos/dlt-hub/dlt/issues/{issue_number}/comments",
|
||||
"params": {
|
||||
"issue_number": {
|
||||
"type": "resolve",
|
||||
"resource": "issues",
|
||||
"field": "number",
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
github_source = rest_api_source(config)
|
||||
rest_api_pipeline = dlt.pipeline(
|
||||
pipeline_name="rest_api_github",
|
||||
destination="duckdb",
|
||||
dataset_name="rest_api_data",
|
||||
dev_mode=True,
|
||||
)
|
||||
_load_info = rest_api_pipeline.run(github_source)
|
||||
print(_load_info)
|
||||
return (rest_api_pipeline,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(rest_api_pipeline):
|
||||
rest_api_pipeline.dataset().issues.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 1: Run `rest_api` source**
|
||||
|
||||
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
|
||||
|
||||
#### **Question**
|
||||
How many columns does the `issues` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 2: Create a dlt source with `rest_api`**
|
||||
|
||||
Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration:
|
||||
|
||||
- Resource name: **"contributors"**
|
||||
- Endpoint path: **"repos/dlt-hub/dlt/contributors"**
|
||||
- No parameters
|
||||
|
||||
#### **Question**
|
||||
How many columns does the `contributors` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **[SQL Databases source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database/)**
|
||||
|
||||
SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval.
|
||||
|
||||
The `sql_database` verified source loads data to your specified destination using one of the following backends:
|
||||
* SQLAlchemy,
|
||||
* PyArrow,
|
||||
* pandas,
|
||||
* ConnectorX.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Initialize the `sql_database` template
|
||||
|
||||
Initialize the `dlt` template for `sql_database` using the `init` command:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "init", "sql_database", "duckdb"], input="y\n", text=True, check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The `sql_database` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Example
|
||||
|
||||
The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination.
|
||||
|
||||
To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
from dlt.sources.sql_database import sql_database
|
||||
|
||||
sql_source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["family"],
|
||||
)
|
||||
sql_db_pipeline = dlt.pipeline(
|
||||
pipeline_name="sql_database_example",
|
||||
destination="duckdb",
|
||||
dataset_name="sql_data",
|
||||
dev_mode=True,
|
||||
)
|
||||
_load_info = sql_db_pipeline.run(sql_source)
|
||||
print(_load_info)
|
||||
return (sql_database,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 3: Run `sql_database` source**
|
||||
|
||||
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
|
||||
|
||||
#### **Question**
|
||||
How many columns does the `family` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **[Filesystem source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/)**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The filesystem source allows seamless loading of files from the following locations:
|
||||
|
||||
* AWS S3
|
||||
* Google Cloud Storage
|
||||
* Google Drive
|
||||
* Azure Blob Storage
|
||||
* remote filesystem (via SFTP)
|
||||
* local filesystem
|
||||
|
||||
The filesystem source natively supports CSV, Parquet, and JSONL files and allows customization for loading any type of structured file.
|
||||
|
||||
|
||||
**How filesystem source works**
|
||||
|
||||
The Filesystem source doesn't just give you an easy way to load data from both remote and local files — it also comes with a powerful set of tools that let you customize the loading process to fit your specific needs.
|
||||
|
||||
Filesystem source loads data in two steps:
|
||||
|
||||
1. It accesses the files in your remote or local file storage **without** actually **reading** the content yet. At this point, you can filter files by metadata or name. You can also set up incremental loading to load only new files.
|
||||
2. The **transformer** **reads** the files' content and yields the records. At this step, you can filter out the actual data, enrich records with metadata from files, or perform incremental loading based on the file content.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Initialize the `filesystem` template
|
||||
|
||||
Initialize the dlt template for `filesystem` using the `init` command:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "init", "filesystem", "duckdb"], input="y\n", text=True, check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The `filesystem` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Example
|
||||
|
||||
To illustrate how this **built-in source** works, we first download some file to the local (Colab) filesystem.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
import requests
|
||||
|
||||
folder_name = "local_data"
|
||||
os.makedirs(folder_name, exist_ok=True)
|
||||
full_path = os.path.abspath(folder_name)
|
||||
|
||||
url = "https://www.timestored.com/data/sample/userdata.parquet"
|
||||
resp = requests.get(url)
|
||||
resp.raise_for_status()
|
||||
|
||||
with open(f"{full_path}/userdata.parquet", "wb") as f:
|
||||
f.write(resp.content)
|
||||
return full_path, os
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, full_path):
|
||||
from dlt.sources.filesystem import filesystem, read_parquet
|
||||
|
||||
filesystem_resource = filesystem(bucket_url=full_path, file_glob="**/*.parquet")
|
||||
filesystem_pipe = filesystem_resource | read_parquet()
|
||||
fs_pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb")
|
||||
_load_info = fs_pipeline.run(filesystem_pipe.with_name("userdata"))
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Exercise 4: Run `filesystem` source**
|
||||
|
||||
Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.
|
||||
|
||||
#### **Question**
|
||||
How many columns does the `userdata` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can read how to configure **Cloud Storage** in the official
|
||||
[dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Exploring `dlt` destinations**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
To be honest, this is simply a matter of going through the
|
||||
[documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:
|
||||
|
||||
- Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.
|
||||
- If not, you can define a custom destination and still benefit from most `dlt`-specific features.
|
||||
*FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…*
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Choosing a destination**
|
||||
|
||||
Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
data_pipeline = dlt.pipeline(
|
||||
pipeline_name="data_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="data",
|
||||
)
|
||||
print(data_pipeline.destination.destination_type)
|
||||
|
||||
data_pipeline = dlt.pipeline(
|
||||
pipeline_name="data_pipeline",
|
||||
destination="bigquery",
|
||||
dataset_name="data",
|
||||
)
|
||||
print(data_pipeline.destination.destination_type)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""This flexibility allows you to easily transition from local development to production-grade environments."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Filesystem destination**
|
||||
|
||||
The `filesystem` destination enables you to load data into **files stored locally** or in **cloud storage** solutions, making it an excellent choice for lightweight testing, prototyping, or file-based workflows.
|
||||
|
||||
Below is an **example** demonstrating how to use the `filesystem` destination to load data in **Parquet** format:
|
||||
|
||||
* Step 1: Set up a local bucket or cloud directory for storing files
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os):
|
||||
os.environ["BUCKET_URL"] = "./content"
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""* Step 2: Define the data source""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, sql_database):
|
||||
source = sql_database(
|
||||
"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",
|
||||
table_names=["family"],
|
||||
)
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="fs_pipeline", destination="filesystem", dataset_name="fs_data"
|
||||
)
|
||||
_load_info = pipeline.run(source, loader_file_format="parquet")
|
||||
print(_load_info)
|
||||
return pipeline, source
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Look at the files:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["ls", "./content/fs_data/family"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Look at the loaded data:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# explore loaded data
|
||||
pipeline.dataset().family.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Table formats: [Delta tables & Iceberg](https://dlthub.com/docs/dlt-ecosystem/destinations/delta-iceberg)**
|
||||
|
||||
dlt supports writing **Delta** and **Iceberg** tables when using the `filesystem` destination.
|
||||
|
||||
**How it works:**
|
||||
|
||||
dlt uses the `deltalake` and `pyiceberg` libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline, source):
|
||||
_load_info = pipeline.run(
|
||||
source, loader_file_format="parquet", table_format="iceberg"
|
||||
)
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Note:**
|
||||
|
||||
The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration.
|
||||
|
||||
[Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Spoiler: Custom Sources & Destinations**
|
||||
|
||||
`dlt` aims to simplify the process of creating both custom sources
|
||||
([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client),
|
||||
[`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api))
|
||||
and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).
|
||||
|
||||
We will explore this topic in more detail in the next Advanced course.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,14 +6,14 @@
|
||||
"id": "h93BcC8SX2fj"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 4](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Listed all available verified sources;\n",
|
||||
"2. Initialized `github_api` verified source;\n",
|
||||
"3. Explored built-in `rest_api` source.\n",
|
||||
"4. Explored built-in `sql_database` source.\n",
|
||||
"5. Explored built-in `filesystem` source.\n",
|
||||
"6. Learned how to switch between destinations."
|
||||
"1. Listed all available verified sources.\n",
|
||||
"2. Initialized the `github_api` verified source.\n",
|
||||
"3. Explored the built-in `rest_api` source.\n",
|
||||
"4. Explored the built-in `sql_database` source.\n",
|
||||
"5. Explored the built-in `filesystem` source.\n",
|
||||
"6. Learned how to switch between destinations.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -24,7 +24,7 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"# **Write Disposition and Incremental Loading** ⚙️🧠 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n",
|
||||
"# **Write Disposition and Incremental Loading** ⚙️🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn:**\n",
|
||||
@@ -52,19 +52,17 @@
|
||||
"id": "5ThZzzAwqLnn"
|
||||
},
|
||||
"source": [
|
||||
"Write disposition in the context of the dlt library defines how the data should be written to the destination. There are three types of write dispositions:\n",
|
||||
"A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types:\n",
|
||||
"\n",
|
||||
"* **Append**: This is the **default** disposition. It will append the data to the existing data in the destination.\n",
|
||||
"- **Append**: The **default** disposition. It appends new data to the existing data in the destination.\n",
|
||||
"\n",
|
||||
"* **Replace**: This disposition replaces the data in the destination with the data from the resource. It **deletes** all the data and **recreates** the schema before loading the data.\n",
|
||||
"- **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading.\n",
|
||||
"\n",
|
||||
"* **Merge**: This write disposition merges the data from the resource with the data at the destination. For the merge disposition, you need to specify a `primary_key` for the resource.\n",
|
||||
"- **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource.\n",
|
||||
"\n",
|
||||
"The write disposition you choose depends on the dataset and how you can extract it. For more details, you can refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n",
|
||||
"The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"A `write_disposition` in `dlt` can specified in the resource decorator:\n",
|
||||
"You can specify a `write_disposition` in the resource decorator:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"@dlt.resource(write_disposition=\"append\")\n",
|
||||
@@ -79,25 +77,7 @@
|
||||
"load_info = pipeline.run(my_resource, write_disposition=\"replace\")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"> In case you specify both, the write disposition specified at the pipeline run level will override the write disposition specified at the resource level."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "SpEU7xzw9lZL"
|
||||
},
|
||||
"source": [
|
||||
"### **0. Install dlt**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Su4oUJelKaZY"
|
||||
},
|
||||
"source": [
|
||||
"Install `dlt` with DuckDB as a destination as per usual:"
|
||||
"> If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -128,7 +108,7 @@
|
||||
"id": "5IpPPDpVrU75"
|
||||
},
|
||||
"source": [
|
||||
"As we already have said `append` is a default loading behavior. Now we will explore how this write disposition works."
|
||||
"As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -162,7 +142,7 @@
|
||||
"id": "CltUh8t6rGUP"
|
||||
},
|
||||
"source": [
|
||||
"We create dlt pipeline as usual and load this data into DuckDB."
|
||||
"We create a `dlt` pipeline as usual and load this data into DuckDB."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -179,23 +159,23 @@
|
||||
"\n",
|
||||
"@dlt.resource(\n",
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"append\", # <--- add new argument into decorator\n",
|
||||
" write_disposition=\"append\",\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def append_pokemon() -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"poke_pipeline\",\n",
|
||||
"append_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"append_poke_pipeline\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = append_pipeline.run(append_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"append_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -204,9 +184,9 @@
|
||||
"id": "Wtz2oUpCs7Ay"
|
||||
},
|
||||
"source": [
|
||||
"Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**. It is very useful.\n",
|
||||
"Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful.\n",
|
||||
"\n",
|
||||
"Example use case: when you have a new folder created daily with json file logs, and you want to ingest them incrementally."
|
||||
"Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -217,11 +197,11 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = append_pipeline.run(append_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"append_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -240,7 +220,7 @@
|
||||
"id": "Njz_qUcpDtTW"
|
||||
},
|
||||
"source": [
|
||||
"Perhaps this duplicated data is not what you want to get in your work projects. For example, if your data was updated, how we can refresh it in the database? One method is to tell dlt to **replace** the data in existing tables by using **write_disposition**."
|
||||
"Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -256,23 +236,23 @@
|
||||
"\n",
|
||||
"@dlt.resource(\n",
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"replace\", # <--- change 'append' to 'replace'\n",
|
||||
" write_disposition=\"replace\",\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def replace_pokemon() -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"poke_pipeline\",\n",
|
||||
"replace_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"replace_poke_pipeline\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = replace_pipeline.run(replace_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"replace_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -292,11 +272,11 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = replace_pipeline.run(replace_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"replace_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -305,7 +285,7 @@
|
||||
"id": "aPjezxijt_mz"
|
||||
},
|
||||
"source": [
|
||||
"TAADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."
|
||||
"TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -364,24 +344,24 @@
|
||||
"\n",
|
||||
"@dlt.resource(\n",
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"merge\", # <--- change 'replace' to 'merge'\n",
|
||||
" primary_key=\"id\", # <--- add primary_key\n",
|
||||
" write_disposition=\"merge\",\n",
|
||||
" primary_key=\"id\",\n",
|
||||
")\n",
|
||||
"def pokemon() -> TDataItems:\n",
|
||||
"def merge_pokemon() -> TDataItems:\n",
|
||||
" yield data\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"merge_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"poke_pipeline_merge\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = merge_pipeline.run(merge_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"merge_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -431,24 +411,24 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We added `created_at` field to the data\n",
|
||||
"data = [\n",
|
||||
"created_data = [\n",
|
||||
" {\n",
|
||||
" \"id\": \"1\",\n",
|
||||
" \"name\": \"bulbasaur\",\n",
|
||||
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
|
||||
" \"created_at\": \"2024-12-01\", # <------- new field\n",
|
||||
" \"created_at\": \"2024-12-01\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"4\",\n",
|
||||
" \"name\": \"charmander\",\n",
|
||||
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
|
||||
" \"created_at\": \"2024-09-01\", # <------- new field\n",
|
||||
" \"created_at\": \"2024-09-01\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 6, \"height\": 0.4},\n",
|
||||
" \"created_at\": \"2023-06-01\", # <------- new field\n",
|
||||
" \"created_at\": \"2023-06-01\",\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
@@ -459,11 +439,11 @@
|
||||
"id": "EO63mHgE_Oya"
|
||||
},
|
||||
"source": [
|
||||
"**The goal**: Load only Pokémon caught after January 1, 2024, skipping the ones you already have.\n",
|
||||
"**The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have.\n",
|
||||
"\n",
|
||||
"### **Step 2: Defining the incremental logic**\n",
|
||||
"\n",
|
||||
"Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémon caught after a certain date:\n",
|
||||
"Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date:\n",
|
||||
"```python\n",
|
||||
"cursor_date = dlt.sources.incremental(\"created_at\", initial_value=\"2024-01-01\")\n",
|
||||
"```\n",
|
||||
@@ -489,12 +469,12 @@
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"append\",\n",
|
||||
")\n",
|
||||
"def pokemon(\n",
|
||||
"def incremental_pokemon(\n",
|
||||
" cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n",
|
||||
" \"created_at\", initial_value=\"2024-01-01\"\n",
|
||||
" )\n",
|
||||
") -> TDataItems:\n",
|
||||
" yield data"
|
||||
" yield created_data"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -524,17 +504,17 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"incremental_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"poke_pipeline_incremental\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = incremental_pipeline.run(incremental_pokemon)\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"incremental_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -584,7 +564,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = incremental_pipeline.run(incremental_pokemon)\n",
|
||||
"print(load_info)"
|
||||
]
|
||||
},
|
||||
@@ -619,21 +599,21 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We added `created_at` field to the data\n",
|
||||
"data = [\n",
|
||||
"# We added `updated_at` field to the data\n",
|
||||
"updated_data = [\n",
|
||||
" {\n",
|
||||
" \"id\": \"1\",\n",
|
||||
" \"name\": \"bulbasaur\",\n",
|
||||
" \"size\": {\"weight\": 6.9, \"height\": 0.7},\n",
|
||||
" \"created_at\": \"2024-12-01\",\n",
|
||||
" \"updated_at\": \"2024-12-01\", # <------- new field\n",
|
||||
" \"updated_at\": \"2024-12-01\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"4\",\n",
|
||||
" \"name\": \"charmander\",\n",
|
||||
" \"size\": {\"weight\": 8.5, \"height\": 0.6},\n",
|
||||
" \"created_at\": \"2024-09-01\",\n",
|
||||
" \"updated_at\": \"2024-09-01\", # <------- new field\n",
|
||||
" \"updated_at\": \"2024-09-01\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
@@ -641,9 +621,9 @@
|
||||
" \"size\": {\n",
|
||||
" \"weight\": 9,\n",
|
||||
" \"height\": 0.4,\n",
|
||||
" }, # <----- pikachu gained weight from 6 to 9\n",
|
||||
" },\n",
|
||||
" \"created_at\": \"2023-06-01\",\n",
|
||||
" \"updated_at\": \"2024-12-16\", # <------- new field, information about pikachu has updated\n",
|
||||
" \"updated_at\": \"2024-12-16\",\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
@@ -670,14 +650,15 @@
|
||||
"\n",
|
||||
"@dlt.resource(\n",
|
||||
" name=\"pokemon\",\n",
|
||||
" write_disposition=\"merge\", # <--- change write disposition from 'append' to 'merge'\n",
|
||||
" primary_key=\"id\", # <--- set a primary key\n",
|
||||
" write_disposition=\"merge\",\n",
|
||||
" primary_key=\"id\",\n",
|
||||
")\n",
|
||||
"def pokemon(\n",
|
||||
"def dedup_pokemon(\n",
|
||||
" data: TDataItems,\n",
|
||||
" cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n",
|
||||
" \"updated_at\", initial_value=\"2024-01-01\"\n",
|
||||
" )\n",
|
||||
") -> TDataItems: # <--- change the cursor name from 'created_at' to 'updated_at'\n",
|
||||
" ),\n",
|
||||
") -> TDataItems:\n",
|
||||
" yield data"
|
||||
]
|
||||
},
|
||||
@@ -698,17 +679,17 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipeline = dlt.pipeline(\n",
|
||||
"dedup_pipeline = dlt.pipeline(\n",
|
||||
" pipeline_name=\"poke_pipeline_dedup\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"pokemon_data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = dedup_pipeline.run(dedup_pokemon(updated_data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"dedup_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -717,7 +698,7 @@
|
||||
"id": "omG1cgzcrqOs"
|
||||
},
|
||||
"source": [
|
||||
"All Pokémon are processed because this is the pipeline’s first run.\n",
|
||||
"All Pokémons are processed because this is the pipeline’s first run.\n",
|
||||
"\n",
|
||||
"Now, let’s say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`."
|
||||
]
|
||||
@@ -730,8 +711,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We added `created_at` field to the data\n",
|
||||
"data = [\n",
|
||||
"reupdated_data = [\n",
|
||||
" {\n",
|
||||
" \"id\": \"1\",\n",
|
||||
" \"name\": \"bulbasaur\",\n",
|
||||
@@ -749,9 +729,9 @@
|
||||
" {\n",
|
||||
" \"id\": \"25\",\n",
|
||||
" \"name\": \"pikachu\",\n",
|
||||
" \"size\": {\"weight\": 7.5, \"height\": 0.4}, # <--- pikachu lost weight\n",
|
||||
" \"size\": {\"weight\": 7.5, \"height\": 0.4},\n",
|
||||
" \"created_at\": \"2023-06-01\",\n",
|
||||
" \"updated_at\": \"2024-12-23\", # <--- data about his weight was updated a week later\n",
|
||||
" \"updated_at\": \"2024-12-23\",\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
@@ -773,11 +753,11 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_info = pipeline.run(pokemon)\n",
|
||||
"load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data))\n",
|
||||
"print(load_info)\n",
|
||||
"\n",
|
||||
"# explore loaded data\n",
|
||||
"pipeline.dataset().pokemon.df()"
|
||||
"dedup_pipeline.dataset().pokemon.df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -786,10 +766,10 @@
|
||||
"id": "u2hZHn_EowBd"
|
||||
},
|
||||
"source": [
|
||||
"**What happens?**\n",
|
||||
"**What happened?**\n",
|
||||
"\n",
|
||||
"* The pipeline detects that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped.\n",
|
||||
"* Pikachu’s record is updated to reflect the latest weight.\n",
|
||||
"* The pipeline detected that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped.\n",
|
||||
"* Pikachu’s record was updated to reflect the latest weight.\n",
|
||||
"\n",
|
||||
"You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination."
|
||||
]
|
||||
@@ -800,28 +780,17 @@
|
||||
"id": "pufZ_GWPxqEQ"
|
||||
},
|
||||
"source": [
|
||||
"The **`dlt.sources.incremental`** instance above has the next attributes:\n",
|
||||
"The **`dlt.sources.incremental`** instance above has the following attributes:\n",
|
||||
"\n",
|
||||
"* **`cursor_date.initial_value`** which is always equal to \"2024-01-01\" passed in the constructor;\n",
|
||||
"* **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run;\n",
|
||||
"* **`cursor_date.last_value`** a \"real-time\" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`;\n",
|
||||
"* **`cursor_date.end_value`** (here not used) marking the end of the backfill range.\n",
|
||||
"* **`cursor_date.end_value`** (not used here) marking the end of the backfill range.\n",
|
||||
"\n",
|
||||
"## **Example**\n",
|
||||
"You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "l4C_IFK7G4m9"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exit() # we use exit() to reset all ENVs we set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -838,10 +807,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -859,9 +827,7 @@
|
||||
" )\n",
|
||||
" ) -> TDataItems:\n",
|
||||
" params = {\n",
|
||||
" \"since\": (\n",
|
||||
" cursor_date.last_value\n",
|
||||
" ), # <--- use last_value to request only new data from API\n",
|
||||
" \"since\": (cursor_date.last_value),\n",
|
||||
" \"status\": \"open\",\n",
|
||||
" }\n",
|
||||
" for page in client.paginate(\"repos/dlt-hub/dlt/issues\", params=params):\n",
|
||||
@@ -885,9 +851,9 @@
|
||||
"id": "5d1J5DPX3Dn3"
|
||||
},
|
||||
"source": [
|
||||
"Pay attention how we use **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n",
|
||||
"Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n",
|
||||
"\n",
|
||||
"Run the pipeline again and make sure that **no data was loaded**."
|
||||
"Run the pipeline again and make sure that **no data is loaded**."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -934,12 +900,12 @@
|
||||
"\n",
|
||||
"Transform your GitHub API pipeline to use incremental loading. This means:\n",
|
||||
"\n",
|
||||
"* Implement new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n",
|
||||
"* Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n",
|
||||
"* Fetch only pulls comments updated after the last pipeline run.\n",
|
||||
"* Use the `updated_at` field from the GitHub API as the incremental cursor.\n",
|
||||
"* [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository)\n",
|
||||
"* Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments`\n",
|
||||
"* Use `since` parameter - only show results that were last updated after the given time - and `last_value`.\n",
|
||||
"* Use the `since` parameter - only show results that were last updated after the given time - and `last_value`.\n",
|
||||
"* `initial_value` is `2024-12-01`.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -954,17 +920,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "SVyiG5wRVo1B"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,743 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩💻🚀**
|
||||
|
||||
1. Listed all available verified sources.
|
||||
2. Initialized the `github_api` verified source.
|
||||
3. Explored the built-in `rest_api` source.
|
||||
4. Explored the built-in `sql_database` source.
|
||||
5. Explored the built-in `filesystem` source.
|
||||
6. Learned how to switch between destinations.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# **Write Disposition and Incremental Loading** ⚙️🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
|
||||
|
||||
|
||||
**Here, you will learn:**
|
||||
- `dlt` write dispositions:
|
||||
- Append
|
||||
- Replace
|
||||
- Merge
|
||||
- What incremental loading is
|
||||
- How to update and deduplicate your data
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **`dlt` write dispositions**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types:
|
||||
|
||||
- **Append**: The **default** disposition. It appends new data to the existing data in the destination.
|
||||
|
||||
- **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading.
|
||||
|
||||
- **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource.
|
||||
|
||||
The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).
|
||||
|
||||
You can specify a `write_disposition` in the resource decorator:
|
||||
|
||||
```python
|
||||
@dlt.resource(write_disposition="append")
|
||||
def my_resource():
|
||||
...
|
||||
yield data
|
||||
```
|
||||
|
||||
Or directly in the pipeline run:
|
||||
|
||||
```python
|
||||
load_info = pipeline.run(my_resource, write_disposition="replace")
|
||||
```
|
||||
|
||||
> If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **1. Append**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's remember our Quick Start data sample with pokemons:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# Sample data containing pokemon details
|
||||
data = [
|
||||
{"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}},
|
||||
{"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}},
|
||||
{"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}},
|
||||
]
|
||||
return (data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""We create a `dlt` pipeline as usual and load this data into DuckDB."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data):
|
||||
import dlt
|
||||
from dlt.common.typing import TDataItems
|
||||
|
||||
@dlt.resource(name="pokemon", write_disposition="append")
|
||||
def append_pokemon() -> TDataItems:
|
||||
yield data
|
||||
|
||||
append_pipeline = dlt.pipeline(
|
||||
pipeline_name="append_poke_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="pokemon_data",
|
||||
)
|
||||
_load_info = append_pipeline.run(append_pokemon)
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
append_pipeline.dataset().pokemon.df()
|
||||
return TDataItems, append_pipeline, append_pokemon, dlt
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful.
|
||||
|
||||
Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(append_pipeline, append_pokemon):
|
||||
_load_info = append_pipeline.run(append_pokemon)
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
append_pipeline.dataset().pokemon.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **2. Replace**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data, dlt):
|
||||
@dlt.resource(name="pokemon", write_disposition="replace")
|
||||
def replace_pokemon() -> TDataItems:
|
||||
yield data
|
||||
|
||||
replace_pipeline = dlt.pipeline(
|
||||
pipeline_name="replace_poke_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="pokemon_data",
|
||||
)
|
||||
_load_info = replace_pipeline.run(replace_pokemon)
|
||||
print(_load_info)
|
||||
replace_pipeline.dataset().pokemon.df()
|
||||
return replace_pipeline, replace_pokemon
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Run it again:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(replace_pipeline, replace_pokemon):
|
||||
_load_info = replace_pipeline.run(replace_pokemon)
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
replace_pipeline.dataset().pokemon.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **3. [Merge](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental-loading)**
|
||||
|
||||
Consider a scenario where the data in the source has been updated, but you want to avoid reloading the entire dataset.
|
||||
|
||||
|
||||
|
||||
Merge write disposition is used to merge new data into the destination, using a `merge_key` and/or **deduplicating**/**upserting** new data using a `primary_key`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The **merge** write disposition can be useful in several situations:
|
||||
|
||||
1. If you have a dataset where records are frequently updated and you want to reflect these changes in your database, the `merge` write disposition can be used. It will **update the existing records** with the new data instead of creating duplicate entries.
|
||||
|
||||
2. If your data source occasionally sends **duplicate records**, the merge write disposition can help handle this. It uses a `primary_key` to identify unique records, so if a duplicate record (with the same `primary_key`) is encountered, it will be merged with the existing record instead of creating a new one.
|
||||
|
||||
3. If you are dealing with **Slowly Changing Dimensions** (SCD) where the attribute of a record changes over time and you want to maintain a history of these changes, you can use the `merge` write disposition with the scd2 strategy.
|
||||
|
||||
|
||||
When using the merge disposition, you need to specify a `primary_key` or `merge_key` for the resource.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, data, dlt):
|
||||
@dlt.resource(name="pokemon", write_disposition="merge", primary_key="id")
|
||||
def merge_pokemon() -> TDataItems:
|
||||
yield data
|
||||
|
||||
merge_pipeline = dlt.pipeline(
|
||||
pipeline_name="poke_pipeline_merge",
|
||||
destination="duckdb",
|
||||
dataset_name="pokemon_data",
|
||||
)
|
||||
_load_info = merge_pipeline.run(merge_pokemon)
|
||||
print(_load_info)
|
||||
merge_pipeline.dataset().pokemon.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The merge write disposition can be used with three different strategies:
|
||||
|
||||
* delete-insert (default strategy)
|
||||
* scd2
|
||||
* upsert
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Incremental Loading**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Incremental loading is the act of loading only new or changed data and not old records that we already loaded.
|
||||
|
||||
Imagine you’re a Pokémon trainer trying to catch ‘em all. You don’t want to keep visiting the same old PokéStops, catching the same old Bulbasaurs—you only want to find new and exciting Pokémon that have appeared since your last trip. That’s what incremental loading is all about: collecting only the new data that’s been added or changed, without wasting your Poké Balls (or database resources) on what you already have.
|
||||
|
||||
In this example, we have a dataset of Pokémon, each with a **unique ID**, their **name**, **size** (height and weight), and **when** they were "caught" (`created_at` field).
|
||||
|
||||
### **Step 1: Adding the `created_at` Field**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# We added `created_at` field to the data
|
||||
created_data = [
|
||||
{
|
||||
"id": "1",
|
||||
"name": "bulbasaur",
|
||||
"size": {"weight": 6.9, "height": 0.7},
|
||||
"created_at": "2024-12-01",
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"name": "charmander",
|
||||
"size": {"weight": 8.5, "height": 0.6},
|
||||
"created_at": "2024-09-01",
|
||||
},
|
||||
{
|
||||
"id": "25",
|
||||
"name": "pikachu",
|
||||
"size": {"weight": 6, "height": 0.4},
|
||||
"created_at": "2023-06-01",
|
||||
},
|
||||
]
|
||||
return (created_data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have.
|
||||
|
||||
### **Step 2: Defining the incremental logic**
|
||||
|
||||
Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date:
|
||||
```python
|
||||
cursor_date = dlt.sources.incremental("created_at", initial_value="2024-01-01")
|
||||
```
|
||||
This tells `dlt`:
|
||||
- **Start date**: January 1, 2024 (`initial_value`).
|
||||
- **Field to track**: `created_at` (our timestamp).
|
||||
|
||||
As you run the pipeline repeatedly, `dlt` will keep track of the latest `created_at` value processed. It will skip records older than this date in future runs.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, created_data, dlt):
|
||||
@dlt.resource(name="pokemon", write_disposition="append")
|
||||
def incremental_pokemon(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"created_at", initial_value="2024-01-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
yield created_data
|
||||
return (incremental_pokemon,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""We use the `@dlt.resource` decorator to declare table **name** to which data will be loaded and **write disposition**, which is **append** by default."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Step 3: Running the pipeline**
|
||||
Finally, we run our pipeline and load the fresh Pokémon data:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt, incremental_pokemon):
|
||||
incremental_pipeline = dlt.pipeline(
|
||||
pipeline_name="poke_pipeline_incremental",
|
||||
destination="duckdb",
|
||||
dataset_name="pokemon_data",
|
||||
)
|
||||
_load_info = incremental_pipeline.run(incremental_pokemon)
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
incremental_pipeline.dataset().pokemon.df()
|
||||
return (incremental_pipeline,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This:
|
||||
1. Loads **only Charmander and Bulbasaur** (caught after 2024-01-01).
|
||||
2. Skips Pikachu because it’s old news.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Only data for 2024 year was loaded.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Run the same pipeline again. The pipeline will detect that there are **no new records** based on the `created_at` field and the incremental cursor. As a result, **no new data will be loaded** into the destination:
|
||||
>0 load package(s) were loaded
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(incremental_pipeline, incremental_pokemon):
|
||||
_load_info = incremental_pipeline.run(incremental_pokemon)
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### **Why incremental loading matters**
|
||||
|
||||
* **Efficiency**. Skip redundant data, saving time and resources.
|
||||
* **Scalability**. Handle growing datasets without bottlenecks.
|
||||
* **Automation**. Let the tool track changes for you—no manual effort.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Update and deduplicate your data**
|
||||
The script above finds new pokemons and adds them to the database. It will ignore any updates to user information.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# We added `updated_at` field to the data
|
||||
updated_data = [
|
||||
{
|
||||
"id": "1",
|
||||
"name": "bulbasaur",
|
||||
"size": {"weight": 6.9, "height": 0.7},
|
||||
"created_at": "2024-12-01",
|
||||
"updated_at": "2024-12-01",
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"name": "charmander",
|
||||
"size": {"weight": 8.5, "height": 0.6},
|
||||
"created_at": "2024-09-01",
|
||||
"updated_at": "2024-09-01",
|
||||
},
|
||||
{
|
||||
"id": "25",
|
||||
"name": "pikachu",
|
||||
"size": {
|
||||
"weight": 9,
|
||||
"height": 0.4,
|
||||
},
|
||||
"created_at": "2023-06-01",
|
||||
"updated_at": "2024-12-16",
|
||||
},
|
||||
]
|
||||
return (updated_data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Get always fresh content of all the pokemons: combine an **incremental load** with **merge** write disposition, like in the script below."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt):
|
||||
@dlt.resource(name="pokemon", write_disposition="merge", primary_key="id")
|
||||
def dedup_pokemon(
|
||||
data: TDataItems,
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-01-01"
|
||||
),
|
||||
) -> TDataItems:
|
||||
yield data
|
||||
return (dedup_pokemon,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The incremental cursor keeps an eye on the `updated_at` field. Every time the pipeline runs, it only processes records with `updated_at` values greater than the last run."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dedup_pokemon, dlt, updated_data):
|
||||
dedup_pipeline = dlt.pipeline(
|
||||
pipeline_name="poke_pipeline_dedup",
|
||||
destination="duckdb",
|
||||
dataset_name="pokemon_data",
|
||||
)
|
||||
_load_info = dedup_pipeline.run(dedup_pokemon(updated_data))
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
dedup_pipeline.dataset().pokemon.df()
|
||||
return (dedup_pipeline,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
All Pokémons are processed because this is the pipeline’s first run.
|
||||
|
||||
Now, let’s say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
reupdated_data = [
|
||||
{
|
||||
"id": "1",
|
||||
"name": "bulbasaur",
|
||||
"size": {"weight": 6.9, "height": 0.7},
|
||||
"created_at": "2024-12-01",
|
||||
"updated_at": "2024-12-01",
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"name": "charmander",
|
||||
"size": {"weight": 8.5, "height": 0.6},
|
||||
"created_at": "2024-09-01",
|
||||
"updated_at": "2024-09-01",
|
||||
},
|
||||
{
|
||||
"id": "25",
|
||||
"name": "pikachu",
|
||||
"size": {"weight": 7.5, "height": 0.4},
|
||||
"created_at": "2023-06-01",
|
||||
"updated_at": "2024-12-23",
|
||||
},
|
||||
]
|
||||
return (reupdated_data,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Run the same pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dedup_pipeline, dedup_pokemon, reupdated_data):
|
||||
_load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data))
|
||||
print(_load_info)
|
||||
# explore loaded data
|
||||
dedup_pipeline.dataset().pokemon.df()
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**What happened?**
|
||||
|
||||
* The pipeline detected that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped.
|
||||
* Pikachu’s record was updated to reflect the latest weight.
|
||||
|
||||
You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The **`dlt.sources.incremental`** instance above has the following attributes:
|
||||
|
||||
* **`cursor_date.initial_value`** which is always equal to "2024-01-01" passed in the constructor;
|
||||
* **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run;
|
||||
* **`cursor_date.last_value`** a "real-time" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`;
|
||||
* **`cursor_date.end_value`** (not used here) marking the end of the backfill range.
|
||||
|
||||
## **Example**
|
||||
You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TDataItems, dlt, os):
|
||||
from typing import Iterable
|
||||
from dlt.extract import DltResource
|
||||
from dlt.sources.helpers import requests
|
||||
from dlt.sources.helpers.rest_client import RESTClient
|
||||
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
|
||||
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
|
||||
|
||||
dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def github_source(access_token: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=access_token),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource(name="issues", write_disposition="merge", primary_key="id")
|
||||
def github_issues(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/issues", params=params):
|
||||
yield page
|
||||
|
||||
return github_issues
|
||||
|
||||
pipeline = dlt.pipeline(pipeline_name="github_incr", destination="duckdb")
|
||||
_load_info = pipeline.run(github_source())
|
||||
print(_load_info)
|
||||
return github_source, pipeline
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.
|
||||
|
||||
Run the pipeline again and make sure that **no data is loaded**.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(github_source, pipeline):
|
||||
# run the pipeline with the new resource
|
||||
_load_info = pipeline.run(github_source())
|
||||
print(_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Apply Hints**
|
||||
|
||||
Alternatively, you can use `apply_hints` on a resource to define an incremental field:
|
||||
|
||||
```python
|
||||
resource = resource()
|
||||
resource.apply_hints(incremental=dlt.sources.incremental("updated_at"))
|
||||
```
|
||||
|
||||
When you apply an incremental hint using `apply_hints`, the source still performs a full extract. The incremental hint is used by `dlt` to filter the data after it has been extracted, before it is loaded into the destination.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Exercise 1: Make the GitHub API pipeline incremental**
|
||||
|
||||
In the previous lessons, you built a pipeline to pull data from the GitHub API. Now, let’s level it up by making it incremental, so it fetches only new or updated data.
|
||||
|
||||
|
||||
Transform your GitHub API pipeline to use incremental loading. This means:
|
||||
|
||||
* Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.
|
||||
* Fetch only pulls comments updated after the last pipeline run.
|
||||
* Use the `updated_at` field from the GitHub API as the incremental cursor.
|
||||
* [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository)
|
||||
* Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments`
|
||||
* Use the `since` parameter - only show results that were last updated after the given time - and `last_value`.
|
||||
* `initial_value` is `2024-12-01`.
|
||||
|
||||
|
||||
### Question
|
||||
|
||||
How many columns does the `comments` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,14 +6,14 @@
|
||||
"id": "h93BcC8SX2fj"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 5](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Explored 3 dlt write dispositions:\n",
|
||||
" * append;\n",
|
||||
" * replace;\n",
|
||||
" * merge.\n",
|
||||
"2. Learned how to update and depuplicate data\n",
|
||||
"3. Created incremental pipeline\n"
|
||||
"1. Explored 3 `dlt` write dispositions: \n",
|
||||
" - append \n",
|
||||
" - replace \n",
|
||||
" - merge \n",
|
||||
"2. Learned how to update and deduplicate data \n",
|
||||
"3. Created an incremental pipeline\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -22,16 +22,16 @@
|
||||
"id": "26boldDvOn0R"
|
||||
},
|
||||
"source": [
|
||||
"# **How dlt works** 🧠🧠 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n",
|
||||
"# **How dlt works** 🧠🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn:**\n",
|
||||
"- Three main steps:\n",
|
||||
" - Extract;\n",
|
||||
" - Normalize;\n",
|
||||
" - Load. \n",
|
||||
"- Some default behaviour.\n",
|
||||
"- About file formats."
|
||||
"- The 3 main steps of a pipeline run: \n",
|
||||
" - Extract \n",
|
||||
" - Normalize \n",
|
||||
" - Load \n",
|
||||
"- Some default behaviors \n",
|
||||
"- Supported file formats"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -42,7 +42,7 @@
|
||||
"source": [
|
||||
"## **Introduction**\n",
|
||||
"\n",
|
||||
"The main building block of dlt is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method."
|
||||
"The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -60,8 +60,8 @@
|
||||
"id": "Xh6CKQATb63X"
|
||||
},
|
||||
"source": [
|
||||
"# **Understing `pipeline.run()`**\n",
|
||||
" The `pipeline.run()` method executes the entire pipeline, encompassing the [`extract`](#scrollTo=4C0U1dnwZxAB), [`normalize`](#scrollTo=bCeUqaW_cRSh), and [`load`](#scrollTo=Rn6cUc0OcWsk) stages."
|
||||
"# **Understanding `pipeline.run()`**\n",
|
||||
" The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -89,7 +89,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"!pip install -U dlt"
|
||||
"!pip install dlt"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -136,14 +136,14 @@
|
||||
"The `progress=\"log\"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipeline’s progress during execution. These logs provide visibility into the pipeline’s operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"dlt supports 4 progress monitors out of the box:\n",
|
||||
"`dlt` supports 4 progress monitors out of the box:\n",
|
||||
"\n",
|
||||
"* `enlighten` - a status bar with progress bars that also allows for logging.\n",
|
||||
"* `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks.\n",
|
||||
"* `alive_progress` - with the most fancy animations.\n",
|
||||
"* `log` - dumps the progress information to log, console, or text stream. the most useful on production optionally adds memory and CPU usage stats.\n",
|
||||
"* `log` — dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats.\n",
|
||||
"\n",
|
||||
"For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#display-the-loading-progress)."
|
||||
"For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -154,7 +154,7 @@
|
||||
"source": [
|
||||
"## **Extract**\n",
|
||||
"\n",
|
||||
"Extract can be run individually with the `extract` command on the pipeline:\n",
|
||||
"Extract can be run individually with the `extract` method on the pipeline:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"pipeline.extract(data)\n",
|
||||
@@ -712,17 +712,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "rZpSep8SV1SZ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
663
docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py
Normal file
663
docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py
Normal file
@@ -0,0 +1,663 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩💻🚀**
|
||||
|
||||
1. Explored 3 `dlt` write dispositions:
|
||||
- append
|
||||
- replace
|
||||
- merge
|
||||
2. Learned how to update and deduplicate data
|
||||
3. Created an incremental pipeline
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **How dlt works** 🧠🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
|
||||
|
||||
|
||||
**Here, you will learn:**
|
||||
- The 3 main steps of a pipeline run:
|
||||
- Extract
|
||||
- Normalize
|
||||
- Load
|
||||
- Some default behaviors
|
||||
- Supported file formats
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Introduction**
|
||||
|
||||
The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""So, let's take a step back and walk through the internal steps of `pipeline.run()`, identifying methods to optimize each one."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Understanding `pipeline.run()`**
|
||||
The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Consider this intentionally short example:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import dlt
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="my_pipeline", destination="duckdb", progress="log"
|
||||
)
|
||||
|
||||
load_info = pipeline.run(
|
||||
[
|
||||
{"id": 1},
|
||||
{"id": 2},
|
||||
{"id": 3, "nested": [{"id": 1}, {"id": 2}]},
|
||||
],
|
||||
table_name="items",
|
||||
)
|
||||
print(load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This is what happens when the `run` method is executed:
|
||||
|
||||
1. **Extract** - Fully extracts the data from your source to your hard drive. In the example above, an implicit source with one resource with 3 items is created and extracted.
|
||||
2. **Normalize** - Inspects and normalizes your data and computes a schema compatible with your destination. For the example above, the normalizer will detect one column `id` of type `int` in one table named `items`, it will furthermore detect a nested list in table items and unnest it into a child table named `items__nested`.
|
||||
3. **Load** - Runs schema migrations if necessary on your destination and loads your data into the destination. For the example above, a new dataset on a local duckdb database is created that contains the two tables discovered in the previous steps.
|
||||
|
||||
|
||||
## **Display the loading progress**
|
||||
Notice how we use `progress="log"` here.
|
||||
|
||||
The `progress="log"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipeline’s progress during execution. These logs provide visibility into the pipeline’s operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization.
|
||||
|
||||
|
||||
`dlt` supports 4 progress monitors out of the box:
|
||||
|
||||
* `enlighten` - a status bar with progress bars that also allows for logging.
|
||||
* `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks.
|
||||
* `alive_progress` - with the most fancy animations.
|
||||
* `log` — dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats.
|
||||
|
||||
For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress).
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Extract**
|
||||
|
||||
Extract can be run individually with the `extract` method on the pipeline:
|
||||
|
||||
```python
|
||||
pipeline.extract(data)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **What happens at the extraction stage?**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
When the `pipeline.run()` method is executed, it first performs the `extract` stage, during which the following occurs:
|
||||
|
||||
1. Data is fetched and stored in an in-memory buffer.
|
||||
2. When the buffer reaches its capacity, the data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items.
|
||||
3. If a size is specified for intermediary files and an the intermediary file in question reaches this size, a new intermediary file is opened for further data.
|
||||
|
||||
```
|
||||
API Data
|
||||
| (extract)
|
||||
Buffer
|
||||
(resources) / | ... | \
|
||||
extracted data in local storage
|
||||
|
||||
```
|
||||
|
||||
The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **Default behaviour at the extraction stage**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
- The in-memory buffer is set to `5000` items.
|
||||
- By default, **intermediary files are not rotated**. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, `dlt` will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions.
|
||||
- By default, intermediary files at the extract stage use a custom version of the JSONL format.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Normalize**
|
||||
|
||||
Normalize can be run individually with the `normalize` command on the pipeline. Normalize is dependent on having a completed extract phase and will not do anything if there is no extracted data.
|
||||
|
||||
```py
|
||||
pipeline.normalize()
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **What happens at the normalization stage?**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
In the `normalize` stage, `dlt` first transforms the structure of the input data. This transformed data is then converted into a relational structure that can be easily loaded into the destination. To be detailed, here's what happens during this stage:
|
||||
|
||||
1. Intermediary files are sent from the `extract` stage to the `normalize` stage.
|
||||
3. During normalization step it processes one intermediate file at a time within its own in-memory buffer.
|
||||
4. When the buffer reaches its capacity, the normalized data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items.
|
||||
4. If a size is specified for intermediary files in the normalize stage and the intermediary file in question reaches this size, a new intermediary file is opened.
|
||||
|
||||
```
|
||||
(extract)
|
||||
API Data --> extracted files in local storage
|
||||
/ | \ (normalize)
|
||||
one file ... one file
|
||||
/ | \ / | \
|
||||
normalized files normalized files
|
||||
|
||||
```
|
||||
|
||||
|
||||
The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **Default behaviour at the normalization stage**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
- The in-memory buffer is set to `5000`, just like at the extraction stage.
|
||||
- By default, **intermediary files are not rotated** as well. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, dlt will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## **Load**
|
||||
|
||||
Load can be run individually with the `load` command on the pipeline. Load is dependent on having a completed normalize phase and will not do anything if there is no normalized data.
|
||||
|
||||
```py
|
||||
pipeline.load()
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **What happens at the loading stage?**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The `load` stage is responsible for taking the normalized data and loading it into your chosen destination:
|
||||
|
||||
1. All intermediary files from a single source are combined into a single load package.
|
||||
2. All load packages are then loaded into the destination.
|
||||
|
||||
|
||||
```
|
||||
(extract) (normalize)
|
||||
API Data --> extracted files --> normalized files
|
||||
/ | ... | \ (load)
|
||||
one normalized file ... one file
|
||||
\ | ... | /
|
||||
destination
|
||||
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **Default behaviour at the loading stage**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""- Loading happens in `20` threads, each loading a single file.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""## **Intermediary file formats**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Intermediary files at the extract stage use a custom version of the JSONL format, while the loader files - files created at the normalize stage - can take 4 different formats."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **JSONL**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Definition**: JSON Delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line.
|
||||
|
||||
**Compression:** enabled by default.
|
||||
|
||||
**Data type handling:**
|
||||
|
||||
- `datetime` and `date` are stored as ISO strings;
|
||||
- `decimal` is stored as a text representation of a decimal number;
|
||||
- `binary` is stored as a base64 encoded string;
|
||||
- `HexBytes` is stored as a hex encoded string;
|
||||
- `complex` is serialized as a string.
|
||||
|
||||
**By default used by:**
|
||||
|
||||
- Bigquery
|
||||
- Snowflake
|
||||
- Filesystem
|
||||
|
||||
**Configuration**:
|
||||
|
||||
- Directly in the `pipeline.run()`:
|
||||
|
||||
```py
|
||||
info = pipeline.run(some_source(), loader_file_format="jsonl")
|
||||
```
|
||||
|
||||
- In `config.toml` or `secrets.toml`:
|
||||
|
||||
```py
|
||||
[normalize]
|
||||
loader_file_format="jsonl"
|
||||
```
|
||||
|
||||
- Via environment variables:
|
||||
|
||||
```py
|
||||
export NORMALIZE__LOADER_FILE_FORMAT="jsonl"
|
||||
```
|
||||
|
||||
- Specify directly in the resource decorator:
|
||||
|
||||
```py
|
||||
@dlt.resource(file_format="jsonl")
|
||||
def generate_rows():
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **Parquet**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Definition**: Apache Parquet is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem.
|
||||
|
||||
**Prerequisite:** To use this format, you need a pyarrow package. You can get this package as a dlt extra as well:
|
||||
|
||||
```py
|
||||
pip install "dlt[parquet]"
|
||||
|
||||
```
|
||||
|
||||
**Default version**: 2.4, which coerces timestamps to microseconds and silently truncates nanoseconds for better compatibility with databases and pandas.
|
||||
|
||||
**Supported by:**
|
||||
|
||||
- Bigquery
|
||||
- DuckDB
|
||||
- Snowflake
|
||||
- Filesystem
|
||||
- Athena
|
||||
- Databricks
|
||||
- Synapse
|
||||
|
||||
**Configuration**:
|
||||
|
||||
- Directly in the `pipeline.run()`:
|
||||
|
||||
```py
|
||||
info = pipeline.run(some_source(), loader_file_format="parquet")
|
||||
```
|
||||
|
||||
- In `config.toml` or `secrets.toml`:
|
||||
|
||||
```py
|
||||
[normalize]
|
||||
loader_file_format="parquet"
|
||||
```
|
||||
|
||||
- Via environment variables:
|
||||
|
||||
```py
|
||||
export NORMALIZE__LOADER_FILE_FORMAT="parquet"
|
||||
```
|
||||
|
||||
- Specify directly in the resource decorator:
|
||||
|
||||
```py
|
||||
@dlt.resource(file_format="parquet")
|
||||
def generate_rows():
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
**Destination AutoConfig**:
|
||||
|
||||
`dlt` automatically configures the Parquet writer based on the destination's capabilities:
|
||||
|
||||
- Selects the appropriate decimal type and sets the correct precision and scale for accurate numeric data storage, including handling very small units like Wei.
|
||||
|
||||
- Adjusts the timestamp resolution (seconds, microseconds, or nanoseconds) to match what the destination supports
|
||||
|
||||
|
||||
**Writer settings:**
|
||||
|
||||
`dlt` uses the pyarrow Parquet writer for file creation. You can adjust the writer's behavior with the following options:
|
||||
|
||||
- `flavor` adjusts schema and compatibility settings for different target systems. Defaults to None (pyarrow default).
|
||||
- `version` selects Parquet logical types based on the Parquet format version. Defaults to "2.6".
|
||||
- `data_page_size` sets the target size for data pages within a column chunk (in bytes). Defaults to None.
|
||||
- `timestamp_timezone` specifies the timezone; defaults to UTC.
|
||||
- `coerce_timestamps` sets the timestamp resolution (s, ms, us, ns).
|
||||
- `allow_truncated_timestamps` raises an error if precision is lost on truncated timestamps.
|
||||
|
||||
**Example configurations:**
|
||||
|
||||
- In `configs.toml` or `secrets.toml`:
|
||||
```py
|
||||
[normalize.data_writer]
|
||||
# the default values
|
||||
flavor="spark"
|
||||
version="2.4"
|
||||
data_page_size=1048576
|
||||
timestamp_timezone="Europe/Berlin"
|
||||
```
|
||||
|
||||
- Via environment variables:
|
||||
```py
|
||||
export NORMALIZE__DATA_WRITER__FLAVOR="spark"
|
||||
```
|
||||
|
||||
|
||||
**Timestamps and timezones**
|
||||
|
||||
`dlt` adds UTC adjustments to all timestamps, creating timezone-aware timestamp columns in destinations (except DuckDB).
|
||||
|
||||
**Disable timezone/UTC adjustments:**
|
||||
|
||||
- Set `flavor` to spark to use the deprecated `int96` timestamp type without logical adjustments.
|
||||
|
||||
- Set `timestamp_timezone` to an empty string (`DATA_WRITER__TIMESTAMP_TIMEZONE=""`) to generate logical timestamps without UTC adjustment.
|
||||
|
||||
By default, pyarrow converts timezone-aware DateTime objects to UTC and stores them in Parquet without timezone information.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **CSV**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Supported by:**
|
||||
|
||||
- PostgreSQL
|
||||
- Filesystem
|
||||
- Snowflake
|
||||
|
||||
**Configuration**:
|
||||
|
||||
- Directly in the `pipeline.run()`:
|
||||
|
||||
```py
|
||||
info = pipeline.run(some_source(), loader_file_format="csv")
|
||||
```
|
||||
|
||||
- In `config.toml` or `secrets.toml`:
|
||||
|
||||
```py
|
||||
[normalize]
|
||||
loader_file_format="csv"
|
||||
```
|
||||
|
||||
- Via environment variables:
|
||||
|
||||
```py
|
||||
export NORMALIZE__LOADER_FILE_FORMAT="csv"
|
||||
```
|
||||
|
||||
- Specify directly in the resource decorator:
|
||||
|
||||
```py
|
||||
@dlt.resource(file_format="csv")
|
||||
def generate_rows():
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
**Two implementation**:
|
||||
|
||||
1. `pyarrow` csv writer - very fast, multithreaded writer for the arrow tables
|
||||
- binary columns are supported only if they contain valid UTF-8 characters
|
||||
- complex (nested, struct) types are not supported
|
||||
2. `python stdlib writer` - a csv writer included in the Python standard library for Python objects
|
||||
|
||||
- binary columns are supported only if they contain valid UTF-8 characters (easy to add more encodings)
|
||||
- complex columns dumped with json.dumps
|
||||
- None values are always quoted
|
||||
|
||||
**Default settings:**
|
||||
|
||||
- separators are commas
|
||||
- quotes are " and are escaped as ""
|
||||
- NULL values both are empty strings and empty tokens as in the example below
|
||||
- UNIX new lines are used
|
||||
- dates are represented as ISO 8601
|
||||
quoting style is "when needed"
|
||||
|
||||
**Adjustable setting:**
|
||||
|
||||
- `delimiter`: change the delimiting character (default: ',')
|
||||
- `include_header`: include the header row (default: True)
|
||||
- `quoting`: `quote_all` - all values are quoted, `quote_needed` - quote only values that need quoting (default: `quote_needed`)
|
||||
|
||||
```py
|
||||
[normalize.data_writer]
|
||||
delimiter="|"
|
||||
include_header=false
|
||||
quoting="quote_all"
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```py
|
||||
NORMALIZE__DATA_WRITER__DELIMITER=|
|
||||
NORMALIZE__DATA_WRITER__INCLUDE_HEADER=False
|
||||
NORMALIZE__DATA_WRITER__QUOTING=quote_all
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""### **SQL INSERT File Format**""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage.
|
||||
|
||||
Additional data types are stored as follows:
|
||||
|
||||
- `datetime` and date are stored as ISO strings;
|
||||
- `decimal` is stored as a text representation of a decimal number;
|
||||
- `binary` storage depends on the format accepted by the destination;
|
||||
- `complex` storage also depends on the format accepted by the destination.
|
||||
|
||||
This file format is compressed by default.
|
||||
|
||||
**Default for:**
|
||||
|
||||
1. DuckDB
|
||||
2. PostgreSQL
|
||||
3. Redshift
|
||||
|
||||
**Supported by:**
|
||||
|
||||
1. Filesystem
|
||||
|
||||
**Configuration**:
|
||||
|
||||
- Directly in the `pipeline.run()`:
|
||||
|
||||
```py
|
||||
info = pipeline.run(some_source(), loader_file_format="insert_values")
|
||||
```
|
||||
|
||||
- In `config.toml` or `secrets.toml`:
|
||||
|
||||
```py
|
||||
[normalize]
|
||||
loader_file_format="insert_values"
|
||||
```
|
||||
|
||||
- Via environment variables:
|
||||
|
||||
```py
|
||||
export NORMALIZE__LOADER_FILE_FORMAT="insert_values"
|
||||
```
|
||||
|
||||
- Specify directly in the resource decorator:
|
||||
|
||||
```py
|
||||
@dlt.resource(file_format="insert_values")
|
||||
def generate_rows():
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,14 +6,14 @@
|
||||
"id": "h93BcC8SX2fj"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 6](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Learned how dlt works under the hood;\n",
|
||||
"2. Explored 3 main steps:\n",
|
||||
" * Extract;\n",
|
||||
" * Normalize;\n",
|
||||
" * Load.\n",
|
||||
"3. Learned which file formats dlt supports."
|
||||
"1. Learned how `dlt` works under the hood. \n",
|
||||
"2. Explored the 3 main steps of a pipeline run: \n",
|
||||
" - Extract \n",
|
||||
" - Normalize \n",
|
||||
" - Load \n",
|
||||
"3. Learned which file formats `dlt` supports."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -24,7 +24,7 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"# **Inspecting & Adjusting Schema** 🧠🧠 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n",
|
||||
"# **Inspecting & Adjusting Schema** 🧠🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn or refresh your knowledge on:**\n",
|
||||
@@ -56,7 +56,7 @@
|
||||
"id": "1vRudCVb9zII"
|
||||
},
|
||||
"source": [
|
||||
"Let's load some GitHub data to DuckDB to inspect the schema in different ways. First we need to install dlt with DuckDB:"
|
||||
"Let's load some GitHub data to DuckDB to inspect the schema in different ways."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -68,7 +68,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%capture\n",
|
||||
"!pip install -U dlt"
|
||||
"!pip install dlt"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -77,7 +77,7 @@
|
||||
"id": "DKvf4NWW-U9V"
|
||||
},
|
||||
"source": [
|
||||
"Define a dlt resource that fetches pull requests and wrap it in a dlt source, create a pipeline and run it:"
|
||||
"Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -100,7 +100,7 @@
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -259,7 +259,7 @@
|
||||
" pipeline_name=\"github_pipeline2\",\n",
|
||||
" destination=\"duckdb\",\n",
|
||||
" dataset_name=\"github_data\",\n",
|
||||
" export_schema_path=\"schemas/export\", # <--- dir path for a schema export\n",
|
||||
" export_schema_path=\"schemas/export\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -308,7 +308,9 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!ls schemas/export && cat schemas/export/github_source.schema.yaml"
|
||||
"print(os.listdir(\"schemas/export\"))\n",
|
||||
"with open(\"schemas/export/github_source.schema.yaml\") as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -957,7 +959,8 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!cat schemas/export/github_source.schema.yaml"
|
||||
"with open(\"schemas/export/github_source.schema.yaml\") as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -977,17 +980,8 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1jp5UtydA3x9cAq-fbW2tRmAOl4LMZqM1#forceEdit=true&sandboxMode=true)!"
|
||||
"✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "gxU44wP9GvG6"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -0,0 +1,882 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩💻🚀**
|
||||
|
||||
1. Learned how `dlt` works under the hood.
|
||||
2. Explored the 3 main steps of a pipeline run:
|
||||
- Extract
|
||||
- Normalize
|
||||
- Load
|
||||
3. Learned which file formats `dlt` supports.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# **Inspecting & Adjusting Schema** 🧠🧠 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
|
||||
|
||||
|
||||
**Here, you will learn or refresh your knowledge on:**
|
||||
- Methods to inspect a schema
|
||||
- The components of a schema
|
||||
- How to modify a schema
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Methods to inspect a schema**
|
||||
|
||||
- **What's a schema?** The schema describes the structure of normalized data (e.g. tables, columns, data types, etc.). `dlt` generates schemas from the data during the normalization process.
|
||||
|
||||
- **How can you inspect a schema in `dlt`?** There are multiple ways:
|
||||
- CLI
|
||||
- Python
|
||||
- Export schema directly
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Let's load some GitHub data to DuckDB to inspect the schema in different ways."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from typing import Iterable
|
||||
import dlt
|
||||
from dlt.common.typing import TDataItems
|
||||
from dlt.extract import DltResource
|
||||
from dlt.sources.helpers import requests
|
||||
from dlt.sources.helpers.rest_client import RESTClient
|
||||
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
|
||||
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
|
||||
|
||||
import os
|
||||
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
|
||||
return github_pulls
|
||||
|
||||
# define new dlt pipeline
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="github_pipeline1",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
|
||||
# run the pipeline with the new resource
|
||||
load_info = pipeline.run(github_source())
|
||||
print(load_info)
|
||||
return dlt, github_source, load_info, os
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) CLI**
|
||||
|
||||
Let's first try the CLI command `dlt pipeline -v <pipeline_name> load-package`, which is used to inspect a load package in verbose mode.
|
||||
|
||||
> In the context of the `dlt` library, a load package is a collection of jobs with data for particular tables. The -v flag stands for verbose, which means the command will provide more detailed output.
|
||||
|
||||
Specifically, this command will show the schema changes introduced in the load package for the given pipeline.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import subprocess
|
||||
|
||||
subprocess.run(
|
||||
["dlt", "pipeline", "-v", "github_pipeline1", "load-package"], check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(1) Python**
|
||||
|
||||
Alternatively, we can inspect the schema object from load info with:
|
||||
|
||||
```python
|
||||
print(load_info.load_packages[0].schema)
|
||||
```
|
||||
|
||||
which has the following public methods and attributes:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(load_info):
|
||||
# This code snippet just prints out the public methods and attributes of the schema object in load info
|
||||
all_attributes_methods = dir(load_info.load_packages[0].schema)
|
||||
public_attributes_methods = [
|
||||
attr for attr in all_attributes_methods if not attr.startswith("_")
|
||||
]
|
||||
|
||||
print(f"{'Attribute/Method':<50} {'Type':<10}")
|
||||
print("-" * 40)
|
||||
for attr in public_attributes_methods:
|
||||
attr_value = getattr(load_info.load_packages[0].schema, attr)
|
||||
if callable(attr_value):
|
||||
print(f"{attr:<50} {'method':<10}")
|
||||
else:
|
||||
print(f"{attr:<50} {'attribute':<10}")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Let's use the `to_pretty_json` method and print the schema:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(load_info):
|
||||
print(load_info.load_packages[0].schema.to_pretty_json())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(2) Exporting schema**
|
||||
|
||||
> Exporting the data schema directly into a file might be even more straightforward than the two previous approaches.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The instruction to export a schema should be provided at the beginning when creating a pipeline:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
pipeline_1 = dlt.pipeline(
|
||||
pipeline_name="github_pipeline2",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
export_schema_path="schemas/export",
|
||||
)
|
||||
return (pipeline_1,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Run the pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(github_source, pipeline_1):
|
||||
load_info_1 = pipeline_1.run(github_source())
|
||||
print(load_info_1)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Check if the schema was exported.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os):
|
||||
print(os.listdir("schemas/export"))
|
||||
with open("schemas/export/github_source.schema.yaml") as _f:
|
||||
print(_f.read())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **The components of a schema**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> Since we learned the ways we can inspect the schema, it's important to actually understand what it contains to be able to meaningfully adjust it later."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
A schema (in YAML format) looks somethng like this:
|
||||
|
||||
```yaml
|
||||
version: 2
|
||||
version_hash: wdIt+pExjT8Mj1ygQEMhq3E3SXtNBuIbHg0fDz9xD9I=
|
||||
engine_version: 11
|
||||
name: github_source
|
||||
tables:
|
||||
_dlt_version:
|
||||
...
|
||||
_dlt_loads:
|
||||
...
|
||||
github_pulls:
|
||||
...
|
||||
settings:
|
||||
detections:
|
||||
- iso_timestamp
|
||||
default_hints:
|
||||
not_null:
|
||||
- _dlt_id
|
||||
- _dlt_root_id
|
||||
- _dlt_parent_id
|
||||
- _dlt_list_idx
|
||||
- _dlt_load_id
|
||||
parent_key:
|
||||
- _dlt_parent_id
|
||||
root_key:
|
||||
- _dlt_root_id
|
||||
unique:
|
||||
- _dlt_id
|
||||
row_key:
|
||||
- _dlt_id
|
||||
normalizers:
|
||||
names: snake_case
|
||||
json:
|
||||
module: dlt.common.normalizers.json.relational
|
||||
previous_hashes:
|
||||
- 0WLnuf3Jh1J1XsbVrV2eB824Z6heOlf5o912i1v3tho=
|
||||
- 0d1z0RFV2O0OvfEWkebtSjxrCjjiyv1lOeNiF0V8Lws=
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) Schema version hash**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The schema hash, denoted by `version_hash`, is generated from the actual schema content, excluding the hash values and version of the schema.
|
||||
|
||||
Each time the schema is changed, a new hash is produced.
|
||||
|
||||
> Note that during the initial run (the first pipeline run), the version will be 2, and there will be two previous hashes because the schema is updated during both the extract and normalize stages. You can rely on the version number to determine how many times the schema has been changed, but keep in mind that it stops being reliable when parallelization is introduced.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Each version hash is then stored in the `_dlt_version` table.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
On subsequent runs, `dlt` checks if the generated schema hash is stored in this table. If it is not, `dlt` concludes that the schema has changed and migrates the destination accordingly.
|
||||
|
||||
- If multiple pipelines are sending data to the same dataset and there is a clash in table names, a single table with the union of the columns will be created.
|
||||
- If columns clash and have different types or other incompatible characteristics, the load may fail if the data cannot be coerced.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(1) Naming convention**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Each schema contains a naming convention that is denoted in the following way when the schema is exported:
|
||||
|
||||
```yaml
|
||||
...
|
||||
normalizers:
|
||||
names: snake_case # naming convention
|
||||
...
|
||||
```
|
||||
The naming convention is particularly useful if the identifiers of the data to be loaded (e.g., keys in JSON files) need to match the namespace of the destination (such as Redshift, which accepts case-insensitive alphanumeric identifiers with a maximum of 127 characters). This convention is used by `dlt` to translate between these identifiers and namespaces.
|
||||
|
||||
The standard behavior of `dlt` is to use the same naming convention for all destinations, ensuring that users always see the same tables and columns in their databases.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The default naming convention is `snake_case`:
|
||||
|
||||
- Removes all ASCII characters except alphanumerics and underscores.
|
||||
- Adds an underscore (`_`) if the name starts with a number.
|
||||
- Multiple underscores (`_`) are reduced to a single underscore.
|
||||
- The parent-child relationship is expressed as a double underscore (`__`) in names.
|
||||
- The identifier is shortened if it exceeds the length allowed at the destination.
|
||||
|
||||
> If you provide any schema elements that contain identifiers via decorators or arguments (e.g., `table_name` or `columns`), all the names used will be converted according to the naming convention when added to the schema. For example, if you execute `dlt.run(..., table_name="CamelCaseTableName")`, the data will be loaded into `camel_case_table_name`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
To retain the original naming convention, you can define the following in your `config.toml`:
|
||||
|
||||
```python
|
||||
[schema]
|
||||
naming="direct"
|
||||
```
|
||||
|
||||
or use an environment variable as:
|
||||
|
||||
```
|
||||
SCHEMA__NAMING=direct
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(2) Schema settings**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The `settings` section of the schema file allows you to define various global rules that impact how tables and columns are inferred from data.
|
||||
|
||||
```yaml
|
||||
settings:
|
||||
detections:
|
||||
...
|
||||
default_hints:
|
||||
...
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**1. Detections**
|
||||
|
||||
You can define a set of functions that will be used to infer the data type of the column from a value. These functions are executed sequentially from top to bottom on the list.
|
||||
|
||||
```yaml
|
||||
settings:
|
||||
detections:
|
||||
- timestamp # detects int and float values that can be interpreted as timestamps within a 5-year range and converts them
|
||||
- iso_timestamp # detects ISO 8601 strings and converts them to timestamp
|
||||
- iso_date #detects strings representing an ISO-like date (excluding timestamps) and, if so, converts to date
|
||||
- large_integer # detects integers too large for 64-bit and classifies as "wei" or converts to text if extremely large
|
||||
- hexbytes_to_text # detects HexBytes objects and converts them to text
|
||||
- wei_to_double # detects Wei values and converts them to double for aggregate non-financial reporting
|
||||
```
|
||||
|
||||
> `iso_timestamp` detector is enabled by default.
|
||||
|
||||
Detectors can be removed or added directly in code:
|
||||
|
||||
```python
|
||||
source = source()
|
||||
source.schema.remove_type_detection("iso_timestamp")
|
||||
source.schema.add_type_detection("timestamp")
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**2. Column hint rules**
|
||||
|
||||
The `default_hints` section in the schema file is used to define global rules that apply to newly inferred columns.
|
||||
|
||||
> These rules are applied **after normalization**, meaning after the naming convention is applied!
|
||||
|
||||
|
||||
By default, schema adopts column hint rules from the json(relational) normalizer to support correct hinting of columns added by the normalizer:
|
||||
|
||||
```yaml
|
||||
settings:
|
||||
default_hints:
|
||||
foreign_key:
|
||||
- _dlt_parent_id
|
||||
not_null:
|
||||
- _dlt_id
|
||||
- _dlt_root_id
|
||||
- _dlt_parent_id
|
||||
- _dlt_list_idx
|
||||
- _dlt_load_id
|
||||
unique:
|
||||
- _dlt_id
|
||||
root_key:
|
||||
- _dlt_root_id
|
||||
```
|
||||
|
||||
|
||||
You can define column names with regular expressions as well.
|
||||
|
||||
```yaml
|
||||
settings:
|
||||
default_hints:
|
||||
partition:
|
||||
- re:_timestamp$ # add partition hint to all columns ending with _timestamp
|
||||
```
|
||||
|
||||
Column hints can be added directly in code:
|
||||
|
||||
```python
|
||||
source = data_source()
|
||||
# this will update existing hints with the hints passed
|
||||
source.schema.merge_hints({"partition": ["re:_timestamp$"]})
|
||||
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**3. Preferred data types**
|
||||
|
||||
In the `preferred_types` section, you can define rules that will set the data type for newly created columns. On the left side, you specify a rule for a column name, and on the right side, you define the corresponding data type. You can use column names directly or with regular expressions to match them.
|
||||
|
||||
```yaml
|
||||
settings:
|
||||
preferred_types:
|
||||
re:timestamp: timestamp
|
||||
inserted_at: timestamp
|
||||
created_at: timestamp
|
||||
updated_at: timestamp
|
||||
```
|
||||
Above, we prefer `timestamp` data type for all columns containing timestamp substring and define a exact matches for certain columns.
|
||||
|
||||
Preferred data types can be added directly in code as well:
|
||||
|
||||
```python
|
||||
source = data_source()
|
||||
source.schema.update_preferred_types(
|
||||
{
|
||||
"re:timestamp": "timestamp",
|
||||
"inserted_at": "timestamp",
|
||||
"created_at": "timestamp",
|
||||
"updated_at": "timestamp",
|
||||
}
|
||||
)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **How to modify a schema**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Speaking of data types... you can directly apply data types and hints to your resources, bypassing the need for importing and adjusting schemas. This approach is ideal for rapid prototyping and handling data sources with dynamic schema requirements.
|
||||
|
||||
The two main approaches are:
|
||||
|
||||
- Using the `columns` argument in the `dlt.resource` decorator.
|
||||
- Using the `apply_hints` method.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **`(0) @dlt.resource(columns=...)`**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
This code snippet sets up a nullable boolean column named `my_column` directly in the decorator.
|
||||
|
||||
```python
|
||||
@dlt.resource(name='my_table', columns={"my_column": {"data_type": "bool", "nullable": True}})
|
||||
def my_resource():
|
||||
for i in range(10):
|
||||
yield {'my_column': i % 2 == 0}
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(1) `apply_hints`**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
When dealing with dynamically generated resources or needing to programmatically set hints, `apply_hints` is your go-to tool.
|
||||
|
||||
The `apply_hints` method in dlt is used to programmatically **set** or **adjust** various aspects of your data resources or pipeline. It can be used in several ways:
|
||||
|
||||
* You can use `apply_hints` to **directly define data types** and their properties, such as nullability, within the `@dlt.resource` decorator. This eliminates the dependency on external schema files.
|
||||
|
||||
* When **dealing with dynamically generated resources** or needing to programmatically set hints, `apply_hints` is your tool. It's especially useful for applying hints across various collections or tables at once.
|
||||
|
||||
* `apply_hints` can be used to **load your data incrementally**. For example, you can load only files that have been updated since the last time dlt processed them, or load only the new or updated records by looking at a specific column.
|
||||
|
||||
* You can **set or update the table name, columns, and other schema elements** when your resource is executed, and you already yield data. Such changes will be merged with the existing schema in the same way the `apply_hints` method works.
|
||||
|
||||
|
||||
It’s especially useful for applying hints across multiple collections or tables at once.
|
||||
|
||||
For example, to apply a complex data type across all collections from a MongoDB source:
|
||||
|
||||
```python
|
||||
all_collections = ["collection1", "collection2", "collection3"] # replace with your actual collection names
|
||||
source_data = mongodb().with_resources(*all_collections)
|
||||
|
||||
for col in all_collections:
|
||||
source_data.resources[col].apply_hints(columns={"column_name": {"data_type": "complex"}})
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="mongodb_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="mongodb_data"
|
||||
)
|
||||
load_info = pipeline.run(source_data)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(2) Adjusting schema settings**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> Maybe you've noticed, but there several ways to adjust your schema settings directly in code were already covered. This is just a recap. You can go back directly to the Schema Settings section."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Detectors can be removed or added directly in code:
|
||||
|
||||
```python
|
||||
source = source()
|
||||
source.schema.remove_type_detection("iso_timestamp")
|
||||
source.schema.add_type_detection("timestamp")
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Column hints can be added directly in code:
|
||||
|
||||
```python
|
||||
source = data_source()
|
||||
# this will update existing hints with the hints passed
|
||||
source.schema.merge_hints({"partition": ["re:_timestamp$"]})
|
||||
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Preferred data types can be added directly in code as well:
|
||||
|
||||
```python
|
||||
source = data_source()
|
||||
source.schema.update_preferred_types(
|
||||
{
|
||||
"re:timestamp": "timestamp",
|
||||
"inserted_at": "timestamp",
|
||||
"created_at": "timestamp",
|
||||
"updated_at": "timestamp",
|
||||
}
|
||||
)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(3) Importing a schema**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""> We mentioned that you can export a schema. In a similar fashion you can import a schema. The usual approach to use this functionaility is to export the schema first, make the adjustments and put the adjusted schema into the corresponding import folder."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""The instruction to import a schema should be provided at the beginning when creating a pipeline:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dlt):
|
||||
pipeline_2 = dlt.pipeline(
|
||||
pipeline_name="github_pipeline3",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
export_schema_path="schemas/export",
|
||||
import_schema_path="schemas/import",
|
||||
)
|
||||
return (pipeline_2,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Let's make an initial pipeline run to export schema into the file."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(github_source, pipeline_2):
|
||||
# run the pipeline with the new resource
|
||||
load_info_2 = pipeline_2.run(github_source())
|
||||
print(load_info_2)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Look at the "Files" in the left sidebar, see the `schema` folder, and `export` and `import` folders inside."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
Now, both folders contain identic schema files.
|
||||
|
||||
### **Exercise 1: Adjust import schema**
|
||||
|
||||
**Adjust the import schema** by adding a description of the **`github_pulls`** table.
|
||||
|
||||
|
||||
```
|
||||
github_pulls:
|
||||
columns:
|
||||
updated_at:
|
||||
incremental: true
|
||||
write_disposition: append
|
||||
resource: github_pulls
|
||||
description: Table contains all pull requests information from dlt repository
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Run the pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(github_source, pipeline_2):
|
||||
load_info_3 = pipeline_2.run(github_source())
|
||||
print(load_info_3)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Check the exported schema file. It should now contain a description for the `github_pulls` table."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
with open("schemas/export/github_source.schema.yaml") as _f:
|
||||
print(_f.read())
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Question
|
||||
|
||||
What **data type** does the column `version` in the `_dlt_version` table have?
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -6,12 +6,12 @@
|
||||
"id": "h93BcC8SX2fj"
|
||||
},
|
||||
"source": [
|
||||
"# **Recap of [Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true) 👩💻🚀**\n",
|
||||
"# **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩💻🚀**\n",
|
||||
"\n",
|
||||
"1. Learned what is a schema.\n",
|
||||
"1. Learned what a schema is.\n",
|
||||
"2. Explored schema settings and components.\n",
|
||||
"3. Learned how to retrieve dlt pipeline schema.\n",
|
||||
"4. Learned how to adjust schema."
|
||||
"3. Learned how to retrieve a dlt pipeline schema.\n",
|
||||
"4. Learned how to adjust the schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -22,13 +22,13 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"\n",
|
||||
"# **Understanding Pipeline Metadata and State** 👻📄 [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n",
|
||||
"# **Understanding Pipeline Metadata and State** 👻📄 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Here, you will learn or brush up on:**\n",
|
||||
"- What's pipeline metadata\n",
|
||||
"- What pipeline metadata is\n",
|
||||
"- Exploring pipeline metadata from load info\n",
|
||||
"- Exploring pipeline metadate from trace\n",
|
||||
"- Exploring pipeline metadata from trace\n",
|
||||
"- Exploring pipeline metadata from state"
|
||||
]
|
||||
},
|
||||
@@ -48,16 +48,16 @@
|
||||
"id": "nFZNlDb1Y7ZH"
|
||||
},
|
||||
"source": [
|
||||
"Metadata is basically data about data.\n",
|
||||
"**Metadata** is essentially *data about data*.\n",
|
||||
"\n",
|
||||
"Pipeline Metadata is data about your data pipeline. This can be useful if you want to know things like:\n",
|
||||
"**Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like:\n",
|
||||
"\n",
|
||||
"- When your pipeline first ran\n",
|
||||
"- When your pipeline last ran\n",
|
||||
"- Information about your source or destination\n",
|
||||
"- Processing time\n",
|
||||
"- Or information that you yourself may want to add to the metadata\n",
|
||||
"- And much more!\n"
|
||||
"- Custom metadata you add yourself\n",
|
||||
"- And much more!"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -73,9 +73,9 @@
|
||||
"id": "wY2ySVotY-JU"
|
||||
},
|
||||
"source": [
|
||||
" `dlt` allows you to be able to view all this metadata through various options!\n",
|
||||
"`dlt` allows you to view all this metadata through various options!\n",
|
||||
"\n",
|
||||
"This notebook will walk you through those options. Namely:\n",
|
||||
"This notebook will walk you through those options, namely:\n",
|
||||
"\n",
|
||||
"- Load info\n",
|
||||
"- Trace\n",
|
||||
@@ -88,7 +88,7 @@
|
||||
"id": "JTR2acUYZbku"
|
||||
},
|
||||
"source": [
|
||||
"Let's load some GitHub data to DuckDB to inspect the pipeline metadata in different ways. First we need to install dlt with DuckDB:"
|
||||
"Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -109,7 +109,7 @@
|
||||
"id": "AhU2JVjTZn_j"
|
||||
},
|
||||
"source": [
|
||||
"Define a dlt resource that fetches Pull Requests and wrap it in a dlt source:"
|
||||
"Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -129,10 +129,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -206,13 +205,13 @@
|
||||
"id": "NA2dPY3_a2Ue"
|
||||
},
|
||||
"source": [
|
||||
"From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r) Colab we've already learned that we can see which schema changes a load package has introduced with the command:\n",
|
||||
"From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"dlt pipeline -v <pipeline_name> load-package\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"The verbose flag only accounts for the schema changes, so if we run it without the flag, we will still see the most recent load package info:"
|
||||
"The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -232,9 +231,9 @@
|
||||
"id": "w9ztJjzWcB3q"
|
||||
},
|
||||
"source": [
|
||||
"The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of 0 when the load process is fully completed. The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them.\n",
|
||||
"The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them.\n",
|
||||
"\n",
|
||||
"We can also see load package info with a specific load id:"
|
||||
"We can also view load package info for a specific `load_id` (replace the value with the one output above):\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -264,12 +263,12 @@
|
||||
"id": "Lg1lg6FVdKLl"
|
||||
},
|
||||
"source": [
|
||||
"From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r?usp=sharing) Colab we've also learned that a schema can be accessed with:\n",
|
||||
"From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"print(load_info.load_packages[0].schema)\n",
|
||||
"```\n",
|
||||
"Similarly if we drop the schema part, we will just get the load package info:"
|
||||
"Similarly, if we drop the schema part, we will get the load package info:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -351,7 +350,7 @@
|
||||
"id": "P3_rFHz6elTy"
|
||||
},
|
||||
"source": [
|
||||
"You can access pipeline trace using the command:\n",
|
||||
"You can access the pipeline trace using the command:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
@@ -365,7 +364,7 @@
|
||||
"id": "E2B3-30Yezbi"
|
||||
},
|
||||
"source": [
|
||||
"Try on the github issues pipeline:"
|
||||
"Try running it on the github issues pipeline:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -458,7 +457,7 @@
|
||||
"id": "XMsVhKYHff20"
|
||||
},
|
||||
"source": [
|
||||
"In particular how many rows of data were normalized:"
|
||||
"How many rows of data were normalized:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -513,17 +512,19 @@
|
||||
},
|
||||
"source": [
|
||||
"**When to use pipeline state**\n",
|
||||
"- dlt uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n",
|
||||
"- `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n",
|
||||
"- Store a list of already requested entities if the list is not much bigger than 100k elements.\n",
|
||||
"- Store large dictionaries of last values if you are not able to implement it with the standard incremental construct.\n",
|
||||
"- Store the custom fields dictionaries, dynamic configurations and other source-scoped state.\n",
|
||||
"\n",
|
||||
"**When not to use pipeline state**\n",
|
||||
"\n",
|
||||
"Do not use dlt state when it may grow to millions of elements. Do you plan to store modification timestamps of all of your millions of user records? This is probably a bad idea! In that case you could:\n",
|
||||
"Do not use `dlt` state when it may grow to millions of elements. \n",
|
||||
"For example, storing modification timestamps for millions of user records is a bad idea. \n",
|
||||
"In that case, you could:\n",
|
||||
"\n",
|
||||
"- Store the state in dynamo-db, redis etc. taking into the account that if the extract stage fails you'll end with invalid state.\n",
|
||||
"- Use your loaded data as the state. dlt exposes the current pipeline via dlt.current.pipeline() from which you can obtain sqlclient and load the data of interest. In that case try at least to process your user records in batches."
|
||||
"- Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state.\n",
|
||||
"- Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -634,10 +635,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -696,7 +696,7 @@
|
||||
"id": "UEBszW96bX1F"
|
||||
},
|
||||
"source": [
|
||||
"In the state you will see the new items:"
|
||||
"In the state, you will see the new items:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -748,10 +748,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -826,11 +825,10 @@
|
||||
"id": "im-o7K5IkoW5"
|
||||
},
|
||||
"source": [
|
||||
"You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store mapping of custom fields to their displayable names.\n",
|
||||
"You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names.\n",
|
||||
"\n",
|
||||
"Let's read some custom keys from the state:\n",
|
||||
"Let's read some custom keys from the state with:\n",
|
||||
"```python\n",
|
||||
"# Let's read some custom state information\n",
|
||||
"source_new_keys = dlt.current.source_state().get(\"resources\", {}).get(\"github_pulls\", {}).get(\"new_key\")\n",
|
||||
"```\n",
|
||||
"Full example:"
|
||||
@@ -850,10 +848,9 @@
|
||||
"from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n",
|
||||
"from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from google.colab import userdata\n",
|
||||
"\n",
|
||||
"os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.source\n",
|
||||
@@ -915,17 +912,24 @@
|
||||
"id": "WIhvQCY_lEaB"
|
||||
},
|
||||
"source": [
|
||||
"What if you run your pipeline on, for example, Airflow where every task gets a clean filesystem and pipeline working directory is always deleted?\n",
|
||||
"What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted?\n",
|
||||
"\n",
|
||||
"**dlt loads** your **state** into the destination **together** with all other **data** and when faced with a clean start, it will try to restore state from the destination.\n",
|
||||
"**dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination.\n",
|
||||
"\n",
|
||||
"The remote state is identified by pipeline name, the destination location (as given by the credentials) and destination dataset. To re-use **the same state**, use **the same pipeline name** and destination.\n",
|
||||
"The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset. \n",
|
||||
"To reuse **the same state**, use **the same pipeline name** and the same destination.\n",
|
||||
"\n",
|
||||
"The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, pipeline run (that the state belongs to) and state blob.\n",
|
||||
"The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob.\n",
|
||||
"\n",
|
||||
"dlt has `dlt pipeline <pipeline name> sync` command where you can request the state back from that table.\n",
|
||||
"`dlt` provides the command:\n",
|
||||
"\n",
|
||||
"💡 If you can keep the pipeline working directory across the runs, you can disable the state sync by setting `restore_from_destination=false` i.e. in your `config.toml`."
|
||||
"```\n",
|
||||
"dlt pipeline <pipeline name> sync\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"which retrieves the state from that table.\n",
|
||||
"\n",
|
||||
"💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -937,11 +941,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import duckdb\n",
|
||||
"from google.colab import data_table\n",
|
||||
"from IPython.display import display\n",
|
||||
"\n",
|
||||
"data_table.enable_dataframe_formatter()\n",
|
||||
"\n",
|
||||
"# a database 'chess_pipeline.duckdb' was created in working directory so just connect to it\n",
|
||||
"conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n",
|
||||
"conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n",
|
||||
@@ -955,7 +956,7 @@
|
||||
"id": "YIy5yLOAlJ9M"
|
||||
},
|
||||
"source": [
|
||||
"Column \"state\" is compressed json dictionary."
|
||||
"The \"state\" column is a compressed json dictionary."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -998,14 +999,14 @@
|
||||
"source": [
|
||||
"**To fully reset the state:**\n",
|
||||
"\n",
|
||||
"Drop the destination dataset to fully reset the pipeline.\n",
|
||||
"Set the `dev_mode` flag when creating pipeline.\n",
|
||||
"Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n",
|
||||
"- Drop the destination dataset to fully reset the pipeline. \n",
|
||||
"- Set the `dev_mode` flag when creating the pipeline. \n",
|
||||
"- Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n",
|
||||
"\n",
|
||||
"**To partially reset the state:**\n",
|
||||
"\n",
|
||||
"Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource.\n",
|
||||
"Use the `dlt pipeline drop --state-paths` command to reset the state at given path without touching the tables and data."
|
||||
"- Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource. \n",
|
||||
"- Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1014,9 +1015,9 @@
|
||||
"id": "fUuRzapCl8pC"
|
||||
},
|
||||
"source": [
|
||||
"**Example for partial reset:**\n",
|
||||
"**Example for a partial reset:**\n",
|
||||
"\n",
|
||||
"> in an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI"
|
||||
"> In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1058,7 +1059,7 @@
|
||||
"id": "NYbccmLie1zm"
|
||||
},
|
||||
"source": [
|
||||
"🎊🎊🎊 That is actually it! We hope you enjoyed this course and learned more about dlt! 🎊🎊🎊\n",
|
||||
"🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊\n",
|
||||
"\n",
|
||||
"Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼"
|
||||
]
|
||||
|
||||
@@ -0,0 +1,884 @@
|
||||
# /// script
|
||||
# dependencies = [
|
||||
# "dlt[duckdb]",
|
||||
# "numpy",
|
||||
# "pandas",
|
||||
# "sqlalchemy",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.17.4"
|
||||
app = marimo.App()
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩💻🚀**
|
||||
|
||||
1. Learned what a schema is.
|
||||
2. Explored schema settings and components.
|
||||
3. Learned how to retrieve a dlt pipeline schema.
|
||||
4. Learned how to adjust the schema.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# **Understanding Pipeline Metadata and State** 👻📄 [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
|
||||
|
||||
|
||||
**Here, you will learn or brush up on:**
|
||||
- What pipeline metadata is
|
||||
- Exploring pipeline metadata from load info
|
||||
- Exploring pipeline metadata from trace
|
||||
- Exploring pipeline metadata from state
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Pipeline Metadata**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Metadata** is essentially *data about data*.
|
||||
|
||||
**Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like:
|
||||
|
||||
- When your pipeline first ran
|
||||
- When your pipeline last ran
|
||||
- Information about your source or destination
|
||||
- Processing time
|
||||
- Custom metadata you add yourself
|
||||
- And much more!
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
`dlt` allows you to view all this metadata through various options!
|
||||
|
||||
This notebook will walk you through those options, namely:
|
||||
|
||||
- Load info
|
||||
- Trace
|
||||
- State
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os):
|
||||
from typing import Iterable
|
||||
import dlt
|
||||
from dlt.extract import DltResource
|
||||
from dlt.common.typing import TDataItems
|
||||
from dlt.sources.helpers import requests
|
||||
from dlt.sources.helpers.rest_client import RESTClient
|
||||
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth
|
||||
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator
|
||||
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
|
||||
return github_pulls
|
||||
|
||||
pipeline = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
load_info = pipeline.run(_github_source())
|
||||
# define new dlt pipeline
|
||||
# run the pipeline with the new resource
|
||||
print(load_info)
|
||||
return (
|
||||
BearerTokenAuth,
|
||||
DltResource,
|
||||
HeaderLinkPaginator,
|
||||
Iterable,
|
||||
RESTClient,
|
||||
TDataItems,
|
||||
dlt,
|
||||
load_info,
|
||||
pipeline,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Load info**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
`Load Info:` This is a collection of useful information about the recently loaded data. It includes details like the pipeline and dataset name, destination information, and a list of loaded packages with their statuses, file sizes, types, and error messages (if any).
|
||||
|
||||
`Load Package:` A load package is a collection of jobs with data for specific tables, generated during each execution of the pipeline. Each package is uniquely identified by a `load_id`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) CLI**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command:
|
||||
|
||||
```
|
||||
dlt pipeline -v <pipeline_name> load-package
|
||||
```
|
||||
|
||||
The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import subprocess
|
||||
|
||||
subprocess.run(["dlt", "pipeline", "github_pipeline", "load-package"], check=True)
|
||||
return (subprocess,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them.
|
||||
|
||||
We can also view load package info for a specific `load_id` (replace the value with the one output above):
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "pipeline", "github_pipeline", "load-package", "1741348101.3398592"],
|
||||
check=True,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) Python**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with:
|
||||
|
||||
```python
|
||||
print(load_info.load_packages[0].schema)
|
||||
```
|
||||
Similarly, if we drop the schema part, we will get the load package info:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(load_info):
|
||||
print(load_info.load_packages[0])
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""which has the following public methods and attributes:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(load_info):
|
||||
# This code snippet just prints out the public methoda and attributes of the schema object in load info
|
||||
all_attributes_methods = dir(load_info.load_packages[0])
|
||||
public_attributes_methods = [
|
||||
attr for attr in all_attributes_methods if not attr.startswith("_")
|
||||
]
|
||||
|
||||
print(f"{'Attribute/Method':<50} {'Type':<10}")
|
||||
print("-" * 40)
|
||||
for attr in public_attributes_methods:
|
||||
attr_value = getattr(load_info.load_packages[0], attr)
|
||||
if callable(attr_value):
|
||||
print(f"{attr:<50} {'method':<10}")
|
||||
else:
|
||||
print(f"{attr:<50} {'attribute':<10}")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **Trace**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""`Trace`: A trace is a detailed record of the execution of a pipeline. It provides rich information on the pipeline processing steps: **extract**, **normalize**, and **load**. It also shows the last `load_info`."""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) CLI**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can access the pipeline trace using the command:
|
||||
|
||||
|
||||
```
|
||||
dlt pipeline <pipeline_name> trace
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Try running it on the github issues pipeline:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["dlt", "pipeline", "github_pipeline", "trace"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) Python**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""We can also print out the trace in code:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# print human friendly trace information
|
||||
print(pipeline.last_trace)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Separately receive the extract stage info:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# print human friendly trace information
|
||||
print(pipeline.last_trace.last_extract_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""As well as the normalization stage info with:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# print human friendly normalization information
|
||||
print(pipeline.last_trace.last_normalize_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""How many rows of data were normalized:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# access row counts dictionary of normalize info
|
||||
print(pipeline.last_trace.last_normalize_info.row_counts)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""And finally the load stage info:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline):
|
||||
# print human friendly load information
|
||||
print(pipeline.last_trace.last_load_info)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
## **State**
|
||||
|
||||
[`The pipeline state`](https://dlthub.com/docs/general-usage/state) is a Python dictionary that lives alongside your data. You can store values in it during a pipeline run, and then retrieve them in the next pipeline run. It's used for tasks like preserving the "last value" or similar loading checkpoints, and it gets committed atomically with the data. The state is stored locally in the pipeline working directory and is also stored at the destination for future runs.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**When to use pipeline state**
|
||||
- `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.
|
||||
- Store a list of already requested entities if the list is not much bigger than 100k elements.
|
||||
- Store large dictionaries of last values if you are not able to implement it with the standard incremental construct.
|
||||
- Store the custom fields dictionaries, dynamic configurations and other source-scoped state.
|
||||
|
||||
**When not to use pipeline state**
|
||||
|
||||
Do not use `dlt` state when it may grow to millions of elements.
|
||||
For example, storing modification timestamps for millions of user records is a bad idea.
|
||||
In that case, you could:
|
||||
|
||||
- Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state.
|
||||
- Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(0) CLI**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **(1) Python**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import json
|
||||
|
||||
def read_state(filepath: str) -> str:
|
||||
with open(filepath, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
pretty_json = json.dumps(data, indent=4)
|
||||
return pretty_json
|
||||
return (read_state,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(read_state):
|
||||
# stored in your default pipelines folder
|
||||
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **Modify State**
|
||||
|
||||
The pipeline state is a Python dictionary that lives alongside your data; you can store values in it and, on the next pipeline run, request them back.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
#### **(0) Resource state**
|
||||
|
||||
You can **read** and **write** the state in your resources using:
|
||||
|
||||
```python
|
||||
dlt.current.resource_state().get()
|
||||
```
|
||||
and
|
||||
|
||||
```python
|
||||
dlt.current.resource_state().setdefault(key, value)
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
BearerTokenAuth,
|
||||
DltResource,
|
||||
HeaderLinkPaginator,
|
||||
Iterable,
|
||||
RESTClient,
|
||||
TDataItems,
|
||||
dlt,
|
||||
os,
|
||||
):
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
dlt.current.resource_state().setdefault(
|
||||
"new_key", ["first_value", "second_value"]
|
||||
)
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
|
||||
return github_pulls
|
||||
|
||||
pipeline_1 = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
load_info_1 = pipeline_1.run(_github_source())
|
||||
print(load_info_1)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(read_state):
|
||||
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""In the state, you will see the new items:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r""""""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can modify any item in the state dict:
|
||||
|
||||
```python
|
||||
new_keys = dlt.current.resource_state().setdefault("new_key", ["first_value", "second_value"])
|
||||
|
||||
if "something_happend":
|
||||
new_keys.append("third_value")
|
||||
|
||||
incremental_dict = dlt.current.resource_state().get("incremental")
|
||||
incremental_dict.update({"second_new_key": "fourth_value"})
|
||||
```
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""Full example:""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
BearerTokenAuth,
|
||||
DltResource,
|
||||
HeaderLinkPaginator,
|
||||
Iterable,
|
||||
RESTClient,
|
||||
TDataItems,
|
||||
dlt,
|
||||
os,
|
||||
):
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
new_keys = dlt.current.resource_state().setdefault(
|
||||
"new_key", ["first_value", "second_value"]
|
||||
)
|
||||
if "something_happened":
|
||||
new_keys.append("third_value")
|
||||
incremental_dict = dlt.current.resource_state().get("incremental")
|
||||
incremental_dict.update({"second_new_key": "fourth_value"})
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
|
||||
return github_pulls
|
||||
|
||||
pipeline_2 = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
load_info_2 = pipeline_2.run(_github_source())
|
||||
print(load_info_2)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(read_state):
|
||||
print(read_state("/var/dlt/pipelines/github_pipeline/state.json"))
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
#### **(1) Source state**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names.
|
||||
|
||||
Let's read some custom keys from the state with:
|
||||
```python
|
||||
source_new_keys = dlt.current.source_state().get("resources", {}).get("github_pulls", {}).get("new_key")
|
||||
```
|
||||
Full example:
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
BearerTokenAuth,
|
||||
DltResource,
|
||||
HeaderLinkPaginator,
|
||||
Iterable,
|
||||
RESTClient,
|
||||
TDataItems,
|
||||
dlt,
|
||||
os,
|
||||
):
|
||||
dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY")
|
||||
|
||||
@dlt.source
|
||||
def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]:
|
||||
client = RESTClient(
|
||||
base_url="https://api.github.com",
|
||||
auth=BearerTokenAuth(token=secret_key),
|
||||
paginator=HeaderLinkPaginator(),
|
||||
)
|
||||
|
||||
@dlt.resource
|
||||
def github_pulls(
|
||||
cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(
|
||||
"updated_at", initial_value="2024-12-01"
|
||||
)
|
||||
) -> TDataItems:
|
||||
params = {"since": cursor_date.last_value, "status": "open"}
|
||||
for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params):
|
||||
yield page
|
||||
source_new_keys = (
|
||||
dlt.current.source_state()
|
||||
.get("resources", {})
|
||||
.get("github_pulls", {})
|
||||
.get("new_key")
|
||||
)
|
||||
print("My custom values: ", source_new_keys)
|
||||
|
||||
return github_pulls
|
||||
|
||||
pipeline_3 = dlt.pipeline(
|
||||
pipeline_name="github_pipeline",
|
||||
destination="duckdb",
|
||||
dataset_name="github_data",
|
||||
)
|
||||
load_info_3 = pipeline_3.run(_github_source())
|
||||
print(load_info_3)
|
||||
return (pipeline_3,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **Sync State**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted?
|
||||
|
||||
**dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination.
|
||||
|
||||
The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset.
|
||||
To reuse **the same state**, use **the same pipeline name** and the same destination.
|
||||
|
||||
The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob.
|
||||
|
||||
`dlt` provides the command:
|
||||
|
||||
```
|
||||
dlt pipeline <pipeline name> sync
|
||||
```
|
||||
|
||||
which retrieves the state from that table.
|
||||
|
||||
💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pipeline_3):
|
||||
import duckdb
|
||||
from IPython.display import display
|
||||
|
||||
conn = duckdb.connect(f"{pipeline_3.pipeline_name}.duckdb")
|
||||
# a database 'chess_pipeline.duckdb' was created in working directory so just connect to it
|
||||
conn.sql(f"SET search_path = '{pipeline_3.dataset_name}'")
|
||||
stats_table = conn.sql("SELECT * FROM _dlt_pipeline_state").df()
|
||||
display(stats_table)
|
||||
return (conn,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""The "state" column is a compressed json dictionary.""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
|index|version|engine\_version|pipeline\_name|state|created\_at|version\_hash|\_dlt\_load\_id|\_dlt\_id|
|
||||
|---|---|---|---|---|---|---|---|---|
|
||||
|0|1|4|github\_pipeline|eNplkN....6+/m/QA7mbNc|2025-03-10 14:02:34\.340458+00:00|pnp+9AIA5jAGx5LKon6zWmPnfYVb10ROa5aIKjv9O0I=|1741615353\.5473728|FOzn5XuSZ/y/BQ|
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "--non-interactive", "pipeline", "github_pipeline", "sync"], check=True
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
---
|
||||
### **Reset State**
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**To fully reset the state:**
|
||||
|
||||
- Drop the destination dataset to fully reset the pipeline.
|
||||
- Set the `dev_mode` flag when creating the pipeline.
|
||||
- Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.
|
||||
|
||||
**To partially reset the state:**
|
||||
|
||||
- Use the `dlt pipeline drop <resource_name>` command to drop state and tables for a given resource.
|
||||
- Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
**Example for a partial reset:**
|
||||
|
||||
> In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(conn):
|
||||
conn.close()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(
|
||||
["dlt", "pipeline", "github_pipeline", "drop", "github_pulls"],
|
||||
input="y\n",
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(subprocess):
|
||||
subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊
|
||||
|
||||
Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
return (mo,)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
@@ -43,6 +43,7 @@ dependencies = [
|
||||
"regex>=2025.10.23",
|
||||
"pytest-forked>=1.6.0",
|
||||
"databind>=4.5.2",
|
||||
"marimo>=0.17.4",
|
||||
]
|
||||
|
||||
|
||||
|
||||
2
docs/uv.lock
generated
2
docs/uv.lock
generated
@@ -1121,6 +1121,7 @@ dependencies = [
|
||||
{ name = "google-api-python-client" },
|
||||
{ name = "google-auth-oauthlib" },
|
||||
{ name = "lancedb" },
|
||||
{ name = "marimo" },
|
||||
{ name = "modal" },
|
||||
{ name = "mypy" },
|
||||
{ name = "nbqa" },
|
||||
@@ -1160,6 +1161,7 @@ requires-dist = [
|
||||
{ name = "google-api-python-client", specifier = ">=1.7.11" },
|
||||
{ name = "google-auth-oauthlib", specifier = ">=1.0.0,<2" },
|
||||
{ name = "lancedb", marker = "python_full_version < '3.13'", specifier = ">=0.8.2" },
|
||||
{ name = "marimo", specifier = ">=0.17.4" },
|
||||
{ name = "modal", specifier = ">=0.64.170" },
|
||||
{ name = "modal", specifier = ">=1.2.1" },
|
||||
{ name = "mypy", specifier = ">=1.11.0,<1.13.0" },
|
||||
|
||||
@@ -10,34 +10,34 @@ In this course, you'll go far beyond the basics. You’ll build production-grade
|
||||
|
||||
## Lessons
|
||||
|
||||
### **Lesson 1: Custom Sources – REST APIs & RESTClient** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)
|
||||
### **Lesson 1: Custom Sources – REST APIs & RESTClient** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)
|
||||
|
||||
Learn how to build flexible REST API connectors from scratch using `@dlt.resource` and the powerful `RESTClient`.
|
||||
|
||||
### **Lesson 2: Custom Sources – SQL Databases** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
|
||||
### **Lesson 2: Custom Sources – SQL Databases** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)
|
||||
|
||||
Connect to any SQL-compatible database, reflect table schemas, write query adapters, and selectively ingest data using `sql_database`.
|
||||
|
||||
### **Lesson 3: Custom Sources – Filesystems & Cloud Storage** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)
|
||||
### **Lesson 3: Custom Sources – Filesystems & Cloud Storage** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)
|
||||
Build sources that read from local or remote files (S3, GCS, Azure).
|
||||
|
||||
### **Lesson 4: Custom Destinations – Reverse ETL** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
|
||||
### **Lesson 4: Custom Destinations – Reverse ETL** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)
|
||||
Use `@dlt.destination` to send data back to APIs like Notion, Slack, or Airtable. Learn batching, retries, and idempotent patterns.
|
||||
|
||||
### **Lesson 5: Transforming Data Before & After Load** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
|
||||
### **Lesson 5: Transforming Data Before & After Load** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)
|
||||
|
||||
Learn when and how to apply `add_map`, `add_filter`, `@dlt.transformer`, or even post-load transformations via SQL or Ibis. Control exactly how your data looks.
|
||||
|
||||
### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)
|
||||
### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)
|
||||
Understand how to use `replace` and `merge`, and combine them with schema hints and incremental loading.
|
||||
|
||||
### **Lesson 7: Data Contracts** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
|
||||
### **Lesson 7: Data Contracts** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)
|
||||
Define expectations on schema, enforce data types and behaviors, and lock down your schema evolution. Ensure reliable downstream use of your data.
|
||||
|
||||
### **Lesson 8: Logging & Tracing** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)
|
||||
### **Lesson 8: Logging & Tracing** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)
|
||||
Track every step of your pipeline: from extraction to load. Use logs, traces, and metadata to debug and analyze performance.
|
||||
|
||||
### **Lesson 9: Performance Optimization** [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)
|
||||
### **Lesson 9: Performance Optimization** [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)
|
||||
Handle large datasets, tune buffer sizes, parallelize resource extraction, optimize memory usage, and reduce pipeline runtime.
|
||||
|
||||
## Homework & Certification
|
||||
|
||||
@@ -10,42 +10,41 @@ In this course you will learn the fundamentals of `dlt` alongside some of the mo
|
||||
|
||||
## Lessons
|
||||
|
||||
### Lesson 1: Quick Start [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
|
||||
### Lesson 1: Quick Start [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)
|
||||
|
||||
Discover what dlt is, run your first pipeline with toy data, and explore it like a pro using DuckDB, `sql_client`, and dlt datasets!
|
||||
|
||||
### Lesson 2: dlt Resources and Sources [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)
|
||||
### Lesson 2: dlt Resources and Sources [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)
|
||||
|
||||
Learn to run pipelines with diverse data sources (dataframes, databases, and REST APIs),
|
||||
master `dlt.resource`, `dlt.source`, and `dlt.transformer`, and create your first REST API pipeline!
|
||||
|
||||
### Lesson 3: Pagination & Authentication & dlt Configuration [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
|
||||
### Lesson 3: Pagination & Authentication & dlt Configuration [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)
|
||||
|
||||
|
||||
Since it is never a good idea to publicly put your API keys into your code, different environments have different methods to set and access these secret keys. `dlt` is no different.
|
||||
Master pagination and authentication for REST APIs, explore dlt's RESTClient and manage secrets and configs.
|
||||
|
||||
### Lesson 4: Using dlt's pre-built Sources and Destinations [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
|
||||
### Lesson 4: Using dlt's pre-built Sources and Destinations [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)
|
||||
|
||||
|
||||
Now that you took a data source and loaded it into a `duckdb` destination, it is time to look into what other possibilities `dlt` offers.
|
||||
In this notebook we will take a look at pre-built verified sources and destinations and how to use them.
|
||||
|
||||
### Lesson 5: Write disposition and incremental loading [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
|
||||
|
||||
### Lesson 5: Write disposition and incremental loading [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)
|
||||
|
||||
Learn to control data behavior with dlt write dispositions (Append, Replace, Merge), master incremental loading, and efficiently update and deduplicate your datasets.
|
||||
|
||||
### Lesson 6: How dlt works [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
|
||||
|
||||
### Lesson 6: How dlt works [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)
|
||||
|
||||
Discover the magic behind `dlt`! Learn its three main steps — Extract, Normalize, Load — along with default behaviors and supported file formats.
|
||||
|
||||
### Lesson 7: Inspecting & Adjusting Schema [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
|
||||
### Lesson 7: Inspecting & Adjusting Schema [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)
|
||||
|
||||
|
||||
dlt creates and manages the schema automatically, but what if you want to control it yourself? Explore the schema and customize it to your needs easily with dlt!
|
||||
|
||||
### Lesson 8: Understanding Pipeline State & Metadata [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
|
||||
### Lesson 8: Understanding Pipeline State & Metadata [](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)
|
||||
|
||||
|
||||
After having learnt about pipelines and how to move data from one place to another. We now learn about information about the pipeline itself. Or, metadata of a pipeline that can be accessed and edited through dlt.
|
||||
|
||||
@@ -249,6 +249,8 @@ dev = [
|
||||
"pydoclint>=0.6.5,<0.7",
|
||||
"types-paramiko>=3.5.0.20250708",
|
||||
"graphviz>=0.21",
|
||||
# limits sqlglot - remove when #3489 is fixed
|
||||
"sqlglot<28.1",
|
||||
]
|
||||
|
||||
# NOTE: those dependencies are used to test built in sources
|
||||
|
||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2282,6 +2282,7 @@ dev = [
|
||||
{ name = "requests-mock" },
|
||||
{ name = "ruff" },
|
||||
{ name = "sqlfluff" },
|
||||
{ name = "sqlglot" },
|
||||
{ name = "types-cachetools" },
|
||||
{ name = "types-click" },
|
||||
{ name = "types-deprecated" },
|
||||
@@ -2487,6 +2488,7 @@ dev = [
|
||||
{ name = "requests-mock", specifier = ">=1.10.0,<2" },
|
||||
{ name = "ruff", specifier = ">=0.3.2,<0.4" },
|
||||
{ name = "sqlfluff", specifier = ">=2.3.2,<3" },
|
||||
{ name = "sqlglot", specifier = "<28.1" },
|
||||
{ name = "types-cachetools", specifier = ">=4.2.9" },
|
||||
{ name = "types-click", specifier = ">=7.1.8,<8" },
|
||||
{ name = "types-deprecated", specifier = ">=1.2.9.2,<2" },
|
||||
|
||||
Reference in New Issue
Block a user