diff --git a/.github/workflows/test_docs.yml b/.github/workflows/test_docs.yml index 1215557b3..32aa213c3 100644 --- a/.github/workflows/test_docs.yml +++ b/.github/workflows/test_docs.yml @@ -106,3 +106,9 @@ jobs: - name: run docs preprocessor run: cd docs && make preprocess-docs + + - name: test preprocess_to_molab + run: cd docs && make test-preprocess-molabs + + - name: Ensure marimo notebooks are up-to-date + run: cd docs && make validate-molabs diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index c6d55991f..04d95555d 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -40,7 +40,8 @@ except ModuleNotFoundError: raise MissingDependencyException( "dlt pyarrow helpers", [f"{version.DLT_PKG_NAME}[parquet]"], - "Install pyarrow to be allow to load arrow tables, panda frames and to use parquet files.", + "Install pyarrow to be allowed to load arrow tables, panda frames and to use parquet" + " files.", ) import ctypes diff --git a/docs/Makefile b/docs/Makefile index 8f517fb0c..f55d90cef 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -27,10 +27,9 @@ test-examples: ## Tests the examples in the examples folder test-snippets: ## Tests the snippets in the snippets folder cd website/docs && uv run pytest --ignore=node_modules -format: ## Formats the docs tooling, notebooks, and examples +format: ## Formats the docs tooling, website, examples, and notebooks uv run black docs_tools website examples - uv run black education --ipynb - + uv run black education/*/*.ipynb --ipynb generate-api-ref: ## Generates the API reference documentation from dlt codebase for website cd docs_tools/api_docs && uv run pydoc-markdown @@ -43,3 +42,14 @@ preprocess-docs: ## Preprocesses the docs pages, copies docs to docs_processed preprocess-docs-watch: ## Preprocesses the docs pages, copies docs to docs_processed folder and inserts snippets and tuba links and watches for changes uv run preprocess-docs --watch +test-preprocess-molabs: ## Tests functions used to build Molabs + uv run pytest docs_tools/education/tests + +build-molabs: ## Format the notebooks files first and build Molabs + uv run black education/*/*.ipynb --ipynb + uv run python docs_tools/education/preprocess_to_molab.py + uv run black education/*/*.py + uv run marimo check education/*/*.py --fix --quiet + +validate-molabs: build-molabs ## Validate marimo notebooks are up-to-date + git diff --quiet --exit-code -- education/ diff --git a/docs/docs_tools/education/__init__.py b/docs/docs_tools/education/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docs/docs_tools/education/preprocess_to_molab.py b/docs/docs_tools/education/preprocess_to_molab.py new file mode 100644 index 000000000..3230f3e3f --- /dev/null +++ b/docs/docs_tools/education/preprocess_to_molab.py @@ -0,0 +1,290 @@ +import json +import re +import shlex +import subprocess +from pathlib import Path +from typing import Dict, Any + +EDUCATION_NOTEBOOKS_DIR = Path(__file__).parent.parent.parent / "education" +TEMP_IPYNB_FILE_PREIFX = "tmp" + +MUST_INSTALL_PACKAGES = {"numpy", "pandas", "sqlalchemy"} + + +def replace_colab_imports_in_notebook(notebook_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Remove Google Colab-specific imports and replace Colab API calls with standard Python. + + Google Colab provides special APIs like `google.colab.userdata` for accessing secrets + that don't exist outside the Colab environment. This function: + - Removes: `from google.colab import userdata` (and similar imports) + - Replaces: `userdata.get(...)` → `os.getenv(...)` + + Args: + notebook_dict: Notebook as a Python dictionary + + Returns: + Modified notebook dictionary + """ + for cell in notebook_dict.get("cells", []): + if cell.get("cell_type") == "code": + source = cell.get("source", []) + if isinstance(source, list): + # Remove lines with Google Colab imports + source = [ + line + for line in source + if not re.match(r"^\s*from google\.colab import", line) + ] + # Replace userdata.get with os.getenv + source = [ + line.replace("userdata.get(", "os.getenv(") for line in source + ] + cell["source"] = source + + return notebook_dict + + +def process_shell_commands_in_notebook( + notebook_dict: Dict[str, Any] +) -> tuple[Dict[str, Any], set[str]]: + """ + Convert Jupyter shell commands to Python subprocess calls and extract dependencies. + + Jupyter/Colab notebooks support shell commands with `!` syntax (e.g., `!pip install dlt`), + but this is IPython-specific magic syntax that doesn't work in standard Python or Marimo. + This function: + - Extracts package names from `!pip install` commands for dependency tracking + - Converts other `!command` shell commands to `subprocess.run()` calls + - Removes notebook-specific magic commands (e.g., `%%capture`) + + Args: + notebook_dict: Notebook as a Python dictionary + + Returns: + Tuple of (modified notebook dict, set of package names extracted from pip install commands) + """ + packages: set[str] = set() + subprocess_imported: bool = False + + for cell in notebook_dict.get("cells", []): + if cell.get("cell_type") == "code": + cell_code = cell.get("source", []) + new_cell_code = [] + + for line in cell_code: + stripped = line.strip() + + # skip magic commands + if stripped.startswith("%%capture"): + continue + + # extract packages from pip install + if stripped.startswith("!pip install"): + match = re.search(r"!pip install\s+(.+?)(?:\n|$)", stripped) + if match: + cleaned = ( + match.group(1).strip().replace('"', "").replace("'", "") + ) + # Remove spaces around commas in brackets + cleaned = re.sub(r"\[\s*", "[", cleaned) # Remove space after [ + cleaned = re.sub( + r"\s*\]", "]", cleaned + ) # Remove space before ] + cleaned = re.sub( + r",\s+", ",", cleaned + ) # Remove space after commas + + pkgs = [ + p.strip() + for p in cleaned.split() + if p.strip() and not p.startswith("-") + ] # Filter flags + packages.update(pkgs) + continue + + # convert other shell commands + elif stripped.startswith("!"): + if not subprocess_imported: + new_cell_code.append("import subprocess\n") + subprocess_imported = True + cmd = stripped[1:] + new_line = _build_subprocess_line(cmd) + "\n" + new_cell_code.append(new_line) + + else: + new_cell_code.append(line) + + cell["source"] = new_cell_code + + return notebook_dict, packages + + +def add_inline_dependencies_to_content(packages: set[str], py_content: str) -> str: + """ + Add PEP 723 inline script metadata block with dependencies. + + Marimo/Molab can automatically install packages when they're declared using PEP 723 + inline script metadata. The dependency list includes: + - Packages extracted from !pip install commands in the original notebook + - MUST_INSTALL_PACKAGES (core dependencies required for all notebooks) + + Args: + packages: Set of package names to include (will be merged with MUST_INSTALL_PACKAGES) + py_content: The Python file content as a string + + Returns: + Python content with PEP 723 metadata block prepended + + NOTE: Without this, users would need to go through a step of manually installing packages before running + the notebook (Marimo will try to install missing imports, which is not exactly nice for a smooth experience. + Also, some libraries used under the hood are not directly imported and are not caught by Marimo). + + Format: + # /// script + # dependencies = [ + # "package1", + # "package2", + # ] + # /// + """ + packages = packages.copy() # Don't mutate the input set + packages.update(MUST_INSTALL_PACKAGES) + if not packages: + return py_content + + pkg_lines = "\n".join(f'# "{pkg}",' for pkg in sorted(packages)) + deps_block = f"""# /// script +# dependencies = [ +{pkg_lines} +# ] +# /// + +""" + + return deps_block + py_content + + +def read_notebook(ipynb_path: Path) -> Dict[str, Any]: + """ + Read a Jupyter notebook file and return as a dictionary. + + Args: + ipynb_path: Path to the .ipynb file + + Returns: + Notebook data as a Python dictionary + """ + data: Dict[str, Any] = json.loads(ipynb_path.read_text(encoding="utf-8")) + return data + + +def write_notebook(notebook_dict: Dict[str, Any], output_path: Path) -> None: + """ + Write a notebook dictionary to a file. + + Args: + notebook_dict: Notebook data as a Python dictionary + output_path: Path where the notebook should be written + """ + output_path.write_text( + json.dumps(notebook_dict, indent=1, ensure_ascii=False), encoding="utf-8" + ) + + +def convert_notebook_to_marimo(temp_ipynb_path: Path) -> str: + """ + Convert a Jupyter notebook to Marimo Python format using marimo CLI. + + Args: + temp_ipynb_path: Path to the temporary preprocessed notebook + + Returns: + Marimo Python file content as a string + """ + result = subprocess.run( + ["marimo", "convert", str(temp_ipynb_path)], + capture_output=True, + text=True, + check=True, + ) + return result.stdout + + +def write_python_file(content: str, output_path: Path) -> None: + """ + Write Python content to a file. + + Args: + content: Python file content as a string + output_path: Path where the file should be written + """ + output_path.write_text(content, encoding="utf-8") + + +def _build_subprocess_line(cmd: str) -> str: + """ + Generate a subprocess.run() call string from a shell command. + + This helper converts various shell command patterns to their Python subprocess + equivalents, handling special cases like piped input. + + Conversion rules: + - Simple commands: `command arg` → `subprocess.run(['command', 'arg'], check=True)` + - Yes piping: `yes | command` → `subprocess.run(['command'], input='y\\n', ...)` + - No piping: `no | command` → `subprocess.run(['command'], input='n\\n', ...)` + - Complex pipes: `cmd1 | cmd2` → `subprocess.run('cmd1 | cmd2', shell=True, ...)` + + Args: + cmd: The shell command string (without the leading `!`) + + Returns: + A string containing Python code for subprocess.run() + """ + cmd = cmd.strip() + + # No pipe → simple list argv + if "|" not in cmd: + argv = shlex.split(cmd) + return f"subprocess.run({argv!r}, check=True)" + + # Split pipe + left, right = map(str.strip, cmd.split("|", 1)) + left_lower = left.lower() + + # yes | command → feed "y\n" + if left_lower == "yes": + argv = shlex.split(right) + return f"subprocess.run({argv!r}, input='y\\n', text=True, check=True)" + + # no | command → feed "n\n" + if left_lower == "no": + argv = shlex.split(right) + return f"subprocess.run({argv!r}, input='n\\n', text=True, check=True)" + + # generic pipe: shell=True fallback + return f"subprocess.run({cmd!r}, shell=True, check=True)" + + +if __name__ == "__main__": + for ipynb_file in EDUCATION_NOTEBOOKS_DIR.glob("*/*.ipynb"): + # 1. Read notebook file + notebook_dict = read_notebook(ipynb_file) + # 2. Replace Colab imports + notebook_dict = replace_colab_imports_in_notebook(notebook_dict) + # 3. Process shell commands + notebook_dict, packages = process_shell_commands_in_notebook(notebook_dict) + # 4. Write temporary notebook + temp_ipynb_file = ipynb_file.with_name( + f"{TEMP_IPYNB_FILE_PREIFX}_{ipynb_file.name}" + ) + write_notebook(notebook_dict, temp_ipynb_file) + # 5. Convert to Marimo format + py_content = convert_notebook_to_marimo(temp_ipynb_file) + # 6. Add inline dependencies + py_content_with_deps = add_inline_dependencies_to_content(packages, py_content) + # 7. Write final Python file + output_path = ipynb_file.with_suffix(".py") + write_python_file(py_content_with_deps, output_path) + # 8. Clean up temporary files + temp_ipynb_file.unlink() diff --git a/docs/docs_tools/education/tests/__init__.py b/docs/docs_tools/education/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docs/docs_tools/education/tests/test_preprocess_to_molab.py b/docs/docs_tools/education/tests/test_preprocess_to_molab.py new file mode 100644 index 000000000..c3513e83a --- /dev/null +++ b/docs/docs_tools/education/tests/test_preprocess_to_molab.py @@ -0,0 +1,109 @@ +import pytest +from docs_tools.education.preprocess_to_molab import ( + replace_colab_imports_in_notebook, + process_shell_commands_in_notebook, + add_inline_dependencies_to_content, +) + + +def test_replace_colab_imports() -> None: + """Ensure that collab specific imports are removed and converted where necessary.""" + notebook = { + "cells": [ + { + "cell_type": "code", + "source": [ + "from google.colab import userdata\n", + "api_key = userdata.get('API_KEY')\n", + "print(api_key)\n", + ], + }, + ] + } + result = replace_colab_imports_in_notebook(notebook) + assert result == { + "cells": [ + { + "cell_type": "code", + "source": [ + "api_key = os.getenv('API_KEY')\n", + "print(api_key)\n", + ], + }, + ] + } + + +def test_process_shell_commands_in_notebook() -> None: + """Ensure that pip install commands are removed, shell commands converted.""" + notebook = { + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install dlt\n", + "!pip install dlt[bigquery,postgres]\n", + "!pip install requests==2.28.0\n", + "!pip install -q scikit-learn\n", + ], + }, + { + "cell_type": "code", + "source": [ + "!ls -la\n", + "!pwd\n", + "!yes | dlt init source destination\n", + "!no | some_command --flag\n", + "!cat file.txt | grep pattern\n", + "%%capture\n", + "print('hello')\n", + ], + }, + ] + } + + result, packages = process_shell_commands_in_notebook(notebook) + assert packages == { + "dlt", + "dlt[bigquery,postgres]", + "requests==2.28.0", + "scikit-learn", + } + assert result == { + "cells": [ + {"cell_type": "code", "source": []}, + { + "cell_type": "code", + "source": [ + "import subprocess\n", + "subprocess.run(['ls', '-la'], check=True)\n", + "subprocess.run(['pwd'], check=True)\n", + "subprocess.run(['dlt', 'init', 'source', 'destination'], input='y\\n', text=True, check=True)\n", + "subprocess.run(['some_command', '--flag'], input='n\\n', text=True, check=True)\n", + "subprocess.run('cat file.txt | grep pattern', shell=True, check=True)\n", + "print('hello')\n", + ], + }, + ] + } + + +def test_add_inline_dependencies_to_content() -> None: + """Ensure that PEP 723 metadata block is correctly added and includes MUST_INSTALL_PACKAGES.""" + packages = {"requests", "dlt[bigquery,postgres]"} + py_content = "import marimo\n" + result = add_inline_dependencies_to_content(packages, py_content) + expected = """# /// script +# dependencies = [ +# "dlt[bigquery,postgres]", +# "numpy", +# "pandas", +# "requests", +# "sqlalchemy", +# ] +# /// + +import marimo +""" + print(result) + assert result == expected diff --git a/docs/education/README.md b/docs/education/README.md new file mode 100644 index 000000000..73908b9db --- /dev/null +++ b/docs/education/README.md @@ -0,0 +1,31 @@ +# Adding New Notebooks + +## Overview + +The `.py` files in this directory are **auto-generated** from `.ipynb` files. Only edit the `.ipynb` files. + +To regenerate `.py` files: +```bash +make build-molabs +``` + +Preprocessing logic: [`docs/docs_tools/education/`](../docs_tools/education/) + +## Things to consider + +To ensure compatibility with both **Google Colab** and **Marimo/Molab**: + +### 1. **No inline comments** +Bad: `x = 5 # comment` +Good: Separate line comments + +**Why:** `marimo convert` scatters inline comments + +## Workflow + +1. Create/edit `.ipynb` in the course folder +2. Follow guidelines above +3. Run `make build-molabs` to generate `.py` files +4. Test both versions (Colab and Molab) +5. Commit both `.ipynb` and `.py` files +6. Make changes to the processing logic in `docs/docs_tools/education/` if necessary. \ No newline at end of file diff --git a/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb b/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb index aa52e30d6..2c41d22e1 100644 --- a/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb +++ b/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb @@ -6,7 +6,7 @@ "id": "TKD-8-XUjqU4" }, "source": [ - "# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)" + "# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)" ] }, { @@ -46,7 +46,9 @@ "We constructed a custom source for the **GitHub API** using the `RESTClient` class, decorators like `@dlt.resource` and `@dlt.source`, and manual pagination handling.\n", "\n", "\n", - "#### **Example**" + "#### **Example**\n", + "\n", + "> Don't forget to use your [GitHub API token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) below! " ] }, { @@ -81,7 +83,7 @@ "from google.colab import userdata\n", "\n", "\n", - "os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n", + "dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n", "\n", "\n", "@dlt.source\n", @@ -148,7 +150,7 @@ " \"client\": {\n", " \"base_url\": \"https://api.github.com\",\n", " \"auth\": {\n", - " \"token\": dlt.secrets[\"access_token\"], # Access token configured above\n", + " \"token\": dlt.secrets[\"access_token\"],\n", " },\n", " \"paginator\": \"header_link\",\n", " },\n", @@ -182,14 +184,14 @@ "\n", "git_source = rest_api_source(config)\n", "\n", - "pipeline = dlt.pipeline(\n", + "rest_api_pipeline = dlt.pipeline(\n", " pipeline_name=\"rest_api_github\",\n", " destination=\"duckdb\",\n", " dataset_name=\"rest_api_data\",\n", " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.run(git_source)\n", + "load_info = rest_api_pipeline.run(git_source)\n", "print(load_info)" ] }, @@ -212,7 +214,7 @@ "source": [ "If you don't like black boxes and prefer lower-level building blocks, then our `RESTClient` is perfect for you!\n", "\n", - "The `RESTClient` class offers an Pythonic interface for interacting with RESTful APIs, including features like:\n", + "The `RESTClient` class offers a Pythonic interface for interacting with RESTful APIs, including features like:\n", "\n", "- automatic pagination,\n", "- various authentication mechanisms,\n", @@ -225,7 +227,7 @@ "- How to build a custom `@dlt.source`\n", "- How to run the pipeline and inspect the data\n", "\n", - "For more information, read `dlt` [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client) official documentation." + "For more information, read `dlt`'s official documentation for the [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client)." ] }, { @@ -248,11 +250,10 @@ "source": [ "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", - "from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator\n", "from google.colab import userdata\n", "\n", "\n", - "os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n", + "dlt.secrets[\"ACCESS_TOKEN\"] = userdata.get(\"ACCESS_TOKEN\")\n", "\n", "\n", "client = RESTClient(\n", @@ -335,7 +336,7 @@ "\n", "#### **Authentication Details:**\n", "\n", - "To use NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n", + "To use the NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request:\n", "\n", "```http\n", "GET /v2/everything?q=python&page=1&apiKey=YOUR_API_KEY\n", @@ -357,7 +358,7 @@ "\n", "1. **Sign up** at [https://newsapi.org/register](https://newsapi.org/register)\n", "2. Copy your **API key** from your dashboard\n", - "3. Save your **API key** in Colab Secrets (side-bar on the right) as NEWS_API_KEY\n", + "3. Save your **API key** in Colab (or Molab) Secrets (side-bar on the right) as NEWS_API_KEY\n", "\n", "\n", "### **How we chose the right authenticator for NewsAPI**\n", @@ -423,12 +424,12 @@ "\n", "api_key = userdata.get(\"NEWS_API_KEY\")\n", "\n", - "client = RESTClient(\n", + "news_api_client = RESTClient(\n", " base_url=\"https://newsapi.org/v2/\",\n", " auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n", ")\n", "\n", - "response = client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", + "response = news_api_client.get(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", "print(response.json())" ] }, @@ -503,16 +504,24 @@ }, "outputs": [], "source": [ - "page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", + "page_iterator = news_api_client.paginate(\n", + " \"everything\", params={\"q\": \"python\", \"page\": 1}\n", + ")\n", "# prints the original request object\n", "print(next(page_iterator).request)\n", - "page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", + "page_iterator = news_api_client.paginate(\n", + " \"everything\", params={\"q\": \"python\", \"page\": 1}\n", + ")\n", "# prints the raw HTTP response\n", "print(next(page_iterator).response)\n", - "page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", + "page_iterator = news_api_client.paginate(\n", + " \"everything\", params={\"q\": \"python\", \"page\": 1}\n", + ")\n", "# prints the paginator that was used\n", "print(next(page_iterator).paginator)\n", - "page_iterator = client.paginate(\"everything\", params={\"q\": \"python\", \"page\": 1})\n", + "page_iterator = news_api_client.paginate(\n", + " \"everything\", params={\"q\": \"python\", \"page\": 1}\n", + ")\n", "# prints the authentication class used\n", "print(next(page_iterator).auth)" ] @@ -545,7 +554,7 @@ "### **Question 1:**\n", "\n", "\n", - "Which paginator is used by `client.paginate()` by default in the example above?\n", + "Which paginator is used by `news_api_client.paginate()` by default in the example above?\n", "\n", "\n", ">Answer this question and select the correct option in the homework Google Form.\n" @@ -627,19 +636,19 @@ "api_key = userdata.get(\"NEWS_API_KEY\")\n", "\n", "\n", - "client = RESTClient(\n", + "another_client = RESTClient(\n", " base_url=\"https://newsapi.org/v2/\",\n", " auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n", " paginator=PageNumberPaginator(\n", - " base_page=1, # NewsAPI starts paging from 1\n", - " page_param=\"page\", # Matches the API spec\n", - " total_path=None, # Set it to None explicitly\n", - " stop_after_empty_page=True, # Stop if no articles returned\n", - " maximum_page=4, # Optional limit for dev/testing\n", + " base_page=1,\n", + " page_param=\"page\",\n", + " total_path=None,\n", + " stop_after_empty_page=True,\n", + " maximum_page=4,\n", " ),\n", ")\n", "\n", - "for page in client.paginate(\n", + "for page in another_client.paginate(\n", " \"everything\", params={\"q\": \"python\", \"pageSize\": 5, \"language\": \"en\"}\n", "):\n", " for article in page:\n", @@ -670,14 +679,14 @@ "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n", "\n", - "os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n", + "dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n", "\n", "\n", "@dlt.resource(write_disposition=\"replace\", name=\"python_articles\")\n", - "def get_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n", + "def get_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n", " client = RESTClient(\n", " base_url=\"https://newsapi.org/v2/\",\n", - " auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n", + " auth=APIKeyAuth(name=\"apiKey\", api_key=news_api_key, location=\"query\"),\n", " paginator=PageNumberPaginator(\n", " base_page=1,\n", " page_param=\"page\",\n", @@ -715,11 +724,11 @@ "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import APIKeyAuth\n", "\n", - "os.environ[\"API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n", + "dlt.secrets[\"NEWS_API_KEY\"] = userdata.get(\"NEWS_API_KEY\")\n", "\n", "\n", "@dlt.resource(write_disposition=\"replace\", name=\"top_articles\")\n", - "def get_top_articles(api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n", + "def get_top_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]:\n", " client = RESTClient(\n", " base_url=\"https://newsapi.org/v2/\",\n", " auth=APIKeyAuth(name=\"apiKey\", api_key=api_key, location=\"query\"),\n", @@ -759,8 +768,8 @@ "outputs": [], "source": [ "@dlt.source\n", - "def newsapi_source(api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n", - " return [get_articles(api_key=api_key), get_top_articles(api_key=api_key)]" + "def newsapi_source(news_api_key: str = dlt.secrets.value) -> Iterable[DltResource]:\n", + " return [get_articles(news_api_key), get_top_articles(news_api_key)]" ] }, { @@ -843,7 +852,7 @@ "\n", "dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n", "\n", - "In previous section you've already met Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n", + "In the previous section, you've already learned about the Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source.\n", "\n", "\n" ] @@ -909,7 +918,7 @@ "source": [ "### **RESTAPIConfig**\n", "\n", - "The central object when working with `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n", + "The central object when working with the `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to.\n", "\n", "It defines:\n", "- how to connect to the API (base URL, authentication)\n", @@ -1045,7 +1054,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)" ] }, @@ -1081,7 +1090,7 @@ "}\n", "```\n", "\n", - "This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab's secret manager.\n", + "This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab or Molab's secret manager.\n", "\n", "\n", "The available authentication methods you can find in [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#authentication)." @@ -1122,12 +1131,12 @@ "\n", "news_source = rest_api_source(news_config)\n", "\n", - "pipeline = dlt.pipeline(\n", + "another_pipeline = dlt.pipeline(\n", " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", - "print(pipeline.last_trace)" + "another_pipeline.run(news_source)\n", + "print(another_pipeline.last_trace)" ] }, { @@ -1202,7 +1211,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)" ] }, @@ -1292,7 +1301,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)" ] }, @@ -1318,7 +1327,7 @@ "- dlt will remember the last `publishedAt` seen\n", "- On the next run, it will only request articles newer than that\n", "\n", - "This is optional and depends on your usage pattern.\n" + "This is optional and depends on your usage pattern." ] }, { @@ -1331,8 +1340,14 @@ "source": [ "import dlt\n", "from dlt.sources.rest_api import rest_api_source\n", + "from datetime import datetime, timedelta, timezone\n", "from google.colab import userdata\n", "\n", + "# the free plan of newsapi.org only allows you to fetch news from a maximum of 1 month ago\n", + "one_month_ago = datetime.now(timezone.utc) - timedelta(days=30)\n", + "initial_from = one_month_ago.replace(microsecond=0).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", + "\n", + "\n", "api_key = userdata.get(\"NEWS_API_KEY\")\n", "\n", "\n", @@ -1365,7 +1380,7 @@ " \"from\": {\n", " \"type\": \"incremental\",\n", " \"cursor_path\": \"publishedAt\",\n", - " \"initial_value\": \"2025-04-15T00:00:00Z\",\n", + " \"initial_value\": initial_from,\n", " },\n", " },\n", " },\n", @@ -1379,11 +1394,11 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)\n", "\n", "# Run the pipeline one more time\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)" ] }, @@ -1471,7 +1486,7 @@ " \"from\": {\n", " \"type\": \"incremental\",\n", " \"cursor_path\": \"publishedAt\",\n", - " \"initial_value\": \"2025-04-15T00:00:00Z\",\n", + " \"initial_value\": initial_from,\n", " },\n", " },\n", " },\n", @@ -1485,11 +1500,11 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)\n", "\n", "# Run the pipeline one more time\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)" ] }, @@ -1580,7 +1595,7 @@ " \"from\": {\n", " \"type\": \"incremental\",\n", " \"cursor_path\": \"publishedAt\",\n", - " \"initial_value\": \"2025-04-15T00:00:00Z\",\n", + " \"initial_value\": initial_from,\n", " },\n", " },\n", " },\n", @@ -1601,7 +1616,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)\n", "\n", "pipeline.dataset().top_headlines.df().head()" @@ -1672,9 +1687,10 @@ }, "outputs": [], "source": [ - "def debug_response(\n", - " response: requests.Response, *args: Any, **kwargs: Any\n", - ") -> requests.Response:\n", + "from dlt.sources.helpers.requests import Response\n", + "\n", + "\n", + "def debug_response(response: Response, *args: Any, **kwargs: Any) -> Response:\n", " print(\"Intercepted:\", response.status_code)\n", " return response" ] @@ -1728,7 +1744,7 @@ " \"response_actions\": [\n", " {\n", " \"status_code\": 200,\n", - " \"action\": debug_response, # <--- add some action\n", + " \"action\": debug_response,\n", " },\n", " ],\n", " \"params\": {\n", @@ -1736,7 +1752,7 @@ " \"from\": {\n", " \"type\": \"incremental\",\n", " \"cursor_path\": \"publishedAt\",\n", - " \"initial_value\": \"2025-04-15T00:00:00Z\",\n", + " \"initial_value\": initial_from,\n", " },\n", " },\n", " },\n", @@ -1757,7 +1773,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)\n", "\n", "pipeline.dataset().news_articles.df().head()" @@ -1807,8 +1823,8 @@ }, "outputs": [], "source": [ - "def lower_title(record: TDataItem) -> TDataItem:\n", - " record[\"title\"] = record[\"title\"].lower()\n", + "def lower_title(record: dict[str, Any]) -> dict[str, Any]:\n", + " record[\"title\"] = str(record[\"title\"]).lower()\n", " return record" ] }, @@ -1857,8 +1873,8 @@ " {\n", " \"name\": \"news_articles\",\n", " \"processing_steps\": [\n", - " {\"filter\": lambda x: len(x[\"author\"]) > 0}, # <--- add filter\n", - " {\"map\": lower_title}, # <--- add some transformation\n", + " {\"filter\": lambda x: len(x[\"author\"]) > 0},\n", + " {\"map\": lower_title},\n", " ],\n", " \"endpoint\": {\n", " \"path\": \"everything\",\n", @@ -1873,7 +1889,7 @@ " \"from\": {\n", " \"type\": \"incremental\",\n", " \"cursor_path\": \"publishedAt\",\n", - " \"initial_value\": \"2025-04-15T00:00:00Z\",\n", + " \"initial_value\": initial_from,\n", " },\n", " },\n", " },\n", @@ -1894,7 +1910,7 @@ " pipeline_name=\"news_pipeline\", destination=\"duckdb\", dataset_name=\"news\"\n", ")\n", "\n", - "load_info = pipeline.run(news_source)\n", + "pipeline.run(news_source)\n", "print(pipeline.last_trace)\n", "\n", "pipeline.dataset().news_articles.df().head()" @@ -1944,15 +1960,15 @@ "\n", "### Requirements:\n", "1. Use `rest_api_source` to define your source config.\n", - "2. This API uses **pagination**. Figure out what type is it.\n", + "2. This API uses **pagination**. Figure out what type it is.\n", "3. Add incremental loading to `orders`, starting from `2017-08-01` and using `ordered_at` as the cursor.\n", "4. Add `processing_steps` to `orders`:\n", - " - Remove records from orders which `order_total` > 500.\n", + " - Remove records from orders for which it is true that `order_total` > 500.\n", "\n", "\n", "\n", "### Question:\n", - "How many rows does resulted table `orders` contain?\n" + "How many rows does the resulting table `orders` contain?\n" ] }, { @@ -1972,7 +1988,7 @@ "id": "70D6czgeId7F" }, "source": [ - "✅ ▶ Well done! Go to [the next lesson.](https://colab.research.google.com/drive/1lQ8VkrGJwZMsVtbkuYympcvbv0_CCgYo#forceEdit=true&sandboxMode=true)" + "✅ ▶ Well done! Go to [the next lesson.](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)" ] }, { diff --git a/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py b/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py new file mode 100644 index 000000000..f7b94eada --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py @@ -0,0 +1,1621 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# **Building custom sources with [dlt REST API source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic) and [RESTClient](https://dlthub.com/docs/devel/general-usage/http/rest-client)** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# New section""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap** + + In the **[dlt Fundamentals](https://github.com/dlt-hub/dlthub-education/tree/main/courses/dlt_fundamentals_dec_2024)** course, we learned two primary ways to build sources for REST APIs: + + 1. **Using low-level dlt decorators** (`@dlt.source` and `@dlt.resource`) with [`RESTClient`](https://dlthub.com/docs/devel/general-usage/http/rest-client). + 2. **Using the built-in [`rest_api` source](https://dlthub.com/docs/devel/dlt-ecosystem/verified-sources/rest_api/basic)** with declarative configuration. + + --- + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **1. Building sources with low-level dlt decorators** + + We constructed a custom source for the **GitHub API** using the `RESTClient` class, decorators like `@dlt.resource` and `@dlt.source`, and manual pagination handling. + + + #### **Example** + + > Don't forget to use your [GitHub API token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) below! + """) + return + + +@app.cell +def _(): + from typing import Iterator, Any, Iterable + import os + import dlt + from dlt.common.typing import TDataItems, TDataItem + from dlt.sources import DltResource + from dlt.sources.helpers import requests + from dlt.sources.helpers.rest_client import RESTClient + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + dlt.secrets["ACCESS_TOKEN"] = os.getenv("ACCESS_TOKEN") + + @dlt.source + def github_source(access_token: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=access_token), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_events() -> Iterator[TDataItems]: + for page in client.paginate("orgs/dlt-hub/events"): + yield page + + @dlt.resource + def github_stargazers() -> Iterator[TDataItems]: + for page in client.paginate("repos/dlt-hub/dlt/stargazers"): + yield page + + return (github_events, github_stargazers) + + pipeline = dlt.pipeline( + pipeline_name="rest_client_github", + destination="duckdb", + dataset_name="rest_client_data", + dev_mode=True, + ) + _load_info = pipeline.run(github_source()) + print(_load_info) + return ( + Any, + BearerTokenAuth, + DltResource, + HeaderLinkPaginator, + Iterable, + Iterator, + RESTClient, + TDataItems, + dlt, + os, + ) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ### **2. Building sources with `rest_api` source** + + The **`rest_api` source** provides a higher-level, declarative approach to building sources for REST APIs. It's particularly suited for REST APIs with predictable structures and behaviors. + + + #### **Example** + """) + return + + +@app.cell +def _(dlt): + from dlt.sources.rest_api import RESTAPIConfig, rest_api_source + + config: RESTAPIConfig = { + "client": { + "base_url": "https://api.github.com", + "auth": {"token": dlt.secrets["access_token"]}, + "paginator": "header_link", + }, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "repos/dlt-hub/dlt/issues", + "params": {"state": "open"}, + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "repos/dlt-hub/dlt/issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + }, + { + "name": "contributors", + "endpoint": {"path": "repos/dlt-hub/dlt/contributors"}, + }, + ], + } + git_source = rest_api_source(config) + rest_api_pipeline = dlt.pipeline( + pipeline_name="rest_api_github", + destination="duckdb", + dataset_name="rest_api_data", + dev_mode=True, + ) + _load_info = rest_api_pipeline.run(git_source) + print(_load_info) + return RESTAPIConfig, rest_api_source + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **REST API Client by `dlt`** + + `dlt`’s REST API Client is the low level abstraction that powers the REST API Source. You can use it in your imperative code for more automation and brevity, if you do not wish to use the higher level declarative interface. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + If you don't like black boxes and prefer lower-level building blocks, then our `RESTClient` is perfect for you! + + The `RESTClient` class offers a Pythonic interface for interacting with RESTful APIs, including features like: + + - automatic pagination, + - various authentication mechanisms, + - customizable request/response handling. + + ### What you’ll learn + + - How to authenticate with your API key + - How to fetch paginated results using `RESTClient` + - How to build a custom `@dlt.source` + - How to run the pipeline and inspect the data + + For more information, read `dlt`'s official documentation for the [REST API Client](https://dlthub.com/devel/general-usage/http/rest-client). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **1. Creating a RESTClient instance**""") + return + + +@app.cell +def _(BearerTokenAuth, HeaderLinkPaginator, RESTClient, dlt, os): + dlt.secrets["ACCESS_TOKEN"] = os.getenv("ACCESS_TOKEN") + client = RESTClient( + base_url="https://api.github.com", + headers={"User-Agent": "MyApp/1.0"}, + auth=BearerTokenAuth(dlt.secrets["access_token"]), + paginator=HeaderLinkPaginator(), + data_selector="data", + ) + client.get("repos/dlt-hub/dlt/issues").json() # session=MyCustomSession() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `RESTClient` class is initialized with the following parameters: + + - `base_url`: The root URL of the API. All requests will be made relative to this URL. + - `headers`: Default headers to include in every request. This can be used to set common headers like `User-Agent` or other custom headers. + - `auth`: The authentication configuration. See the [Authentication](https://dlthub.com/docs/general-usage/http/rest-client#authentication) section for more details. + - `paginator`: A paginator instance for handling paginated responses. See the [Paginators](https://dlthub.com/docs/general-usage/http/rest-client#paginators) section below. + - `data_selector`: A [JSONPath selector](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) for extracting data from the responses. This defines a way to extract the data from the response JSON. Only used when paginating. + - `session`: An optional session for making requests. This should be a [Requests session](https://requests.readthedocs.io/en/latest/api/#requests.Session) instance that can be used to set up custom request behavior for the client. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **2. Add authentication** + + The RESTClient supports various authentication strategies, such as bearer tokens, API keys, and HTTP basic auth, configured through the `auth` parameter of both the `RESTClient` and the `paginate()` method. + + The **available authentication methods** are defined in the `dlt.sources.helpers.rest_client.auth` module: + + - [BearerTokenAuth](https://dlthub.com/docs/devel/general-usage/http/rest-client#bearer-token-authentication) + - [APIKeyAuth](https://dlthub.com/docs/devel/general-usage/http/rest-client#api-key-authentication) + - [HttpBasicAuth](https://dlthub.com/docs/devel/general-usage/http/rest-client#http-basic-authentication) + - [OAuth2ClientCredentials](https://dlthub.com/docs/devel/general-usage/http/rest-client#oauth-20-authorization) + + For specific use cases, you can [implement custom authentication](https://dlthub.com/docs/devel/general-usage/http/rest-client#implementing-custom-authentication) by subclassing the `AuthConfigBase` class from the [`dlt.sources.helpers.rest_client.auth`](https://github.com/dlt-hub/dlt/blob/devel/dlt/sources/helpers/rest_client/auth.py) module. + For specific flavors of OAuth 2.0, you can [implement custom OAuth 2.0](https://dlthub.com/docs/devel/general-usage/http/rest-client#oauth-20-authorization) by subclassing `OAuth2ClientCredentials`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_1_Custom_sources_RestAPI_source_and_RESTClient_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_1_Custom_sources_RestAPI_source_and_RESTClient_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 📰 **NewsAPI overview** + + - **Base URL:** `https://newsapi.org/v2/` + - **Authentication:** API key passed in query string as `apiKey` + - **Documentation:** [NewsAPI Docs](https://newsapi.org/docs) + + | Endpoint | Description | Auth Required | Response | + |-------------------|------------------------------------------|---------------|--------------| + | `/everything` | Search for news articles by query string | ✅ Yes | JSON object with `articles[]` | + | `/top-headlines` | Latest headlines filtered by region/topic| ✅ Yes | JSON object with `articles[]` | + | `/sources` | List of available news sources | ✅ Yes | JSON object with `sources[]` | + + + #### **Authentication Details:** + + To use the NewsAPI, you must register for a **free account** and obtain an API key. This key is required for all endpoints and must be included as a query parameter in your request: + + ```http + GET /v2/everything?q=python&page=1&apiKey=YOUR_API_KEY + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Prerequisites:** + + To securely access the NewsAPI in your dlt project: + + 1. **Sign up** at [https://newsapi.org/register](https://newsapi.org/register) + 2. Copy your **API key** from your dashboard + 3. Save your **API key** in Colab (or Molab) Secrets (side-bar on the right) as NEWS_API_KEY + + + ### **How we chose the right authenticator for NewsAPI** + + NewsAPI uses a **simple API key-based scheme**. You sign up, get a key, and send it with every request. + + There are two supported ways to send this key: + + - In a **query string**, like `?apiKey=...` + - Or in the **Authorization header**, as a Bearer token + + We are using the **query string method**, because: + + - It's supported on **all plans**, including the free tier + - It's more transparent — you can inspect the request URL and see the key + - It's easier to test manually in a browser or terminal + + + **Using `APIKeyAuth` simplifies request setup** + + Instead of manually appending the key to every URL, we use dlt’s built-in `APIKeyAuth`: + + ```python + APIKeyAuth(name="apiKey", api_key=api_key, location="query") + ``` + + This means: + + - `name="apiKey"` tells it what the key is called (NewsAPI expects `apiKey`) + - `location="query"` means the key will be added to the URL as a query parameter: + + ``` + https://newsapi.org/v2/everything?q=python&apiKey=your_key + ``` + """) + return + + +@app.cell +def _(RESTClient, os): + from dlt.sources.helpers.rest_client.auth import APIKeyAuth + + api_key = os.getenv("NEWS_API_KEY") + news_api_client = RESTClient( + base_url="https://newsapi.org/v2/", + auth=APIKeyAuth(name="apiKey", api_key=api_key, location="query"), + ) + response = news_api_client.get("everything", params={"q": "python", "page": 1}) + print(response.json()) + return APIKeyAuth, news_api_client + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""This authenticates every request by adding `?apiKey=your_key` to the URL.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **3. Add pagination** + + The `RESTClient` supports automatic pagination of API responses via the `paginate()` method, which can be customized using a built-in or custom paginator. + + You specify the paginator using the `paginator` parameter of the `RESTClient` or directly in the `paginate()` method. + + The **available pagination strategies** are defined in the `dlt.sources.helpers.rest_client.paginators` module and cover the most common pagination patterns used in REST APIs: + + - [`PageNumberPaginator`](https://dlthub.com/docs/general-usage/http/rest-client#pagenumberpaginator) – uses `page=N`, optionally with `pageSize` or `limit` + - [`OffsetPaginator`](https://dlthub.com/docs/general-usage/http/rest-client#offsetpaginator) – uses `offset` and `limit` + - [`JSONLinkPaginator`](https://dlthub.com/docs/general-usage/http/rest-client#jsonresponsepaginator) – follows a `next` URL in the response body + - [`HeaderLinkPaginator`](https://dlthub.com/docs/general-usage/http/rest-client#headerlinkpaginator) – follows a `Link` header (used by GitHub and others) + - [`JSONResponseCursorPaginator`](https://dlthub.com/docs/general-usage/http/rest-client#jsonresponsecursorpaginator) – uses a cursor from the response body + + Each paginator knows how to update the request to get the next page of results, and will continue until: + + - no more pages are available, + - a configurable `maximum_page` or `maximum_offset` is reached, + - or the API response is empty (depending on paginator behavior). + + + > If a `paginator` is not specified, the `paginate()` method will attempt to **automatically detect** the pagination mechanism used by the API. If the API uses a standard pagination mechanism like having a `next` link in the response's headers or JSON body, the `paginate()` method will handle this automatically. Otherwise, you can specify a paginator object explicitly or implement a custom paginator. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **PageData** + + When using `client.paginate(...)` in dlt, you don’t just get a stream of data — each **page** returned is a rich object called `PageData`, and it gives you full access to the internals of the request, response, and pagination state. + + This is especially useful for **debugging**, **tracing**, or building custom logic. + + + The `PageData` is a list-like object that contains the following attributes: + + - `request`: The original request object. + - `response`: The response object. + - `paginator`: The paginator object used to paginate the response. + - `auth`: The authentication object used for the request. + + Let’s walk through an example. + """) + return + + +@app.cell +def _(news_api_client): + page_iterator = news_api_client.paginate( + "everything", params={"q": "python", "page": 1} + ) + # prints the original request object + print(next(page_iterator).request) + page_iterator = news_api_client.paginate( + "everything", params={"q": "python", "page": 1} + ) + # prints the raw HTTP response + print(next(page_iterator).response) + page_iterator = news_api_client.paginate( + "everything", params={"q": "python", "page": 1} + ) + # prints the paginator that was used + print(next(page_iterator).paginator) + page_iterator = news_api_client.paginate( + "everything", params={"q": "python", "page": 1} + ) + # prints the authentication class used + print(next(page_iterator).auth) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Log Warning explained** + + ``` + [WARNING] Fallback paginator used: SinglePagePaginator... + ``` + + This warning means: + + - dlt tried to guess the pagination method but failed + - It will make only **one request** + - You won’t get multiple pages of data unless you configure a paginator explicitly + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Question 1:** + + + Which paginator is used by `news_api_client.paginate()` by default in the example above? + + + >Answer this question and select the correct option in the homework Google Form. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **How we chose the right paginator for NewsAPI** + + When using `RESTClient` to extract data from paginated APIs, one of the first decisions you must make is: + **"What type of pagination does this API use?"** + This determines which paginator to plug into the client. + + --- + + **Step 1: Read the API docs** + + From the [NewsAPI documentation](https://newsapi.org/docs/endpoints/everything), we learn: + + - Pagination is done via two query parameters: + - `page` → page number (starts at 1) + - `pageSize` → how many articles per page (max 100) + - Example request: + ``` + GET /v2/everything?q=bitcoin&page=2&pageSize=20&apiKey=... + ``` + + There is **no "next" URL**, no cursor, no `offset`. + + This is **classic page-number pagination.** + + --- + + **Step 2: Understand response behavior** + + Each response includes: + + ```json + { + "status": "ok", + "totalResults": 1532, + "articles": [ ... ] + } + ``` + + But: + - The API **does not tell us how many total pages exist**. + - We only know how many total results there are. + + So we either: + - Compute total pages: `ceil(totalResults / pageSize)` + *(But that requires looking into the first page’s body)* + - **Or we keep requesting pages until we get an empty list.** + + --- + + **Step 3: Choose `PageNumberPaginator`** + + This is exactly what `PageNumberPaginator` is made for: + """) + return + + +@app.cell +def _(APIKeyAuth, RESTClient, os): + from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator + + api_key_1 = os.getenv("NEWS_API_KEY") + another_client = RESTClient( + base_url="https://newsapi.org/v2/", + auth=APIKeyAuth(name="apiKey", api_key=api_key_1, location="query"), + paginator=PageNumberPaginator( + base_page=1, + page_param="page", + total_path=None, + stop_after_empty_page=True, + maximum_page=4, + ), + ) + for page in another_client.paginate( + "everything", params={"q": "python", "pageSize": 5, "language": "en"} + ): + for article in page: + print(article["title"]) + return PageNumberPaginator, api_key_1 + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **4. Wrap into a dlt Resource** + + Let’s turn this into a dlt pipeline resource: + """) + return + + +@app.cell +def _( + APIKeyAuth, + Iterator, + PageNumberPaginator, + RESTClient, + TDataItems, + dlt, + os, +): + dlt.secrets["NEWS_API_KEY"] = os.getenv("NEWS_API_KEY") + + @dlt.resource(write_disposition="replace", name="python_articles") + def get_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]: + client = RESTClient( + base_url="https://newsapi.org/v2/", + auth=APIKeyAuth(name="apiKey", api_key=news_api_key, location="query"), + paginator=PageNumberPaginator( + base_page=1, + page_param="page", + total_path=None, + stop_after_empty_page=True, + maximum_page=4, + ), + ) + for page in client.paginate( + "everything", params={"q": "python", "pageSize": 5, "language": "en"} + ): + yield page + return (get_articles,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **5. Add `top-headlines` Resource**""") + return + + +@app.cell +def _( + APIKeyAuth, + Iterator, + PageNumberPaginator, + RESTClient, + TDataItems, + api_key_1, + dlt, + os, +): + dlt.secrets["NEWS_API_KEY"] = os.getenv("NEWS_API_KEY") + + @dlt.resource(write_disposition="replace", name="top_articles") + def get_top_articles(news_api_key: str = dlt.secrets.value) -> Iterator[TDataItems]: + client = RESTClient( + base_url="https://newsapi.org/v2/", + auth=APIKeyAuth(name="apiKey", api_key=api_key_1, location="query"), + paginator=PageNumberPaginator( + base_page=1, + page_param="page", + total_path=None, + stop_after_empty_page=True, + maximum_page=4, + ), + ) + for page in client.paginate( + "top-headlines", params={"pageSize": 5, "language": "en"} + ): + yield page + return (get_top_articles,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **6. Create a reusable Source** + + Now bundle both resources into a single `@dlt.source`: + """) + return + + +@app.cell +def _(DltResource, Iterable, dlt, get_articles, get_top_articles): + @dlt.source + def newsapi_source(news_api_key: str = dlt.secrets.value) -> Iterable[DltResource]: + return [get_articles(news_api_key), get_top_articles(news_api_key)] + return (newsapi_source,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **7. Run the pipeline**""") + return + + +@app.cell +def _(dlt, newsapi_source): + pipeline_1 = dlt.pipeline( + pipeline_name="newsapi_pipeline", destination="duckdb", dataset_name="news_data" + ) + info = pipeline_1.run(newsapi_source()) + print(info) + return (pipeline_1,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **8. Explore data**""") + return + + +@app.cell +def _(pipeline_1): + pipeline_1.dataset().python_articles.df().head() + return + + +@app.cell +def _(pipeline_1): + pipeline_1.dataset().top_articles.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Create custom source using `dlt` and [`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)** + + `rest_api` is a generic source that you can use to create a `dlt` source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this `dlt` source attempts to provide a declarative way to define a `dlt` source for those APIs. + + Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can define: + + - the API endpoints to pull data from, + - their relationships, + - how to handle pagination, + - authentication, + - data transformation, + - incremental loading. + + dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination** + + In the previous section, you've already learned about the Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **What you’ll learn** + + This section will teach you how to create a reusable, authenticated, and paginated pipeline using the `rest_api_source` module in dlt. Our example will use the [NewsAPI](https://newsapi.org), which provides access to thousands of news articles via a REST API. + + We'll walk step-by-step through: + - Setting up the source configuration + - Authenticating with an API key + - Configuring pagination + - Building a working `dlt` pipeline + - Inspecting and transforming the response + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Reminder: **About NewsAPI** + + - **Base URL:** `https://newsapi.org/v2/` + - **Authentication:** API key passed in query string as `apiKey` + - **Documentation:** [NewsAPI Docs](https://newsapi.org/docs) + + | Endpoint | Description | Auth Required | Response | + |-------------------|------------------------------------------|---------------|--------------| + | `/everything` | Search for news articles by keyword | ✅ Yes | JSON with `articles[]` | + | `/top-headlines` | Latest headlines filtered by region/topic| ✅ Yes | JSON with `articles[]` | + | `/sources` | List of available news sources | ✅ Yes | JSON with `sources[]` | + + To access the API, register for a **free account** at [newsapi.org](https://newsapi.org/register) and copy your personal API key. + + Add this key to your Colab secrets. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **1. Define the source configuration** + + We'll now build the complete configuration step-by-step. This gives you control over authentication, pagination, filters, and even incremental loading. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **RESTAPIConfig** + + The central object when working with the `rest_api_source` is the `RESTAPIConfig`. This is a declarative Python dictionary that tells dlt everything it needs to know about the API you are connecting to. + + It defines: + - how to connect to the API (base URL, authentication) + - what endpoints to call (resources) + - how to paginate + - how to filter or sort the data + - how to extract the actual data from responses + + ```python + import dlt + from dlt.sources.rest_api import rest_api_source + + # Define config + news_config = { + "client": { + "base_url": ..., + "auth": ... + }, + "resources": [ + ... + ] + } + + # Create source + news_source = rest_api_source(news_config) + + # Create pipeline + pipeline = dlt.pipeline( + pipeline_name="news_pipeline", + destination="duckdb", + dataset_name="news" + ) + + # Run it + load_info = pipeline.run(news_source) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can start with just these fields and then add pagination, schema hints, transformations, and more as needed. + + To extract data from a REST API using `dlt`, we define a configuration dictionary that follows the `RESTAPIConfig` structure. + This configuration describes: + + - how to connect to the API (base URL, headers, auth) + - what resources to extract (endpoints) + - how to paginate, filter, and process responses + + At a high level, the configuration has two required keys: + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **`client`** + This defines the shared connection details for all requests: + - `base_url`: The root URL for the API + - `auth`: (Optional) Authentication method to use — such as API key or token + - `headers`: (Optional) Custom headers for requests + - `paginator`: (Optional) Default paginator for all resources + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **`resources`** + A list of resource definitions. Each resource becomes a table in your destination. + A resource includes: + - `name`: Table name for the resource + - `endpoint`: Path to the endpoint, query parameters, pagination config + - `write_disposition`: How to load the data (`append`, `merge`, `replace`) + - `primary_key`: Optional key used when merging + - `data_selector`: JSONPath to extract data from the response (e.g., "articles") + - `processing_steps`: Optional filters and transformations + - `response_actions`: Optional hooks to inspect or alter the HTTP response + + Let’s build a real-world configuration step-by-step using NewsAPI. + """) + return + + +@app.cell +def _(RESTAPIConfig, dlt, rest_api_source): + _news_config: RESTAPIConfig = { + "client": {"base_url": "https://newsapi.org/v2/"}, + "resources": [ + { + "name": "news_articles", + "endpoint": {"path": "everything", "params": {"q": "python"}}, + } + ], + } + _news_source = rest_api_source(_news_config) + pipeline_2 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_2.run(_news_source) + print(pipeline_2.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Question 2: + + What error was thrown in the example above? + + >Answer this question and select the correct option in the homework Google Form. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **2. Add authentication** + + NewsAPI requires an API key to be sent with every request. We use dlt's built-in `api_key` authentication method, which places the key into the query string automatically: + + ```python + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": "your_key", + "location": "query", + } + ``` + + This ensures every request has `?apiKey=...` added. It's simple and secure, especially when storing the key in ENVs or Colab or Molab's secret manager. + + + The available authentication methods you can find in [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#authentication). + """) + return + + +@app.cell +def _(dlt, os, rest_api_source): + api_key_2 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_2, + "location": "query", + }, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": {"path": "everything", "params": {"q": "python"}}, + } + ], + } + _news_source = rest_api_source(_news_config) + another_pipeline = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + another_pipeline.run(_news_source) + print(another_pipeline.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **3. Add pagination** + + The REST API source will try to automatically handle pagination for you. This works by detecting the pagination details from the first API response. Unfortunately, it doesn't work for NewsAPI. + + NewsAPI uses page-based pagination. We use the built-in `PageNumberPaginator` to automatically paginate through pages until results run out: + + + ```python + "paginator": { + "type": "page_number", + "page_param": "page", + "stop_after_empty_page": True, + "total_path": None, + "maximum_page": 3, + }, + ``` + + This will fetch up to 3 pages of results, stopping early if a page is empty. + """) + return + + +@app.cell +def _(dlt, os, rest_api_source): + api_key_3 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_3, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": {"path": "everything", "params": {"q": "python"}}, + } + ], + } + _news_source = rest_api_source(_news_config) + pipeline_3 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_3.run(_news_source) + print(pipeline_3.last_trace) + return (pipeline_3,) + + +@app.cell +def _(pipeline_3): + pipeline_3.dataset().news_articles.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **4. Add order, filtering via params** + We can filter articles using query parameters supported by NewsAPI: + + ```python + "params": { + "q": "python", + "language": "en", + "pageSize": 20, + }, + ``` + + - `q`: search keyword (e.g. "python") + - `language`: filter by article language + - `pageSize`: number of articles per page (max 100) + """) + return + + +@app.cell +def _(dlt, os, rest_api_source): + api_key_4 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_4, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "params": {"q": "python", "language": "en", "pageSize": 20}, + }, + } + ], + } + _news_source = rest_api_source(_news_config) + pipeline_4 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_4.run(_news_source) + print(pipeline_4.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **5. Incremental loading** + + Although NewsAPI does not support true incremental loading via cursors, you can simulate it using the `from` or `to` date filters and dlt's `incremental` loader: + + ```python + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": "2024-01-01T00:00:00Z", + }, + ``` + + This setup means: + - dlt will remember the last `publishedAt` seen + - On the next run, it will only request articles newer than that + + This is optional and depends on your usage pattern. + """) + return + + +@app.cell +def _(dlt, os, rest_api_source): + from datetime import datetime, timedelta, timezone + + one_month_ago = datetime.now(timezone.utc) - timedelta(days=30) + initial_from = one_month_ago.replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ") + api_key_5 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_5, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "params": { + "q": "python", + "language": "en", + "pageSize": 20, + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": initial_from, + }, + }, + }, + } + ], + } + _news_source = rest_api_source(_news_config) + pipeline_5 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_5.run(_news_source) + print(pipeline_5.last_trace) + pipeline_5.run(_news_source) + print(pipeline_5.last_trace) + return (initial_from,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **6. Add more endpoints**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Set defaults + + First, set some defaults for all endpoints: + + ```python + "resource_defaults": { + "primary_key": "id", + "write_disposition": "merge", + "endpoint": { + "params": { + "per_page": 100, + }, + }, + }, + ``` + """) + return + + +@app.cell +def _(dlt, initial_from, os, rest_api_source): + api_key_6 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_6, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resource_defaults": { + "write_disposition": "append", + "endpoint": {"params": {"language": "en", "pageSize": 20}}, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "params": { + "q": "python", + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": initial_from, + }, + }, + }, + } + ], + } + _news_source = rest_api_source(_news_config) + pipeline_6 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_6.run(_news_source) + print(pipeline_6.last_trace) + pipeline_6.run(_news_source) + print(pipeline_6.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + `resource_defaults` contains the default values to configure the dlt resources returned by this source. + + `resources` object contains the configuration for each resource. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Add same level endpoint + + To load additional endpoints like `/top-headlines` or `/sources`, you can simply add more entries to the `resources` list: + ```python + { + "name": "top_headlines", + "endpoint": { + "path": "top-headlines", + "params": {"country": "us", "pageSize": 10}, + "paginator": {"type": "page_number", "page_param": "page"} + }, + "primary_key": "url", + "write_disposition": "append", + "data_selector": "articles" + } + ``` + """) + return + + +@app.cell +def _(dlt, initial_from, os, rest_api_source): + api_key_7 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_7, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resource_defaults": { + "write_disposition": "append", + "endpoint": {"params": {"language": "en", "pageSize": 20}}, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "params": { + "q": "python", + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": initial_from, + }, + }, + }, + }, + { + "name": "top_headlines", + "endpoint": {"path": "top-headlines", "params": {"country": "us"}}, + }, + ], + } + _news_source = rest_api_source(_news_config) + pipeline_7 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_7.run(_news_source) + print(pipeline_7.last_trace) + pipeline_7.dataset().top_headlines.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Advanced""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Response actions + + The `response_actions` field in the endpoint configuration allows you to specify how to **handle specific responses** or all responses from the API. + + For example: + - Responses with specific status codes or content substrings can be ignored. + - All responses or only responses with specific status codes or content substrings can be transformed with a custom callable, such as a function. This callable is passed on to the requests library as a response hook. The callable can modify the response object and has to return it for the modifications to take effect. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ```python + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "response_actions": [ + { + "status_code": 200, + "content": "some text", + "action": do_something, + }, + ], + }, + }, + ``` + + Fields: + + * `status_code` (int, optional): The HTTP status code to match. + * `content` (str, optional): A substring to search for in the response content. + * `action` (str or Callable or List[Callable], optional): The action to take when the condition is met. Currently supported actions: + "ignore": Ignore the response. + a callable accepting and returning the response object. + a list of callables, each accepting and returning the response object. + """) + return + + +@app.cell +def _(Any): + from dlt.sources.helpers.requests import Response + + def debug_response(response: Response, *args: Any, **kwargs: Any) -> Response: + print("Intercepted:", response.status_code) + return response + return (debug_response,) + + +@app.cell +def _(debug_response, dlt, initial_from, os, rest_api_source): + api_key_8 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_8, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resource_defaults": { + "write_disposition": "append", + "endpoint": {"params": {"language": "en", "pageSize": 20}}, + }, + "resources": [ + { + "name": "news_articles", + "endpoint": { + "path": "everything", + "response_actions": [ + {"status_code": 200, "action": debug_response} + ], + "params": { + "q": "python", + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": initial_from, + }, + }, + }, + }, + { + "name": "top_headlines", + "endpoint": {"path": "top-headlines", "params": {"country": "us"}}, + }, + ], + } + _news_source = rest_api_source(_news_config) + pipeline_8 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_8.run(_news_source) + print(pipeline_8.last_trace) + pipeline_8.dataset().news_articles.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Processing steps: filter and transform data + The `processing_steps` field in the resource configuration allows you to **apply transformations** to the data fetched from the API before it is loaded into your destination. + + This is useful when you need to + - **filter out** certain records, + - **modify the data** structure, + - **anonymize** sensitive information. + + Each processing step is a dictionary specifying the type of operation (filter or map) and the function to apply. Steps apply in the order they are listed. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ```python + "resources": [ + { + "name": "news_articles", + "processing_steps": [ + {"filter": lambda x: len(x["author"]) > 0}, + {"map": lower_title}, + ], + }, + ], + ``` + """) + return + + +@app.cell +def _(Any): + def lower_title(record: dict[str, Any]) -> dict[str, Any]: + record["title"] = str(record["title"]).lower() + return record + return (lower_title,) + + +@app.cell +def _(debug_response, dlt, initial_from, lower_title, os, rest_api_source): + api_key_9 = os.getenv("NEWS_API_KEY") + _news_config = { + "client": { + "base_url": "https://newsapi.org/v2/", + "auth": { + "type": "api_key", + "name": "apiKey", + "api_key": api_key_9, + "location": "query", + }, + "paginator": { + "base_page": 1, + "type": "page_number", + "page_param": "page", + "total_path": None, + "maximum_page": 3, + }, + }, + "resource_defaults": { + "write_disposition": "append", + "endpoint": {"params": {"language": "en", "pageSize": 20}}, + }, + "resources": [ + { + "name": "news_articles", + "processing_steps": [ + {"filter": lambda x: len(x["author"]) > 0}, + {"map": lower_title}, + ], + "endpoint": { + "path": "everything", + "response_actions": [ + {"status_code": 200, "action": debug_response} + ], + "params": { + "q": "python", + "from": { + "type": "incremental", + "cursor_path": "publishedAt", + "initial_value": initial_from, + }, + }, + }, + }, + { + "name": "top_headlines", + "endpoint": {"path": "top-headlines", "params": {"country": "us"}}, + }, + ], + } + _news_source = rest_api_source(_news_config) + pipeline_9 = dlt.pipeline( + pipeline_name="news_pipeline", destination="duckdb", dataset_name="news" + ) + pipeline_9.run(_news_source) + print(pipeline_9.last_trace) + pipeline_9.dataset().news_articles.df().head() + return (pipeline_9,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Links + + More Information about how to build efficient data pipelines you can find in our official documentation: + - `dlt` [Getting Started](https://dlthub.com/docs/getting-started), + - [REST API Source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api), + - [REST API Client](https://dlthub.com/docs/general-usage/http/rest-client), + - `dlt` [Sources](https://dlthub.com/docs/general-usage/source) and [Resources](https://dlthub.com/docs/general-usage/resource), + - [Incremental loading](https://dlthub.com/docs/general-usage/incremental-loading), + - Our pre-built [Verified Sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/), + - Available [Destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_1_Custom_sources_RestAPI_source_and_RESTClient_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_1_Custom_sources_RestAPI_source_and_RESTClient_img2.jpeg)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Exercise 1 + + Your task is to create a `rest_api_source` configuration for the public **Jaffle Shop API**. This exercise will help you apply what you’ve learned: + + ### API details: + - **Base URL:** `https://jaffle-shop.scalevector.ai/api/v1` + - **Docs:** [https://jaffle-shop.scalevector.ai/docs](https://jaffle-shop.scalevector.ai/docs) + + ### Endpoints to load: + - `/orders` + + ### Requirements: + 1. Use `rest_api_source` to define your source config. + 2. This API uses **pagination**. Figure out what type it is. + 3. Add incremental loading to `orders`, starting from `2017-08-01` and using `ordered_at` as the cursor. + 4. Add `processing_steps` to `orders`: + - Remove records from orders for which it is true that `order_total` > 500. + + + + ### Question: + How many rows does the resulting table `orders` contain? + """) + return + + +@app.cell +def _(pipeline_9): + pipeline_9.dataset().orders.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Well done! Go to [the next lesson.](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb b/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb index 95111d28d..cab413ba3 100644 --- a/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb +++ b/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb @@ -6,7 +6,7 @@ "id": "NvaKFdYx-kbG" }, "source": [ - "# Building custom sources using SQL Databases [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n", + "# Building custom sources using SQL Databases [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb)\n", "\n", "This lesson covers building flexible and powerful custom sources using the `sql_database` verified source.\n" ] @@ -32,15 +32,6 @@ "- How to load only new data with incremental loading\n" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "4PRqLBIQA7rj" - }, - "source": [ - "Setup & install dlt:" - ] - }, { "cell_type": "code", "execution_count": null, @@ -198,7 +189,7 @@ "id": "YjPZMS6DWVNN" }, "source": [ - "Let's save this filtered data:" + "Let's load this filtered data:" ] }, { @@ -209,7 +200,7 @@ }, "outputs": [], "source": [ - "info = pipeline.run(filtered_resource, table_name=\"bacterias\")\n", + "info = pipeline.run(filtered_resource, table_name=\"bacteria\")\n", "print(info)" ] }, @@ -230,7 +221,7 @@ }, "outputs": [], "source": [ - "pipeline.dataset().bacterias.df().head()" + "pipeline.dataset().bacteria.df().head()" ] }, { @@ -241,7 +232,7 @@ "source": [ "### **Question 1**:\n", "\n", - "How many rows are present in the `bacterias` table?\n", + "How many rows are present in the `bacteria` table?\n", "\n", ">Answer this question and select the correct option in the homework Quiz.\n" ] @@ -278,8 +269,10 @@ "\n", "\n", "def add_max_timestamp(table: Table) -> Any:\n", - " max_ts = sa.func.greatest(table.c.created, table.c.updated).label(\"max_timestamp\")\n", - " subq = sa.select(*table.c, max_ts).subquery()\n", + " max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label(\n", + " \"max_timestamp\"\n", + " )\n", + " subq = sa.select(*table.columns, max_ts).subquery()\n", " return subq" ] }, @@ -476,7 +469,7 @@ "\n", "We'll also be looking at where these incremental values are stored.\n", "\n", - "Hint: they are stored in [dlt state](https://dlthub.com/docs/general-usage/state)." + "Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state)." ] }, { @@ -583,17 +576,8 @@ "id": "IkvUgaRhI6iY" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1P8pOw9C6J9555o2jhZydESVuVb-3z__y#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Iz0lz3QhJEvv" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py b/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py new file mode 100644 index 000000000..383ef8d5f --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py @@ -0,0 +1,435 @@ +# /// script +# dependencies = [ +# "dlt", +# "duckdb", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Building custom sources using SQL Databases [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) + + This lesson covers building flexible and powerful custom sources using the `sql_database` verified source. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_2_Custom_sources_SQL_Databases_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_2_Custom_sources_SQL_Databases_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## What you will learn + + - How to build a custom pipeline using SQL sources + - How to use `query_adapter_callback`, `table_adapter_callback`, and `type_adapter_callback` + - How to load only new data with incremental loading + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Step 1: Load data from SQL Databases""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We’ll use the [Rfam MySQL public DB](https://docs.rfam.org/en/latest/database.html) and load it into DuckDB:""" + ) + return + + +@app.cell +def _(): + from typing import Any + from dlt.sources.sql_database import sql_database + import dlt + + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["family"], + ) + pipeline = dlt.pipeline( + pipeline_name="sql_database_example", + destination="duckdb", + dataset_name="sql_data", + dev_mode=True, + ) + load_info = pipeline.run(_source) + print(load_info) + return Any, dlt, pipeline, sql_database + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Explore the `family` table:""") + return + + +@app.cell +def _(pipeline): + pipeline.dataset().family.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 2: Customize SQL queries with `query_adapter_callback` + + You can fully rewrite or modify the SQL SELECT statement per table. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Filter rows using a WHERE clause""") + return + + +@app.cell +def _(): + from sqlalchemy import text + from dlt.sources.sql_database.helpers import SelectClause, Table + + def query_adapter_callback(query: SelectClause, table: Table) -> SelectClause: + return text(f"SELECT * FROM {table.fullname} WHERE rfam_id like '%bacteria%'") + return Table, query_adapter_callback + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""To be able to use `sql_database` and not have to declare the connection string each time, we save it as an environment variable. This can also (should preferably) be done in `secrets.toml`""" + ) + return + + +@app.cell +def _(): + import os + + os.environ[ + "SOURCES__SQL_DATABASE__CREDENTIALS" + ] = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" + return + + +@app.cell +def _(query_adapter_callback, sql_database): + filtered_resource = sql_database( + query_adapter_callback=query_adapter_callback, table_names=["family"] + ) + return (filtered_resource,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's load this filtered data:""") + return + + +@app.cell +def _(filtered_resource, pipeline): + _info = pipeline.run(filtered_resource, table_name="bacteria") + print(_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Explore the data:""") + return + + +@app.cell +def _(pipeline): + pipeline.dataset().bacteria.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Question 1**: + + How many rows are present in the `bacteria` table? + + >Answer this question and select the correct option in the homework Quiz. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 3: Modify table schema with `table_adapter_callback` + + Add columns, change types, or transform schema using this hook. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Example: Add computed column `max_timestamp`""") + return + + +@app.cell +def _(Any, Table): + import sqlalchemy as sa + + def add_max_timestamp(table: Table) -> Any: + max_ts = sa.func.greatest(table.columns.created, table.columns.updated).label( + "max_timestamp" + ) + subq = sa.select(*table.columns, max_ts).subquery() + return subq + return add_max_timestamp, sa + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Use it with `sql_table`:""") + return + + +@app.cell +def _(add_max_timestamp, dlt, pipeline): + from dlt.sources.sql_database import sql_table + + table = sql_table( + table="family", + table_adapter_callback=add_max_timestamp, + incremental=dlt.sources.incremental("max_timestamp"), + ) + _info = pipeline.run(table, table_name="family_with_max_timestamp") + print(_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's check out if this column exists!""") + return + + +@app.cell +def _(pipeline): + pipeline.dataset().family_with_max_timestamp.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 4: Adapt column data types with `type_adapter_callback` + + When the default types don’t match what you want in the destination, you can remap them. + + Let's look at the schema that has already been loaded: + """) + return + + +@app.cell +def _(pipeline): + schema = pipeline.default_schema.to_dict()["tables"]["family"]["columns"] + for _column in schema: + print(schema[_column]["name"], ":", schema[_column]["data_type"]) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Lets change `hmm_lambda` from decimal to float. + + 💡 Quick fyi: The `float` data type is: + - Fast and uses less space + - But it's approximate — you may get 0.30000000000000004 instead of 0.3 + - Bad for money, great for probabilities, large numeric ranges, scientific values + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Example: Change data types""") + return + + +@app.cell +def _(Any, sa): + from sqlalchemy.types import Float + + def type_adapter_callback(sql_type: Any) -> Any: + if isinstance(sql_type, sa.Numeric): + return Float + return sql_type + return (type_adapter_callback,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Use it with `sql_database`:""") + return + + +@app.cell +def _(pipeline, sql_database, type_adapter_callback): + new_source = sql_database( + type_adapter_callback=type_adapter_callback, table_names=["family"] + ) + _info = pipeline.run(new_source, table_name="type_changed_family") + print(_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""👀 Can you see how the column data types have changed?""") + return + + +@app.cell +def _(pipeline): + schema1 = pipeline.default_schema.to_dict()["tables"]["family"]["columns"] + schema2 = pipeline.default_schema.to_dict()["tables"]["type_changed_family"][ + "columns" + ] + _column = "trusted_cutoff" + print( + "For table 'family':", + schema1[_column]["name"], + ":", + schema1[_column]["data_type"], + ) + print( + "For table 'type_changed_family':", + schema2[_column]["name"], + ":", + schema2[_column]["data_type"], + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Question 2**: + + How many columns had their type changed in the `type_changed_family` table? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 5: Incremental loads with `sql_database` + Track only new rows using a timestamp or ID column. + + We'll also be looking at where these incremental values are stored. + + Hint: they are stored in the [dlt state](https://dlthub.com/docs/general-usage/state). + """) + return + + +@app.cell +def _(): + import json + + with open( + "/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8" + ) as _f: + _data = json.load(_f) + _data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys() + return (json,) + + +@app.cell +def _(dlt, pipeline, sql_database): + import pendulum + + _source = sql_database(table_names=["family"]) + _source.family.apply_hints( + incremental=dlt.sources.incremental( + "updated", initial_value=pendulum.datetime(2024, 1, 1) + ) + ) + _info = pipeline.run(_source) + print(_info) + return + + +@app.cell +def _(json): + with open( + "/var/dlt/pipelines/sql_database_example/state.json", "r", encoding="utf-8" + ) as _f: + _data = json.load(_f) + _data["sources"]["sql_database"]["resources"]["family"]["incremental"].keys() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Rename tables for `sql_database` source**""") + return + + +@app.cell +def _(dlt, sql_database): + _source = sql_database(table_names=["family"]) + for _resource_name, resource in _source.resources.items(): + resource.apply_hints(table_name=f"xxxx__{resource.name}") + pipeline_1 = dlt.pipeline( + pipeline_name="sql_db_prefixed_tables", + destination="duckdb", + dataset_name="renamed_tables", + ) + print(pipeline_1.run(_source)) + pipeline_1.dataset().row_counts().df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb b/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb index 27ba3b465..d66fc4b27 100644 --- a/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb +++ b/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb @@ -6,7 +6,7 @@ "id": "8ucJBHffzqYB" }, "source": [ - "# Building Custom Sources with the Filesystem in `dlt` [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)" + "# Building Custom Sources with the Filesystem in `dlt` [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)" ] }, { @@ -24,8 +24,6 @@ "id": "F5ayDx9Nz1ts" }, "source": [ - "You will learn how to:\n", - "\n", "- Use the `filesystem` resource to build real custom sources\n", "- Apply filters to file metadata (name, size, date)\n", "- Implement and register custom transformers\n", @@ -42,15 +40,6 @@ "## Setup: Download real data" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "siTnHHjg1fSK" - }, - "source": [ - "Install dlt" - ] - }, { "cell_type": "code", "execution_count": null, @@ -80,7 +69,14 @@ }, "outputs": [], "source": [ - "!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet" + "import urllib.request\n", + "import os\n", + "\n", + "os.makedirs(\"local_data\", exist_ok=True)\n", + "\n", + "url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n", + "dest = \"local_data/userdata.parquet\"\n", + "urllib.request.urlretrieve(url, dest)" ] }, { @@ -277,7 +273,9 @@ "\n", "\n", "# Download a JSON file\n", - "!wget -O local_data/sample.json https://jsonplaceholder.typicode.com/users\n", + "url = \"https://jsonplaceholder.typicode.com/users\"\n", + "dest = \"local_data/sample.json\"\n", + "urllib.request.urlretrieve(url, dest)\n", "\n", "fs = filesystem(bucket_url=\"./local_data\", file_glob=\"sample.json\")\n", "pipeline = dlt.pipeline(\"json_pipeline\", destination=\"duckdb\")\n", @@ -366,7 +364,7 @@ "id": "XoWLhw7DLg7i" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/14br3TZTRFwTSwpDyom7fxlZCeRF4efMk#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!" ] }, { @@ -375,15 +373,6 @@ "source": [ "![Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1.webp)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rBJ9K3XwMhZW" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py b/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py new file mode 100644 index 000000000..64dc878e9 --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py @@ -0,0 +1,301 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# Building Custom Sources with the Filesystem in `dlt` [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## What you will learn""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + - Use the `filesystem` resource to build real custom sources + - Apply filters to file metadata (name, size, date) + - Implement and register custom transformers + - Enrich records with file metadata + - Use incremental loading both for files and content + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Setup: Download real data""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We’ll use a real `.parquet` file from [TimeStored.com](https://www.timestored.com/data/sample/userdata.parquet)""" + ) + return + + +@app.cell +def _(): + import urllib.request + import os + + os.makedirs("local_data", exist_ok=True) + _url = "https://www.timestored.com/data/sample/userdata.parquet" + _dest = "local_data/userdata.parquet" + urllib.request.urlretrieve(_url, _dest) + return os, urllib + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 1: Load Parquet file from Local Filesystem + + **What the script below does**: Lists and reads all `.parquet` files in `./local_data` and loads them into a table named `userdata`. + """) + return + + +@app.cell +def _(): + import dlt + from dlt.sources.filesystem import filesystem, read_parquet + + _fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet") + # Point to the local file directory + parquet_data = _fs | read_parquet() + pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") + # Add a transformer + _load_info = pipeline.run(parquet_data.with_name("userdata")) + print(_load_info) + # Create and run pipeline + # Inspect data + pipeline.dataset().userdata.df().head() + return dlt, filesystem, pipeline, read_parquet + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Question 1**: + + In the `my_pipeline` pipeline, and the `userdata` dataset, what is the ratio of men:women in decimal? + """) + return + + +@app.cell +def _(pipeline): + # check out the numbers below and answer 👀 + df = pipeline.dataset().userdata.df() + df.groupby("gender").describe() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 2: Enrich records with file metadata + + Let’s add the file name to every record to track the data origin. + """) + return + + +@app.cell +def _(dlt, filesystem): + from dlt.common.typing import TDataItems + + @dlt.transformer() + def read_parquet_with_filename(files: TDataItems) -> TDataItems: + import pyarrow.parquet as pq + + for file_item in files: + with file_item.open() as f: + table = pq.read_table(f).to_pandas() + table["source_file"] = file_item["file_name"] + yield table.to_dict(orient="records") + + _fs = filesystem(bucket_url="./local_data", file_glob="*.parquet") + pipeline_1 = dlt.pipeline("meta_pipeline", destination="duckdb") + _load_info = pipeline_1.run( + (_fs | read_parquet_with_filename()).with_name("userdata") + ) + print(_load_info) + return (TDataItems,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Step 3: Filter files by metadata""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Only load files matching custom logic:""") + return + + +@app.cell +def _(dlt, filesystem, read_parquet): + _fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet") + _fs.add_filter(lambda f: "user" in f["file_name"] and f["size_in_bytes"] < 1000000) + pipeline_2 = dlt.pipeline("filtered_pipeline", destination="duckdb") + _load_info = pipeline_2.run((_fs | read_parquet()).with_name("userdata_filtered")) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 4: Load files incrementally + Avoid reprocessing the same file twice. + """) + return + + +@app.cell +def _(dlt, filesystem, read_parquet): + _fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet") + _fs.apply_hints(incremental=dlt.sources.incremental("modification_date")) + data = (_fs | read_parquet()).with_name("userdata") + pipeline_3 = dlt.pipeline("incremental_pipeline", destination="duckdb") + _load_info = pipeline_3.run(data) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 5: Create a custom transformer + + Let’s read structured data from `.json` files. + """) + return + + +@app.cell +def _(TDataItems, dlt, filesystem, urllib): + @dlt.transformer(standalone=True) + def read_json(items: TDataItems) -> TDataItems: + from dlt.common import json + + for file_obj in items: + with file_obj.open() as f: + yield json.load(f) + + _url = "https://jsonplaceholder.typicode.com/users" + _dest = "local_data/sample.json" + urllib.request.urlretrieve(_url, _dest) + _fs = filesystem(bucket_url="./local_data", file_glob="sample.json") + pipeline_4 = dlt.pipeline("json_pipeline", destination="duckdb") + _load_info = pipeline_4.run((_fs | read_json()).with_name("users")) + print(_load_info) + return (pipeline_4,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + 📁 You will see that this file also exists in your local_data directory. + + > A **standalone** resource is defined on a function that is top-level in a module (not an inner function) that accepts config and secrets values. Additionally, if the standalone flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function, and the user must call the wrapper to get the actual resource. + + Let's inspect the `users` table in your DuckDB dataset: + """) + return + + +@app.cell +def _(pipeline_4): + pipeline_4.dataset().users.df().head() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 6: Copy files before loading + + Copy files locally as part of the pipeline. This is useful for backups or post-processing. + """) + return + + +@app.cell +def _(dlt, filesystem, os): + from dlt.common.storages.fsspec_filesystem import FileItemDict + + def copy_local(item: FileItemDict) -> FileItemDict: + local_path = os.path.join("copied", item["file_name"]) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + item.fsspec.download(item["file_url"], local_path) + return item + + _fs = filesystem(bucket_url="./local_data", file_glob="**/*.parquet").add_map( + copy_local + ) + pipeline_5 = dlt.pipeline("copy_pipeline", destination="duckdb") + _load_info = pipeline_5.run(_fs.with_name("copied_files")) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Next steps + + - Try building a transformer for `.xml` using `xmltodict` + - Combine multiple directories or buckets in a single pipeline + - Explore [more examples](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/advanced) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)!""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_3_Custom_sources_Filesystem_and_cloud_storage_img1.webp)""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb b/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb index 778ed14da..4a37867ad 100644 --- a/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb +++ b/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb @@ -6,7 +6,7 @@ "id": "eZpIGo3Fg8hR" }, "source": [ - "# Custom destinations & Reverse ETL [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n", + "# Custom destinations & Reverse ETL [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb)\n", "\n", "---\n", "\n", @@ -15,7 +15,7 @@ "- What reverse ETL means in practice \n", "- How to build custom destinations with `@dlt.destination` \n", "- How batching works \n", - "- How to push real data from Rfam database to Notion \n", + "- How to push real data from the Rfam database to Notion \n", "\n", "---\n" ] @@ -237,8 +237,8 @@ "\n", "### 4.1. Step 1: Create a database in Notion\n", "\n", - "1. Create empty database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n", - "2. [Create integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n", + "1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion)\n", + "2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace.\n", "3. Connect your database to the integration.\n", "4. Create 3 columns: Accession (title), ID (text), Description (text)" ] @@ -263,7 +263,7 @@ "id": "0AdDovQklsE9" }, "source": [ - "### 4.2. Step 2: Install and configure" + "### 4.2. Step 2: Configure" ] }, { @@ -289,7 +289,7 @@ "2. Set your credentials either in:\n", " - `~/.dlt/secrets.toml` \n", " - or environment variables\n", - " - or (**in our case**) in Colab Secrets\n", + " - or (**in our case**) in Colab or Molab Secrets\n", "\n", " ```toml\n", " [destination.notion]\n", @@ -344,7 +344,7 @@ "id": "C0r_R3M_6ePP" }, "source": [ - "You can also check if your integration works via `curl`:\n", + "You can also check if your integration works via the requests library:\n", "1. Modify Bearer token\n", "2. Modify \"query\" if you database have another name" ] @@ -357,7 +357,24 @@ }, "outputs": [], "source": [ - "! curl -X POST 'https://api.notion.com/v1/search' -H 'Authorization: Bearer '\"ntn_q5_your_token_o5xQLn1sewnep6\"'' -H 'Content-Type: application/json' -H 'Notion-Version: 2022-06-28' --data '{\"query\": \"Advanced\", \"filter\": {\"value\": \"database\", \"property\": \"object\"}, \"sort\": {\"direction\":\"ascending\", \"timestamp\":\"last_edited_time\"}}'" + "import requests\n", + "\n", + "url = \"https://api.notion.com/v1/search\"\n", + "\n", + "headers = {\n", + " \"Authorization\": \"Bearer ntn_q5_your_token_o5xQLn1sewnep6\",\n", + " \"Content-Type\": \"application/json\",\n", + " \"Notion-Version\": \"2022-06-28\",\n", + "}\n", + "\n", + "data = {\n", + " \"query\": \"Advanced\",\n", + " \"filter\": {\"value\": \"database\", \"property\": \"object\"},\n", + " \"sort\": {\"direction\": \"ascending\", \"timestamp\": \"last_edited_time\"},\n", + "}\n", + "\n", + "response = requests.post(url, headers=headers, json=data)\n", + "print(response.json())" ] }, { @@ -424,8 +441,8 @@ "from notion_client import Client\n", "from google.colab import userdata\n", "\n", - "os.environ[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n", - "os.environ[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n", + "dlt.secrets[\"DESTINATION__NOTION__NOTION_AUTH\"] = userdata.get(\"NOTION_AUTHENTICATION\")\n", + "dlt.secrets[\"DESTINATION__NOTION__NOTION_PAGE_ID\"] = userdata.get(\"NOTION_PAGE_ID\")\n", "\n", "\n", "@dlt.destination(name=\"notion\")\n", @@ -522,17 +539,8 @@ "id": "nJach4xBFfva" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1--wNVd26TqNolnnECnUYZqeE2CXOeVZE#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vmz0tMhcmwPh" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py b/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py new file mode 100644 index 000000000..f14c87230 --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py @@ -0,0 +1,464 @@ +# /// script +# dependencies = [ +# "dlt", +# "dlt[duckdb]", +# "notion-client", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Custom destinations & Reverse ETL [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) + + --- + + ## What you’ll learn + + - What reverse ETL means in practice + - How to build custom destinations with `@dlt.destination` + - How batching works + - How to push real data from the Rfam database to Notion + + --- + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **1. Concept: What is a custom destination?** + + Normally, dlt sends your data to databases like BigQuery or Postgres. + + But with `@dlt.destination`, you can **intercept the normalized data** and send it wherever you want: + - APIs (Notion, Slack, Airtable) + - Message queues (Kafka, SQS) + - Logging systems + - Custom data sinks + + All you have to do is define a function like: + + ```python + @dlt.destination + def my_destination(items, table): + ... + ``` + + And dlt will call this for every batch of data extracted and normalized. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## **2. Simple example: print data rows** + + ### Code example: + """) + return + + +@app.cell +def _(): + import dlt + from dlt.common.typing import TDataItems + from dlt.common.schema import TTableSchema + + @dlt.destination(batch_size=5) + def print_sink(items: TDataItems, table: TTableSchema) -> None: + print(f"\nTable: {table['name']}") + for item in items: + print(item) + + @dlt.resource + def simple_data() -> TDataItems: + yield [{"id": i, "value": f"row-{i}"} for i in range(12)] + + _pipeline = dlt.pipeline("print_example", destination=print_sink) + _pipeline.run(simple_data()) + print(_pipeline.last_trace) + return TDataItems, TTableSchema, dlt, simple_data + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **What’s happening?** + + - `simple_data()` yields 12 small records. + - The data goes through **normalization** (converted to rows + types). + - `@dlt.destination(batch_size=5)` groups these rows into batches of 5. + - For each batch, `print_sink()` is called. + - The `table` parameter tells you which table the batch belongs to. + + + **Why this is important?** + + - This is the **simplest possible custom destination.** + - You’re in control: log, debug, or route data per table. + - It introduces how dlt structures the data and calls your function. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Question 1: + + In the following example, how many times will the function be called? + """) + return + + +@app.cell +def _(TDataItems, TTableSchema, dlt): + @dlt.destination(batch_size=2) + def new_print_sink(items: TDataItems, table: TTableSchema) -> None: + print(items) + + @dlt.resource + def new_simple_data() -> TDataItems: + yield [{"id": i} for i in range(6)] + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **3. How batching works** + + By default `batch_size` is 10. + + + Let’s tweak just one thing: + """) + return + + +@app.cell +def _(TDataItems, TTableSchema, dlt, simple_data): + @dlt.destination(batch_size=1) + def print_each_row(items: TDataItems, table: TTableSchema) -> None: + print(f"Got one row from table {table['name']}:") + print(items) + + _pipeline = dlt.pipeline("print_example", destination=print_each_row) + _pipeline.run(simple_data()) + print(_pipeline.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Now, dlt calls your function **once per row** instead of per 5 rows. + + Useful if: + - Your API doesn’t support bulk inserts. + - You want fine-grained control or retries. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **4. Real-world project: Rfam database → Notion** + + Let’s build a real pipeline that fetches data from database and **sends it to Notion**. + + ### Why Notion? + + - Notion is a great tool for product/dev teams. + - But dlt doesn’t support Notion as a *destination*. + - So, we’ll build that ourselves. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 4.1. Step 1: Create a database in Notion + + 1. Create empty an database. [Notion documentation.](https://super.so/blog/6-steps-to-creating-databases-in-notion) + 2. [Create an integration](https://www.notion.so/profile/integrations) in your Notion Workspace. + 3. Connect your database to the integration. + 4. Create 3 columns: Accession (title), ID (text), Description (text) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img3](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img3.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### 4.2. Step 2: Configure""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + 2. Set your credentials either in: + - `~/.dlt/secrets.toml` + - or environment variables + - or (**in our case**) in Colab or Molab Secrets + + ```toml + [destination.notion] + notion_auth = "" + notion_page_id = "" + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""- Save your [Notion authentication token](https://developers.notion.com/docs/authorization#internal-integration-auth-flow-set-up) and the [ID of the page](https://developers.notion.com/docs/working-with-page-content#creating-a-page-with-content) where you want to create a database in your Colab secrets:""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img4](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img4.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img5](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img5.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img6](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img6.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> Make sure to [connect the page](https://www.notion.so/help/add-and-manage-connections-with-the-api#add-connections-to-pages) to the integration associated with the token!""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can also check if your integration works via the requests library: + 1. Modify Bearer token + 2. Modify "query" if you database have another name + """) + return + + +@app.cell +def _(): + import requests + + url = "https://api.notion.com/v1/search" + + headers = { + "Authorization": "Bearer ntn_q5_your_token_o5xQLn1sewnep6", + "Content-Type": "application/json", + "Notion-Version": "2022-06-28", + } + + data = { + "query": "Advanced", + "filter": {"value": "database", "property": "object"}, + "sort": {"direction": "ascending", "timestamp": "last_edited_time"}, + } + + response = requests.post(url, headers=headers, json=data) + print(response.json()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 4.3. Step 3: Get data from Rfam database + + Let's use `query_callback` and limit the number of data rows: + """) + return + + +@app.cell +def _(): + import os + import sqlalchemy as sa + from sqlalchemy import text + from dlt.sources.sql_database import sql_database + from dlt.sources.sql_database.helpers import SelectClause, Table + + def limit_rows(query: SelectClause, table: Table) -> SelectClause: + return text(f"SELECT * FROM {table.fullname} LIMIT 20") + + source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["family"], + query_adapter_callback=limit_rows, + ) + return os, source + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### 4.4. Step 4: Define Notion destination""") + return + + +@app.cell +def _(TDataItems, TTableSchema, dlt, os): + from notion_client import Client + + dlt.secrets["DESTINATION__NOTION__NOTION_AUTH"] = os.getenv("NOTION_AUTHENTICATION") + dlt.secrets["DESTINATION__NOTION__NOTION_PAGE_ID"] = os.getenv("NOTION_PAGE_ID") + + @dlt.destination(name="notion") + def push_to_notion( + items: TDataItems, + table: TTableSchema, + notion_auth: str = dlt.secrets.value, + notion_page_id: str = dlt.secrets.value, + ) -> None: + client = Client(auth=notion_auth) + print(len(items)) + for item in items: + client.pages.create( + parent={"database_id": notion_page_id}, + properties={ + "Accession": {"title": [{"text": {"content": item["rfam_acc"]}}]}, + "ID": {"rich_text": [{"text": {"content": item["rfam_id"]}}]}, + "Description": { + "rich_text": [{"text": {"content": item["description"]}}] + }, + }, + ) + return (push_to_notion,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **What’s happening?** + + - dlt will call `push_to_notion()` with one batch of records at a time. + - For each record, we create a page in Notion. + - Credentials and database ID come from `secrets.toml` or env vars. + + **Why this is useful?** + + - You just turned your pipeline into a full **reverse ETL** job. + - No need for Airbyte or writing custom orchestration scripts. + - It’s reusable and works with dlt’s retry logic, state management, and transformations. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### 4.5. Step 5: Run the pipeline""") + return + + +@app.cell +def _(dlt, push_to_notion, source): + _pipeline = dlt.pipeline( + "notion_pipeline", destination=push_to_notion, progress="log" + ) + _pipeline.run(source, table_name="rfam_family") + print(_pipeline.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Destinations_Reverse_ETL_img7](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_4_Destinations_Reverse_ETL_img7.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## 5. Reliability and state + + ### What if Notion fails mid-run? + + - dlt **retries batches** up to 5 times. + - You can restart the pipeline and it will continue from the failed batch. + - But you must make your destination **idempotent** (i.e., safe to re-run the same input). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb b/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb index 2f7ea18a0..938925928 100644 --- a/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb +++ b/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb @@ -6,17 +6,17 @@ "id": "CbFVutT06Cqq" }, "source": [ - "# Transforming and filtering the data [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n", + "# Transforming and filtering the data [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb)\n", "\n", "In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion.\n", "\n", "dlt provides several ways of doing it during the ingestion:\n", - "1. With custom query (applicable for `sql_database` source).\n", - "2. With dlt special functions (`add_map` and `add_filter`).\n", + "1. With a custom query (applicable for `sql_database` source).\n", + "2. With special dlt functions (`add_map` and `add_filter`).\n", "3. Via `@dlt.transformers`.\n", "4. With `pipeline.dataset()`.\n", "\n", - "Let's review and compare those methods." + "Let's review and compare these methods." ] }, { @@ -116,8 +116,8 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT * FROM genome\") as table:\n", - " genome = table.df()\n", + " with client.execute_query(\"SELECT * FROM genome\") as my_table:\n", + " genome = my_table.df()\n", "genome" ] }, @@ -139,8 +139,8 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as table:\n", - " print(table.df())" + " with client.execute_query(\"SELECT COUNT(*) AS total_rows FROM genome\") as my_table:\n", + " print(my_table.df())" ] }, { @@ -158,7 +158,7 @@ "id": "edAUbOHXuwlL" }, "source": [ - "Imagine a use-case where we're only interested in getting the genome data for bacterias. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources." + "Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources." ] }, { @@ -172,8 +172,8 @@ "with pipeline.sql_client() as client:\n", " with client.execute_query(\n", " \"SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'\"\n", - " ) as table:\n", - " print(table.df())" + " ) as my_table:\n", + " print(my_table.df())" ] }, { @@ -190,20 +190,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "F8A675ZXTCn9" - }, + "metadata": {}, "outputs": [], "source": [ "from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause\n", "\n", "\n", "def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny:\n", - " if table.name == \"genome\":\n", - " # Only select rows where the column kingdom has value \"bacteria\"\n", - " return query.where(table.c.kingdom == \"bacteria\")\n", - " # Use the original query for other tables\n", - " return query" + " return query.where(table.c.kingdom == \"bacteria\") if table.name else query" ] }, { @@ -240,8 +234,7 @@ " dataset_name=\"sql_data\",\n", ")\n", "\n", - "load_info = pipeline.run(source, write_disposition=\"replace\")\n", - "\n", + "pipeline.run(source, write_disposition=\"replace\")\n", "print(pipeline.last_trace)" ] }, @@ -305,16 +298,16 @@ "with pipeline.sql_client() as client:\n", " with client.execute_query(\n", " \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan\"\n", - " ) as table:\n", + " ) as my_table:\n", " print(\"Table clan:\")\n", - " print(table.df())\n", + " print(my_table.df())\n", " print(\"\\n\")\n", "\n", " with client.execute_query(\n", " \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n", - " ) as table:\n", + " ) as my_table:\n", " print(\"Table genome:\")\n", - " print(table.df())" + " print(my_table.df())" ] }, { @@ -373,9 +366,9 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n", + " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n", " print(\"Table clan:\")\n", - " print(table.df())" + " print(my_table.df())" ] }, { @@ -465,9 +458,9 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n", + " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n", " print(\"Table clan:\")\n", - " clan = table.df()\n", + " clan = my_table.df()\n", "\n", "clan" ] @@ -546,9 +539,9 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as table:\n", + " with client.execute_query(\"SELECT DISTINCT author FROM clan LIMIT 5\") as my_table:\n", " print(\"Table clan:\")\n", - " print(table.df())" + " print(my_table.df())" ] }, { @@ -596,8 +589,8 @@ "\n", "resource.add_map(add_greeting)\n", "\n", - "for row in resource():\n", - " print(row)" + "for _row in resource():\n", + " print(_row)" ] }, { @@ -680,7 +673,7 @@ ")\n", "source.genome.add_filter(lambda item: item[\"kingdom\"] == \"bacteria\")\n", "\n", - "load_info = pipeline.run(source, write_disposition=\"replace\")\n", + "pipeline.run(source, write_disposition=\"replace\")\n", "\n", "print(pipeline.last_trace)" ] @@ -696,9 +689,9 @@ "with pipeline.sql_client() as client:\n", " with client.execute_query(\n", " \"SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome\"\n", - " ) as table:\n", + " ) as my_table:\n", " print(\"Table genome:\")\n", - " genome_count = table.df()\n", + " genome_count = my_table.df()\n", "genome_count" ] }, @@ -753,8 +746,7 @@ ")\n", "source.genome.add_limit(1)\n", "\n", - "load_info = pipeline.run(source, write_disposition=\"replace\")\n", - "\n", + "pipeline.run(source, write_disposition=\"replace\")\n", "print(pipeline.last_trace)" ] }, @@ -767,8 +759,8 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT * FROM genome\") as table:\n", - " genome_limited = table.df()\n", + " with client.execute_query(\"SELECT * FROM genome\") as my_table:\n", + " genome_limited = my_table.df()\n", "genome_limited" ] }, @@ -824,7 +816,7 @@ " dev_mode=True,\n", ")\n", "\n", - "info = pipeline.run([genome_resource, genome_resource | batch_stats])\n", + "pipeline.run([genome_resource, genome_resource | batch_stats])\n", "print(pipeline.last_trace)" ] }, @@ -837,8 +829,8 @@ "outputs": [], "source": [ "with pipeline.sql_client() as client:\n", - " with client.execute_query(\"SELECT * FROM batch_stats\") as table:\n", - " res = table.df()\n", + " with client.execute_query(\"SELECT * FROM batch_stats\") as my_table:\n", + " res = my_table.df()\n", "res" ] }, @@ -879,16 +871,16 @@ "# NOTE: this is the duckdb sql dialect, other destinations may use different expressions\n", "with pipeline.sql_client() as client:\n", " client.execute_sql(\n", - " \"\"\" CREATE OR REPLACE TABLE genome_length AS\n", - " SELECT\n", - " SUM(total_length) AS total_total_length,\n", - " AVG(total_length) AS average_total_length\n", - " FROM\n", - " genome\n", - " \"\"\"\n", + " (\n", + " \"CREATE OR REPLACE TABLE genome_length AS \"\n", + " \"SELECT \"\n", + " \" SUM(total_length) AS total_total_length, \"\n", + " \" AVG(total_length) AS average_total_length \"\n", + " \"FROM genome\"\n", + " )\n", " )\n", - " with client.execute_query(\"SELECT * FROM genome_length\") as table:\n", - " genome_length = table.df()\n", + " with client.execute_query(\"SELECT * FROM genome_length\") as my_table:\n", + " genome_length = my_table.df()\n", "\n", "genome_length" ] @@ -1068,7 +1060,7 @@ "id": "AH3F46PaJZe4" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1XT1xUIQIWj0nPWOmTixThgdXzi4vudce#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!" ] } ], diff --git a/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py b/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py new file mode 100644 index 000000000..e1bb40286 --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py @@ -0,0 +1,765 @@ +# /// script +# dependencies = [ +# "dlt[sql_database,duckdb]", +# "ibis-framework[duckdb]", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Transforming and filtering the data [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) + + In this lesson, we will take a look at various ways of doing data transformations and filtering of the data during and after the ingestion. + + dlt provides several ways of doing it during the ingestion: + 1. With a custom query (applicable for `sql_database` source). + 2. With special dlt functions (`add_map` and `add_filter`). + 3. Via `@dlt.transformers`. + 4. With `pipeline.dataset()`. + + Let's review and compare these methods. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## What you’ll learn: + + - How to limit rows at the source with SQL queries. + - How to apply custom Python logic per record. + - How to write transformations using functional and declarative APIs. + - How to access and query your loaded data using `.dataset()`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Setup and initial Load""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We will be using the `sql_database` source as an example and will connect to the public [MySQL RFam](https://www.google.com/url?q=https%3A%2F%2Fwww.google.com%2Furl%3Fq%3Dhttps%253A%252F%252Fdocs.rfam.org%252Fen%252Flatest%252Fdatabase.html) database. The RFam database contains publicly accessible scientific data on RNA structures. + + Let's perform an initial load: + """) + return + + +@app.cell +def _(): + import dlt + from dlt.sources.sql_database import sql_database + + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["family", "genome"], + ) + pipeline = dlt.pipeline( + pipeline_name="sql_database_pipeline", + destination="duckdb", + dataset_name="sql_data", + ) + _load_info = pipeline.run(_source) + print(_load_info) + return dlt, pipeline, sql_database + + +@app.cell +def _(pipeline): + with pipeline.sql_client() as _client: + with _client.execute_query("SELECT * FROM genome") as _my_table: + genome = _my_table.df() + genome + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""You can check your data count using `sql_client`:""") + return + + +@app.cell +def _(pipeline): + with pipeline.sql_client() as _client: + with _client.execute_query( + "SELECT COUNT(*) AS total_rows FROM genome" + ) as _my_table: + print(_my_table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""## **1. Filtering the data during the ingestion with `query_adapter_callback`**""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Imagine a use-case where we're only interested in getting the genome data for bacteria. In this case, ingesting the whole `genome` table would be an unnecessary use of time and compute resources.""" + ) + return + + +@app.cell +def _(pipeline): + with pipeline.sql_client() as _client: + with _client.execute_query( + "SELECT COUNT(*) AS total_rows FROM genome WHERE kingdom='bacteria'" + ) as _my_table: + print(_my_table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + When ingesting data using the `sql_database` source, dlt runs a `SELECT` statement in the back, and using the `query_adapter_callback` parameter makes it possible to pass a `WHERE` clause inside the underlying `SELECT` statement. + + In this example, only the table `genome` is filtered on the column `kingdom` + """) + return + + +@app.cell +def _(): + from dlt.sources.sql_database.helpers import Table, SelectAny, SelectClause + + def query_adapter_callback(query: SelectAny, table: Table) -> SelectAny: + return query.where(table.c.kingdom == "bacteria") if table.name else query + return SelectAny, SelectClause, Table, query_adapter_callback + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Attach it:""") + return + + +@app.cell +def _(dlt, query_adapter_callback, sql_database): + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["genome"], + query_adapter_callback=query_adapter_callback, + ) + pipeline_1 = dlt.pipeline( + pipeline_name="sql_database_pipeline_filtered", + destination="duckdb", + dataset_name="sql_data", + ) + pipeline_1.run(_source, write_disposition="replace") + print(pipeline_1.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In the snippet above we created an SQL VIEW in your source database and extracted data from it. In that case, dlt will infer all column types and read data in shape you define in a view without any further customization. + + If creating a view is not feasible, you can fully rewrite the automatically generated query with an extended version of `query_adapter_callback`: + """) + return + + +@app.cell +def _(SelectAny, SelectClause, Table, dlt, sql_database): + import sqlalchemy as sa + + def query_adapter_callback_1(query: SelectAny, table: Table) -> SelectClause: + if table.name == "genome": + return sa.text(f"SELECT * FROM {table.fullname} WHERE kingdom='bacteria'") + return query + + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["genome", "clan"], + query_adapter_callback=query_adapter_callback_1, + ) + pipeline_2 = dlt.pipeline( + pipeline_name="sql_database_pipeline_filtered", + destination="duckdb", + dataset_name="sql_data", + ) + _load_info = pipeline_2.run(_source, write_disposition="replace") + print(_load_info) + return (pipeline_2,) + + +@app.cell +def _(pipeline_2): + with pipeline_2.sql_client() as _client: + with _client.execute_query( + "SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM clan" + ) as _my_table: + print("Table clan:") + print(_my_table.df()) + print("\n") + with _client.execute_query( + "SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome" + ) as _my_table: + print("Table genome:") + print(_my_table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **2. Transforming the data after extract and before load**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Since dlt is a Python library, it gives you a lot of control over the extracted data. + + You can attach any number of transformations that are evaluated on an item-per-item basis to your resource. The available transformation types: + + * `map` - transform the data item (resource.add_map). + * `filter` - filter the data item (resource.add_filter). + * `yield map` - a map that returns an iterator (so a single row may generate many rows - resource.add_yield_map). + * `limit` - limits the number of records processed by a resource. Useful for testing or reducing data volume during development. + + For example, if we wanted to anonymize sensitive data before it is loaded into the destination, then we can write a python function for it and apply it to source or resource using the `.add_map()` method. + + [dlt documentation.](https://dlthub.com/docs/general-usage/resource#filter-transform-and-pivot-data) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Using `add_map`""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In the table `clan`, we notice that there is a column `author` that we would like to anonymize.""" + ) + return + + +@app.cell +def _(pipeline_2): + with pipeline_2.sql_client() as _client: + with _client.execute_query( + "SELECT DISTINCT author FROM clan LIMIT 5" + ) as _my_table: + print("Table clan:") + print(_my_table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""We write a function in python that anonymizes a string""") + return + + +@app.cell +def _(): + import hashlib + from dlt.common.typing import TDataItem + + def pseudonymize_name(row: TDataItem) -> TDataItem: + """ + Pseudonymization is a deterministic type of PII-obscuring. + Its role is to allow identifying users by their hash, + without revealing the underlying info. + """ + # add a constant salt to generate + salt = "WI@N57%zZrmk#88c" + salted_string = row["author"] + salt + sh = hashlib.sha256() + sh.update(salted_string.encode()) + hashed_string = sh.digest().hex() + row["author"] = hashed_string + return row + return TDataItem, hashlib, pseudonymize_name + + +@app.cell +def _(dlt, pseudonymize_name, sql_database): + pipeline_3 = dlt.pipeline( + pipeline_name="sql_database_pipeline_anonymized", + destination="duckdb", + dataset_name="sql_data", + ) + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["clan"], + ) + _source.clan.add_map(pseudonymize_name) + _info = pipeline_3.run(_source) + print(_info) + return (pipeline_3,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""After the pipeline has run, we can observe that the author column has been anonymized.""" + ) + return + + +@app.cell +def _(pipeline_3): + with pipeline_3.sql_client() as _client: + with _client.execute_query( + "SELECT DISTINCT author FROM clan LIMIT 5" + ) as _my_table: + print("Table clan:") + clan = _my_table.df() + clan + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""**Note:** If you're using the `pyarrow` or `connectorx` backend, the data is not processed item-by-item. Instead they're processed in batches, therefore your function should be adjusted. For example, for PyArrow chunks the function could be changed as follows:""" + ) + return + + +@app.cell +def _(dlt, hashlib, sql_database): + import pyarrow as pa + import pyarrow.compute as pc + + def pseudonymize_name_pyarrow(table: pa.Table) -> pa.Table: + """ + Pseudonymizes the 'author' column in a PyArrow Table. + """ + salt = "WI@N57%zZrmk#88c" + _df = table.to_pandas() + _df["author"] = ( + _df["author"] + .astype(str) + .apply(lambda x: hashlib.sha256((x + salt).encode()).hexdigest()) + ) + new_table = pa.Table.from_pandas(_df) + return new_table + + pipeline_4 = dlt.pipeline( + pipeline_name="sql_database_pipeline_anonymized1", + destination="duckdb", + dataset_name="sql_data", + ) + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["clan"], + backend="pyarrow", + ) + _source.clan.add_map(pseudonymize_name_pyarrow) + _info = pipeline_4.run(_source) + print(_info) + return (pipeline_4,) + + +@app.cell +def _(pipeline_4): + with pipeline_4.sql_client() as _client: + with _client.execute_query( + "SELECT DISTINCT author FROM clan LIMIT 5" + ) as _my_table: + print("Table clan:") + print(_my_table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### `add_map` vs `add_yield_map` + + The difference between `add_map` and `add_yield_map` matters when a transformation returns multiple records from a single input. + + #### **`add_map`** + - Use `add_map` when you want to transform each item into exactly one item. + - Think of it like modifying or enriching a row. + - You use a regular function that returns one modified item. + - Great for adding fields or changing structure. + + #### Example + """) + return + + +@app.cell +def _(TDataItem, dlt): + from dlt.common.typing import TDataItems + + @dlt.resource + def _resource() -> TDataItems: + yield [{"name": "Alice"}, {"name": "Bob"}] + + def add_greeting(item: TDataItem) -> TDataItem: + item["greeting"] = f"Hello, {item['name']}!" + return item + + _resource.add_map(add_greeting) + for _row in _resource(): + print(_row) + return (TDataItems,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **`add_yield_map`** + - Use `add_yield_map` when you want to turn one item into multiple items, or possibly no items. + - Your function is a generator that uses yield. + - Great for pivoting nested data, flattening lists, or filtering rows. + + #### Example + """) + return + + +@app.cell +def _(TDataItem, TDataItems, dlt): + @dlt.resource + def _resource() -> TDataItems: + yield [ + {"name": "Alice", "hobbies": ["reading", "chess"]}, + {"name": "Bob", "hobbies": ["cycling"]}, + ] + + def expand_hobbies(item: TDataItem) -> TDataItem: + for hobby in item["hobbies"]: + yield {"name": item["name"], "hobby": hobby} + + _resource.add_yield_map(expand_hobbies) + for row in _resource(): + print(row) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Using `add_filter` + `add_filter` function can be used similarly. The difference is that `add_filter` expects a function that returns a boolean value for each item. For example, to implement the same filtering we did with a query callback, we can use: + """) + return + + +@app.cell +def _(dlt, sql_database): + import time + + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["genome"], + ) + pipeline_5 = dlt.pipeline( + pipeline_name="sql_database_pipeline_filtered", + destination="duckdb", + dataset_name="sql_data", + ) + _source.genome.add_filter(lambda item: item["kingdom"] == "bacteria") + pipeline_5.run(_source, write_disposition="replace") + print(pipeline_5.last_trace) + return (pipeline_5,) + + +@app.cell +def _(pipeline_5): + with pipeline_5.sql_client() as _client: + with _client.execute_query( + "SELECT COUNT(*) AS total_rows, MAX(_dlt_load_id) as latest_load_id FROM genome" + ) as _my_table: + print("Table genome:") + genome_count = _my_table.df() + genome_count + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Question 1: + + What is a `total_rows` in the example above? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Using `add_limit` + + If your resource loads thousands of pages of data from a REST API or millions of rows from a database table, you may want to sample just a fragment of it in order to quickly see the dataset with example data and test your transformations, etc. + + To do this, you limit how many items will be yielded by a resource (or source) by calling the `add_limit` method. This method will close the generator that produces the data after the limit is reached. + """) + return + + +@app.cell +def _(dlt, sql_database): + _source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["genome"], + chunk_size=10, + ) + pipeline_6 = dlt.pipeline( + pipeline_name="sql_database_pipeline_filtered", + destination="duckdb", + dataset_name="sql_data", + ) + _source.genome.add_limit(1) + pipeline_6.run(_source, write_disposition="replace") + print(pipeline_6.last_trace) + return (pipeline_6,) + + +@app.cell +def _(pipeline_6): + with pipeline_6.sql_client() as _client: + with _client.execute_query("SELECT * FROM genome") as _my_table: + genome_limited = _my_table.df() + genome_limited + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **3. Transforming data with `@dlt.transformer`** + + The main purpose of transformers is to create children tables with additional data requests, but they can also be used for data transformations especially if you want to keep the original data as well. + """) + return + + +@app.cell +def _(TDataItem, TDataItems, dlt, sql_database): + @dlt.transformer() + def batch_stats(items: TDataItems) -> TDataItem: + """ + Pseudonymization is a deterministic type of PII-obscuring. + Its role is to allow identifying users by their hash, + without revealing the underlying info. + """ + yield { + "batch_length": len(items), + "max_length": max([item["total_length"] for item in items]), + } + + genome_resource = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", chunk_size=10000 + ).genome + pipeline_7 = dlt.pipeline( + pipeline_name="sql_database_pipeline_with_transformers1", + destination="duckdb", + dataset_name="sql_data", + dev_mode=True, + ) + pipeline_7.run([genome_resource, genome_resource | batch_stats]) + print(pipeline_7.last_trace) # add a constant salt to generate + return (pipeline_7,) + + +@app.cell +def _(pipeline_7): + with pipeline_7.sql_client() as _client: + with _client.execute_query("SELECT * FROM batch_stats") as _my_table: + res = _my_table.df() + res + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **4. Transforming data after the load** + + Another possibility for data transformation is transforming data after the load. dlt provides several way of doing it: + + * using `sql_client`, + * via `.dataset()` and ibis integration, + * via [dbt integration](https://dlthub.com/docs/dlt-ecosystem/transformations/dbt/). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### SQL client + + You already saw examples of using dlt's SQL client. dlt gives you an opportunity to connect to your destination and execute any SQL query. + """) + return + + +@app.cell +def _(pipeline_7): + with pipeline_7.sql_client() as _client: + _client.execute_sql( + "CREATE OR REPLACE TABLE genome_length AS SELECT SUM(total_length) AS total_total_length, AVG(total_length) AS average_total_length FROM genome" + ) + with _client.execute_query("SELECT * FROM genome_length") as _my_table: + genome_length = _my_table.df() + genome_length + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Accessing loaded data with `pipeline.dataset()` + + Use `pipeline.dataset()` to inspect and work with your data in Python after loading. + """) + return + + +@app.cell +def _(pipeline_7): + dataset = pipeline_7.dataset() + # List tables + dataset.row_counts().df() + return (dataset,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Note that `row_counts` didn't return the new table `genome_length`,""" + ) + return + + +@app.cell +def _(dataset): + # Access as pandas + _df = dataset["genome"].df() + _df + return + + +@app.cell +def _(dataset): + # Access as Arrow + arrow_table = dataset["genome_length"].arrow() + arrow_table + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""You can also filter, limit, and select columns:""") + return + + +@app.cell +def _(dataset): + _df = dataset["genome"].select("kingdom", "ncbi_id").limit(10).df() + _df + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""To iterate over large data:""") + return + + +@app.cell +def _(dataset): + for chunk in dataset["genome"].iter_df(chunk_size=500): + print(chunk.head()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""For more advanced users, this interface supports **Ibis expressions**, joins, and subqueries.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Ibis integration + + Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). + + [dlt provides a way to use Ibis expressions natively](https://dlthub.com/docs/general-usage/dataset-access/ibis-backend) with a lot of destinations. Supported ones are: + * Snowflake + * DuckDB + * MotherDuck + * Postgres + * Redshift + * Clickhouse + * MSSQL (including Synapse) + * BigQuery + """) + return + + +@app.cell +def _(pipeline_7): + # get the dataset from the pipeline + dataset_1 = pipeline_7.dataset() + dataset_name = pipeline_7.dataset_name + ibis_connection = dataset_1.ibis() + # get the native ibis connection from the dataset + print(ibis_connection.list_tables(database=dataset_name)) + table = ibis_connection.table("batch_stats", database=dataset_name) + # list all tables in the dataset + # NOTE: You need to provide the dataset name to ibis, in ibis datasets are named databases + # get the items table + # print the first 2 rows + print(table.limit(2).execute()) # # type: ignore[attr-defined] + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb b/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb index e2ec3d5a4..38eebf04a 100644 --- a/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb +++ b/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb @@ -6,7 +6,7 @@ "id": "_dbt9Ilnmktb" }, "source": [ - "# Merge and replace strategies & Advanced tricks [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n" + "# Merge and replace strategies & Advanced tricks [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)\n" ] }, { @@ -46,7 +46,7 @@ "\n", "\n", "\n", - "A `write_disposition` in `dlt` can specified in the resource decorator:\n", + "A `write_disposition` in `dlt` can be specified in the resource decorator:\n", "\n", "```python\n", "@dlt.resource(write_disposition=\"append\")\n", @@ -153,17 +153,17 @@ " - Append\n", " - Replace\n", " - Merge\n", - "- What incremental loading is.\n", + "- What incremental loading is\n", "\n", "**Now, we will cover** the different strategies for `merge` write disposition:\n", - "- `delete-insert` strategy.\n", - "- `upsert` strategy.\n", - "- `SCD2` strategy.\n", + "- `delete-insert` strategy\n", + "- `upsert` strategy\n", + "- `SCD2` strategy\n", "\n", - "We also will take a look at\n", - "* Hard deletes.\n", - "* Falling back for incremental cursors.\n", - "* Backfills." + "We will also take a look at:\n", + "* Hard deletes\n", + "* Falling back for incremental cursors\n", + "* Backfills" ] }, { @@ -258,9 +258,7 @@ "]\n", "\n", "\n", - "dlt.secrets[\n", - " \"destination.replace_strategy\"\n", - "] = \"truncate-and-insert\" # <--- set the replace strategy using TOML, ENVs or Python\n", + "dlt.secrets[\"destination.replace_strategy\"] = \"truncate-and-insert\"\n", "\n", "pipeline = dlt.pipeline(\n", " pipeline_name=\"pokemon_load_1\",\n", @@ -268,7 +266,7 @@ " dataset_name=\"pokemon_data_1\",\n", ")\n", "\n", - "load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n", + "pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n", "print(pipeline.last_trace)" ] }, @@ -350,9 +348,7 @@ "]\n", "\n", "\n", - "dlt.secrets[\n", - " \"destination.replace_strategy\"\n", - "] = \"insert-from-staging\" # <--- set the replace strategy using TOML, ENVs or Python\n", + "dlt.secrets[\"destination.replace_strategy\"] = \"insert-from-staging\"\n", "\n", "pipeline = dlt.pipeline(\n", " pipeline_name=\"pokemon_load_2\",\n", @@ -360,8 +356,7 @@ " dataset_name=\"pokemon_data_2\",\n", ")\n", "\n", - "load_info = pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n", - "\n", + "pipeline.run(data, table_name=\"pokemon\", write_disposition=\"replace\")\n", "print(pipeline.last_trace)" ] }, @@ -391,7 +386,7 @@ "\n", "In this example, the `insert-from-staging` strategy will load the pokemon data **into a staging table** in the `pokemon_data_2_staging` schema in DuckDB (or any other destination you choose). \n", "\n", - "Let's check the content of this table:" + "Let's check the contents of this table:" ] }, { @@ -558,7 +553,7 @@ " write_disposition=\"merge\",\n", " primary_key=\"id\",\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def pokemon(data: TDataItems) -> TDataItems:\n", " yield data\n", "\n", "\n", @@ -568,7 +563,7 @@ " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)\n", "\n", "# explore loaded data\n", @@ -645,7 +640,7 @@ " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 7.5, \"height\": 0.4},\n", - " }, # <--- Pikachu's weight has increased\n", + " },\n", "]" ] }, @@ -666,7 +661,7 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)\n", "\n", "# explore loaded data\n", @@ -729,7 +724,7 @@ "id": "S06hBVpXgmqF" }, "source": [ - "We see that only new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one." + "We see that only the new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one." ] }, { @@ -892,12 +887,12 @@ "@dlt.resource(\n", " name=\"pokemon\",\n", " write_disposition={\n", - " \"disposition\": \"merge\", # <--- specifies that existing data should be merged\n", - " \"strategy\": \"scd2\", # <--- enables SCD2 tracking, which keeps historical records of changes\n", + " \"disposition\": \"merge\",\n", + " \"strategy\": \"scd2\",\n", " },\n", " primary_key=\"id\",\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def pokemon(data: TDataItems) -> TDataItems:\n", " yield data\n", "\n", "\n", @@ -908,7 +903,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)" ] }, @@ -972,7 +967,7 @@ " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 6, \"height\": 0.4},\n", - " }, # <--- weight has changed back\n", + " },\n", "]" ] }, @@ -993,7 +988,7 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)" ] }, @@ -1075,19 +1070,19 @@ " \"name\": \"bulbasaur\",\n", " \"size\": {\"weight\": 6.9, \"height\": 0.7},\n", " \"deleted_flag\": True,\n", - " }, # <--- should be deleted\n", + " },\n", " {\n", " \"id\": \"4\",\n", " \"name\": \"charmander\",\n", " \"size\": {\"weight\": 8.5, \"height\": 0.6},\n", " \"deleted_flag\": None,\n", - " }, # <--- should be kept\n", + " },\n", " {\n", " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 6, \"height\": 0.4},\n", " \"deleted_flag\": False,\n", - " }, # <--- should be kept\n", + " },\n", "]" ] }, @@ -1106,9 +1101,9 @@ " name=\"pokemon\",\n", " write_disposition=\"merge\",\n", " primary_key=\"id\",\n", - " columns={\"deleted_flag\": {\"hard_delete\": True}}, # <--- set columns argument\n", + " columns={\"deleted_flag\": {\"hard_delete\": True}},\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def pokemon(data: TDataItems) -> TDataItems:\n", " yield data\n", "\n", "\n", @@ -1119,7 +1114,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)" ] }, @@ -1160,7 +1155,7 @@ " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 6, \"height\": 0.4},\n", " \"deleted_flag\": True,\n", - " }, # <--- set to True\n", + " },\n", "]" ] }, @@ -1172,7 +1167,7 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)" ] }, @@ -1236,19 +1231,19 @@ " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 6, \"height\": 0.4},\n", " \"deleted_flag\": None,\n", - " }, # <--- will be filtered out\n", + " },\n", " {\n", " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 7, \"height\": 0.4},\n", " \"deleted_flag\": True,\n", - " }, # <--- will be removed\n", + " },\n", " {\n", " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 8, \"height\": 0.4},\n", " \"deleted_flag\": None,\n", - " }, # <--- will be loaded\n", + " },\n", "]" ] }, @@ -1279,9 +1274,9 @@ " columns={\n", " \"deleted_flag\": {\"hard_delete\": True},\n", " \"size__weight\": {\"dedup_sort\": \"desc\"},\n", - " }, # <-- desc means that the record with the highest value remains.\n", + " },\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def pokemon(data: TDataItems) -> TDataItems:\n", " yield data\n", "\n", "\n", @@ -1292,7 +1287,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)\n", "\n", "pipeline.dataset().pokemon.df()" @@ -1381,7 +1376,7 @@ " \"size\": {\"weight\": 6, \"height\": 0.4},\n", " \"created_at\": 3,\n", " \"updated_at\": None,\n", - " }, # <--- Incremental cursor is None\n", + " },\n", "]" ] }, @@ -1396,12 +1391,13 @@ "import dlt\n", "\n", "\n", - "@dlt.resource\n", + "@dlt.resource(name=\"pokemon\")\n", "def pokemon(\n", + " data: TDataItems,\n", " updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n", " \"updated_at\", on_cursor_value_missing=\"include\"\n", - " )\n", - ") -> TDataItems: # <--- we want to include all data rows even if cursor is missing\n", + " ),\n", + ") -> TDataItems:\n", " yield data\n", "\n", "\n", @@ -1412,7 +1408,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = pipeline.run(pokemon(data))\n", "print(load_info)\n", "\n", "pipeline.dataset().pokemon.df()" @@ -1474,7 +1470,7 @@ " \"size\": {\"weight\": 6, \"height\": 0.4},\n", " \"created_at\": 3,\n", " \"updated_at\": None,\n", - " }, # <--- Incremental cursor is None\n", + " },\n", "]" ] }, @@ -1488,6 +1484,7 @@ "source": [ "@dlt.resource\n", "def some_data(\n", + " data: TDataItems,\n", " updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\"updated_at\"),\n", ") -> TDataItems:\n", " yield data\n", @@ -1495,9 +1492,7 @@ "\n", "def set_default_updated_at(record: TDataItem) -> TDataItems:\n", " if record.get(\"updated_at\") is None:\n", - " record[\"updated_at\"] = record.get(\n", - " \"created_at\"\n", - " ) # <--- use 'created_at' instead of missing 'updated_at'\n", + " record[\"updated_at\"] = record.get(\"created_at\")\n", " return record" ] }, @@ -1510,7 +1505,7 @@ "outputs": [], "source": [ "# Modifies records before the incremental processing\n", - "with_default_values = some_data().add_map(set_default_updated_at, insert_at=1)" + "with_default_values = some_data(data).add_map(set_default_updated_at, insert_at=1)" ] }, { @@ -1542,7 +1537,7 @@ "outputs": [], "source": [ "# Removes records before the incremental processing\n", - "without_none = some_data().add_filter(\n", + "without_none = some_data(data).add_filter(\n", " lambda r: r.get(\"updated_at\") is not None, insert_at=1\n", ")" ] @@ -1641,9 +1636,10 @@ "\n", "@dlt.resource\n", "def some_data(\n", + " data: TDataItems,\n", " updated_at: dlt.sources.incremental[int] = dlt.sources.incremental(\n", " \"created_at\", initial_value=0, end_value=2\n", - " )\n", + " ),\n", ") -> TDataItems:\n", " yield data" ] @@ -1662,7 +1658,7 @@ " dataset_name=\"pokemon_inc_wd\",\n", ")\n", "\n", - "load_info = pipeline.run(some_data, table_name=\"pokemon\")\n", + "load_info = pipeline.run(some_data(data), table_name=\"pokemon\")\n", "print(load_info)\n", "\n", "pipeline.dataset().pokemon.df()" @@ -1752,7 +1748,7 @@ "continue_load_flag = True\n", "\n", "while continue_load_flag:\n", - " load_info = pipeline.run(source.genome.add_limit(10))\n", + " pipeline.run(source.genome.add_limit(10))\n", " continue_load_flag = (\n", " my_table_name in pipeline.last_trace.last_normalize_info.row_counts.keys()\n", " )\n", @@ -1772,17 +1768,8 @@ "id": "AH3F46PaJZe4" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mC09rjkheo92-ycjjq0AlIzgwJC8-ZMX#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K4smMmlfMysW" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py b/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py new file mode 100644 index 000000000..e2a952bcc --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py @@ -0,0 +1,1379 @@ +# /// script +# dependencies = [ +# "dlt", +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# Merge and replace strategies & Advanced tricks [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Recap**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **`dlt` write dispositions** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Write disposition in the context of the dlt library defines how the data should be written to the destination. There are three types of write dispositions: + + * **Append**: This is the **default** disposition. It will append the data to the existing data in the destination. + + * **Replace**: This disposition replaces the data in the destination with the data from the resource. It **deletes** all the data and **recreates** the schema before loading the data. + + * **Merge**: This write disposition merges the data from the resource with the data at the destination. For the merge disposition, you need to specify a `primary_key` for the resource. + + The write disposition you choose depends on the dataset and how you can extract it. For more details, you can refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading). + + + + A `write_disposition` in `dlt` can be specified in the resource decorator: + + ```python + @dlt.resource(write_disposition="append") + def my_resource(): + ... + yield data + ``` + + Or directly in the pipeline run: + + ```python + load_info = pipeline.run(my_resource, write_disposition="replace") + ``` + + > In case you specify both, the write disposition specified at the pipeline run level will override the write disposition specified at the resource level. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ### **Replace** + + The `replace` strategy in the dlt library is used for **full loading** of data. This strategy completely overwrites the existing data with the new dataset. It's useful when you want to refresh the entire table with the latest data. It's important to note that this strategy technically does not load only new data but instead reloads all data: old and new. + + **Example: E-commerce Product Catalog refresh** + + - A large retailer (e.g., Amazon) needs to refresh their product catalog daily. + + - Using a replace strategy ensures all product details are up-to-date and no longer available products are removed. + + - Critical for ensuring customers see only valid products and prices. + + **Importance**: Guarantees consistency and accuracy in fast-changing datasets where full refreshes are simpler than merging updates. + + **Risks**: Data downtime if the refresh fails or takes too long. + + In dlt, you can control how the data is loaded into the destination table by setting the `write_disposition` parameter in the resource configuration. When you set the `write_disposition` to `replace`, it replaces the data in the destination table with the new data. + + For more details, you can refer to the following documentation pages: + + - [Full loading](https://dlthub.com/docs/general-usage/full-loading) + - [Write dispositions](https://dlthub.com/docs/general-usage/incremental-loading#the-3-write-dispositions) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **Merge** + + Consider a scenario where the data in the source has been updated, but you want to avoid reloading the entire dataset. + + **Example: Customer data integration (e.g., Salesforce CRM)** + + - A business integrates Salesforce CRM data with its data warehouse. + + - Sales representatives continuously update customer profiles. A merge strategy ensures that only the changed records are updated without affecting the entire dataset. + + - Useful for integrating various CRM systems where incremental updates are preferred over full reloads. + + Merge write disposition is used to merge new data into the destination, using a `merge_key` and/or **deduplicating**/**upserting** new data using a `primary_key`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The **merge** write disposition can be useful in several situations: + + 1. If you have a dataset where records are frequently updated and you want to reflect these changes in your database, the merge write disposition can be used. It will **update the existing records** with the new data instead of creating duplicate entries. + + 2. If your data source occasionally sends **duplicate records**, the merge write disposition can help handle this. It uses a `primary_key` to identify unique records, so if a duplicate record (with the same `primary_key`) is encountered, it will be merged with the existing record instead of creating a new one. + + 3. If you are dealing with **Slowly Changing Dimensions** (SCD) where the attribute of a record changes over time and you want to maintain a history of these changes, you can use the merge write disposition with the scd2 strategy. + + + When using the merge disposition, you need to specify a `primary_key` or `merge_key` for the resource. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **More about write dispositions and incremental loading** ⚙️🧠 + + **In the dlt Fundamentals course we've already discussed:** + - `dlt` write dispositions: + - Append + - Replace + - Merge + - What incremental loading is + + **Now, we will cover** the different strategies for `merge` write disposition: + - `delete-insert` strategy + - `upsert` strategy + - `SCD2` strategy + + We will also take a look at: + * Hard deletes + * Falling back for incremental cursors + * Backfills + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## **Replace strategies** + + In this lesson, we will explore the concept of **full loading**, where we completely reload the data of our tables, removing all existing data and replacing it with new data from our source. + + + We will also delve into the different replace strategies that dlt implements for doing a full load on your table: + - `truncate-and-insert`, + - `insert-from-staging`, + - `staging-optimized`. + + + + Each of these strategies has its own unique characteristics and use cases, and we will discuss them in detail. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **I. Truncate-and-Insert Strategy** + + **Overview** + + The `truncate-and-insert` strategy is the **default** replace strategy in dlt and is the **fastest** of all three strategies. This strategy is particularly useful when you want to completely refresh your data and you don't need to maintain the existing data in your tables during the load process. + + When you load data with the `truncate-and-insert` strategy, the destination tables will be truncated at the beginning of the load. This means that all existing data in the tables will be removed. After truncating the tables, the new data will be inserted. The insertion of new data happens consecutively but not within the same transaction. + + **Example: Daily ETL job for financial reports (e.g., Bloomberg Terminal Data)** + + - Daily financial summaries are generated and processed overnight. + + - Using `truncate-and-insert`, the pipeline ensures that analysts work with the most recent data every morning. + + **Configuration** + + You can select the `truncate-and-insert` strategy with a setting in your `config.toml` file. + + ```yaml + [destination] + replace_strategy = "truncate-and-insert" + ``` + + **Limitations** + + However, it's important to note that the **downside** of this strategy is that your **tables will have no data for a while** until the load is completed. You may end up with new data in some tables and no data in other tables if the load fails during the run. Such an incomplete load may be detected by checking if the `_dlt_loads` table contains a load id from `_dlt_load_id` of the replaced tables. If you prefer to have no data downtime, please use one of the other strategies. + + + Here's an example of how to use the `truncate-and-insert` strategy with the Pokemon data: + """) + return + + +@app.cell +def _(): + from typing import List, Dict, Any + import dlt + from datetime import datetime + + data: List[Dict[str, Any]] = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + + dlt.secrets["destination.replace_strategy"] = "truncate-and-insert" + + pipeline = dlt.pipeline( + pipeline_name="pokemon_load_1", + destination="duckdb", + dataset_name="pokemon_data_1", + ) + + pipeline.run(data, table_name="pokemon", write_disposition="replace") + print(pipeline.last_trace) + return dlt, pipeline + + +@app.cell +def _(pipeline): + with pipeline.sql_client() as _client: + with _client.execute_query("SHOW ALL TABLES") as _table: + _tables = _table.df() + _tables + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In this example, we're using the `replace_strategy="truncate-and-insert"` parameter in the pipeline method to indicate that we want to use the `truncate-and-insert` strategy for replacing data.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **II. Insert-from-staging Strategy** + + **Overview** + + The `insert-from-staging` strategy is used when you want to maintain a consistent state between nested and root tables at all times, with zero downtime. This strategy loads all new data into staging tables away from your final destination tables and then truncates and inserts the new data in one transaction. + + **Example: Airline reservation systems (e.g., Amadeus, Sabre)** + + - Ensuring that updated flight availability information doesn't interrupt user queries during ingestion. + + - Data is first written to staging tables and only swapped to production tables when the operation is complete. + + **Configuration** + + You can select the `insert-from-staging` strategy with a setting in your `config.toml` file. If you do not select a strategy, dlt will default to `truncate-and-insert`. + + ```yaml + [destination] + replace_strategy = "insert-from-staging" + ``` + + **Limitations** + + The `insert-from-staging` strategy, while ensuring zero downtime and maintaining a consistent state between nested and root tables, is **the slowest** of all three strategies. It loads all new data into staging tables away from your final destination tables and then truncates and inserts the new data in one transaction. This process can be time-consuming, especially for large datasets. + + Here's an example of how you can use this strategy: + """) + return + + +@app.cell +def _(dlt): + data_1 = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + dlt.secrets["destination.replace_strategy"] = "insert-from-staging" + pipeline_1 = dlt.pipeline( + pipeline_name="pokemon_load_2", + destination="duckdb", + dataset_name="pokemon_data_2", + ) + pipeline_1.run(data_1, table_name="pokemon", write_disposition="replace") + print(pipeline_1.last_trace) + return (pipeline_1,) + + +@app.cell +def _(pipeline_1): + with pipeline_1.sql_client() as _client: + with _client.execute_query("SHOW ALL TABLES") as _table: + _tables = _table.df() + _tables + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We see the introduction of the [staging](https://dlthub.com/docs/dlt-ecosystem/staging) schema called `pokemon_data_2_staging`. + + + In this example, the `insert-from-staging` strategy will load the pokemon data **into a staging table** in the `pokemon_data_2_staging` schema in DuckDB (or any other destination you choose). + + Let's check the contents of this table: + """) + return + + +@app.cell +def _(pipeline_1): + with pipeline_1.sql_client() as _client: + with _client.execute_query( + "SELECT * from pokemon_data_2_staging.pokemon" + ) as _table: + _tables = _table.df() + _tables + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""We see that the staging table contains all the data we loaded.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + dlt will then **truncate** the destination table and **insert** the new data in one transaction, ensuring that the destination dataset is always in a consistent state. + + For more details about the `insert-from-staging` strategy, you can refer to the [dlt documentation.](https://dlthub.com/docs/general-usage/full-loading#the-insert-from-staging-strategy) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **III. Staging-optimized Strategy** + + + The `staging-optimized` replace strategy is one of the three strategies implemented by dlt for doing a full load on your table. + + **Overview** + + The `staging-optimized` strategy **combines the benefits** of the `insert-from-staging` strategy with certain optimizations for **faster** loading on some destinations. However, it comes with a **trade-off**: destination tables may be dropped and recreated in some cases. This means that any views or other constraints you have placed on those tables will be dropped with the table. + + If you have a setup where you need to retain your destination tables, you should not use the `staging-optimized` strategy. On the other hand, if you do not care about tables being dropped but need the benefits of the `insert-from-staging` with some performance (and cost) saving opportunities, this strategy is a good choice. + + **Example: Data warehousing for Business Intelligence (e.g., Snowflake, BigQuery)** + + - When refreshing tables with daily marketing analytics, staging-optimized strategy uses clone operations. + + - Clone operations in platforms like Snowflake are fast and cost-effective since they avoid data copying. + + **How it works** + + The `staging-optimized` strategy behaves differently across destinations: + + - **Postgres**: After loading the new data into the staging tables, the destination tables will be dropped and replaced by the staging tables. No data needs to be moved, so this strategy is almost as fast as `truncate-and-insert`. + + - **BigQuery**: After loading the new data into the staging tables, the destination tables will be dropped and recreated with a clone command from the staging tables. This is a low-cost and fast way to create a second independent table from the data of another. You can learn more about table cloning on BigQuery [here](https://dlthub.com/docs/dlt-ecosystem/destinations/bigquery). + + - **Snowflake**: After loading the new data into the staging tables, the destination tables will be dropped and recreated with a clone command from the staging tables. This is a low-cost and fast way to create a second independent table from the data of another. You can learn more about table cloning on Snowflake [here](https://dlthub.com/docs/dlt-ecosystem/destinations/snowflake). + + - For all **other destinations**, please look at their respective [documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) pages to see if and how the `staging-optimized` strategy is implemented. If it is not implemented, dlt will fall back to the `insert-from-staging` strategy. + + **Configuration** + + You can select the `staging-optimized` strategy with a setting in your `config.toml` file. If you do not select a strategy, dlt will default to `truncate-and-insert`. + + ```yaml + [destination] + # Set the optimized replace strategy + replace_strategy = "staging-optimized" + ``` + + **Limitations** + + It's important to note that the `staging-optimized` replace strategy is **not implemented for all destinations**. For example, DuckDB doesn't support this strategy, that's why we skip the code example. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Merge strategies** + + Append and replace write dispositions are quite simple to use, but with `merge` you need to be more careful. + + Let's create an example database + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Let's remember our Pokemon data sample from the dlt Fundamentals course:""" + ) + return + + +@app.cell +def _(): + # Sample data containing pokemon details + data_2 = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + return (data_2,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Load this data into duckdb with merge write disposition.""") + return + + +@app.cell +def _(data_2, dlt): + from dlt.common.typing import TDataItems, TDataItem + + @dlt.resource(name="pokemon", write_disposition="merge", primary_key="id") + def pokemon(data: TDataItems) -> TDataItems: + yield data + + pipeline_2 = dlt.pipeline( + pipeline_name="poke_pipeline_merge", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = pipeline_2.run(pokemon(data_2)) + print(_load_info) + pipeline_2.dataset().pokemon.df() + return TDataItem, TDataItems, pipeline_2, pokemon + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The merge write disposition can be used with three different strategies: + + * delete-insert (default strategy) + * scd2 + * upsert + + + Let's explore these strategies closer. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **I. `delete-insert` strategy**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Overview** + + The `merge` write disposition has `delete-insert` as the default strategy. Since we haven't specified a strategy in the previous example, this is what was used by default under the hood. + + The `delete-insert` strategy loads data to a **`staging`** dataset, deduplicates the `staging` data if a `primary_key` is provided, **deletes** the data from the destination using `merge_key` and `primary_key`, and then **inserts** the new records. + + > The `merge_key` is used in the `delete-insert` strategy to determine which records to delete from the destination before inserting the new records. + + **Example: Streaming analytics (e.g., Kafka → Data Warehouse)** + + - Streaming logs are ingested with a `delete-insert` strategy to remove outdated entries and ensure only fresh data remains. + + - Used when a `merge_key` is provided, allowing old entries to be purged before new ones are inserted. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Imagine that we want to load only updated data:""") + return + + +@app.cell +def _(): + # Sample data containing pokemon details + data_3 = [{"id": "25", "name": "pikachu", "size": {"weight": 7.5, "height": 0.4}}] + return (data_3,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline again:""") + return + + +@app.cell +def _(data_3, pipeline_2, pokemon): + _load_info = pipeline_2.run(pokemon(data_3)) + print(_load_info) + pipeline_2.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Data was updated, pikachu data has changed, now he has a different `_dlt_load_id`. + + Let's check what happened in the database in the previous run: + """) + return + + +@app.cell +def _(pipeline_2): + with pipeline_2.sql_client() as _client: + with _client.execute_query("SHOW ALL TABLES") as _table: + _tables = _table.df() + _tables + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We see agian the staging schema called `pokemon_data_staging`. Let's check the content:""" + ) + return + + +@app.cell +def _(pipeline_2): + with pipeline_2.sql_client() as _client: + with _client.execute_query( + "SELECT * from pokemon_data_staging.pokemon" + ) as _table: + _tables = _table.df() + _tables + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We see that only the new row is in the staging table. Since we used primary key, `dlt` deleted the previous entry of Pikachu and then inserted the new one.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **II. `upsert` strategy**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Overview** + + The upsert merge strategy does `primary_key` based upserts: + + - update record if key exists in target table + - insert record if key does not exist in target table + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ``` + @dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key" + ) + def my_upsert_resource(): + ... + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Difference between upsert and delete-insert** + + 1. needs a `primary_key` + 2. expects this `primary_key` to be unique (`dlt` does not deduplicate) + 3. does not support `merge_key` + 4. uses MERGE or UPDATE operations to process updates + + + **Example: Customer data management (e.g., HubSpot, Salesforce)** + + - Continuous synchronization of customer profiles across multiple systems. + + - Any update to an existing customer is reflected without deleting unrelated data. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""***Not supported in DuckDB.** List of supported destinations can be found in [docs](https://dlthub.com/docs/general-usage/incremental-loading#upsert-strategy).""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **III. `SCD2` strategy**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Overview** + + `dlt` can create Slowly Changing Dimensions Type 2 (SCD2) destination tables for dimension tables that change in the source. + + The resource is expected to provide a full extract of the source table each run. + + A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. + + **Example: Financial transaction systems (e.g., Mastercard, Visa)** + + - Keeping history of account balances over time for auditing purposes. + + - Allows analysts to trace how data evolved, which is critical for compliance and troubleshooting. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Before running the pipeline, let's re-use our small Pokemon dataset:""" + ) + return + + +@app.cell +def _(): + data_4 = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 7.5, "height": 0.4}}, + ] + return (data_4,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Now, run the pipeline with merge disposition and SCD2 strategy:""") + return + + +@app.cell +def _(TDataItems, data_4, dlt): + @dlt.resource( + name="pokemon", + write_disposition={"disposition": "merge", "strategy": "scd2"}, + primary_key="id", + ) + def pokemon_1(data: TDataItems) -> TDataItems: + yield data + + pipeline_3 = dlt.pipeline( + pipeline_name="pokemon_pipeline", + destination="duckdb", + dataset_name="pokemon_scd2", + ) + _load_info = pipeline_3.run(pokemon_1(data_4)) + print(_load_info) + return pipeline_3, pokemon_1 + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check what happened:""") + return + + +@app.cell +def _(pipeline_3): + # explore loaded data + pipeline_3.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + New columns were created: + + - `_dlt_valid_from` – The timestamp when this record was first inserted into the table. + - All records have the same value, which is when the pipeline first processed them. + + - `_dlt_valid_to` – The timestamp when this record was considered outdated. + - NaT (Not a Time) means that these records are currently active and have not been superseded by newer versions. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Modify the dataset by changing Pikachu weight again. This simulates a change in source data that should be tracked by SCD2:""" + ) + return + + +@app.cell +def _(): + data_5 = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + return (data_5,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline again with the modified dataset:""") + return + + +@app.cell +def _(data_5, pipeline_3, pokemon_1): + _load_info = pipeline_3.run(pokemon_1(data_5)) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check the database:""") + return + + +@app.cell +def _(pipeline_3): + pipeline_3.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""SCD2 created a new row for Pikachu with updated `size_weight` to 6.0 while keeping the historical record.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Hard-deletes** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `hard_delete` column hint can be used to delete records from the destination dataset. The behavior of the delete mechanism depends on the data type of the column marked with the hint: + + * `bool` type: only `True` leads to a delete, `None` and `False` values are disregarded. + * Other types: each not `None` value leads to a delete. + + Each record in the destination table with the same `primary_key` or `merge_key` as a record in the source dataset that's marked as a delete will be deleted. + + Deletes are propagated to any nested table that might exist. For each record that gets deleted in the root table, all corresponding records in the nested table(s) will also be deleted. + + **Example: User account deletion (GDPR Compliance)** + + - An online social platform (e.g., Instagram, Facebook) allows users to permanently delete their accounts. + + - When a user requests account deletion, their data must be removed from the production dataset to comply with GDPR or CCPA requirements. + + - By marking records with a `deleted_flag = True`, the system ensures the user’s data is completely removed from the production tables during the next load operation. + """) + return + + +@app.cell +def _(): + data_6 = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "deleted_flag": True, + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "deleted_flag": None, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "deleted_flag": False, + }, + ] + return (data_6,) + + +@app.cell +def _(TDataItems, data_6, dlt): + @dlt.resource( + name="pokemon", + write_disposition="merge", + primary_key="id", + columns={"deleted_flag": {"hard_delete": True}}, + ) + def pokemon_2(data: TDataItems) -> TDataItems: + yield data + + pipeline_4 = dlt.pipeline( + pipeline_name="pokemon_pipeline", + destination="duckdb", + dataset_name="pokemon_hd", + ) + _load_info = pipeline_4.run(pokemon_2(data_6)) + print(_load_info) + return pipeline_4, pokemon_2 + + +@app.cell +def _(pipeline_4): + pipeline_4.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Bulbasaur wasn't loaded at all. + + Let's see if can remove data from loaded data: + """) + return + + +@app.cell +def _(): + data_7 = [ + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "deleted_flag": True, + } + ] + return (data_7,) + + +@app.cell +def _(data_7, pipeline_4, pokemon_2): + _load_info = pipeline_4.run(pokemon_2(data_7)) + print(_load_info) + return + + +@app.cell +def _(pipeline_4): + pipeline_4.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Pikachu record was deleted from loaded data.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **Deduplication** + + By default, `primary_key` deduplication is arbitrary. You can pass the `dedup_sort` column hint with a value of `desc` or `asc` to influence which record remains after deduplication. + + - Using `desc`, the records sharing the same `primary_key` are sorted in **descending** order before deduplication, making sure the record with the highest value for the column with the `dedup_sort` hint remains. + + - `asc` has the opposite behavior. + + + **Example: Email marketing platforms (e.g., Mailchimp, SendGrid)** + + - Users may accidentally submit the same email address multiple times during a signup process. + + - When ingesting these signups, using deduplication ensures that only unique email addresses are retained. + + - The `dedup_sort` hint allows prioritization of the latest record. + + The example data below contains three rows of information about Pikachu. + """) + return + + +@app.cell +def _(): + data_8 = [ + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "deleted_flag": None, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 7, "height": 0.4}, + "deleted_flag": True, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 8, "height": 0.4}, + "deleted_flag": None, + }, + ] + return (data_8,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""This will insert one record (the one with size__weight = 8).""") + return + + +@app.cell +def _(TDataItems, data_8, dlt): + @dlt.resource( + name="pokemon", + write_disposition="merge", + primary_key="id", + columns={ + "deleted_flag": {"hard_delete": True}, + "size__weight": {"dedup_sort": "desc"}, + }, + ) + def pokemon_3(data: TDataItems) -> TDataItems: + yield data + + pipeline_5 = dlt.pipeline( + pipeline_name="pokemon_pipeline", + destination="duckdb", + dataset_name="pokemon_hd", + ) + _load_info = pipeline_5.run(pokemon_3(data_8)) + print(_load_info) + pipeline_5.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""The row with the largest value of "size__weight" 8.0 remains.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Missing incremental cursor path** + + You can customize the incremental processing of dlt by setting the parameter `on_cursor_value_missing`. + + When loading incrementally with the default settings, there are two assumptions: + + * Each row contains the cursor path. + * Each row is expected to contain a value at the cursor path that is not `None`. + + **Example: IoT device data ingestion (e.g., Smart Homes)** + + - IoT devices (e.g., thermostats, cameras) send data continuously. + + - Due to network failures or device malfunctions, some records may lack timestamps or have None as their cursor value. + + - Using `on_cursor_value_missing="include"` ensures that such data is not discarded by default, allowing for later inspection and processing. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + To process a data set where some records **do not include the incremental cursor path** or where the values at the cursor path are **None**, there are the following four options: + + * Configure the incremental load to **raise** an exception in case there is a row where the cursor path is missing or has the value `None` using + - `incremental(..., on_cursor_value_missing="raise")`. + + - This is the **default** behavior. + * Configure the incremental load to **tolerate** the missing cursor path and `None` values using + - `incremental(..., on_cursor_value_missing="include")`. + * Configure the incremental load to **exclude** the missing cursor path and `None` values using + - `incremental(..., on_cursor_value_missing="exclude")`. + + Here is an example of including rows where the **incremental cursor value** is **missing** or **None**: + """) + return + + +@app.cell +def _(): + data_9 = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": 1, + "updated_at": 1, + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": 2, + "updated_at": 2, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "created_at": 3, + "updated_at": None, + }, + ] + return (data_9,) + + +@app.cell +def _(TDataItems, data_9, dlt): + @dlt.resource(name="pokemon") + def pokemon_4( + data: TDataItems, + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", on_cursor_value_missing="include" + ), + ) -> TDataItems: + yield data + + pipeline_6 = dlt.pipeline( + pipeline_name="pokemon_pipeline", + destination="duckdb", + dataset_name="pokemon_inc", + ) + _load_info = pipeline_6.run(pokemon_4(data_9)) + print(_load_info) + pipeline_6.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can also define a [fall back column](https://dlthub.com/docs/devel/general-usage/incremental-loading#transform-records-before-incremental-processing) for an incremental cursor, as described below. + + ## **Transform records before incremental processing** + + If you want to load data that includes `None` values, you can transform the records before the incremental processing. + You can add steps to the pipeline that [filter, transform, or pivot your data](https://dlthub.com/docs/devel/general-usage/resource#filter-transform-and-pivot-data). + + In the following example + - the step of data yielding is at `index = 0`, + - the custom transformation at `index = 1`, + - and the incremental processing at `index = 2`. + + + > **Caution!** + > + >It is important to set the `insert_at` parameter of the `add_map` function to control the order of execution and ensure that your custom steps are executed before the incremental processing starts. + + + See below how you can modify rows before the incremental processing using `add_map()` and filter rows using `add_filter()`. + """) + return + + +@app.cell +def _(): + data_10 = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": 1, + "updated_at": 1, + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": 2, + "updated_at": 2, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "created_at": 3, + "updated_at": None, + }, + ] + return (data_10,) + + +@app.cell +def _(TDataItem, TDataItems, dlt): + @dlt.resource + def some_data( + data: TDataItems, + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at" + ), + ) -> TDataItems: + yield data + + def set_default_updated_at(record: TDataItem) -> TDataItems: + if record.get("updated_at") is None: + record["updated_at"] = record.get("created_at") + return record + return set_default_updated_at, some_data + + +@app.cell +def _(data_10, set_default_updated_at, some_data): + # Modifies records before the incremental processing + with_default_values = some_data(data_10).add_map( + set_default_updated_at, insert_at=1 + ) + return (with_default_values,) + + +@app.cell +def _(dlt, with_default_values): + pipeline_7 = dlt.pipeline( + pipeline_name="pokemon_pipeline_wd", + destination="duckdb", + dataset_name="pokemon_inc_wd", + ) + _load_info = pipeline_7.run(with_default_values, table_name="pokemon") + print(_load_info) + pipeline_7.dataset().pokemon.df() + return + + +@app.cell +def _(data_10, some_data): + # Removes records before the incremental processing + without_none = some_data(data_10).add_filter( + lambda r: r.get("updated_at") is not None, insert_at=1 + ) + return (without_none,) + + +@app.cell +def _(dlt, without_none): + pipeline_8 = dlt.pipeline( + pipeline_name="pokemon_pipeline_wn", + destination="duckdb", + dataset_name="pokemon_inc_wn", + ) + _load_info = pipeline_8.run(without_none, table_name="pokemon") + print(_load_info) + pipeline_8.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Backfilling** + + ### Using `end_value` for backfill + + You can specify both initial and end dates when defining incremental loading. Let's go back to our Pokemon example: + """) + return + + +@app.cell +def _(): + data_11 = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": 1, + "updated_at": 1, + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": 2, + "updated_at": 2, + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "created_at": 3, + "updated_at": 3, + }, + ] + return (data_11,) + + +@app.cell +def _(TDataItems, dlt): + @dlt.resource + def some_data_1( + data: TDataItems, + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "created_at", initial_value=0, end_value=2 + ), + ) -> TDataItems: + yield data + return (some_data_1,) + + +@app.cell +def _(data_11, dlt, some_data_1): + pipeline_9 = dlt.pipeline( + pipeline_name="pokemon_pipeline_wd", + destination="duckdb", + dataset_name="pokemon_inc_wd", + ) + _load_info = pipeline_9.run(some_data_1(data_11), table_name="pokemon") + print(_load_info) + pipeline_9.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Above, we use the `initial_value` and `end_value` arguments of the `incremental` to define the range of issues that we want to retrieve + and pass this range to the Github API (`since` and `until`). As in the examples above, `dlt` will make sure that only the issues from + the defined range are returned. + + Please note that when `end_date` is specified, `dlt` **will not modify the existing incremental state**. The backfill is **stateless** and: + 1. You can run backfill and incremental load in parallel (i.e., in an Airflow DAG) in a single pipeline. + 2. You can partition your backfill into several smaller chunks and run them in parallel as well. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""## **Load a large dataset using incremental loading and add_limits**""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Specifically for the `sql_database` source you can utilize another possible approach - load data in fixed chunks using `chunk_size` parameter.""" + ) + return + + +@app.cell +def _(dlt): + from dlt.sources.sql_database import sql_database + + source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", chunk_size=1000 + ).with_resources("genome") + source.genome.apply_hints( + incremental=dlt.sources.incremental("updated", row_order="asc") + ) + pipeline_10 = dlt.pipeline( + pipeline_name="sql_database_pipeline", + destination="duckdb", + dataset_name="sql_data", + ) + my_table_name = "genome" + continue_load_flag = True + while continue_load_flag: + pipeline_10.run(source.genome.add_limit(10)) + continue_load_flag = ( + my_table_name + in pipeline_10.last_trace.last_normalize_info.row_counts.keys() + ) + print(pipeline_10.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_6_Write_disposition_strategies_%26_Advanced_tricks_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_6_Write_disposition_strategies_%26_Advanced_tricks_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb b/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb index 696f3d011..c18582a51 100644 --- a/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb +++ b/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb @@ -6,7 +6,7 @@ "id": "Wat0fkM3BHwn" }, "source": [ - "# **Introduction** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n", + "# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb)\n", "\n", "`dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀\n" ] @@ -35,7 +35,7 @@ "source": [ "When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path=\"schemas/export\"` in your pipeline.\n", "\n", - "See [dlt Fundamentals: Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)\n" + "See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n" ] }, { @@ -167,18 +167,9 @@ "\n", "Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves.\n", "\n", - "`dlt` allows you to implement these data contracts at various levels, including the [table level](#scrollTo=zzVNMHgqNEYr), [column level](#scrollTo=Bq_9SNOMQGk_), and [data type level](#scrollTo=H9eMPvlOQHrJ). This provides granular control over how different parts of your schema evolve.\n", + "`dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve.\n", "\n", - "> **Note**: This Colab is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g2XDHclpusOU" - }, - "source": [ - "To get started with data contracts, first install `dlt`:" + "> **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details." ] }, { @@ -190,8 +181,6 @@ "outputs": [], "source": [ "%%capture\n", - "\n", - "# Install dlt\n", "!pip install dlt[duckdb]" ] }, @@ -468,13 +457,13 @@ "load_info = column_pipeline.run(\n", " discard_row(\n", " [\n", - " {\"id\": 3, \"name\": \"Sam\", \"age\": 30}, # This row will be loaded\n", + " {\"id\": 3, \"name\": \"Sam\", \"age\": 30},\n", " {\n", " \"id\": 4,\n", " \"name\": \"Kate\",\n", " \"age\": 79,\n", " \"phone\": \"123-456-7890\",\n", - " }, # This row will not be loaded\n", + " },\n", " ]\n", " ),\n", " table_name=\"users\",\n", @@ -711,8 +700,8 @@ "load_info = data_type_pipeline.run(\n", " discard_row(\n", " [\n", - " {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"}, # This row will be loaded\n", - " {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"}, # This row will not be loaded\n", + " {\"id\": 3, \"name\": \"Sam\", \"age\": \"35\"},\n", + " {\"id\": 4, \"name\": \"Kate\", \"age\": \"seventy\"},\n", " ]\n", " ),\n", " table_name=\"users\",\n", @@ -940,17 +929,8 @@ "id": "AH3F46PaJZe4" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1YCjHWMyOO9QGC66t1a5bIxL-ZUeVKViR#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6_6WprxWXhXi" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_7_data_contracts.py b/docs/education/dlt-advanced-course/lesson_7_data_contracts.py new file mode 100644 index 000000000..e5a62449c --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_7_data_contracts.py @@ -0,0 +1,780 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) + + `dlt` offers powerful tools for schema configuration, giving you control over your data processing. You can export and import schemas for easy adjustments and apply specific settings directly to resources for precise data normalization. Plus, you can set data contracts to ensure your data meets your expectations... 👀 + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_7_Data_Contracts_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_7_Data_Contracts_img1.webp)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# [Refresher] **Understanding schema**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + When you run a pipeline, `dlt` internally generates a `<>.schema.json` file. You can export this file to a specific location in YAML format by specifying `export_schema_path="schemas/export"` in your pipeline. + + See [dlt Fundamentals: Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This YAML file will look something like: + + ```yaml + version: 2 # version of the schema + version_hash: xmTG0tOmE40LvzY2DbPBOnRaNNK8YlLpVP1PMO0YgyE= # hash of the actual schema content + engine_version: 9. # shema engine version of dlt + name: quick_start + tables: + _dlt_version: + ... + _dlt_loads: + ... + _dlt_pipeline_state: + ... + issues: + columns: + url: + data_type: text + nullable: true + repository_url: + data_type: text + nullable: true + labels_url: + data_type: text + nullable: true + ... + write_disposition: append + resource: get_issues + x-normalizer: + seen-data: true + issues__assignees: + columns: + ... + parent: issues + + settings: + detections: + - iso_timestamp + default_hints: + not_null: + - _dlt_id + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + - _dlt_load_id + foreign_key: + - _dlt_parent_id + root_key: + - _dlt_root_id + unique: + - _dlt_id + normalizers: + names: snake_case # naming convention + json: + module: dlt.common.normalizers.json.relational + previous_hashes: + - O4M6U4KA32Xz4Vrdcqo4XPBPFVcK1FZbgRu5qcMfjn4= + - 0DQRnVWANYV21yD0T5nsoUtdTeq0/jIOYMUxpPE6Fcc= + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Tables and columns**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + A `table schema` may have the following properties: + + - `name` + - `description` + - `parent`: The name of the parent table if this is a child table. + - `columns`: A list of column schemas defining the table's structure. + - `write_disposition`: A hint telling `dlt` how new data coming into the table should be loaded. + + + A `column schema` may have the following properties: + + - `name` + - `description` + - `data_type` + - `precision`: Defines the precision for text, timestamp, time, bigint, binary, and decimal types. + - `scale`: Defines the scale for the decimal type. + - `is_variant`: Indicates that the column was generated as a variant of another column. + + A `column schema` may have the following basic hints: + + - `nullable` + - `primary_key` + - `merge_key`: Marks the column as part of the merge key used for incremental loads. + - `foreign_key` + - `root_key`: Marks the column as part of a root key, a type of foreign key that always refers to the root table. + - `unique` + + + A `column schema` may have the following performance hints: + + - `partition`: Marks the column to be used for partitioning data. + - `cluster`: Marks the column to be used for clustering data. + - `sort`: : Marks the column as sortable or ordered; on some destinations, this may generate an index, even if the column is not unique. + + > Each destination can interpret these performance hints in its own way. For example, the `cluster` hint is used by Redshift to define table distribution, by BigQuery to specify a cluster column, and is ignored by DuckDB and Postgres when creating tables. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Data contracts** + + Data contracts are rules that help control how your data schema changes over time. They are particularly useful for maintaining the integrity and consistency of your data as it evolves. + + `dlt` allows you to implement these data contracts at various levels, including the `table level`, `column level`, and `data type level`. This provides granular control over how different parts of your schema evolve. + + > **Note**: This Colab (or Molab) is based on `dlt`'s [schema contracts doc page](https://dlthub.com/docs/general-usage/schema-contracts) and includes additional code examples. It's still a good idea to check out the doc page for all the details. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ###**Table level** + + On the table level, you can specify `evolve` or `freeze` as part of the schema contract. + + - `evolve`: Allows the creation of new tables within the schema. + - `freeze`: Prevents any changes to the schema, ensuring no new tables can be added. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Before diving into the modes above, let's load some sample data into a DuckDB database. + > You'll find the database stored in the `Files` section on the left sidebar. + """) + return + + +@app.cell +def _(): + import dlt + + data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] + # Sample data to be loaded + table_pipeline = dlt.pipeline( + pipeline_name="data_contracts_table_level", + destination="duckdb", + dataset_name="mydata", + ) + _load_info = table_pipeline.run(data, table_name="users") + # Create a dlt pipeline + print(_load_info) + # Load the data to the "users" table + # Print the row counts for each table that was loaded in the last run of the pipeline + print( + "\nNumber of new rows loaded into each table: ", + table_pipeline.last_trace.last_normalize_info.row_counts, + ) + return data, dlt, table_pipeline + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Now, try out the `evolve` mode at the table level by loading the same sample data into the same database, but this time into a new table called `new_users`.""" + ) + return + + +@app.cell +def _(data, dlt, table_pipeline): + from dlt.common.typing import TDataItems + + @dlt.resource(schema_contract={"tables": "evolve"}) + # Define a dlt resource that allows the creation of new tables + def allow_new_tables(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = table_pipeline.run(allow_new_tables(data), table_name="new_users") + print(_load_info) + # Run the pipeline again with the above dtl resource to load the same data into a new table "new_users" + # Print the row counts for each table that was loaded in the last run of the pipeline + print( + "\nNumber of new rows loaded into each table: ", + table_pipeline.last_trace.last_normalize_info.row_counts, + ) + return (TDataItems,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The `freeze` mode at the table level, as mentioned earlier, won't allow any changes to the schema, so the pipeline run below that tries to create another table with the name `newest_users` will fail 👇""" + ) + return + + +@app.cell +def _(TDataItems, data, dlt, table_pipeline): + # Define a dlt resource that prevents any changes to the schema at the table level (no new tables can be added) + @dlt.resource(schema_contract={"tables": "freeze"}) + def no_new_tables(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = table_pipeline.run(no_new_tables(data), table_name="newest_users") + # Now, run the pipeline with the resource above, attempting to load the same data into "newest_users". + # This will fail, as new tables can't be added. + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ###**Column level** + At the column level, you can specify: + - `evolve`: Allows for the addition of new columns or changes in the existing ones. + - `freeze`: Prevents any changes to the existing columns. + - `discard_row`: Skips rows that have new columns but loads those that follow the existing schema. + - `discard_value`: Doesn't skip entire rows. Instead, it only skips the values of new columns, loading the rest of the row data. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Just like we did in the previous section, let's first load some sample data into a new database using a new pipeline. + + > After you run the following code snippet, a new `data_contracts_column_level.duckdb` file should appear in `Files`. + """) + return + + +@app.cell +def _(dlt): + column_pipeline = dlt.pipeline( + pipeline_name="data_contracts_column_level", + destination="duckdb", + dataset_name="mydata", + ) + _load_info = column_pipeline.run([{"id": 1, "name": "Alice"}], table_name="users") + print(_load_info) + return (column_pipeline,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""View the loaded data using `dlt`'s `sql_client()`.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Alternatively, you can simply use the DuckDB client.""") + return + + +@app.cell +def _(column_pipeline): + import duckdb + + _conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb") + _conn.sql("SELECT * FROM mydata.users").df() + return (duckdb,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Assume that Alice ☝️ is the first user at your imaginary company, and you have now decided to collect users' ages as well. + + When you load the information for your second user, Bob, who also provided his age 👇, the schema contract at the column level set to `evolve` will allow `dlt` to automatically adjust the schema in the destination database by adding a new column for "age". + """) + return + + +@app.cell +def _(TDataItems, column_pipeline, dlt, duckdb): + # Define dlt resource that allows new columns in the data + @dlt.resource(schema_contract={"columns": "evolve"}) + def allow_new_columns(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = column_pipeline.run( + allow_new_columns([{"id": 2, "name": "Bob", "age": 35}]), table_name="users" + ) + print(_load_info) + # Now, load a new row into the same table, "users", which includes an additional column "age" + print("\n") + _conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb") + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Now, imagine your business partner, with whom you started the company, began requiring phone numbers from users. However, you weren't informed of this requriement and want to first load the data of users who provided their info before this change, i.e., users who did NOT provide their phone numbers. + + In this case, you would use the `discard_row` mode - which will only load Sam's data 👇 because he didn't provide a phone number, and therefore his data complies with the schema. + """) + return + + +@app.cell +def _(TDataItems, column_pipeline, dlt, duckdb): + # Define a dlt resource that skips rows that have new columns but loads those that follow the existing schema + @dlt.resource(schema_contract={"columns": "discard_row"}) + def _discard_row(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = column_pipeline.run( + _discard_row( + [ + {"id": 3, "name": "Sam", "age": 30}, + {"id": 4, "name": "Kate", "age": 79, "phone": "123-456-7890"}, + ] + ), + table_name="users", + ) + print(_load_info) + # Attempt to load two additional rows. Only the row that follows the existing schema will be loaded + print("\n") + _conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb") + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Due to some unknown reasons, you've suddenly decided that phone numbers are irrelevant altogether. From now on, you want to load all new data but without the "phone" column. + + To achieve this, you can use the `discard_value` mode - which will load both Sarah's and Violetta's data 👇, regardless of whether either of them provided a phone number. However, the phone number column itself will be discarded. + """) + return + + +@app.cell +def _(TDataItems, column_pipeline, dlt, duckdb): + # Define a dlt resource that only skips the values of new columns, loading the rest of the row data + @dlt.resource(schema_contract={"columns": "discard_value"}) + def _discard_value(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = column_pipeline.run( + _discard_value( + [ + {"id": 5, "name": "Sarah", "age": "23"}, + {"id": 6, "name": "Violetta", "age": "22", "phone": "666-513-4510"}, + ] + ), + table_name="users", + ) + print(_load_info) + # Load two additional rows. Since we're using the "discard_value" resource, both rows will be added + # However, the "phone" column in the second row will be ignored and not loaded + print("\n") + _conn = duckdb.connect(f"{column_pipeline.pipeline_name}.duckdb") + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Eventually you decide that users' id, name and age are the only things you need for your obscure business... + + So, you set the mode to `freeze`, forbidding any changes to the table schema. The attempt to violate the schema contract, as shown below 👇, will fail. + """) + return + + +@app.cell +def _(TDataItems, column_pipeline, dlt): + # Define a dlt resource that does not allow new columns in the data + @dlt.resource(schema_contract={"columns": "freeze"}) + def no_new_columns(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = column_pipeline.run( + no_new_columns([{"id": 7, "name": "Lisa", "age": 40, "phone": "098-765-4321"}]), + table_name="users", + ) + # Attempt to load a row with additional columns when the column contract is set to freeze + # This will fail as no new columns are allowed. + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Data type level** + At this level, you can choose: + - `evolve`: Allows any data type. This may result with variant columns upstream. + - `freeze`: Prevents any changes to the existing data types. + - `discard_row`: Omits rows with unverifiable data types. + - `discard_value`: Replaces unverifiable values with None, but retains the rest of the row data. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + (*No imaginary situations in this section for the sake of variety and ease* ... 👀) + + Load a sample row entry into a new database using a new pipeline. + """) + return + + +@app.cell +def _(dlt, duckdb): + data_type_pipeline = dlt.pipeline( + pipeline_name="data_contracts_data_type", + destination="duckdb", + dataset_name="mydata", + ) + _load_info = data_type_pipeline.run( + [{"id": 1, "name": "Alice", "age": 24}], table_name="users" + ) + print(_load_info) + print("\n") + _conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb") + _conn.sql("SELECT * FROM mydata.users").df() + return (data_type_pipeline,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Before trying out the `evolve` mode at the data type level 👇, take a moment to understand how variant columns mentioned earlier are created: + - **TLDR:** `dlt` creates a new column when the data type of a field in the incoming data can't be validated against the existing data type in the destination table. + - These variant columns will be named following the pattern `__v_`, where `original_name` is the existing column name (with the data type clash) and `type` is the name of the new data type stored in the variant column. + + In the example below, even though Bob's age is passed as a string, it can be validated as an integer, so it won't cause any problems. + """) + return + + +@app.cell +def _(TDataItems, data_type_pipeline, dlt, duckdb): + # Define dlt resource that accepts all data types + @dlt.resource(schema_contract={"data_type": "evolve"}) + def allow_any_data_type(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = data_type_pipeline.run( + allow_any_data_type([{"id": 2, "name": "Bob", "age": "35"}]), table_name="users" + ) + print(_load_info) + # Now, load a new row where the "age" column is passed as a string but will be validated and stored as an integer + print("\n") + _conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb") + # If you pass the age as "thirty-five", a new variant column will be added + # Note: Running the uncommented code below may affect subsequent steps, so proceed with caution + # load_info = data_type_pipeline.run(allow_any_data_type([{"id": 2, "name": "Bob", "age": "thirty-five"}]), table_name="users") + # print(load_info) + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""But if we ran the commented-out pipeline, this would be the outcome with an additional variant column:""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_7_Data_Contracts_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_7_Data_Contracts_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The `discard_row` mode at the data type level functions similarly to how it does at the column level. The only difference is that it discards rows with diverging data types instead of columns. As a result, you will see that Kate's data will not be loaded 👇.""" + ) + return + + +@app.cell +def _(TDataItems, data_type_pipeline, dlt, duckdb): + # Define dlt resource that omits rows with unverifiable data types + @dlt.resource(schema_contract={"data_type": "discard_row"}) + def _discard_row(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = data_type_pipeline.run( + _discard_row( + [ + {"id": 3, "name": "Sam", "age": "35"}, + {"id": 4, "name": "Kate", "age": "seventy"}, + ] + ), + table_name="users", + ) + print(_load_info) + # Attempt to load two additional rows. Only the row where all column types can be validated will be loaded + print("\n") + _conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb") + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The same goes for the `discard_value` mode. However, note that when applied at the data type level, it will replace non-validating row items with `None`. So, in this example, Violetta's age will be set to `None` 👇.""" + ) + return + + +@app.cell +def _(TDataItems, data_type_pipeline, dlt, duckdb): + # Define a dlt resource that replaces unverifiable values with None, but retains the rest of the row data + @dlt.resource(schema_contract={"data_type": "discard_value"}) + def _discard_value(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = data_type_pipeline.run( + _discard_value( + [ + {"id": 5, "name": "Sarah", "age": 23}, + {"id": 6, "name": "Violetta", "age": "twenty-eight"}, + ] + ), + table_name="users", + ) + print(_load_info) + # Load two additional rows. Since we're using the "discard_value" resource, both rows will be added + # However, the "age" value "twenty-eight" in the second row will be ignored and not loaded + print("\n") + _conn = duckdb.connect(f"{data_type_pipeline.pipeline_name}.duckdb") + # View the data that has been loaded + _conn.sql("SELECT * FROM mydata.users").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The `freeze` mode prohibits any changes to the data types of existing columns and will result in an error if there is a "breach in contract". The example below will fail.""" + ) + return + + +@app.cell +def _(TDataItems, data_type_pipeline, dlt): + # Define dlt resource that prevents any changes to the existing data types + @dlt.resource(schema_contract={"data_type": "freeze"}) + def no_data_type_changes(input_data: TDataItems) -> TDataItems: + yield input_data + + _load_info = data_type_pipeline.run( + no_data_type_changes([{"id": 7, "name": "Lisa", "age": "forty"}]), + table_name="users", + ) + # Attempt to load a row with a column value that can't be validated, in this case "forty" + # This will fail as no data type changes are allowed with the "no_data_type_changes" resource + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Pydantic Models**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Pydantic models can also be used to [define table schemas and validate incoming data](https://dlthub.com/docs/general-usage/resource#define-a-schema-with-pydantic). + They can be passed directly to the "columns" argument of a `dlt` resource: + ```python + class User(BaseModel): + id: int + name: str + tags: List[str] + email: Optional[str] + address: Address + status: Union[int, str] + + @dlt.resource(name="user", columns=User) + def get_users(): + ... + ``` + This will set the schema contract to align with the default Pydantic behavior: + ```python + { + "tables": "evolve", + "columns": "discard_value", + "data_type": "freeze" + } + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + If you happen to pass a `schema_contract` explicitly along with the `columns` argument to a `dlt` resource, the following happens: + + - `tables`: The contract will not impact the Pydantic model and will be applied when a new table is created. + - `columns`: The modes for columns are mapped into the `extra` modes of Pydantic. If your models contain other models, `dlt` will apply this setting recursively. The contract for columns is applied when a new column is created on an existing table. + +
+ + | Column Mode | Pydantic Extra | + |-----------------|----------------| + | evolve | allow | + | freeze | forbid | + | discard_value | ignore | + | discard_row | forbid | + +
+ + - `data_type`: This supports the following modes for Pydantic: + 1. `evolve` will synthesize a lenient model that allows for any data type. It may result in variant columns upstream. + 2. `freeze` will re-raise a ValidationException. + 3. `discard_row` will remove the non-validating data items. + 4. `discard_value` is not currently supported. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Good to Know**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + - Unless you specify a schema contract, settings will default to `evolve` on all levels. + + - The `schema_contract` argument accepts two forms: + 1. Full form: A detailed mapping of schema entities to their respective contract modes. + ```python + schema_contract={"tables": "freeze", "columns": "freeze", "data_type": "freeze"} + ``` + 2. Shorthand form: A single contract mode that will be uniformly applied to all schema entities. + ```python + schema_contract="freeze" + ``` + + - Schema contracts can be defined for: + 1. `dlt` resources: The contract applies to the corresponding table and any child tables. + ```python + @dlt.resource(schema_contract={"columns": "evolve"}) + def items(): + ... + ``` + 2. `dlt` sources: The contract serves as a default for all resources within that source. + ```python + @dlt.source(schema_contract="freeze") + def source(): + ... + ``` + 3. The `pipeline.run()`: This contract overrides any existing schema contracts. + ```python + pipeline.run(source(), schema_contract="freeze") + ``` + + - You can change the contract on a `dlt` source via its `schema_contract` property. + ```python + source = dlt.source(...) + source.schema_contract = {"tables": "evolve", "columns": "freeze", "data_type": "discard_row"} + ``` + + - To update the contract for `dlt` resources, use `apply_hints`. + ```python + resource.apply_hints(schema_contract={"tables": "evolve", "columns": "freeze"}) + ``` + + - For the `discard_row` method at the table level, if there are two tables in a parent-child relationship, such as `users` and `users__addresses`, and the contract is violated in the child table, the row in the child table (`users__addresses`) will be discarded, while the corresponding parent row in the `users` table will still be loaded. + + - If a table is a `new table` that hasn't been created on the destination yet, `dlt` will allow the creation of new columns. During the first pipeline run, the column mode is temporarily changed to `evolve` and then reverted back to the original mode. Following tables are considered new: + 1. Child tables inferred the nested data. + 2. Dynamic tables created from the data during extraction. + 3. Tables containing incomplete columns - columns without a data type bound to them. + + > Note that tables with columns defined with Pydantic models are not considered new. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb b/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb index 3f746ac49..c796f3fda 100644 --- a/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb +++ b/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb @@ -6,7 +6,7 @@ "id": "y0sqFhxJnH5r" }, "source": [ - "# **Introduction** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)" + "# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)" ] }, { @@ -49,6 +49,7 @@ }, "outputs": [], "source": [ + "import os\n", "from typing import Iterable, Union\n", "import dlt\n", "from dlt.sources.helpers import requests\n", @@ -58,10 +59,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"ACCESS_TOKEN\")\n", "\n", "\n", "@dlt.source\n", @@ -162,10 +162,7 @@ "\n", "## What is `Sentry` 🤔\n", "\n", - "`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications.\n", - "\n", - "\n", - "Remember, `dlt` does not have the `Sentry` client as a dependency. You need to install it." + "`Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications." ] }, { @@ -297,10 +294,9 @@ }, "outputs": [], "source": [ - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")" + "dlt.config[\"RUNTIME__SENTRY_DSN\"] = userdata.get(\"SENTRY_TOKEN\")" ] }, { @@ -416,9 +412,9 @@ }, "outputs": [], "source": [ - "import os\n", + "import dlt\n", "\n", - "os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\"" + "dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\"" ] }, { @@ -470,7 +466,7 @@ " dataset_name=\"github_data_merge\",\n", ")\n", "load_info = pipeline.run(github_source())\n", - "\n", + "print(load_info)\n", "# result gets showed despite no print statement ? check dlt.log" ] }, @@ -512,9 +508,9 @@ }, "outputs": [], "source": [ - "import os\n", + "import dlt\n", "\n", - "os.environ[\"RUNTIME__LOG_LEVEL\"] = \"INFO\"" + "dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"INFO\"" ] }, { @@ -576,7 +572,8 @@ " destination=\"duckdb\",\n", " dataset_name=\"github_data_merge\",\n", ")\n", - "load_info = pipeline.run(github_source())" + "load_info = pipeline.run(github_source())\n", + "print(load_info)" ] }, { @@ -596,9 +593,9 @@ }, "outputs": [], "source": [ - "import os\n", + "import dlt\n", "\n", - "os.environ[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n", + "dlt.config[\"RUNTIME__LOG_LEVEL\"] = \"WARNING\"\n", "\n", "\n", "pipeline = dlt.pipeline(\n", @@ -607,7 +604,8 @@ " dataset_name=\"github_data_merge\",\n", " progress=\"log\",\n", ")\n", - "load_info = pipeline.run(github_source())" + "load_info = pipeline.run(github_source())\n", + "print(load_info)" ] }, { @@ -616,17 +614,8 @@ "id": "AH3F46PaJZe4" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/11P5O2R40ExtFtPfX4o1O5mF7nFbibtuZ#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "maZdAnM0bjiv" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py b/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py new file mode 100644 index 000000000..7eb2d2f7a --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py @@ -0,0 +1,470 @@ +# /// script +# dependencies = [ +# "dlt", +# "loguru", +# "numpy", +# "pandas", +# "sentry-sdk", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# **Introduction** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In this notebook, we focus more on pipeline metadata, and how to use that to be able to trace and debug our pipelines. + + First, we create the pipeline we'll inspect throughout this notebook. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Create the pipeline we will inspect""") + return + + +@app.cell +def _(): + import os + from typing import Iterable, Union + import dlt + from dlt.sources.helpers import requests + from dlt.extract import DltResource + from dlt.common.typing import TDataItems + from dlt.sources.helpers.rest_client import RESTClient + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("ACCESS_TOKEN") + + @dlt.source + def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + + return github_pulls + + pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + _load_info = pipeline.run(github_source()) + # define new dlt pipeline + # run the pipeline with the new resource + print(_load_info) + return Union, dlt, github_source, os, pipeline + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## Look at the data""") + return + + +@app.cell +def _(pipeline): + import duckdb + + conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb") + + conn.sql("SHOW ALL TABLES").df() + return (conn,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""More importantly, let's look at the saved load info""") + return + + +@app.cell +def _(conn): + conn.sql("select * from github_data._dlt_loads").df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Tracing with Sentry**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can enable tracing through Sentry. + + ## What is `Sentry` 🤔 + + `Sentry` is an open-source error tracking and performance monitoring tool that helps developers **identify**, **monitor**, and **fix issues** in real-time in their applications. + """) + return + + +@app.cell +def _(): + import sentry_sdk + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Sentry needs to be initialized in normal scripts + + + + ``` + import sentry_sdk + import os + + sentry_sdk.init( + dsn=os.getenv("RUNTIME__SENTRY_DSN"), + traces_sample_rate=1.0 # Adjust this for performance monitoring if needed + ) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Say, you make an error and it is caught with Sentry: + + + + ``` + try: + 1 / 0 + except ZeroDivisionError as e: + sentry_sdk.capture_exception(e) + + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""It will then show up on your Sentry dashboard:""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_8_Logging_%26_Tracing_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_8_Logging_%26_Tracing_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Even when a normal error arises after Sentry has been initiated, your program executes normally, but sends that error to your dashboard, so it can be tracked!""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### In dlt, you can enable Sentry quite easily + + You can configure the `DSN` in the `config.toml`: + + ``` + [runtime] + + sentry_dsn="https:///<...>" + ``` + + + Alternatively, you can use environment variables. **This is what we'll be doing**: + ``` + RUNTIME__SENTRY_DSN="https:///<...>" + ``` + The entry client is configured after the first pipeline is created with `dlt.pipeline()`. Feel free to use `sentry_sdk` init again to cover your specific needs. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's try introducing the same error again""") + return + + +@app.cell +def _(dlt, os): + dlt.config["RUNTIME__SENTRY_DSN"] = os.getenv("SENTRY_TOKEN") + return + + +@app.cell +def _(pipeline): + data = {12: 34} + + info = pipeline.run([data], table_name="issues") + info + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""And that comes up in Sentry as well""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_8_Logging_%26_Tracing_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_8_Logging_%26_Tracing_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The message sent to Sentry is: + ``` + Job for issues.a3f927c556.insert_values failed terminally in load 1723645286.6510239 with message Constraint Error: NOT NULL constraint failed: issues.id + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Logging**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + There are various environments where we would be completely lost without logs. + + Debugging any system would be incredibly hard if we didn't know what was going on, or at what point the program ran into an error. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Setting log levels in `dlt`""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can set log levels in your `config.toml` file: + + + + ``` + [runtime] + log_level="INFO" + ``` + + `log_level` accepts the Python standard logging level names. + + The default log level is `WARNING`. + + **`INFO` log level is useful when diagnosing problems in production.** + + **`CRITICAL` will disable logging.** + + **`DEBUG` should not be used in production.** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""We'll be setting the log level in our environment variables:""") + return + + +@app.cell +def _(dlt): + dlt.config["RUNTIME__LOG_LEVEL"] = "INFO" + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + dlt logs to a logger named `dlt`. + + dlt logger uses a regular python logger so you can configure the handlers as per your requirement. + """) + return + + +@app.cell +def _(): + import logging + + # Create a logger + logger = logging.getLogger("dlt") + + # Set the log level + logger.setLevel(logging.INFO) + + # Create a file handler + handler = logging.FileHandler("dlt.log") + + # Add the handler to the logger + logger.addHandler(handler) + return (logging,) + + +@app.cell +def _(dlt, github_source): + pipeline_1 = dlt.pipeline( + pipeline_name="github_issues_merge_logger", + destination="duckdb", + dataset_name="github_data_merge", + ) + _load_info = pipeline_1.run(github_source()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Logging via `Loguru` in our GitHub example""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""let's change the logging level""") + return + + +@app.cell +def _(dlt): + dlt.config["RUNTIME__LOG_LEVEL"] = "INFO" + return + + +@app.cell +def _(Union, logging): + import sys + from loguru import logger as loguru_logger + + class InterceptHandler(logging.Handler): + @loguru_logger.catch(default=True, onerror=lambda _: sys.exit(1)) + def emit(self, record: logging.LogRecord) -> None: + # parent class logging.Handler processes log messages + try: + level: Union[str, int] = loguru_logger.level( + record.levelname + ).name # decorator provided by loguru that catches any exceptions in the decorated function and logs them + except ValueError: + level = record.levelno + (frame, depth) = ( + sys._getframe(6), + 6, + ) # Get corresponding Loguru level if it exists. + while frame and frame.f_code.co_filename == logging.__file__: + frame = frame.f_back + depth = depth + 1 + loguru_logger.opt(depth=depth, exception=record.exc_info).log( + level, record.getMessage() + ) + + logger_dlt = logging.getLogger("dlt") + logger_dlt.addHandler( + InterceptHandler() + ) # Find caller (call frame) from where originated the logged message. + # all logs will be written to dlt_loguru.log + loguru_logger.add( + "dlt_loguru.log" + ) # logs the message using loguru, with the level, exception information, and depth + return + + +@app.cell +def _(dlt, github_source): + pipeline_2 = dlt.pipeline( + pipeline_name="github_issues_merge_loguru", + destination="duckdb", + dataset_name="github_data_merge", + ) + _load_info = pipeline_2.run(github_source()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Logs for monitoring the progress**""") + return + + +@app.cell +def _(dlt, github_source): + dlt.config["RUNTIME__LOG_LEVEL"] = "WARNING" + pipeline_3 = dlt.pipeline( + pipeline_name="github_issues_progress", + destination="duckdb", + dataset_name="github_data_merge", + progress="log", + ) + _load_info = pipeline_3.run(github_source()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb b/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb index c8606fa76..472b8e022 100644 --- a/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb +++ b/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb @@ -6,7 +6,7 @@ "id": "GNU4s2jjWTOV" }, "source": [ - "# **Performance Optimization in dlt pipelines** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)" + "# **Performance Optimization in dlt pipelines** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)" ] }, { @@ -94,7 +94,7 @@ "\n", "We'll now look at how to optimize each of these stages individually.\n", "\n", - "> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) first." + "> If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module \"How dlt works\"](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) first." ] }, { @@ -232,23 +232,27 @@ }, "outputs": [], "source": [ - "import multiprocessing\n", "import time\n", + "import multiprocessing\n", "from concurrent.futures import ProcessPoolExecutor\n", "\n", "\n", - "def compute_heavy_task() -> None:\n", + "def compute_heavy_task() -> str:\n", + " lines = []\n", " for number in range(3):\n", - " print(\n", - " f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\\n\"\n", + " lines.append(\n", + " f\"Computing in {multiprocessing.current_process().name}. {number=} => {number**2}\"\n", " )\n", " time.sleep(0.1)\n", + " return \"\\n\".join(lines)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " with ProcessPoolExecutor(max_workers=4) as process_executor:\n", - " for _ in range(4):\n", - " process_executor.submit(compute_heavy_task)" + " futures = [process_executor.submit(compute_heavy_task) for _ in range(4)]\n", + " for fut in futures:\n", + " print(fut.result())\n", + " print()" ] }, { @@ -450,12 +454,12 @@ "id": "rvId84tCaH7u" }, "source": [ - "- Control the [in-memory buffer size](#scrollTo=ffVpDFHfnqO-) for the extract stage\n", + "- Control the `in-memory buffer size` for the extract stage\n", "- Group `dlt` resources into `dlt` sources\n", "- Specify the number of thread workers or..\n", "- When using async generators, control the number of async functions/awaitables being evaluated in parallel\n", "- Yield pages instead of rows\n", - "- Customize the [size of intermediary files](#scrollTo=g9AGWfLkoAMb) created in the extract stage to control file rotation" + "- Customize the `size of intermediary files` created in the extract stage to control file rotation" ] }, { @@ -559,7 +563,7 @@ " dataset_name=\"mydata\",\n", " dev_mode=True,\n", ")\n", - "load_info = pipeline.extract(buffered_resource)\n", + "pipeline.extract(buffered_resource)\n", "print(pipeline.last_trace)" ] }, @@ -604,7 +608,8 @@ " dataset_name=\"mydata\",\n", " dev_mode=True,\n", ")\n", - "load_info = pipeline.extract(buffered_resource)\n", + "\n", + "pipeline.extract(buffered_resource)\n", "print(pipeline.last_trace)" ] }, @@ -779,9 +784,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.extract(\n", - " [buffered_resource1, buffered_resource2, buffered_resource3]\n", - ")\n", + "pipeline.extract([buffered_resource1, buffered_resource2, buffered_resource3])\n", "print(pipeline.last_trace)" ] }, @@ -825,7 +828,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.extract(source())\n", + "pipeline.extract(source())\n", "print(pipeline.last_trace)" ] }, @@ -954,7 +957,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.extract(source())\n", + "pipeline.extract(source())\n", "print(pipeline.last_trace)" ] }, @@ -1038,7 +1041,7 @@ ")\n", "\n", "\n", - "load_info = pipeline.extract(source())\n", + "pipeline.extract(source())\n", "print(pipeline.last_trace)" ] }, @@ -1089,13 +1092,13 @@ "@dlt.resource\n", "def sync_items() -> TDataItems:\n", " for i in range(10):\n", - " time.sleep(0.5) # Blocking call\n", + " time.sleep(0.5)\n", " yield i\n", "\n", "\n", "@dlt.transformer\n", "def sync_transform(item: TDataItem) -> TDataItems:\n", - " time.sleep(0.5) # Also blocking\n", + " time.sleep(0.5)\n", " return {\"row\": item}\n", "\n", "\n", @@ -1130,13 +1133,13 @@ "@dlt.resource\n", "async def async_items() -> TDataItems:\n", " for i in range(10):\n", - " await asyncio.sleep(0.5) # Blocking\n", + " await asyncio.sleep(0.5)\n", " yield i\n", "\n", "\n", "@dlt.transformer\n", "async def async_transform(item) -> TDataItems:\n", - " await asyncio.sleep(0.5) # Non-blocking\n", + " await asyncio.sleep(0.5)\n", " # just return the results, if you yield, generator will be evaluated in main thread\n", " return {\"row\": item}\n", "\n", @@ -1276,7 +1279,7 @@ "@dlt.resource\n", "def get_users() -> TDataItems:\n", " for user in fetch_users():\n", - " yield user # yields one row at a time" + " yield user" ] }, { @@ -1354,8 +1357,8 @@ "\n", "def yield_chunks(iterator: Iterator[Dict[str, int]], chunk_size=10):\n", " iterator = iter(iterator)\n", - " while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n", - " time.sleep(0.01) # Simulate slow API call\n", + " while chunk := list(islice(iterator, chunk_size)):\n", + " time.sleep(0.01)\n", " yield chunk\n", "\n", "\n", @@ -1387,7 +1390,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.extract(source())\n", + "pipeline.extract(source())\n", "print(pipeline.last_trace)" ] }, @@ -1463,7 +1466,7 @@ "4. These files are then used in the **load** stage.\n", "\n", "\n", - ">If you’re not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true&scrollTo=bCeUqaW_cRSh) before diving into performance tuning. " + ">If you’re not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) before diving into performance tuning. " ] }, { @@ -1583,8 +1586,8 @@ "\n", "def yield_chunks(iterable, chunk_size=10):\n", " iterator = iter(iterable)\n", - " while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n", - " time.sleep(0.01) # Simulate slow API call\n", + " while chunk := list(islice(iterator, chunk_size)):\n", + " time.sleep(0.01)\n", " yield chunk\n", "\n", "\n", @@ -1611,7 +1614,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.extract(source())\n", + "pipeline.extract(source())\n", "print(pipeline.last_trace)" ] }, @@ -1639,7 +1642,7 @@ "\n", "os.environ[\"NORMALIZE__WORKERS\"] = \"1\"\n", "\n", - "load_info = pipeline.normalize()\n", + "pipeline.normalize()\n", "print(pipeline.last_trace)" ] }, @@ -1710,8 +1713,8 @@ ")\n", "\n", "\n", - "load_info = pipeline.extract(source())\n", - "load_info = pipeline.normalize()\n", + "pipeline.extract(source())\n", + "pipeline.normalize()\n", "\n", "print(pipeline.last_trace)" ] @@ -1881,8 +1884,8 @@ "\n", "def yield_chunks(iterable, chunk_size=10):\n", " iterator = iter(iterable)\n", - " while chunk := list(islice(iterator, chunk_size)): # <--- we slice data into chunks\n", - " time.sleep(0.01) # Simulate slow API call\n", + " while chunk := list(islice(iterator, chunk_size)):\n", + " time.sleep(0.01)\n", " yield chunk\n", "\n", "\n", @@ -2060,7 +2063,6 @@ }, "outputs": [], "source": [ - "# Install dlt if not already installed\n", "%%capture\n", "!pip install \"dlt[duckdb]\"" ] @@ -2082,7 +2084,9 @@ }, "outputs": [], "source": [ - "exit()" + "import os\n", + "\n", + "os.environ.clear()" ] }, { @@ -2117,9 +2121,9 @@ "def pagination(url):\n", " while True:\n", " response = requests.get(url, headers=headers)\n", - " time.sleep(0.1) # Simulate delay\n", + " time.sleep(0.1)\n", " response.raise_for_status()\n", - " yield response.json() # Here we're yielding pages\n", + " yield response.json()\n", "\n", " # Get next page\n", " if \"next\" not in response.links:\n", @@ -2201,7 +2205,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.run(\n", + "pipeline.run(\n", " [\n", " get_issues,\n", " get_stargazers,\n", @@ -2355,9 +2359,6 @@ " )\n", "\n", "\n", - "improved_p = dlt.pipeline(\"test_pipeline_2\", destination=\"duckdb\")\n", - "\n", - "\n", "pipeline = dlt.pipeline(\n", " pipeline_name=\"extract_pipeline_example2\",\n", " destination=\"duckdb\",\n", @@ -2365,7 +2366,7 @@ " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.run(github_data())\n", + "pipeline.run(github_data())\n", "print(pipeline.last_trace)" ] }, diff --git a/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py b/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py new file mode 100644 index 000000000..4627b550f --- /dev/null +++ b/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py @@ -0,0 +1,1970 @@ +# /// script +# dependencies = [ +# "dlt", +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# **Performance Optimization in dlt pipelines** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Introduction** + + Sometimes you have to slow down in order to speed up... + + This lesson teaches you how to make dlt pipelines faster by optimizing each internal stage of execution. You’ll learn how to tune memory, enable parallelism, and reduce runtime using real examples. + + We will walk through the internal steps of `pipeline.run()` again, but this time focusing only on performance optimization techniques. + + Read more in the [dlt performance docs](https://dlthub.com/docs/general-usage/performance). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_9_Performance_optimisation_img1](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_9_Performance_optimisation_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Already covered in Fundamentals** + + - Basic structure of `pipeline.run()`. + + - Default behavior of **extract/normalize/load**. + + - Example with nested rows and `items__nested` tables. + + - Overview of file formats (jsonl, parquet, etc.). + + - Progress logging and pipeline introspection. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **In the Advanced Performance Optimization lesson** + + - Optimize memory with buffer tuning. + + - Yield pages instead of rows. + + - Control threading and multiprocessing. + + - Tune file rotation for parallelism. + + - Run multiple pipelines in one process. + + - Spawn method on Linux. + + - Real GitHub pipeline performance demo. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## Understanding `pipeline.run()` for Performance + + When you call `pipeline.run()`, dlt goes through three stages: + + 1. **Extract** – fetch data and write intermediary files. + 2. **Normalize** – transform and flatten the data. + 3. **Load** – load data into the destination. + + We'll now look at how to optimize each of these stages individually. + + > If you're unfamiliar with how `pipeline.run()` works under the hood, including the **extract/normalize/load** stages and intermediary files, please complete the [Fundamentals module "How dlt works"](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) first. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_9_Performance_optimisation_img2](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_9_Performance_optimisation_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Before we dive into parallelism in dlt...** + + To get the most out of parallelism features in `dlt`, it's helpful to quickly refresh how parallelism works in Python in general. + + Python isn't truly multithreaded by default due to the Global Interpreter Lock (GIL), but there are multiple ways to run tasks concurrently: using **threads**, **processes**, or **async**. + + Each has its own strengths, and `dlt` actually uses all of them depending on the stage: threads for extracting and loading, and processes for normalization. + + Let’s take a quick look at how these work under the hood. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Parallelism in Python** + + + Python is single-threaded by default. That means only one operation happens at a time, even on multi-core CPUs. This becomes a bottleneck for: + + - API calls + - file I/O + - database queries + - and anything that waits instead of computes + + Parallelism solves this by doing *many things at once*. It’s essential when building efficient data pipelines, like those with `dlt`. + + + ## **Types of parallelism in Python** + + There are 3 main types. Each has different use cases. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_9_Performance_optimisation_img3](https://storage.googleapis.com/dlt-blog-images/dlt-advanced-course/Lesson_9_Performance_optimisation_img3.webp)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 1. **Threading** + - Best for I/O-bound tasks (e.g., reading from APIs or files). + - Uses the `threading` or [`concurrent.futures.ThreadPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor). + + + + ### **Why Python has multithreading — but only one thread runs Python code at a time** + + - Python does support multithreading, and you can create multiple threads with `threading.Thread()`. + + - But in CPython, the standard Python implementation, there’s something called the **Global Interpreter Lock (GIL)**. + + - The GIL makes sure that only **one thread** can execute Python bytecode at a time — even on multi-core CPUs. + + - So if you create 5 threads, Python will **run them one by one**, rapidly switching between them — not in true parallel. + + - It still counts as “multithreading” because threads **exist and run**, but they’re **not truly concurrent** for Python code execution. + + **Example 1:** + + In this example, `threaded_function` prints the values zero to two that your for loop assigns to the loop variable number. Using a `ThreadPoolExecutor`, four threads are created to execute the threaded function. `ThreadPoolExecutor` is configured to run a maximum of four threads concurrently with `max_workers=4`, and each worker thread is named with a “Worker” prefix, as in `thread_name_prefix="Worker"`. + """) + return + + +@app.cell +def _(): + import threading + import time + from concurrent.futures import ThreadPoolExecutor + + def threaded_function() -> None: + for _number in range(3): + print( + f"Printing from {threading.current_thread().name}. number={_number!r}" + ) + time.sleep(0.1) + + with ThreadPoolExecutor( + max_workers=4, thread_name_prefix="Worker" + ) as executor: # Simulate slow API call + for _ in range(4): + executor.submit(threaded_function) + return (time,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 2. **Multiprocessing** + - Best for CPU-bound tasks (e.g., compressing, parsing, transforming). + - Uses `multiprocessing` or [`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#processpoolexecutor). + + Example 1: + + In this example, `compute_heavy_task` squares numbers from 0 to 2 and prints the process name it runs on. We use `ProcessPoolExecutor` to run 4 processes in parallel, each computing the task independently. + """) + return + + +@app.cell +def _(time): + import multiprocessing + from concurrent.futures import ProcessPoolExecutor + + def compute_heavy_task() -> str: + lines = [] + for _number in range(3): + lines.append( + f"Computing in {multiprocessing.current_process().name}. number={_number!r} => {_number ** 2}" + ) + time.sleep(0.1) + return "\n".join(lines) + + if __name__ == "__main__": + with ProcessPoolExecutor(max_workers=4) as _process_executor: + futures = [_process_executor.submit(compute_heavy_task) for _ in range(4)] + for fut in futures: + print(fut.result()) + print() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Example 2:""") + return + + +@app.cell +def _(): + import concurrent.futures + import math + + PRIMES = [ + 112272535095293, + 112582705942171, + 112272535095293, + 115280095190773, + 115797848077099, + 1099726899285419, + ] + + def is_prime(n: int) -> bool: + if n < 2: + return False + if n == 2: + return True + if n % 2 == 0: + return False + sqrt_n = int(math.floor(math.sqrt(n))) + for i in range(3, sqrt_n + 1, 2): + if n % i == 0: + return False + return True + + with concurrent.futures.ProcessPoolExecutor() as _process_executor: + for _number, prime in zip(PRIMES, _process_executor.map(is_prime, PRIMES)): + print("%d is prime: %s" % (_number, prime)) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### 3. **AsyncIO** + - Great for many concurrent tasks that *wait* (e.g., HTTP, sockets). + - Lightweight and fast. Single-threaded but concurrent. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Example 1:""") + return + + +@app.cell +async def _(): + import asyncio + + async def _main() -> None: + await asyncio.sleep(1) + print("hello") + + _loop = asyncio.get_running_loop() + # In Colab, you'll need to get a handle of the current running loop first. + await _loop.create_task(_main()) + return (asyncio,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Example 2: + + The order of this output is **the heart of async IO**. Talking to each of the calls to `count()` is a single event loop, or coordinator. + + When each task reaches `await asyncio.sleep(1)`, the function yells up to the event loop and gives control back to it, saying, **“I’m going to be sleeping for 1 second. Go ahead and let something else meaningful be done in the meantime.”** + """) + return + + +@app.cell +async def _(asyncio, time): + async def count() -> None: + print("One") + await asyncio.sleep(1) + print("Two") + + async def _main() -> None: + await asyncio.gather(count(), count(), count()) + + s = time.perf_counter() + _loop = asyncio.get_running_loop() + await _loop.create_task(_main()) + elapsed = time.perf_counter() - s + print(f"executed in {elapsed:0.2f} seconds.") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **Parallelism in dlt** + + In `dlt`, parallelism is baked in: + + - **Extraction**: via threads (`parallelized=True` in `@dlt.resource`) or async generators. + - **Normalization**: via process pools. + - **Loading**: via threads. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## **Extract** + + The extract stage fetches data and writes it to intermediary files. This phase is usually **I/O-bound** — lots of small writes or slow network calls can slow it down. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Default behaviour** + + - The in-memory buffer is set to `5000` items. + - By default, **intermediary files are not rotated**. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, `dlt` will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions. + - By default, intermediary files at the extract stage use a custom version of the JSONL format. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **How to optimize extraction?**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + - Control the `in-memory buffer size` for the extract stage + - Group `dlt` resources into `dlt` sources + - Specify the number of thread workers or.. + - When using async generators, control the number of async functions/awaitables being evaluated in parallel + - Yield pages instead of rows + - Customize the `size of intermediary files` created in the extract stage to control file rotation + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **IMPORTANT: Start simple. dlt has smart defaults** + + Before you dive into tuning buffers, tweaking file sizes, and parallelizing every little thing — consider this: + + > **`dlt` comes with well-thought-out defaults that work great for most cases.** + + The default settings are: + - Conservative enough to work on a laptop. + - Efficient enough to run production loads for many use cases. + - Safe to experiment with incrementally. + + #### When to start tweaking? + + Once you’ve: + - Run your pipeline end-to-end successfully. + - Noticed slowdowns at scale. + - Understood which part of the pipeline (extract, normalize, load) is the bottleneck. + + > **Start with the defaults. Measure. Then tune.** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **1. [Use a larger In-Memory Buffer](https://dlthub.com/docs/reference/performance#overall-memory-and-disk-management)** + + dlt **buffers** data **in memory** to speed up processing and uses the file system to pass data between the **extract** and **normalize** stages. + + You can control **the size of the buffers** and **the size and number of the files** to fine-tune memory and CPU usage. These settings also impact parallelism. + + The size of the buffers is controlled by specifying **the number of data items** held in them. Data is appended to open files when the item buffer is full, after which the buffer is cleared. + + By default, dlt **buffers 5000 items** before writing to disk. Increase this value to reduce disk I/O and improve speed. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Example 1: + + We set the buffer size to 1. dlt will extract data **row by row** and write each row to an intermediary file one by one. + + This also **disables multithreading** — when the buffer size is 1, the number of extract workers is effectively limited to 1. + """) + return + + +@app.cell +def _(): + from typing import Dict, Iterator + import os + import dlt + from dlt.common.typing import TDataItems + + os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "1" + + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + @dlt.resource() + def _buffered_resource() -> TDataItems: + for row in _get_rows(500000): + yield row + + pipeline = dlt.pipeline( + pipeline_name="extract_pipeline1", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline.extract(_buffered_resource) + print(pipeline.last_trace) + return Dict, Iterator, TDataItems, dlt, os + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Example 2: + + Increase the number of buffer items. + """) + return + + +@app.cell +def _(Dict, Iterator, TDataItems, dlt, os): + os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "5000" + + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + @dlt.resource() + def _buffered_resource() -> TDataItems: + for row in _get_rows(500000): + yield row + + pipeline_1 = dlt.pipeline( + pipeline_name="extract_pipeline2", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_1.extract(_buffered_resource) + print(pipeline_1.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Explanation:** The buffer collects many items in memory before writing them to disk. A larger buffer means fewer writes, which saves I/O time and makes the extract stage faster. This is especially helpful when extracting a large number of small records. + + **Downside:** High buffer size increases memory usage. If the machine has limited RAM, it could cause memory pressure or crashes. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **IMPORTANT: Performance measurements in Google Colab may be unreliable.** + + Even with large buffer sizes, timing results in Colab can vary significantly between runs. This is because Colab runs on shared cloud infrastructure, where CPU, memory, and disk I/O are not guaranteed and may fluctuate at any time. + + You might observe: + + - Slower or inconsistent extract times + + - Unpredictable delays due to resource throttling or background activity + + For **reliable performance** testing, always run your dlt pipelines on a **local machine**, where you control the environment and system resources are stable. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **Excercise 1** + + Play with `BUFFER_MAX_ITEMS` parameter. Run your pipeline and measure the time. + + Don’t expect linear speed-up — larger buffers may **slow things down** depending on your system. + + At some point, increasing the buffer size will **stop making things faster**. After that threshold, you’ll hit diminishing returns, and performance may plateau or even degrade. The optimal value depends on your machine’s I/O and memory characteristics. + """) + return + + +@app.cell +def _(Dict, Iterator, TDataItems, dlt, os, time): + import matplotlib.pyplot as plt + + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + def measure_extract_time(buffer_size: int) -> float: + os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = str(buffer_size) + + @dlt.resource() + def _buffered_resource() -> TDataItems: + for row in _get_rows(500000): + yield row + + pipeline = dlt.pipeline( + pipeline_name=f"extract_pipeline_{buffer_size}", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + start_time = time.time() + pipeline.extract(_buffered_resource) + return time.time() - start_time + + buffer_sizes = [1, 10, 100, 1000, 5000, 10000, 50000, 100000, 500000] + times = [measure_extract_time(size) for size in buffer_sizes] + plt.figure(figsize=(10, 6)) + plt.plot(buffer_sizes, times, marker="o") + plt.xlabel("BUFFER_MAX_ITEMS") + plt.ylabel("Time to Extract (seconds)") + plt.title("Effect of Buffer Size on Extraction Time") + plt.grid(True) + plt.xscale("log") + plt.show() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **2. Group Resources into Sources** + + In `dlt`, each **resource** is treated as a separate unit during extraction. If you pass multiple resources directly to `pipeline.extract()`, `dlt` handles them independently — each with its own extract process and context. + + To **optimize performance**, especially during the extract stage, it's often better to **group related resources into a single source**. This allows `dlt` to: + - Run extraction more efficiently + - Reuse shared context (like API sessions or connections) + - Avoid overhead from managing multiple resource objects individually + - Enable better parallelism and state management + + + Example without grouping: + """) + return + + +@app.cell +def _(): + exit() + return + + +@app.cell +def _(Dict, Iterator, TDataItems, dlt): + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + @dlt.resource(name="resource1") + def buffered_resource1() -> TDataItems: + for row in _get_rows(500000): + yield row + + @dlt.resource(name="resource2") + def buffered_resource2() -> TDataItems: + for row in _get_rows(500000): + yield row + + @dlt.resource(name="resource3") + def buffered_resource3() -> TDataItems: + for row in _get_rows(500000): + yield row + + pipeline_2 = dlt.pipeline( + pipeline_name="extract_pipeline4", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_2.extract([buffered_resource1, buffered_resource2, buffered_resource3]) + print(pipeline_2.last_trace) + return buffered_resource1, buffered_resource2, buffered_resource3 + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This works, but each resource is treated separately. For large datasets or many resources, this adds extract overhead. + + + Example with grouping: + """) + return + + +@app.cell +def _(buffered_resource1, buffered_resource2, buffered_resource3, dlt): + from typing import Iterable + from dlt.extract import DltResource + from threading import currentThread + + @dlt.source + def source() -> Iterable[DltResource]: + return (buffered_resource1, buffered_resource2, buffered_resource3) + + pipeline_3 = dlt.pipeline( + pipeline_name="extract_pipeline4", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_3.extract(source()) + print(pipeline_3.last_trace) + return DltResource, Iterable + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This version: + - Groups all resources into a single source + - Allows `dlt` to optimize scheduling and state tracking + - Reduces overhead during extraction and improves throughput + + #### **What to expect** + + - **Grouped resources** may not show a big speed increase in small examples. + - However, **it unlocks `dlt`'s parallel extraction engine**: when grouped into a single `@dlt.source`, `dlt` can schedule their execution in a shared thread pool. + - This is essential when working with: + - Many resources + - Slow APIs + - IO-bound extractors + - High data volumes + + #### **Note**: + Even if timing results look similar in this example, grouping into a source is what **enables true concurrent resource execution**. Without it, `dlt` treats each resource as an isolated unit and may serialize extraction. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **3. Enable parallel threaded extraction** + + When extracting data from **multiple sources**, you usually want them to be processed **at the same time**, not one after another. This is especially useful when: + + - Calling **slow APIs** + - Working with **multiple endpoints** + - Extracting from **databases with many tables** + + Use multiple threads to fetch data from different resources with `parallelized=True`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Set the number of parallel threads with: + + ```python + os.environ["EXTRACT__WORKERS"] = "3" + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Simulate Slow APIs with `time.sleep`** + + We’ll simulate API latency by adding a `time.sleep(0.01)` delay before yielding each row. This mimics a network call taking ~10ms. + + We’ll then parallelize the resources using `parallelized=True` and observe the thread behavior using `threading.currentThread()`. + """) + return + + +@app.cell +def _(Dict, DltResource, Iterable, Iterator, TDataItems, dlt, time): + from threading import current_thread + + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + @dlt.resource(name="resource1", parallelized=False) + def buffered_resource1_1() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource1 in thread {current_thread().name}") + yield row + + @dlt.resource(name="resource2", parallelized=False) + def buffered_resource2_1() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource2 in thread {current_thread().name}") + yield row + + @dlt.resource(name="resource3", parallelized=False) + def buffered_resource3_1() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource3 in thread {current_thread().name}") + yield row + + @dlt.source + def source_1() -> Iterable[DltResource]: + return (buffered_resource1_1, buffered_resource2_1, buffered_resource3_1) + + pipeline_4 = dlt.pipeline( + pipeline_name="extract_pipeline4", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_4.extract(source_1()) + print(pipeline_4.last_trace) + return (current_thread,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **What does it mean?** + + dlt is extracting rows in a [**round-robin**](https://dlthub.com/docs/reference/performance#resources-extraction-fifo-vs-round-robin) fashion — one row from each resource in turn — all within the `MainThread`. Since there’s no parallelization, the resources share a single thread and are executed sequentially. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Now let’s enable multithreading.** + + In the previous example, all resources ran sequentially in the main thread. This time, we add `parallelized=True` to each resource — allowing `dlt` to extract from all three **at the same time**, using separate threads. + + You’ll see the difference immediately in the output: each resource prints from a different thread, confirming that extraction is now concurrent. + """) + return + + +@app.cell +def _( + Dict, + DltResource, + Iterable, + Iterator, + TDataItems, + current_thread, + dlt, + time, +): + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + @dlt.resource(name="resource1", parallelized=True) + def buffered_resource1_2() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource1 in thread {current_thread().name}") + yield row + + @dlt.resource(name="resource2", parallelized=True) + def buffered_resource2_2() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource2 in thread {current_thread().name}") + yield row + + @dlt.resource(name="resource3", parallelized=True) + def buffered_resource3_2() -> TDataItems: + for row in _get_rows(100): + time.sleep(0.01) + print(f"resource3 in thread {current_thread().name}") + yield row + + @dlt.source + def source_2() -> Iterable[DltResource]: + return (buffered_resource1_2, buffered_resource2_2, buffered_resource3_2) + + pipeline_5 = dlt.pipeline( + pipeline_name="extract_pipeline4", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_5.extract(source_2()) + print(pipeline_5.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Explanation:** Each worker runs in a separate thread, allowing several resources to extract data at the same time. This is critical for reducing bottlenecks when working with slow APIs or large resource sets. + + **Downside:** More threads increase CPU load. Poorly written thread-unsafe code or thread contention may degrade performance instead of improving it. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Async**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The example below does the same but using an async generator as the main resource and async/await and futures pool for the transformer. + + **Example 1 — Synchronous execution (sequential, slow)** + """) + return + + +@app.cell +def _(TDataItems, dlt, time): + from dlt.common.typing import TDataItem + + @dlt.resource + def sync_items() -> TDataItems: + for i in range(10): + time.sleep(0.5) + yield i + + @dlt.transformer + def sync_transform(item: TDataItem) -> TDataItems: + time.sleep(0.5) + return {"row": item} + + _start = time.time() + result = list(sync_items() | sync_transform) + print(f"Sync result: {result}") + print("Sync elapsed time:", round(time.time() - _start, 2), "seconds") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""**Example 2 — Asynchronous execution (concurrent, fast)**""") + return + + +@app.cell +def _(TDataItems, asyncio, dlt, time): + @dlt.resource + async def async_items() -> TDataItems: + for i in range(10): + await asyncio.sleep(0.5) + yield i + + @dlt.transformer + async def async_transform(item) -> TDataItems: + await asyncio.sleep(0.5) + return {"row": item} + + _start = time.time() + print(list(async_items() | async_transform)) + print("Async elapsed time:", round(time.time() - _start, 2), "seconds") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Breakdown of time** + + - `async_items()` yields 10 items → takes ~5s total (0.5s × 10) + + - `async_transform()` is fast once it starts — runs in parallel + + - So total time is: + + - ~5s to yield + + - `+` ~0.5s to process the last batch of transformer calls + + - ➜ ~5.5–6 seconds total + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **4. Yielding chunks instead of rows** + + In real-world data extraction, especially from APIs, **data is typically returned in pages** — for example, 100 users per request. These pages are already **natural chunks**, so there's no reason to extract and yield each row from the page individually. + + Instead of doing something like: + + ```python + for item in page: + yield item # ❌ inefficient + ``` + + You should just do: + + ```python + yield page # ✅ fast and efficient + ``` + + This small change makes a big difference in performance. Yielding full pages (chunks) reduces the number of Python function calls and lets `dlt` process your data more efficiently — especially during buffering and writing stages. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **What is `yield` in Python?** + + In Python, `yield` turns a function into a **generator**. + + Instead of returning a full list of results at once, it gives back **one item at a time**, each time the function is called again. + + This is useful when: + - You work with large datasets + - You don’t want to keep everything in memory + - You want to stream values as they are produced + + #### Example + """) + return + + +@app.cell +def _(Iterator): + def count_up_to(n: int) -> Iterator[int]: + for i in range(n): + yield i + return (count_up_to,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Calling `count_up_to(3)` returns a generator:""") + return + + +@app.cell +def _(count_up_to): + for _number in count_up_to(3): + print(_number) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **Yielding rows in dlt** + + This is what you usually see in basic educational `dlt` pipelines: + """) + return + + +@app.cell +def _(Dict, TDataItems, dlt): + from typing import List + + def fetch_users() -> List[Dict[str, int]]: + return [{"id": 1}, {"id": 2}, {"id": 3}] + + @dlt.resource + def get_users() -> TDataItems: + for user in fetch_users(): + yield user + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### Problem + + This creates a **high number of `yield` calls** — each row is passed into the extract pipeline one at a time. While dlt buffers rows before writing, each row still incurs the cost of a Python function call and per-item processing inside the pipeline. + + This adds overhead, especially with millions of rows. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **What does “Yield Chunks” mean?** + + Instead of: + + ```python + yield {"id": 1} + yield {"id": 2} + ``` + + Do this: + + ```python + yield [{"id": 1}, {"id": 2}] # yield a list of rows + ``` + + We call this **page/chunk-based yielding**. + + You still use `yield`, but now each yield returns **a batch of rows**, not just one. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **How to yield chunks** + + Here’s how you chunk your data with `islice` from `itertools`: + """) + return + + +@app.cell +def _(Dict, Iterator, dlt, time): + from itertools import islice + + def _get_rows(limit: int) -> Iterator[Dict[str, int]]: + yield from map(lambda n: {"row": n}, range(limit)) + + def _yield_chunks(iterator: Iterator[Dict[str, int]], chunk_size=10): + iterator = iter(iterator) + while chunk := list(islice(iterator, chunk_size)): + time.sleep(0.01) + yield chunk + + @dlt.resource(name="resource1", parallelized=True) + def buffered_resource1_3(): + yield from _yield_chunks(_get_rows(100), chunk_size=10) + + @dlt.resource(name="resource2", parallelized=True) + def buffered_resource2_3(): + yield from _yield_chunks(_get_rows(100), chunk_size=10) + + @dlt.resource(name="resource3", parallelized=True) + def buffered_resource3_3(): + yield from _yield_chunks(_get_rows(100), chunk_size=10) + + @dlt.source + def source_3(): + return (buffered_resource1_3, buffered_resource2_3, buffered_resource3_3) + + pipeline_6 = dlt.pipeline( + pipeline_name="extract_pipeline4", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_6.extract(source_3()) + print(pipeline_6.last_trace) + return (islice,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Such a crazy speed improvement! You'll notice the difference even more as your data size grows.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **5. Enable file rotation for large datasets** + + By default, `dlt` writes all extracted data from a resource into **one large intermediary file**. If your resource yields millions of rows, that means: + - Only **one normalization worker** will be able to process that file + - You’ll lose all benefits of **parallel processing** in later stages + + To fix this, you can **enable file rotation** by setting a file size limit. For example: + + ```python + os.environ["EXTRACT__DATA_WRITER__FILE_MAX_ITEMS"] = "100000" + ``` + + This means: + - Every 100,000 items, a new intermediary file will be created + - If you have 1,000,000 rows, you'll end up with 10 files + - Later, these files can be processed **in parallel** during normalization and load + + File rotation is essential for scaling up performance when dealing with large datasets. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **6. Avoid unnecessary transformation in the resource** + Keep your resource logic simple and fast — avoid costly computation or transformation in the generator itself. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Normalize**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **What happens at the normalization stage?** + + After data is extracted, `dlt` transforms it into a **relational format** suitable for loading into databases. This happens in the **normalize stage**: + + 1. Extracted files are passed to the **normalization process pool**. + 2. Each file is read, schema is resolved, and data is transformed. + 3. Rows are buffered and written into **normalized intermediary files**. + 4. These files are then used in the **load** stage. + + + >If you’re not yet familiar with how the **normalization stage** works in `dlt`, we recommend reviewing the [**Normalization section in the dlt Fundamentals course**](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) before diving into performance tuning. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Default behavior** + + - **Buffer size**: 5,000 items + - **Parallelism**: Off by default (runs in main process) + - **File rotation**: Off by default — all rows written into one file + - **Compression**: On by default + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Why normalization may be slow** + + If you process a lot of data in one file and use just one CPU, normalization becomes a bottleneck: + - File parsing and transformation are **CPU-heavy** + - Without parallelism, large files block the pipeline + - Compression slows it further if not needed + + > File parsing and transformation are **CPU-heavy**, especially when dealing with **deeply nested structures** (which must be flattened into multiple tables) and **automatic data type inference** (which inspects each value to determine its type). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### How to optimize normalization + + 1. **Enable parallelism**: Use multiple processes + ```python + os.environ['NORMALIZE__WORKERS'] = '3' + ``` + + 2. **Disable compression (for debugging or speed)**: + ```python + os.environ['NORMALIZE_DATA_WRITER__DISABLE_COMPRESSION'] = 'true' + ``` + + 3. **Control buffer size** (optional): + ```python + os.environ['NORMALIZE__DATA_WRITER__BUFFER_MAX_ITEMS'] = '10000' + ``` + + 4. **Enable file rotation** (if you have one big file): + ```python + os.environ['NORMALIZE__DATA_WRITER__FILE_MAX_ITEMS'] = '100000' + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **1. Parallel normalization** + + Let’s measure normalization performance with and without parallelism. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **a. Normalize with 1 worker (default)** + + First, we run extraction: + """) + return + + +@app.cell +def _(dlt, islice, time): + def _get_rows(limit): + yield from map(lambda n: {"row": n}, range(limit)) + + def _yield_chunks(iterable, chunk_size=10): + iterator = iter(iterable) + while chunk := list(islice(iterator, chunk_size)): + time.sleep(0.01) + yield chunk + + @dlt.resource(name="resource1", parallelized=True) + def buffered_resource1_4(): + yield from _yield_chunks(_get_rows(1000000), chunk_size=10000) + + @dlt.resource(name="resource2", parallelized=True) + def buffered_resource2_4(): + yield from _yield_chunks(_get_rows(1000000), chunk_size=10000) + + @dlt.source + def source_4(): + return (buffered_resource1_4, buffered_resource2_4) + + pipeline_7 = dlt.pipeline( + pipeline_name="extract_pipeline_w1", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_7.extract(source_4()) + print(pipeline_7.last_trace) + return pipeline_7, source_4 + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + As mentioned earlier, each file created during the extract stage is sent to the process pool of the normalization stage. Since file rotation has not been enabled at the extract stage, each resource is written to a separate intermediary file. This results in **three files**, which can be **normalized in parallel**. + + First, let's measure the time taken with a single process worker. + """) + return + + +@app.cell +def _(os, pipeline_7): + os.environ["NORMALIZE__WORKERS"] = "1" + pipeline_7.normalize() + print(pipeline_7.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Oh, that took way longer than extraction, right? + + Yep, that’s totally normal. The **normalization step does the heavy lifting**: + - flattening nested data, + - figuring out types, + - generating tables. + + It’s often **the slowest part** of the pipeline, so don’t be surprised if it takes most of the time. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### **b. Normalize with 2 workers** + + Now, let's try more process workers. + + Unfortunately, Colab gives us only **2 CPU cores**. + That means running normalization with more than 2 workers won’t help (and might even slow things down). + Let’s stick with **2 workers** to get the best performance from what we’ve got! + + + Note that we are running the extract stage again with a new pipeline, because normalizing already normalized data would not be meaningful. + """) + return + + +@app.cell +def _(os): + os.cpu_count() + return + + +@app.cell +def _(dlt, os, source_4): + # Set the number of process workers to 2 + os.environ["NORMALIZE__WORKERS"] = "2" + pipeline_8 = dlt.pipeline( + pipeline_name="extract_pipeline_w2", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_8.extract(source_4()) + pipeline_8.normalize() + print(pipeline_8.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + #### What to expect + + With parallel workers: + - The total time to normalize **drops significantly** + - CPU usage will increase (expected!) + - Logs may show multiple files being processed at the same time + + + #### ✅ Rule of thumb: + Use more workers and rotate files if you have: + - Large data + - Multiple extracted files + - A machine with multiple CPU cores + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **2. Enable file rotation for large datasets** + + By default, all normalized data goes into **one big file** — which means **only one process** can handle it. That kills parallelism. + + To fix this, set: + + ```python + os.environ["NORMALIZE__DATA_WRITER__FILE_MAX_ITEMS"] = "100000" + ``` + + Now `dlt` will: + - Split data into smaller files (e.g., 10 files for 1M rows) + - Load them **in parallel** using multiple workers + - Speed up loading + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Load**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **What happens at the loading stage?** + + + + After data is normalized, `dlt` takes the resulting files and sends them to your **destination** (e.g., DuckDB, BigQuery, Redshift). + + This stage uses a **thread pool**, where: + 1. Each thread loads one normalized file at a time. + 2. Files from the same source are bundled into a **load package**. + 3. Packages are loaded into the destination concurrently. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Default behavior** + + - `dlt` uses **20 threads** by default + - Each thread processes one file + - All file contents are already normalized — there’s no parsing or schema detection at this point, so it’s mostly **I/O-bound** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **How to optimize loading?** + + + 1. **Control the number of threads** + Set this based on your destination’s capacity: + + ```python + os.environ["LOAD__WORKERS"] = "4" + ``` + + 2. **Rotate files during normalization** + If all your data is in **one big file**, you’ll still have only **one load job**. To unlock real parallelism: + ```python + os.environ["NORMALIZE__DATA_WRITER__FILE_MAX_ITEMS"] = "100000" + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Example**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Now that we have two pipelines from the previous steps, let's use the first one to load data with only one thread. This means all normalized files will be loaded sequentially.""" + ) + return + + +@app.cell +def _(dlt, islice, time): + def _get_rows(limit): + yield from map(lambda n: {"row": n}, range(limit)) + + def _yield_chunks(iterable, chunk_size=10): + iterator = iter(iterable) + while chunk := list(islice(iterator, chunk_size)): + time.sleep(0.01) + yield chunk + + @dlt.resource(name="resource1", parallelized=True) + def buffered_resource1_5(): + yield from _yield_chunks(_get_rows(1000000), chunk_size=10000) + + @dlt.resource(name="resource2", parallelized=True) + def buffered_resource2_5(): + yield from _yield_chunks(_get_rows(1000000), chunk_size=10000) + + @dlt.resource(name="resource3", parallelized=True) + def buffered_resource3_4(): + yield from _yield_chunks(_get_rows(1000000), chunk_size=10000) + + @dlt.source + def source_5(): + return (buffered_resource1_5, buffered_resource2_5, buffered_resource3_4) + return (source_5,) + + +@app.cell +def _(dlt, os, source_5): + # Set the number of thread workers to 1 + os.environ["LOAD__WORKERS"] = "1" + pipeline_9 = dlt.pipeline( + pipeline_name="extract_pipeline_load1", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_9.extract(source_5()) + pipeline_9.normalize() + pipeline_9.load() + print(pipeline_9.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""> Step load COMPLETED in 1 minute and 24.07 seconds.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Next, use the second pipeline to load data with 3 threads, allowing the normalized files to be loaded in parallel.""" + ) + return + + +@app.cell +def _(dlt, os, source_5): + # Set the number of thread workers to 3 + os.environ["LOAD__WORKERS"] = "3" + pipeline_10 = dlt.pipeline( + pipeline_name="extract_pipeline_load2", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_10.extract(source_5()) + pipeline_10.normalize() + pipeline_10.load() + print(pipeline_10.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Step load COMPLETED in 59.89 seconds.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Voila! ⭐""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### What to expect + + - More threads = faster load, **if you have enough files** + - If there’s only one file, you won’t see a speedup + - Use **file rotation** in normalization to split the load into chunks + + > The **load stage is I/O-bound**, but that doesn't mean “more files is always better.” + Reading and loading many small files adds overhead too. + So use file rotation wisely: create **enough files to allow parallelism**, but not so many that it slows things down. + **Look at how much data you have**, and tune `FILE_MAX_ITEMS` accordingly. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Can you spot the bottleneck?** + + Ask yourself: + - Is my pipeline slow because of waiting on I/O or doing heavy computations? + - Am I yielding too many tiny objects one-by-one instead of batches? + - Is my API async? If not, can I enable `parallelized=True` safely in my resources? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **GitHub example**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In this example, we'll optimize a pipeline that loads data from seven different GitHub endpoints.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Clear the runtime to reset configurations:""") + return + + +@app.cell +def _(os): + os.environ.clear() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We'll first define the resources without parallelization. + + > Since we are already yielding pages, the chunking method is implemented. However, with a manageable number of entries, the impact of chunking may be negligible. + """) + return + + +@app.cell +def _(dlt, os, time): + import requests + + github_token = os.getenv("ACCESS_TOKEN") + headers = {"Authorization": f"token {github_token}"} + + def pagination(url): + while True: + response = requests.get(url, headers=headers) + time.sleep(0.1) + response.raise_for_status() + yield response.json() + if "next" not in response.links: + break + url = response.links["next"]["url"] + + @dlt.resource(table_name="issues", write_disposition="merge", primary_key="id") + def get_issues( + updated_at=dlt.sources.incremental( + "updated_at", initial_value="1970-01-01T00:00:00Z" + ) + ): + url = f"https://api.github.com/repos/dlt-hub/dlt/issues?since={updated_at.last_value}&per_page=100sort=updated" # Get next page + yield pagination(url) + + @dlt.resource(table_name="stargazers", write_disposition="merge", primary_key="id") + def get_stargazers(): + url = "https://api.github.com/repos/dlt-hub/dlt/stargazers?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="pull_requests", write_disposition="merge", primary_key="id" + ) + def get_pulls( + updated_at=dlt.sources.incremental( + "updated_at", initial_value="1970-01-01T00:00:00Z" + ) + ): + url = f"https://api.github.com/repos/dlt-hub/dlt/pulls?since={updated_at.last_value}&per_page=100&sort=updated" + yield pagination(url) + + @dlt.resource(table_name="commits", write_disposition="merge", primary_key="sha") + def get_commits(): + url = "https://api.github.com/repos/dlt-hub/dlt/commits?per_page=100" + yield pagination(url) + + @dlt.resource(table_name="branches", write_disposition="merge", primary_key="name") + def get_branches(): + url = "https://api.github.com/repos/dlt-hub/dlt/branches?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="contributors", write_disposition="merge", primary_key="id" + ) + def get_contributors(): + url = "https://api.github.com/repos/dlt-hub/dlt/contributors?per_page=100" + yield pagination(url) + + @dlt.resource(table_name="labels", write_disposition="merge", primary_key="id") + def get_labels(): + url = "https://api.github.com/repos/dlt-hub/dlt/labels?per_page=100" + yield pagination(url) + return ( + get_branches, + get_commits, + get_contributors, + get_issues, + get_labels, + get_pulls, + get_stargazers, + pagination, + ) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline with the resources defined above:""") + return + + +@app.cell +def _( + dlt, + get_branches, + get_commits, + get_contributors, + get_issues, + get_labels, + get_pulls, + get_stargazers, +): + pipeline_11 = dlt.pipeline( + pipeline_name="extract_pipeline_example1", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_11.run( + [ + get_issues, + get_stargazers, + get_pulls, + get_branches, + get_contributors, + get_labels, + get_commits, + ] + ) + return (pipeline_11,) + + +@app.cell +def _(pipeline_11): + print(pipeline_11.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Let's redefine our resources with parallelization, wrap them in a single source, and increase the number of normalization, as well as extract workers. + + > Since the default number of load workers is by default set to 20, there's probably no need to modify it. + + While we could optimize the configuration of intermediary file sizes more effectively if we knew the exact number of items each endpoint returns, let's start by experimenting with an arbitrary value of 200 for the data writers, which should be more or less suitable to enable enough parallelization. + """) + return + + +@app.cell +def _(os): + os.environ["EXTRACT__WORKERS"] = "7" + os.environ["NORMALIZE__WORKERS"] = "2" + return + + +@app.cell +def _(dlt, pagination): + @dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", + parallelized=True, + ) + def get_issues_2( + updated_at=dlt.sources.incremental( + "updated_at", initial_value="1970-01-01T00:00:00Z" + ) + ): + url = f"https://api.github.com/repos/dlt-hub/dlt/issues?since={updated_at.last_value}&per_page=100sort=updated" + yield pagination(url) + + @dlt.resource( + table_name="stargazers", + write_disposition="merge", + primary_key="id", + parallelized=True, + ) + def get_stargazers_2(): + url = "https://api.github.com/repos/dlt-hub/dlt/stargazers?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="pull_requests", + write_disposition="merge", + primary_key="id", + parallelized=True, + ) + def get_pulls_2( + updated_at=dlt.sources.incremental( + "updated_at", initial_value="1970-01-01T00:00:00Z" + ) + ): + url = f"https://api.github.com/repos/dlt-hub/dlt/pulls?since={updated_at.last_value}&per_page=100&sort=updated" + yield pagination(url) + + @dlt.resource( + table_name="commits", + write_disposition="merge", + primary_key="sha", + parallelized=True, + ) + def get_commits_2(): + url = "https://api.github.com/repos/dlt-hub/dlt/commits?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="branches", + write_disposition="merge", + primary_key="name", + parallelized=True, + ) + def get_branches_2(): + url = "https://api.github.com/repos/dlt-hub/dlt/branches?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="contributors", + write_disposition="merge", + primary_key="id", + parallelized=True, + ) + def get_contributors_2(): + url = "https://api.github.com/repos/dlt-hub/dlt/contributors?per_page=100" + yield pagination(url) + + @dlt.resource( + table_name="labels", + write_disposition="merge", + primary_key="id", + parallelized=True, + ) + def get_labels_2(): + url = "https://api.github.com/repos/dlt-hub/dlt/labels?per_page=100" + yield pagination(url) + + @dlt.source + def github_data(): + return ( + get_issues_2, + get_stargazers_2, + get_pulls_2, + get_branches_2, + get_contributors_2, + get_labels_2, + get_commits_2, + ) + + pipeline_12 = dlt.pipeline( + pipeline_name="extract_pipeline_example2", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + pipeline_12.run(github_data()) + print(pipeline_12.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Homework: Speed up your pipeline** + + ### **Goal** + + Use the public **Jaffle Shop API** to build a `dlt` pipeline and apply everything you've learned about performance: + + - Chunking + - Parallelism + - Buffer control + - File rotation + - Worker tuning + + Your task is to **make the pipeline as fast as possible**, while keeping the results correct. + + + + ### **What you’ll need** + + - API base: `https://jaffle-shop.scalevector.ai/api/v1` + - Docs: [https://jaffle-shop.scalevector.ai/docs](https://jaffle-shop.scalevector.ai/docs) + - Start with these endpoints: + - `/customers` + - `/orders` + - `/products` + + Each of them returns **paged responses** — so you'll need to handle pagination. + + + + ### **What to implement** + + 1. **Extract** from the API using `dlt` + - Use `dlt.resource` and [`RESTClient`](https://dlthub.com/docs/devel/general-usage/http/rest-client) with proper pagination + + 2. **Apply all performance techniques** + - Group resources into sources + - Yield **chunks/pages**, not single rows + - Use `parallelized=True` + - Set `EXTRACT__WORKERS`, `NORMALIZE__WORKERS`, and `LOAD__WORKERS` + - Tune buffer sizes and enable **file rotation** + + 3. **Measure performance** + - Time the extract, normalize, and load stages separately + - Compare a naive version vs. optimized version + - Log thread info or `pipeline.last_trace` if helpful + + + ### **Deliverables** + + Share your code as a Google Colab or [GitHub Gist](https://gist.github.com/) in Homework Google Form. **This step is required for certification.** + + + It should include: + - Working pipeline for at least 2 endpoints + - Before/after timing comparison + - A short explanation of what changes made the biggest difference if there're any differences + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""And remember: **Start with the defaults. Measure. Then tune.**""") + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb b/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb index 13bb06aec..d29d479cf 100644 --- a/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb @@ -6,7 +6,7 @@ "id": "pTAeTdoKJHZV" }, "source": [ - "# **Quick Start** 👩‍💻🚀 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n", + "# **Quick Start** 👩‍💻🚀 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb)\n", "\n", "**Here, you will learn:**\n", "- What is dlt?\n", @@ -55,15 +55,6 @@ "> **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects." ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "Su4oUJelKaZY" - }, - "source": [ - "[Install](https://dlthub.com/docs/reference/installation) `dlt` with DuckDB as destination:" - ] - }, { "cell_type": "code", "execution_count": null, @@ -180,7 +171,7 @@ "> **What just happened?** \n", "> The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables.\n", ">\n", - "> For this example `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n", + "> For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB.\n", ">\n", ">For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline)." ] @@ -191,7 +182,7 @@ "id": "Z9ll-Ax1BxGu" }, "source": [ - "Quick start was really quick, hah? It seems like some kind of magic happened.\n", + "Quick start was really quick, huh? It seems like some kind of magic happened.\n", "\n", "We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline?" ] @@ -217,7 +208,7 @@ }, "outputs": [], "source": [ - "pipeline = dlt.pipeline(\n", + "another_pipeline = dlt.pipeline(\n", " pipeline_name=\"resource_source\",\n", " destination=\"duckdb\",\n", " dataset_name=\"mydata\",\n", @@ -237,7 +228,7 @@ "* **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline.\n", "* **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline.\n", "\n", - "There are more arguments, but they are for advanced use, we skip it for now." + "There are additional arguments for advanced use, but we’ll skip them for now." ] }, { @@ -262,7 +253,7 @@ "outputs": [], "source": [ "# Run the pipeline and print load info\n", - "load_info = pipeline.run(data, table_name=\"pokemon\")\n", + "load_info = another_pipeline.run(data, table_name=\"pokemon\")\n", "print(load_info)" ] }, @@ -309,7 +300,7 @@ "id": "xQcYIbDbQevC" }, "source": [ - "Start a connection to your database using native `duckdb` connection and look what tables were generated:" + "Start a connection to your database using a native `duckdb` connection and see which tables were generated:" ] }, { @@ -321,17 +312,14 @@ "outputs": [], "source": [ "import duckdb\n", - "from google.colab import data_table\n", - "\n", - "data_table.enable_dataframe_formatter()\n", "\n", "# A database '.duckdb' was created in working directory so just connect to it\n", "\n", "# Connect to the DuckDB database\n", - "conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n", + "conn = duckdb.connect(f\"{another_pipeline.pipeline_name}.duckdb\")\n", "\n", "# Set search path to the dataset\n", - "conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n", + "conn.sql(f\"SET search_path = '{another_pipeline.dataset_name}'\")\n", "\n", "# Describe the dataset\n", "conn.sql(\"DESCRIBE\").df()" @@ -399,7 +387,7 @@ "outputs": [], "source": [ "# Query data from 'pokemon' using the SQL client\n", - "with pipeline.sql_client() as client:\n", + "with another_pipeline.sql_client() as client:\n", " with client.execute_query(\"SELECT * FROM pokemon\") as cursor:\n", " data = cursor.df()\n", "\n", @@ -427,7 +415,7 @@ }, "outputs": [], "source": [ - "dataset = pipeline.dataset()\n", + "dataset = another_pipeline.dataset()\n", "dataset.pokemon.df()" ] }, @@ -467,17 +455,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lN6cXVfhVPmq" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py b/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py new file mode 100644 index 000000000..a8fa0bcd4 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py @@ -0,0 +1,394 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Quick Start** 👩‍💻🚀 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) + + **Here, you will learn:** + - What is dlt? + - How to run a simple pipeline with toy data. + - How to explore the loaded data using: + - DuckDB connection + - dlt's sql_client + - dlt datasets + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **What is dlt?** + + In today's data-driven world, organizations often grapple with the challenge of efficiently **extracting, transforming,** and **loading** (ETL) data from various, often messy, data sources into well-structured, live datasets. This process can be complex, time-consuming, and prone to errors, especially when dealing with large volumes of data or nested data structures. + + Enter **dlt**, an **open-source Python library** designed to simplify and streamline this process. **dlt can load data from** a wide range of **sources** including REST APIs, SQL databases, cloud storage, and Python data structures, among others. It offers a lightweight interface that **infers schemas** and **data types**, **normalizes** the data, and handles **nested data** structures, making it easy to use, flexible, and scalable. + + Moreover, dlt supports a variety of **popular destinations** and allows for the addition of custom destinations to create **reverse ETL** pipelines. It can be deployed **anywhere Python runs**, be it on Airflow, serverless functions, or any other cloud deployment of your choice. With features like **schema evolution**, **data contracts** and **incremental loading**, dlt also automates pipeline maintenance, saving valuable time and resources. + + In essence, dlt is a powerful tool that simplifies the ETL process, making it more efficient and less error-prone. It allows data teams to **focus** on leveraging the data and driving value, while ensuring effective **governance** through timely notifications of any changes. + + [Learn more about dlt here](https://dlthub.com/docs/intro) and in this course! + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_1_Quick_start_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_1_Quick_start_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Installation** + + > **Note**: We recommend working within a virtual environment when creating Python projects. This way, all the dependencies for your current project will be isolated from packages in other projects. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Read more about DuckDB as a destination [here](https://dlthub.com/docs/dlt-ecosystem/destinations/duckdb).""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Run a simple pipeline with toy data** + For educational purposes, let’s start with a simple pipeline using a small dataset — Pokémon data represented as a list of Python dictionaries. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""1. Define a list of Python dictionaries, which will be your toy data:""" + ) + return + + +@app.cell +def _(): + # Sample data containing pokemon details + data = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + return (data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""2. Import `dlt` and create a simple pipeline:""") + return + + +@app.cell +def _(): + import dlt + + # Set pipeline name, destination, and dataset name + pipeline = dlt.pipeline( + pipeline_name="quick_start", + destination="duckdb", + dataset_name="mydata", + ) + return dlt, pipeline + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""3. Run your pipeline and print the load info:""") + return + + +@app.cell +def _(data, pipeline): + # Run the pipeline with data and table name + _load_info = pipeline.run(data, table_name="pokemon") + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + > **What just happened?** + > The first run of a pipeline will scan the data that goes through it and generate a schema. To convert nested data into a relational format, dlt flattens dictionaries and unpacks nested lists into sub-tables. + > + > For this example, `dlt` created a schema called 'mydata' with the table 'pokemon' in it and stored it in DuckDB. + > + >For detailed instructions on running a pipeline, see the documentation [here](https://dlthub.com/docs/walkthroughs/run-a-pipeline). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Quick start was really quick, huh? It seems like some kind of magic happened. + + We don't believe in magic! Let's start from the beginning, what is a `dlt` Pipeline? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## **What is a `dlt` Pipeline?** + + A [pipeline](https://dlthub.com/docs/general-usage/pipeline) is a connection that moves data from your Python code to a destination. The pipeline accepts dlt sources or resources, as well as generators, async generators, lists, and any iterables. Once the pipeline runs, all resources are evaluated and the data is loaded at the destination. + """) + return + + +@app.cell +def _(dlt): + another_pipeline = dlt.pipeline( + pipeline_name="resource_source", + destination="duckdb", + dataset_name="mydata", + dev_mode=True, + ) + return (another_pipeline,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You instantiate a pipeline by calling the `dlt.pipeline` function with the following arguments: + * **`pipeline_name`**: This is the name you give to your pipeline. It helps you track and monitor your pipeline, and also helps to bring back its state and data structures for future runs. If you don't provide a name, dlt will use the name of the Python file you're running as the pipeline name. + * **`destination`**: a name of the destination to which dlt will load the data. It may also be provided to the run method of the pipeline. + * **`dataset_name`**: This is the name of the group of tables (or dataset) where your data will be sent. You can think of a dataset like a folder that holds many files, or a schema in a relational database. You can also specify this later when you run or load the pipeline. If you don't provide a name, it will default to the name of your pipeline. + * **`dev_mode`**: If you set this to True, dlt will add a timestamp to your dataset name every time you create a pipeline. This means a new dataset will be created each time you create a pipeline. + + There are additional arguments for advanced use, but we’ll skip them for now. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + ## **Run method** + + To load the data, you call the `run()` method and pass your data in the data argument. + """) + return + + +@app.cell +def _(another_pipeline, data): + # Run the pipeline and print load info + _load_info = another_pipeline.run(data, table_name="pokemon") + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Commonly used arguments: + + * **`data`** (the first argument) may be a dlt source, resource, generator function, or any Iterator or Iterable (i.e., a list or the result of the map function). + * **`write_disposition`** controls how to write data to a table. Defaults to the value "append". + * `append` will always add new data at the end of the table. + * `replace` will replace existing data with new data. + * `skip` will prevent data from loading. + * `merge` will deduplicate and merge data based on `primary_key` and `merge_key` hints. + * **`table_name`**: specified in cases when the table name cannot be inferred, i.e., from the resources or name of the generator function. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Explore the loaded data** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(1) DuckDB Connection** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Start a connection to your database using a native `duckdb` connection and see which tables were generated:""" + ) + return + + +@app.cell +def _(another_pipeline): + import duckdb + + # A database '.duckdb' was created in working directory so just connect to it + + # Connect to the DuckDB database + conn = duckdb.connect(f"{another_pipeline.pipeline_name}.duckdb") + + # Set search path to the dataset + conn.sql(f"SET search_path = '{another_pipeline.dataset_name}'") + + # Describe the dataset + conn.sql("DESCRIBE").df() + return (conn,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can see: + - `pokemon` table, + + and 3 special `dlt` tables (we will discuss them later): + - `_dlt_loads`, + - `_dlt_pipeline_state`, + - `_dlt_version`. + + Let's execute a query to get all data from the `pokemon` table: + """) + return + + +@app.cell +def _(conn): + # Fetch all data from 'pokemon' as a DataFrame + table = conn.sql("SELECT * FROM pokemon").df() + + # Display the DataFrame + table + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(2) `dlt`'s [sql_client](https://dlthub.com/docs/general-usage/dataset-access/sql-client)** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Most dlt destinations (even filesystem) use an implementation of the `SqlClientBase` class to connect to the physical destination to which your data is loaded. You can access the SQL client of your destination via the `sql_client` method on your pipeline. + + Start a connection to your database with `pipeline.sql_client()` and execute a query to get all data from the `pokemon` table: + """) + return + + +@app.cell +def _(another_pipeline): + # Query data from 'pokemon' using the SQL client + with another_pipeline.sql_client() as client: + with client.execute_query("SELECT * FROM pokemon") as cursor: + data_1 = cursor.df() + # Display the data + data_1 + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(3) dlt [datasets](https://dlthub.com/docs/general-usage/dataset-access/dataset)** + + Here's an example of how to retrieve data from a pipeline and load it into a Pandas DataFrame or a PyArrow Table. + """) + return + + +@app.cell +def _(another_pipeline): + dataset = another_pipeline.dataset() + dataset.pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + # **Exercise 1** + + Using the code from the previous cell, fetch the data from the `pokemon` table into a dataframe and count the number of columns in the table `pokemon`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""**Use this number to answer the question in the Quiz LearnWorlds Form.**""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_1_Quick_start_img2.jpeg](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_1_Quick_start_img2.jpeg)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb b/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb index 6419d4e6c..1a2243b49 100644 --- a/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb @@ -6,7 +6,7 @@ "id": "qvMyiV0uMY-7" }, "source": [ - "# **dlt sources and resources**: Create first dlt pipeline. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n" + "# **dlt sources and resources**: Create your first dlt pipeline [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)\n" ] }, { @@ -24,12 +24,12 @@ "id": "pZCRBANQftVQ" }, "source": [ - "## Recap of [Lesson 1](https://colab.research.google.com/drive/1QwlDWxX5hvwbHMkCgiF0UCzGFRMRoSPY#forceEdit=true&sandboxMode=true) 👩‍💻🚀\n", - "1. Created a pipeline, loaded toy data into DuckDB, and viewed load info.\n", - "2. Used `dlt.pipeline` and `pipeline.run` methods.\n", - "3. Used DuckDB, `sql_client` and dlt `dataset` to view tables and query data.\n", + "## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩‍💻🚀\n", + "1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info.\n", + "2. Used the `dlt.pipeline` and `pipeline.run` methods.\n", + "3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`.\n", "\n", - "Now we move to the next lesson to learn more details about dlt! 🚀" + "Now, let's move on to the next lesson to learn more! 🚀" ] }, { @@ -39,18 +39,9 @@ }, "source": [ "**Here, you will learn how to:**\n", - "- Run a simple pipeline with different types of data, such as dataframes, databases and RestAPI.\n", + "- Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs.\n", "- Use `dlt.resource`, `dlt.source` and `dlt.transformer`.\n", - "- Build your first dlt pipeline for RestAPI." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oaLSnDr9hSxE" - }, - "source": [ - "## **Install dlt**" + "- Build your first dlt pipeline for a REST API." ] }, { @@ -142,7 +133,7 @@ "\n", "\n", "# Create a dlt resource from the data\n", - "@dlt.resource(table_name=\"pokemon_new\") # <--- we set new table name\n", + "@dlt.resource(table_name=\"pokemon_new\")\n", "def my_dict_list() -> TDataItems:\n", " yield data" ] @@ -156,8 +147,8 @@ "Commonly used arguments:\n", "\n", "* **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name.\n", - "* **`table_name`**: the name of the table, if different from the resource name.\n", - "* **`write_disposition`**: controls how to write data to a table. Defaults to the value \"append\"." + "* **`table_name`**: The name of the table, if different from the resource name.\n", + "* **`write_disposition`**: Controls how to write data to a table. Defaults to the value \"append\"." ] }, { @@ -232,7 +223,7 @@ "source": [ "---\n", "### Dataframes\n", - "For creating a pipeline using dataframes, you would do:" + "To create a pipeline using dataframes, you would do:" ] }, { @@ -268,11 +259,9 @@ }, "source": [ "---\n", - "### Database\n", + "### Databases\n", "\n", - "For creating a pipeline from an SQL database query you would:\n", - "\n", - "1. Install the PyMySQL package:" + "To create a pipeline from an SQL database query you would:" ] }, { @@ -293,7 +282,7 @@ "id": "ktAAuuJqW792" }, "source": [ - "2. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:" + "1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:" ] }, { @@ -458,7 +447,7 @@ "* The source Python module typically contains optional customizations and data transformations.\n", "* The source Python module typically contains the authentication and pagination code for a particular API.\n", "\n", - "Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource) here." + "Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource)." ] }, { @@ -508,12 +497,12 @@ "outputs": [], "source": [ "# Create a pipeline\n", - "pipeline = dlt.pipeline(\n", + "new_pipeline = dlt.pipeline(\n", " pipeline_name=\"resource_source_new\", destination=\"duckdb\", dataset_name=\"all_data\"\n", ")\n", "\n", "# Run the pipeline\n", - "load_info = pipeline.run(all_data())\n", + "load_info = new_pipeline.run(all_data())\n", "\n", "# Print load info\n", "print(load_info)" @@ -602,8 +591,13 @@ "outputs": [], "source": [ "@dlt.resource(table_name=\"pokemon\")\n", - "def my_dict_list() -> TDataItems:\n", - " yield data" + "def my_pokemons() -> TDataItems:\n", + " pokemons = [\n", + " {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n", + " {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n", + " {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n", + " ]\n", + " yield pokemons" ] }, { @@ -623,45 +617,27 @@ }, "outputs": [], "source": [ - "import requests\n", - "\n", - "data = [\n", - " {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n", - " {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n", - " {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n", - "]\n", - "\n", - "\n", - "# Define a resource to read and write data to pokemon table\n", - "@dlt.resource(table_name=\"pokemon\")\n", - "def my_dict_list() -> TDataItems:\n", - " yield data\n", - "\n", - "\n", "# Define a transformer to enrich pokemon data with additional details\n", - "@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n", + "# NOTE: the `items` argument contains data from the `my_dict_list` resource\n", + "@dlt.transformer(data_from=my_pokemons, table_name=\"detailed_info\")\n", "def poke_details(\n", " items: TDataItems,\n", - ") -> (\n", - " TDataItems\n", - "): # <--- `items` is a variable and contains data from `my_dict_list` resource\n", + ") -> TDataItems:\n", " for item in items:\n", - " print(\n", - " f\"Item: {item}\\n\"\n", - " ) # <-- print what data we get from `my_dict_list` source\n", + " print(f\"Item: {item}\\n\")\n", "\n", " item_id = item[\"id\"]\n", " url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n", " response = requests.get(url)\n", " details = response.json()\n", "\n", - " print(f\"Details: {details}\\n\") # <--- print what data we get from API\n", + " print(f\"Details: {details}\\n\")\n", "\n", " yield details\n", "\n", "\n", "# Set pipeline name, destination, and dataset name\n", - "pipeline = dlt.pipeline(\n", + "another_pipeline = dlt.pipeline(\n", " pipeline_name=\"quick_start\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokedata\",\n", @@ -687,7 +663,7 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(poke_details())\n", + "load_info = another_pipeline.run(poke_details())\n", "print(load_info)" ] }, @@ -709,14 +685,20 @@ "outputs": [], "source": [ "@dlt.resource(table_name=\"pokemon\")\n", - "def my_dict_list() -> TDataItems:\n", - " yield from data # <--- This would yield one item at a time\n", + "def my_other_pokemons() -> TDataItems:\n", + " pokemons = [\n", + " {\"id\": \"1\", \"name\": \"bulbasaur\", \"size\": {\"weight\": 6.9, \"height\": 0.7}},\n", + " {\"id\": \"4\", \"name\": \"charmander\", \"size\": {\"weight\": 8.5, \"height\": 0.6}},\n", + " {\"id\": \"25\", \"name\": \"pikachu\", \"size\": {\"weight\": 6, \"height\": 0.4}},\n", + " ]\n", + " yield from pokemons\n", "\n", "\n", - "@dlt.transformer(data_from=my_dict_list, table_name=\"detailed_info\")\n", - "def details(\n", + "# NOTE: Transformer receives one item at a time\n", + "@dlt.transformer(data_from=my_other_pokemons, table_name=\"detailed_info\")\n", + "def other_poke_details(\n", " data_item: TDataItem,\n", - ") -> TDataItems: # <--- Transformer receives one item at a time\n", + ") -> TDataItems:\n", " item_id = data_item[\"id\"]\n", " url = f\"https://pokeapi.co/api/v2/pokemon/{item_id}\"\n", " response = requests.get(url)\n", @@ -725,7 +707,7 @@ " yield details\n", "\n", "\n", - "load_info = pipeline.run(details())\n", + "load_info = another_pipeline.run(other_poke_details())\n", "print(load_info)" ] }, @@ -746,7 +728,8 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(my_dict_list | details)" + "load_info = another_pipeline.run(my_pokemons | poke_details)\n", + "print(load_info)" ] }, { @@ -767,7 +750,7 @@ "outputs": [], "source": [ "# Query the 'detailed_info' table and convert the result to a DataFrame\n", - "pipeline.dataset().detailed_info.df()" + "another_pipeline.dataset().detailed_info.df()" ] }, { @@ -809,7 +792,7 @@ }, "source": [ "---\n", - "## **Exercise 1: Create a pipeline for GitHub API - repos endpoint**\n", + "## **Exercise 1: Create a pipeline for GitHub API – repos endpoint**\n", "\n", "In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination.\n", "\n", @@ -817,24 +800,24 @@ "\n", "1. **Explore the GitHub API**\n", "\n", - " Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:\n", + " Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization:\n", "\n", - " GET https://api.github.com/orgs/{org}/repos\n", + " `GET https://api.github.com/orgs/{org}/repos`\n", "\n", - "2. **Build the Pipeline**\n", + "2. **Build the pipeline**\n", "\n", - " Write a script to:\n", + " Write a script to:\n", "\n", - " * Fetch repositories for a **dlt-hub** organization.\n", - " * Use `dlt.resource` to define the data extraction logic.\n", - " * Combine all resources to a single `@dlt.source`.\n", - " * Load the data into a DuckDB database.\n", + " - Fetch repositories for the **dlt-hub** organization.\n", + " - Use `dlt.resource` to define the data extraction logic.\n", + " - Combine all resources into a single `@dlt.source`.\n", + " - Load the data into a DuckDB database.\n", "\n", - "3. **Look at the data**\n", + "3. **Inspect the data**\n", "\n", - " Use `duckdb` connection, `sql_client` or `pipeline.dataset()`.\n", + " Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`.\n", "\n", - "> **Note**: For this exercise you don't need to use Auth and Pagination." + "> **Note**: For this exercise you don't need to use authentication or pagination.\n" ] }, { @@ -843,7 +826,7 @@ "id": "lcBEFsCUuylN" }, "source": [ - "Play with API using requests library:\n" + "Play with the API using the requests library:\n" ] }, { @@ -853,9 +836,20 @@ "collapsed": true, "id": "Ws7JhfPJvRTa" }, - "outputs": [], + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mRunning cells with 'dlt (Python 3.10.0)' requires the ipykernel package.\n", + "\u001b[1;31mInstall 'ipykernel' into the Python environment. \n", + "\u001b[1;31mCommand: '/Users/anuunchinbat/Documents/GitHub/dlt/.venv/bin/python -m pip install ipykernel -U --force-reinstall'" + ] + } + ], "source": [ - "import requests\n", + "from dlt.sources.helpers import requests\n", "\n", "response = requests.get(\"https://api.github.com/orgs/dlt-hub/repos\")\n", "response.json()[0]" @@ -867,7 +861,7 @@ "id": "7PUyt5LAXEMY" }, "source": [ - "In the code snippet below you will find an **example** for the **`events`** endpoint:" + "In the code snippet below, you will find an **example** for the **`events`** endpoint:" ] }, { @@ -889,20 +883,23 @@ " yield response.json()\n", "\n", "\n", - "# here is your code\n", + "print(\"build the `github_repos` resource here\")\n", "\n", "\n", "@dlt.source\n", "def github_data() -> Iterable[DltResource]:\n", - " return (github_events,) # github_repos\n", + " return (github_events,)\n", + "\n", + "\n", + "print(\"return your new resource as part of the source above\")\n", "\n", "\n", "# Set pipeline name, destination, and dataset name\n", - "pipeline = dlt.pipeline(\n", + "github_pipeline = dlt.pipeline(\n", " pipeline_name=\"github_pipeline\", destination=\"duckdb\", dataset_name=\"github_data\"\n", ")\n", "\n", - "load_info = pipeline.run(github_data())\n", + "load_info = github_pipeline.run(github_data())\n", "print(load_info)" ] }, @@ -913,7 +910,7 @@ }, "source": [ "### Question\n", - "How many columns has the `github_repos` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`." + "How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`." ] }, { @@ -922,14 +919,15 @@ "id": "mYfeMBI82Tg0" }, "source": [ - "## **Exercise 2: Create a pipeline for GitHub API - stargazers endpoint**\n", + "## **Exercise 2: Create a pipeline for the GitHub API – stargazers endpoint**\n", "\n", - "Create a `dlt.transformer` for the \"stargazers\" endpoint\n", - "https://api.github.com/repos/OWNER/REPO/stargazers for `dlt-hub` organization.\n", + "Create a `dlt.transformer` for the **\"stargazers\"** endpoint \n", + "`https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization.\n", "\n", - "Use `github_repos` resource as a main resource for the transformer:\n", - "1. Get all `dlt-hub` repositories.\n", - "2. Feed these repository names to dlt transformer and get all stargazers for all `dlt-hub` repositories." + "Use the `github_repos` resource as the main resource for the transformer:\n", + "\n", + "1. Get all repositories in the `dlt-hub` organization. \n", + "2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories.\n" ] }, { @@ -940,7 +938,7 @@ }, "outputs": [], "source": [ - "# here is your code" + "print(\"YOUR CODE GOES HERE\")" ] }, { @@ -950,7 +948,7 @@ }, "source": [ "### Question\n", - "How many columns has the `github_stargazer` table? Use `duckdb` connection, `sql_client` or `pipeline.dataset()`." + "How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`." ] }, { @@ -959,7 +957,7 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!" ] } ], @@ -970,11 +968,13 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "dlt", + "language": "python", "name": "python3" }, "language_info": { - "name": "python" + "name": "python", + "version": "3.10.0" } }, "nbformat": 4, diff --git a/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py b/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py new file mode 100644 index 000000000..dd170f138 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py @@ -0,0 +1,745 @@ +# /// script +# dependencies = [ +# "dlt", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# **dlt sources and resources**: Create your first dlt pipeline [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img1.png](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Recap of [Lesson 1](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) 👩‍💻🚀 + 1. Created a pipeline, loaded toy data into DuckDB, and viewed the load info. + 2. Used the `dlt.pipeline` and `pipeline.run` methods. + 3. Queried data and viewed tables with DuckDB, the `sql_client`, and the dlt `dataset`. + + Now, let's move on to the next lesson to learn more! 🚀 + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Here, you will learn how to:** + - Run a simple pipeline with different types of data, such as dataframes, databases and REST APIs. + - Use `dlt.resource`, `dlt.source` and `dlt.transformer`. + - Build your first dlt pipeline for a REST API. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **`dlt` resources** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### List of dicts + + + In the previous lesson, we simply used a list of dictionaries that essentially represents the `pokemon` table. + """) + return + + +@app.cell +def _(): + import dlt + + data = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + # Sample data containing pokemon details + pipeline = dlt.pipeline( + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" + ) + _load_info = pipeline.run(data, table_name="pokemon") + # Set pipeline name, destination, and dataset name + # Run the pipeline with data and table name + print(_load_info) + return data, dlt, pipeline + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""A better way is to wrap it in the `@dlt.resource` decorator which denotes a logical grouping of data within a data source, typically holding data of similar structure and origin:""" + ) + return + + +@app.cell +def _(data, dlt): + from dlt.common.typing import TDataItems, TDataItem + + @dlt.resource(table_name="pokemon_new") + def my_dict_list() -> TDataItems: + # Create a dlt resource from the data + yield data + return TDataItem, TDataItems, my_dict_list + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Commonly used arguments: + + * **`name`**: The resource name and the name of the table generated by this resource. Defaults to the decorated function name. + * **`table_name`**: The name of the table, if different from the resource name. + * **`write_disposition`**: Controls how to write data to a table. Defaults to the value "append". + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> **Why is it a better way?** This allows you to use `dlt` functionalities to the fullest that follow Data Engineering best practices, including incremental loading and data contracts.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Try running the pipeline with the `my_dict_list` resource:""") + return + + +@app.cell +def _(my_dict_list, pipeline): + # Run the pipeline and print load info + _load_info = pipeline.run(my_dict_list) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check what was loaded to the `pokemon_new` table:""") + return + + +@app.cell +def _(pipeline): + pipeline.dataset().pokemon_new.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Instead of a dict list, the data could also be a/an: + - dataframe + - database query response + - API request response + - Anything you can transform into JSON/dict format + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### Dataframes + To create a pipeline using dataframes, you would do: + """) + return + + +@app.cell +def _(TDataItems, dlt, pipeline): + import pandas as pd + + @dlt.resource(table_name="df_data") + # Define a resource to load data from a CSV + def my_df() -> TDataItems: + sample_df = pd.read_csv( + "https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv" + ) + yield sample_df + + _load_info = pipeline.run(my_df) + print(_load_info) + # Run the pipeline with the defined resource + # Query the loaded data from 'df_data' + pipeline.dataset().df_data.df() + return (my_df,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### Databases + + To create a pipeline from an SQL database query you would: + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""1. Create and run a pipeline to fetch data from an SQL resource and query the loaded data as follows:""" + ) + return + + +@app.cell +def _(TDataItems, dlt, pipeline): + from sqlalchemy import create_engine + + @dlt.resource(table_name="genome_data") + def get_genome_data() -> TDataItems: + engine = create_engine( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" + ) + with engine.connect() as conn: + query = "SELECT * FROM genome LIMIT 1000" + rows = conn.execution_options(yield_per=100).exec_driver_sql(query) + yield from map(lambda row: dict(row._mapping), rows) + + _load_info = pipeline.run(get_genome_data) + print(_load_info) + pipeline.dataset().genome_data.df() + return (get_genome_data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### REST API + + For REST API endpoints, create a pipeline as follows: + """) + return + + +@app.cell +def _(TDataItems, dlt, pipeline): + from dlt.sources.helpers import requests + + @dlt.resource(table_name="pokemon_api") + # Define a resource to fetch pokemons from PokeAPI + def get_pokemon() -> TDataItems: + url = "https://pokeapi.co/api/v2/pokemon" + response = requests.get(url) + yield response.json()["results"] + + _load_info = pipeline.run(get_pokemon) + print(_load_info) + # Run the pipeline using the defined resource + # Query the loaded data from 'pokemon_api' table + pipeline.dataset().pokemon_api.df() + return get_pokemon, requests + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Try loading everything above, in a single pipeline:""") + return + + +@app.cell +def _(get_genome_data, get_pokemon, my_df, pipeline): + _load_info = pipeline.run([my_df, get_genome_data, get_pokemon]) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check which new tables were created:""") + return + + +@app.cell +def _(pipeline): + # List all table names from the database + with pipeline.sql_client() as client: + with client.execute_query( + "SELECT table_name FROM information_schema.tables" + ) as table: + print(table.df()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **`dlt` sources** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Now that there are multiple `dlt` resources, each corresponding to a separate table, we can group them into a `dlt` source.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_2_dlt_sources_and_resources_Create_first_dlt_pipeline_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + A source is a logical grouping of resources, e.g., endpoints of a single API. The most common approach is to define it in a separate Python module. + + * A source is a function decorated with `@dlt.source` that returns one or more resources. + * A source can optionally define a schema with tables, columns, performance hints, and more. + * The source Python module typically contains optional customizations and data transformations. + * The source Python module typically contains the authentication and pagination code for a particular API. + + Read more about [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You declare a source by decorating a function that returns or yields one or more resources with `@dlt.source`. + + Here's how it's done: + """) + return + + +@app.cell +def _(dlt, get_genome_data, get_pokemon, my_df): + from typing import Iterable + from dlt.extract import DltResource + + @dlt.source + def all_data() -> Iterable[DltResource]: + return my_df, get_genome_data, get_pokemon + return DltResource, Iterable, all_data + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Only using the source above, load everything into a separate database using a new pipeline:""" + ) + return + + +@app.cell +def _(all_data, dlt): + # Create a pipeline + new_pipeline = dlt.pipeline( + pipeline_name="resource_source_new", + destination="duckdb", + dataset_name="all_data", + ) + _load_info = new_pipeline.run(all_data()) + # Run the pipeline + # Print load info + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + > **Why does this matter?**: + - It is more efficient than running your resources separately. + - It organizes both your schema and your code. 🙂 + - It enables the option for parallelization. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **`dlt` transformers** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + We now know that `dlt` resources can be grouped into a `dlt` source, represented as: + + + ``` + Source + / \ + Resource 1 ... Resource N + + ``` + + However, imagine a scenario where you need an additional step in between: + + ``` + Source + / \ + step \ + / \ + Resource 1 ... Resource N + + ``` + + This step could arise, for example, in a situation where: + + - Resource 1 returns a list of pokemons IDs, and you need to use each of those IDs to retrieve detailed information about the pokemons from a separate API endpoint. + + In such cases, you would use `dlt` transformers — special `dlt` resources that can be fed data from another resource: + + ``` + Source + / \ + Transformer \ + / \ + Resource 1 ... Resource N + + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let’s assume Resource 1 is:""") + return + + +@app.cell +def _(TDataItems, dlt): + @dlt.resource(table_name="pokemon") + def my_pokemons() -> TDataItems: + pokemons = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + yield pokemons + return (my_pokemons,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We need to get detailed information about pokemons from [PokeAPI](https://pokeapi.co/) `"https://pokeapi.co/api/v2/pokemon/{id}"` based on their IDs. We would do:""" + ) + return + + +@app.cell +def _(TDataItems, dlt, my_pokemons, requests): + # Define a transformer to enrich pokemon data with additional details + # NOTE: the `items` argument contains data from the `my_dict_list` resource + @dlt.transformer(data_from=my_pokemons, table_name="detailed_info") + def poke_details( + items: TDataItems, + ) -> TDataItems: + for item in items: + print(f"Item: {item}\n") + + item_id = item["id"] + url = f"https://pokeapi.co/api/v2/pokemon/{item_id}" + response = requests.get(url) + details = response.json() + + print(f"Details: {details}\n") + + yield details + + # Set pipeline name, destination, and dataset name + another_pipeline = dlt.pipeline( + pipeline_name="quick_start", + destination="duckdb", + dataset_name="pokedata", + dev_mode=True, + ) + return another_pipeline, poke_details + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline:""") + return + + +@app.cell +def _(another_pipeline, poke_details): + _load_info = another_pipeline.run(poke_details()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Alternatively, we could do:""") + return + + +@app.cell +def _(TDataItem, TDataItems, another_pipeline, dlt, requests): + @dlt.resource(table_name="pokemon") + def my_other_pokemons() -> TDataItems: + pokemons = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + yield from pokemons + + @dlt.transformer(data_from=my_other_pokemons, table_name="detailed_info") + def other_poke_details(data_item: TDataItem) -> TDataItems: + item_id = data_item["id"] + url = f"https://pokeapi.co/api/v2/pokemon/{item_id}" + response = requests.get(url) + # NOTE: Transformer receives one item at a time + details = response.json() + yield details + + _load_info = another_pipeline.run(other_poke_details()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""You can also use pipe instead of `data_from`, this is useful when you want to apply `dlt.transformer` to multiple `dlt.resources`:""" + ) + return + + +@app.cell +def _(another_pipeline, my_pokemons, poke_details): + _load_info = another_pipeline.run(my_pokemons | poke_details) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check the loaded data:""") + return + + +@app.cell +def _(another_pipeline): + # Query the 'detailed_info' table and convert the result to a DataFrame + another_pipeline.dataset().detailed_info.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Reduce the nesting level of generated tables** + You can limit how deep dlt goes when generating nested tables and flattening dicts into columns. By default, the library will descend and generate nested tables for all nested lists, without limit. + + You can set nesting level for all resources on the source level: + + ```python + @dlt.source(max_table_nesting=1) + def all_data(): + return my_df, get_genome_data, get_pokemon + ``` + + or for each resource separately: + + ```python + @dlt.resource(table_name='pokemon_new', max_table_nesting=1) + def my_dict_list(): + yield data + ``` + + In the example above, we want only 1 level of nested tables to be generated (so there are no nested tables of a nested table). Typical settings: + + * `max_table_nesting=0` will not generate nested tables and will not flatten dicts into columns at all. All nested data will be represented as JSON. + * `max_table_nesting=1` will generate nested tables of root tables and nothing more. All nested data in nested tables will be represented as JSON. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Exercise 1: Create a pipeline for GitHub API – repos endpoint** + + In this exercise, you'll build a dlt pipeline to fetch data from the GitHub REST API. The goal is to learn how to use `dlt.pipeline`, `dlt.resource`, and `dlt.source` to extract and load data into a destination. + + ## Instructions + + 1. **Explore the GitHub API** + + Visit the [GitHub REST API Docs](https://docs.github.com/en/rest) to understand the endpoint to [list public repositories](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28) for an organization: + + `GET https://api.github.com/orgs/{org}/repos` + + 2. **Build the pipeline** + + Write a script to: + + - Fetch repositories for the **dlt-hub** organization. + - Use `dlt.resource` to define the data extraction logic. + - Combine all resources into a single `@dlt.source`. + - Load the data into a DuckDB database. + + 3. **Inspect the data** + + Use a `duckdb` connection, `sql_client`, or `pipeline.dataset()`. + + > **Note**: For this exercise you don't need to use authentication or pagination. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Play with the API using the requests library:""") + return + + +@app.cell +def _(requests): + response = requests.get("https://api.github.com/orgs/dlt-hub/repos") + response.json()[0] + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In the code snippet below, you will find an **example** for the **`events`** endpoint:""" + ) + return + + +@app.cell +def _(DltResource, Iterable, TDataItems, dlt, requests): + @dlt.resource + def github_events() -> TDataItems: + url = "https://api.github.com/orgs/dlt-hub/events" + response = requests.get(url) + yield response.json() + + print("build the `github_repos` resource here") + + @dlt.source + def github_data() -> Iterable[DltResource]: + return (github_events,) + + print("return your new resource as part of the source above") + github_pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + _load_info = github_pipeline.run(github_data()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Question + How many columns has the `github_repos` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Exercise 2: Create a pipeline for the GitHub API – stargazers endpoint** + + Create a `dlt.transformer` for the **"stargazers"** endpoint + `https://api.github.com/repos/OWNER/REPO/stargazers` for the `dlt-hub` organization. + + Use the `github_repos` resource as the main resource for the transformer: + + 1. Get all repositories in the `dlt-hub` organization. + 2. Feed these repository names into the `dlt` transformer and retrieve all stargazers for all `dlt-hub` repositories. + """) + return + + +@app.cell +def _(): + print("YOUR CODE GOES HERE") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Question + How many columns has the `github_stargazer` table? Use a `duckdb` connection, `sql_client` or `pipeline.dataset()`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb b/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb index bebb4334e..5c8e4c6d3 100644 --- a/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb @@ -6,13 +6,13 @@ "id": "MfQUdpVg2Trs" }, "source": [ - "# **Recap of [Lesson 2](https://colab.research.google.com/drive/1tc94GvIoYXmYrjUibDhY_9iPR5zA0Eyw#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩‍💻🚀**\n", "\n", - "1. Used `@dlt.resource` to load and query data like lists, dataframes, and REST API responses into DuckDB. \n", - "2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. \n", - "3. Used `@dlt.transformer` to process and enrich data between resources. \n", + "1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB. \n", + "2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. \n", + "3. Used `@dlt.transformer` to process and enrich data between resources. \n", "\n", - "Next: Dive deeper into building dlt pipelines using pagination, authentication and dlt configuration! 🚀" + "Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀" ] }, { @@ -23,16 +23,16 @@ "source": [ "---\n", "\n", - "# **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n", + "# **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb)\n", "\n", "\n", "\n", - "**Here, you will learn how to:**\n", - "- Use pagination for RestAPIs.\n", - "- Use environment variables to handle both secrets & configs.\n", + "**In this lesson, you will learn how to:**\n", + "- Use pagination for REST APIs.\n", + "- Use environment variables to manage both secrets & configs.\n", "- Add values to `secrets.toml` or `config.toml`.\n", "\n", - "To read more about credentails refer to [dlt documentation](https://dlthub.com/docs/general-usage/credentials/) here." + "To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/)." ] }, { @@ -41,7 +41,7 @@ "id": "aAN9q0Kz0tt_" }, "source": [ - "In previous lesson we loaded data from GitHub API to DuckDB," + "In the previous lesson, we loaded data from the GitHub API to DuckDB," ] }, { @@ -53,7 +53,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install dlt" + "!pip install \"dlt[duckdb]\"" ] }, { @@ -78,14 +78,14 @@ "\n", "\n", "# define dlt pipeline\n", - "pipeline = dlt.pipeline(destination=\"duckdb\")\n", + "_pipeline = dlt.pipeline(destination=\"duckdb\")\n", "\n", "# run dlt pipeline\n", - "load_info = pipeline.run(github_events)\n", + "load_info = _pipeline.run(github_events)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().github_events.df()" + "_pipeline.dataset().github_events.df()" ] }, { @@ -94,9 +94,9 @@ "id": "GtyMwBig37uK" }, "source": [ - "You could notice that we received only 1 page, only 30 records. But this endpoint has muuuch more records in total. To get all the pages you should use a pagination.\n", + "You may notice we received only one page — just 30 records — even though this endpoint has many more.\n", "\n", - "When working with APIs like GitHub, data is often returned in pages. Pagination allows you to retrieve all the data when an endpoint limits how much can be fetched at once." + "To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset." ] }, { @@ -114,14 +114,16 @@ "id": "BolhMQE10Zgk" }, "source": [ + "---\n", "## **Pagination**\n", "\n", - "GitHub has very good documentation, so it is not difficult to go through the documentation and find the relevant page: [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n", + "GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28)\n", "\n", - "It says:\n", - ">You can use the `link` header from the response to request additional pages of data.\n", + "It explains that:\n", "\n", - ">The link header contains URLs that you can use to fetch additional pages of results. For example, the previous, next, first, and last page of results." + ">You can use the `Link` header from the response to request additional pages of data.\n", + "\n", + ">The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages." ] }, { @@ -130,7 +132,7 @@ "id": "iU-xQriAHJI2" }, "source": [ - "**GitHub API Pagination example**\n", + "**GitHub API Pagination Example**\n", "\n", "The GitHub API provides the `per_page` and `page` query parameters:\n", "\n", @@ -146,8 +148,6 @@ }, "outputs": [], "source": [ - "import requests\n", - "\n", "response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n", "response.headers" ] @@ -158,7 +158,7 @@ "id": "ZdDGuAVJ4Qqo" }, "source": [ - "Gotcha! We can see 'Link' in the headers. To get this link we can alternatively use `response.links`:" + "Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:" ] }, { @@ -169,8 +169,6 @@ }, "outputs": [], "source": [ - "import requests\n", - "\n", "response = requests.get(\"https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1\")\n", "response.links" ] @@ -183,18 +181,17 @@ "source": [ "### **dlt RESTClient**\n", "\n", - "The response includes a 'Link' header for navigating to the next page.\n", - "So now we can implement a pagination!\n", + "Now that we know how pagination works conceptually, let’s see how to implement it efficiently!\n", "\n", - "When working with APIs, you could implement pagination using only Python and the requests library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n", + "When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic.\n", "\n", - "More about how to build pagination with Python and `requests`:\n", + "Learn more about building pagination with Python and `requests`:\n", "\n", "* [Link 1](https://farnamdata.com/api-pagination)\n", "\n", "* [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval)\n", "\n", - "**But!** In this lesson, we’re gonna use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n", + "**But!** In this lesson, we’re going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub.\n", "\n", "\n", "**Why use RESTClient?**\n", @@ -208,9 +205,9 @@ "This reduces boilerplate code and lets you focus on your data pipeline logic.\n", "\n", "**Here’s how to fetch paginated data:**\n", - "1. Import RESTClient\n", - "2. Create the RESTClient instance\n", - "3. Use the `paginate` method to iterate through all pages of data." + "1. Import `RESTClient`\n", + "2. Create a `RESTClient` instance\n", + "3. Use the `paginate` method to iterate through all pages of data" ] }, { @@ -238,7 +235,7 @@ "id": "yNB8jyz5Kmo1" }, "source": [ - "Pagination type was detected automatically, but you can explicitly provide it:" + "☝️ The pagination type was detected automatically, but you can also specify it explicitly:" ] }, { @@ -249,7 +246,6 @@ }, "outputs": [], "source": [ - "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", "client = RESTClient(\n", @@ -264,7 +260,7 @@ "id": "_jNBmv1qkUhk" }, "source": [ - "The full list of available paginators you can see in offcial [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n" + "The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).\n" ] }, { @@ -278,13 +274,11 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Dqi7NQtqhfeb" - }, + "metadata": {}, "source": [ - "The events endpoint does not have as much data, specially if you compare it with the stargazers endpoint for the dlt repo.\n", + "The events endpoint doesn’t contain as much data, especially compared to the stargazers endpoint of the dlt repository.\n", "\n", - "If you run the pipeline for stargazers endpoint, there is a high chance that you face the **rate limit error**." + "If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**." ] }, { @@ -295,13 +289,6 @@ }, "outputs": [], "source": [ - "from dlt.sources.helpers.rest_client import RESTClient\n", - "\n", - "\n", - "client = RESTClient(\n", - " base_url=\"https://api.github.com\",\n", - ")\n", - "\n", "for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n", " print(page)" ] @@ -324,24 +311,22 @@ "id": "iKUgNTKuiP6w" }, "source": [ + "---\n", "## **Authentication**\n", "\n", - "To avoid this error you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n", + "To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28):\n", "\n", "1. Login to your GitHub account.\n", - "2. Generate [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic one!).\n", - "2. Use it as an access token for GitHub API." + "2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic).\n", + "2. Use it as an access token for the GitHub API." ] }, { "cell_type": "markdown", - "metadata": { - "id": "-7ZHBjYspQxt" - }, + "metadata": {}, "source": [ - "**! ATTENTION !**\n", - "\n", - "Never share your credentials in public and never hard-code them in your code. Use **environment variables** or **dlt secrets.toml**." + "> **! ATTENTION !**\n", + "> Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**." ] }, { @@ -350,11 +335,18 @@ "id": "UB02kiI8ncYm" }, "source": [ - "Create an environment variable for your access token.\n", + "Create an environment variable for your access token in Colab.\n", "\n", "![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3.webp)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Molab, simply click on the `Secrets` section in the left-side menu and add your access token." + ] + }, { "cell_type": "code", "execution_count": null, @@ -375,7 +367,7 @@ "id": "6bdNZJ0HqY4O" }, "source": [ - "So now you can use `access_token` variable in the code below:" + "Use the `access_token` variable in the code below:" ] }, { @@ -386,13 +378,12 @@ }, "outputs": [], "source": [ - "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "\n", "\n", "client = RESTClient(\n", " base_url=\"https://api.github.com\",\n", - " auth=BearerTokenAuth(token=access_token), # <--- put your token here\n", + " auth=BearerTokenAuth(token=access_token),\n", ")\n", "\n", "for page in client.paginate(\"repos/dlt-hub/dlt/stargazers\"):\n", @@ -406,7 +397,7 @@ "id": "D7-rTvYvr05t" }, "source": [ - "So now we can rewrite our GitHub dlt pipeline using the RestAPI Client and `access_token`." + "Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`." ] }, { @@ -418,7 +409,6 @@ "outputs": [], "source": [ "import dlt\n", - "from dlt.sources.helpers import requests\n", "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "\n", @@ -435,16 +425,16 @@ "\n", "\n", "# define new dlt pipeline\n", - "pipeline = dlt.pipeline(destination=\"duckdb\")\n", + "_pipeline = dlt.pipeline(destination=\"duckdb\")\n", "\n", "\n", "# run the pipeline with the new resource\n", - "load_info = pipeline.run(github_stargazers)\n", + "load_info = _pipeline.run(github_stargazers)\n", "print(load_info)\n", "\n", "\n", "# explore loaded data\n", - "pipeline.dataset().github_stargazers.df()" + "_pipeline.dataset().github_stargazers.df()" ] }, { @@ -462,6 +452,7 @@ "id": "SxpBIZZ_yE8R" }, "source": [ + "---\n", "## **dlt configuration and secrets**\n", "\n", "In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines.\n", @@ -470,15 +461,13 @@ "\n", "On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks.\n", "\n", - "These can be set up in various ways:\n", + "Both can be set up in various ways:\n", "\n", - "* Environment variables\n", + "* As environment variables\n", "* Within code using `dlt.secrets` and `dlt.config`\n", - "* Configuration files (`secrets.toml` and `config.toml`)\n", + "* Via configuration files (`secrets.toml` and `config.toml`)\n", "\n", - "We're gonna use `dlt.secrets.value` to define credentials in resources and sources. dlt automatically **extracts** configuration settings and secrets based on flexible naming conventions. It then **injects** these values where needed in code.\n", - "\n", - "**Note**: It's important to note that while you can put all configurations and credentials in the `dlt.secrets` (or `secrets.toml`) if it's more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt doesn't look for them there.\n" + "> **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if that’s more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there." ] }, { @@ -487,9 +476,9 @@ "id": "64JM2Lnlxyoa" }, "source": [ - "Let's create dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n", + "Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`.\n", "\n", - "We'll use `@dlt.source` to combine all resources in one place." + "We'll use `@dlt.source` to group both resources." ] }, { @@ -533,7 +522,7 @@ "id": "0h3ugsRiLhfv" }, "source": [ - "Now we'll use `dlt.secrets.value` in our source to enable dlt secrets configuration. Rename `access_token` variable to `secret_key` because it's already defined.\n", + "Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:\n", "\n" ] }, @@ -545,7 +534,7 @@ }, "outputs": [], "source": [ - "exit() # we use exit() to reset all ENVs we set" + "os.environ.clear()" ] }, { @@ -559,7 +548,6 @@ "from typing import Iterable\n", "import dlt\n", "from dlt.extract import DltResource\n", - "from dlt.sources.helpers import requests\n", "from dlt.sources.helpers.rest_client import RESTClient\n", "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.common.typing import TDataItems\n", @@ -568,7 +556,7 @@ "@dlt.source\n", "def github_source(\n", " access_token=dlt.secrets.value,\n", - ") -> Iterable[DltResource]: # <--- set the secret variable \"access_token\" here\n", + ") -> Iterable[DltResource]:\n", " client = RESTClient(\n", " base_url=\"https://api.github.com\", auth=BearerTokenAuth(token=access_token)\n", " )\n", @@ -592,7 +580,7 @@ "id": "H-wNVUqfuD37" }, "source": [ - "Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases." + "> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases." ] }, { @@ -601,7 +589,7 @@ "id": "shfeHo-vOcD1" }, "source": [ - "If you run the pipeline with `secret_key` as `dlt.secrets.value`, you will see the following error:" + "If you now run the pipeline, you will see the following error:" ] }, { @@ -613,11 +601,11 @@ "outputs": [], "source": [ "# define new dlt pipeline\n", - "pipeline = dlt.pipeline(destination=\"duckdb\")\n", + "_pipeline = dlt.pipeline(destination=\"duckdb\")\n", "\n", "\n", "# run the pipeline with the new resource\n", - "load_info = pipeline.run(github_source())\n", + "load_info = _pipeline.run(github_source())\n", "print(load_info)" ] }, @@ -627,9 +615,9 @@ "id": "GCmqzzo7OpgE" }, "source": [ - "^ That is what happens if you set `dlt.secrets.value` for any variable in your dlt pipeline, but don't set the secret value up.\n", + "That’s what happens when you use `dlt.secrets.value` for a variable in your pipeline but haven’t actually set the secret value.\n", "\n", - "dlt is looking for secrets in following formats:\n", + "When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below:\n", "\n", "```python\n", "ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration\n", @@ -654,10 +642,10 @@ "id": "Ox08B2V5NCaH" }, "source": [ - "To define `access_token` secret value we can use:\n", + "To define the `access_token` secret value, we can use (as mentioned earlier):\n", "\n", "1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds)\n", - "2. Environment variables (recomnended for prod)\n", + "2. Environment variables (recommended for prod)\n", "3. `secrets.toml` file (recommended for local dev)" ] }, @@ -669,7 +657,7 @@ "source": [ "### **Use `dlt.secrets` in code**\n", "\n", - "You can easily rewrite your secret right in the Python code. It's especially convenient if you take credentials from third-party secret providers, or if you want to update credentials and configs dinamically." + "You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically." ] }, { @@ -680,17 +668,15 @@ }, "outputs": [], "source": [ - "import os\n", "from google.colab import userdata\n", "\n", "dlt.secrets[\"access_token\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "# define new dlt pipeline\n", - "pipeline = dlt.pipeline(destination=\"duckdb\")\n", - "\n", + "github_pipeline = dlt.pipeline(destination=\"duckdb\")\n", "\n", "# run the pipeline with the new resource\n", - "load_info = pipeline.run(github_source())\n", + "load_info = github_pipeline.run(github_source())\n", "print(load_info)" ] }, @@ -700,7 +686,7 @@ "id": "GNghaiYwSBGm" }, "source": [ - "Alternatively you can set:\n", + "Alternatively, you can set:\n", "\n", "```python\n", "dlt.secrets[\"sources.access_token\"] = userdata.get('SECRET_KEY')\n", @@ -748,7 +734,7 @@ "id": "Adi1RZmOvVzj" }, "source": [ - "### **Exercise 2: Run pipeline with `dlt.secrets.value`**\n", + "### **Exercise 2: Run a pipeline with `dlt.secrets.value`**\n", "\n", "Explore the cells above and answer the question below using `sql_client`.\n", "\n", @@ -763,10 +749,9 @@ "id": "fQlOIe46ncYm" }, "source": [ - "---\n", "### **Use environment variables**\n", "\n", - "Let's set ENV in the one of the dlt formats: `ACCESS_TOKEN`.\n" + "Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`.\n" ] }, { @@ -777,17 +762,16 @@ }, "outputs": [], "source": [ - "import os\n", "from google.colab import userdata\n", "\n", "os.environ[\"ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "# define new dlt pipeline\n", - "pipeline = dlt.pipeline(destination=\"duckdb\")\n", + "_pipeline = dlt.pipeline(destination=\"duckdb\")\n", "\n", "\n", "# run the pipeline with the new resource\n", - "load_info = pipeline.run(github_source())\n", + "load_info = _pipeline.run(github_source())\n", "print(load_info)" ] }, @@ -797,7 +781,9 @@ "id": "ppEFU1hJPU6c" }, "source": [ - "Alternatively you can set:\n", + "Alternatively, you can set:\n", + "\n", + "> `userdata.get()` is Colab-specific.\n", "\n", "```python\n", "os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get('SECRET_KEY')\n", @@ -831,7 +817,6 @@ "id": "l7Y1oCAvJ79I" }, "source": [ - "---\n", "### **Use dlt `secrets.toml` or `config.toml`**\n" ] }, @@ -841,7 +826,7 @@ "id": "mNzCp5BGpDSh" }, "source": [ - "> Please note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead." + "> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead." ] }, { @@ -862,7 +847,7 @@ "└── my_pipeline.py\n", "```\n", "\n", - "Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials) here." + "Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials)." ] }, { @@ -871,7 +856,7 @@ "id": "6bTyl229sadQ" }, "source": [ - "To set credentials via TOMLs you would first add your access token to `secrets.toml`:\n", + "To set credentials via the toml files, you would first add your access token to `secrets.toml`:\n", "\n", "```toml\n", "# .dlt/secrets.toml\n", @@ -889,13 +874,13 @@ }, "source": [ "\n", - "Alternatively you can set:\n", + "Alternatively, you can set:\n", "\n", "```\n", "[sources]\n", "secret_key = \"your_access_token\"\n", "```\n", - "is equal to:\n", + "which is equal to:\n", "\n", "```\n", "secret_key = \"your_access_token\"\n", @@ -907,7 +892,7 @@ "[sources.____main____]\n", "secret_key = \"your_access_token\"\n", "```\n", - "and to:\n", + "as well as:\n", "\n", "```\n", "[sources.____main____.github_source]\n", @@ -922,11 +907,11 @@ }, "source": [ "\n", - "### **Configure Secrets in Colab**\n", + "### **Configure secrets in Colab**\n", "\n", - "You can configure secrets using **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n", + "You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well.\n", "\n", - "Open **Secrets** sidebar, press \"Add new secret\", create variable with name `secrets.toml` and copy-paste secrets in Value field and Enable it:\n", + "Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`:\n", "\n", "```\n", "[sources]\n", @@ -934,7 +919,7 @@ "```\n", "\n", "\n", - ">dlt will not reload the secrets automatically. **Please restart your interpreter** in Colab options when you add/change content of the variables above." + ">dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above." ] }, { @@ -952,17 +937,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_7dLATtZkdQl" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py b/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py new file mode 100644 index 000000000..89d6012e6 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py @@ -0,0 +1,773 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 2](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) 👩‍💻🚀** + + 1. Used `@dlt.resource` to load and query data such as lists, dataframes, and REST API responses into DuckDB. + 2. Grouped multiple resources into a single `@dlt.source` for better organization and efficiency. + 3. Used `@dlt.transformer` to process and enrich data between resources. + + Next: We'll dive deeper into building dlt pipelines using pagination, authentication, and dlt configuration! 🚀 + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **Pagination & Authentication & dlt Configuration** 🤫🔩 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) + + + + **In this lesson, you will learn how to:** + - Use pagination for REST APIs. + - Use environment variables to manage both secrets & configs. + - Add values to `secrets.toml` or `config.toml`. + + To learn more about credentials, refer to the [dlt documentation](https://dlthub.com/docs/general-usage/credentials/). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In the previous lesson, we loaded data from the GitHub API to DuckDB,""" + ) + return + + +@app.cell +def _(): + import dlt + from dlt.sources.helpers import requests + from dlt.common.typing import TDataItems + + @dlt.resource + # define dlt resources + def github_events() -> TDataItems: + url = "https://api.github.com/orgs/dlt-hub/events" + _response = requests.get(url) + yield _response.json() + + _pipeline = dlt.pipeline(destination="duckdb") + _load_info = _pipeline.run(github_events) + print(_load_info) + # define dlt pipeline + # run dlt pipeline + # explore loaded data + _pipeline.dataset().github_events.df() + return TDataItems, dlt, requests + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You may notice we received only one page — just 30 records — even though this endpoint has many more. + + To fetch everything, enable pagination: many APIs (like GitHub) return results in pages and limit how much you can retrieve per request, so paginating lets you iterate through all pages to collect the full dataset. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img1.webp)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Pagination** + + GitHub provides excellent documentation, making it easy to find the relevant section on [Pagination.](https://docs.github.com/en/rest/using-the-rest-api/using-pagination-in-the-rest-api?apiVersion=2022-11-28) + + It explains that: + + >You can use the `Link` header from the response to request additional pages of data. + + >The `Link` header contains URLs that let you fetch other pages of results — for example, the previous, next, first, and last pages. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **GitHub API Pagination Example** + + The GitHub API provides the `per_page` and `page` query parameters: + + * `per_page`: The number of records per page (up to 100). + * `page`: The page number to retrieve. + """) + return + + +@app.cell +def _(requests): + _response = requests.get( + "https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1" + ) + _response.headers + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Got it! We can see the `Link` field in the response headers. Alternatively, you can access it directly using `response.links`:""" + ) + return + + +@app.cell +def _(requests): + _response = requests.get( + "https://api.github.com/orgs/dlt-hub/events?per_page=10&page=1" + ) + _response.links + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **dlt RESTClient** + + Now that we know how pagination works conceptually, let’s see how to implement it efficiently! + + When working with APIs, you could implement pagination using only Python and the `requests` library. While this approach works, it often requires writing boilerplate code for tasks like managing authentication, constructing URLs, and handling pagination logic. + + Learn more about building pagination with Python and `requests`: + + * [Link 1](https://farnamdata.com/api-pagination) + + * [Link 2](https://www.klamp.io/blog/python-requests-pagination-for-efficient-data-retrieval) + + **But!** In this lesson, we’re going to use dlt's **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** to handle pagination seamlessly when working with REST APIs like GitHub. + + + **Why use RESTClient?** + + RESTClient is part of dlt's helpers, making it easier to interact with REST APIs by managing repetitive tasks such as: + + * Authentication + * Query parameter handling + * Pagination + + This reduces boilerplate code and lets you focus on your data pipeline logic. + + **Here’s how to fetch paginated data:** + 1. Import `RESTClient` + 2. Create a `RESTClient` instance + 3. Use the `paginate` method to iterate through all pages of data + """) + return + + +@app.cell +def _(): + from dlt.sources.helpers.rest_client import RESTClient + + client = RESTClient(base_url="https://api.github.com") + for _page in client.paginate("orgs/dlt-hub/events"): + print(_page) + return (RESTClient,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""☝️ The pagination type was detected automatically, but you can also specify it explicitly:""" + ) + return + + +@app.cell +def _(RESTClient): + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + client_1 = RESTClient( + base_url="https://api.github.com", paginator=HeaderLinkPaginator() + ) + return (client_1,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The full list of available paginators is in the official [dlt documentation](https://dlthub.com/docs/general-usage/http/rest-client#paginators).""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The events endpoint doesn’t contain as much data, especially compared to the stargazers endpoint of the dlt repository. + + If you run the pipeline for the stargazers endpoint, there's a high chance that you'll face a **rate limit error**. + """) + return + + +@app.cell +def _(client_1): + for _page in client_1.paginate("repos/dlt-hub/dlt/stargazers"): + print(_page) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 1: Pagination with RESTClient** + Explore the cells above and answer the question below. + #### Question + What type of pagination should we use for the GitHub API? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Authentication** + + To avoid the **rate limit error** you can use [GitHub API Authentication](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28): + + 1. Login to your GitHub account. + 2. Generate an [API token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) (classic). + 2. Use it as an access token for the GitHub API. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + > **! ATTENTION !** + > Never share your credentials publicly and never hard-code them in your code. Use **environment variables, files** or dlt's **secrets.toml**. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Create an environment variable for your access token in Colab. + + ![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img3.webp) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""In Molab, simply click on the `Secrets` section in the left-side menu and add your access token.""" + ) + return + + +@app.cell +def _(): + import os + + access_token = os.getenv("SECRET_KEY") + return access_token, os + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Use the `access_token` variable in the code below:""") + return + + +@app.cell +def _(RESTClient, access_token): + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + + client_2 = RESTClient( + base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token) + ) + for _page in client_2.paginate("repos/dlt-hub/dlt/stargazers"): + print(_page) + break + return (BearerTokenAuth,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Let's rewrite our GitHub dlt pipeline using the RestAPI Client and the `access_token`.""" + ) + return + + +@app.cell +def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt): + @dlt.resource + def github_stargazers() -> TDataItems: + client = RESTClient( + base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token) + ) + for _page in client.paginate("repos/dlt-hub/dlt/stargazers"): + yield _page + + _pipeline = dlt.pipeline(destination="duckdb") + _load_info = _pipeline.run(github_stargazers) + print(_load_info) + _pipeline.dataset().github_stargazers.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""You can see that all dlt [stargazers](https://github.com/dlt-hub/dlt/stargazers) were loaded into the DuckDB destination.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **dlt configuration and secrets** + + In dlt, [configurations and secrets](https://dlthub.com/docs/general-usage/credentials/) are essential for setting up data pipelines. + + **Configurations** are **non-sensitive** settings that define the behavior of a data pipeline, including file paths, database hosts, timeouts, API URLs, and performance settings. + + On the other hand, **secrets** are **sensitive** data like passwords, API keys, and private keys, which should never be hard-coded to avoid security risks. + + Both can be set up in various ways: + + * As environment variables + * Within code using `dlt.secrets` and `dlt.config` + * Via configuration files (`secrets.toml` and `config.toml`) + + > **Note**: While you can store both configurations and credentials in `dlt.secrets` (or `secrets.toml`) if that’s more convenient, credentials cannot be placed in `dlt.config` (or `config.toml`) because dlt does not read them from there. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Let's create a dlt pipeline for both endpoints: `repos/dlt-hub/dlt/stargazers` and `orgs/dlt-hub/events`. + + We'll use `@dlt.source` to group both resources. + """) + return + + +@app.cell +def _(BearerTokenAuth, RESTClient, TDataItems, access_token, dlt): + from typing import Iterable + from dlt.extract import DltResource + + @dlt.source + def github_source() -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token) + ) + + @dlt.resource + def github_events() -> TDataItems: + for _page in client.paginate("orgs/dlt-hub/events"): + yield _page + + @dlt.resource + def github_stargazers() -> TDataItems: + for _page in client.paginate("repos/dlt-hub/dlt/stargazers"): + yield _page + + return (github_events, github_stargazers) + return DltResource, Iterable + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Now, we'll use `dlt.secrets.value` in our source, enabling dlt's automatic secrets resolution. Note that we first reset all environment variables to demonstrate what happens if dlt tries to resolve a non-existing variable:""" + ) + return + + +@app.cell +def _(os): + os.environ.clear() + return + + +@app.cell +def _(BearerTokenAuth, DltResource, Iterable, RESTClient, TDataItems, dlt): + @dlt.source + def github_source_1(access_token=dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", auth=BearerTokenAuth(token=access_token) + ) + + @dlt.resource + def github_events() -> TDataItems: + for _page in client.paginate("orgs/dlt-hub/events"): + yield _page + + @dlt.resource + def github_stargazers() -> TDataItems: + for _page in client.paginate("repos/dlt-hub/dlt/stargazers"): + yield _page + + return (github_events, github_stargazers) + return (github_source_1,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> Configs are defined in a similar way but are accessed using `dlt.config.value`. However, since configuration variables are internally managed by `dlt`, it is unlikely that you would need to explicitly use `dlt.config.value` in most cases.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""If you now run the pipeline, you will see the following error:""") + return + + +@app.cell +def _(dlt, github_source_1): + _pipeline = dlt.pipeline(destination="duckdb") + _load_info = _pipeline.run(github_source_1()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + That’s what happens when you use `dlt.secrets.value` for a variable in your pipeline but haven’t actually set the secret value. + + When this occurs, dlt searches for the missing secret across different possible locations and naming formats, as shown below: + + ```python + ConfigFieldMissingException: Following fields are missing: ['access_token'] in configuration with spec GithubSourceConfiguration + for field "access_token" config providers and keys were tried in following order: + In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found. + In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES____MAIN____ACCESS_TOKEN was not found. + In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__SOURCES__ACCESS_TOKEN was not found. + In Environment Variables key DLT_COLAB_KERNEL_LAUNCHER__ACCESS_TOKEN was not found. + In Environment Variables key SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN was not found. + In Environment Variables key SOURCES____MAIN____ACCESS_TOKEN was not found. + In Environment Variables key SOURCES__ACCESS_TOKEN was not found. + In Environment Variables key ACCESS_TOKEN was not found. + WARNING: dlt looks for .dlt folder in your current working directory and your cwd (/content) is different from directory of your pipeline script (/usr/local/lib/python3.10/dist-packages). + If you keep your secret files in the same folder as your pipeline script but run your script from some other folder, secrets/configs will not be found + Please refer to https://dlthub.com/docs/general-usage/credentials for more information + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + To define the `access_token` secret value, we can use (as mentioned earlier): + + 1. `dlt.secrets` in code (recommended for secret vaults or dynamic creds) + 2. Environment variables (recommended for prod) + 3. `secrets.toml` file (recommended for local dev) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Use `dlt.secrets` in code** + + You can easily set or update your secrets directly in Python code. This is especially convenient when retrieving credentials from third-party secret managers or when you need to update secrets and configurations dynamically. + """) + return + + +@app.cell +def _(dlt, github_source_1, os): + dlt.secrets["access_token"] = os.getenv("SECRET_KEY") + github_pipeline = dlt.pipeline(destination="duckdb") + _load_info = github_pipeline.run(github_source_1()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Alternatively, you can set: + + ```python + dlt.secrets["sources.access_token"] = userdata.get('SECRET_KEY') + dlt.secrets["sources.____main____.access_token"] = userdata.get('SECRET_KEY') + dlt.secrets["sources.____main____.github_source.access_token"] = userdata.get('SECRET_KEY') + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + * `sources` is a special word; + + * `__main__` is a python module name; + + * `github_source` is the resource name; + + * `access_token` is the secret variable name. + + + So dlt looks for secrets according to this hierarchy: + ``` + pipeline_name + | + |-sources + | + |- + | + |- + | + |- secret variable 1 + |- secret variable 2 + ``` + + To keep the **naming convention** flexible, dlt looks for a lot of **possible combinations** of key names, starting from the most specific possible path. Then, if the value is not found, it removes the right-most section and tries again. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 2: Run a pipeline with `dlt.secrets.value`** + + Explore the cells above and answer the question below using `sql_client`. + + #### Question + + Who has id=`17202864` in the `stargazers` table? Use `sql_client`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Use environment variables** + + Let's explicitly set the environment variable for our access token in one of the formats dlt accepts: `ACCESS_TOKEN`. + """) + return + + +@app.cell +def _(dlt, github_source_1, os): + os.environ["ACCESS_TOKEN"] = os.getenv("SECRET_KEY") + _pipeline = dlt.pipeline(destination="duckdb") + _load_info = _pipeline.run(github_source_1()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Alternatively, you can set: + + > `userdata.get()` is Colab-specific. + + ```python + os.environ["SOURCES__ACCESS_TOKEN"] = userdata.get('SECRET_KEY') + os.environ["SOURCES____MAIN____ACCESS_TOKEN"] = userdata.get('SECRET_KEY') + os.environ["SOURCES____MAIN____GITHUB_SOURCE__ACCESS_TOKEN"] = userdata.get('SECRET_KEY') + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **How does it work?** + + `dlt` **automatically extracts** configuration settings and secrets based on flexible naming conventions. + + It then **injects** these values where needed in functions decorated with `@dlt.source`, `@dlt.resource`, or `@dlt.destination`. + + + >dlt uses a specific naming hierarchy to search for the secrets and config values. This makes configurations and secrets easy to manage. + > + > The naming convention for **environment variables** in dlt follows a specific pattern. All names are **capitalized** and sections are separated with **double underscores** __ , e.g. `SOURCES____MAIN____GITHUB_SOURCE__SECRET_KEY`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Use dlt `secrets.toml` or `config.toml`**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> Note that Colab is not well-suited for using `secrets.toml` or `config.toml` files. As a result, these sections will provide instructions rather than code cells, detailing how to use them in a local environment. You should test this functionality on your own machine. For Colab, it is recommended to use environment variables instead.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `secrets.toml` file - along with the `config.toml` file - should be stored in the `.dlt` directory where your pipeline code is located: + + ``` + /your_project_directory + │ + ├── .dlt + │ ├── secrets.toml + │ └── config.toml + │ + └── my_pipeline.py + ``` + + Read more about adding [credentials](https://dlthub.com/docs/walkthroughs/add_credentials). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + To set credentials via the toml files, you would first add your access token to `secrets.toml`: + + ```toml + # .dlt/secrets.toml + + [sources] + secret_key = "your_access_token" + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Alternatively, you can set: + + ``` + [sources] + secret_key = "your_access_token" + ``` + which is equal to: + + ``` + secret_key = "your_access_token" + ``` + + and to: + + ``` + [sources.____main____] + secret_key = "your_access_token" + ``` + as well as: + + ``` + [sources.____main____.github_source] + secret_key = "your_access_token" + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Configure secrets in Colab** + + You can configure secrets using the **Secrets** sidebar. Just create a variable with the name `secrets.toml` and paste the content of the toml file from your `.dlt` folder into it. We support `config.toml` variable as well. + + Open the **Secrets** sidebar, press `Add new secret`, create a variable with name `secrets.toml` and copy-paste secrets in the `Value` field and click `Enable`: + + ``` + [sources] + secret_key = "your_access_token" + ``` + + + >dlt will not reload the secrets automatically. **Restart your interpreter** in Colab options when you add/change the variables above. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img4](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_3_Pagination_%26_Authentication_%26_dlt_Configuration_img4.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb b/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb index 2f0fb99f2..6ab2546ba 100644 --- a/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb @@ -6,14 +6,14 @@ "id": "yTmIgQKpV355" }, "source": [ - "# **Recap of [Lesson 3](https://colab.research.google.com/drive/1-jVNzMJTRYHhbRlXgGFlhMwdML1L9zMx#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩‍💻🚀**\n", "\n", - "1. Used pagination for RestAPIs.\n", - "2. Used authentication for RestAPIs.\n", - "3. Tried dlt RESTClient.\n", - "4. Used environment variables to handle both secrets & configs.\n", - "5. Learned how to add values to `secrets.toml` or `config.toml`.\n", - "6. Used `secrets.toml` ENV variable special for Colab." + "1. Used pagination with REST APIs. \n", + "2. Applied authentication for REST APIs. \n", + "3. Tried the dlt `RESTClient`. \n", + "4. Used environment variables to manage secrets and configuration. \n", + "5. Learned how to add values to `secrets.toml` and `config.toml`. \n", + "6. Used the special `secrets.toml` environment variable setup for Colab." ] }, { @@ -23,22 +23,21 @@ }, "source": [ "---\n", - "# **`dlt`’s pre-built Sources and Destinations** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n", - "\n", + "# **`dlt`’s pre-built Sources and Destinations** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb)\n", "\n", "\n", "**Here, you will learn:**\n", - "- How to initialize verified sources;\n", - "- Built-in `rest_api` source.\n", - "- Built-in `sql_database` source.\n", - "- Built-in `filesystem` source.\n", + "- How to initialize verified sources.\n", + "- The built-in `rest_api` source.\n", + "- The built-in `sql_database` source.\n", + "- The built-in `filesystem` source.\n", "- How to switch between destinations.\n", "\n", "---\n", "\n", - "Our verified sources are the simplest way to get started with building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as any SQL database, Google Sheets, Salesforce and others.\n", + "Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more.\n", "\n", - "With our numerous destinations you can load data to a local database, warehouse or a data lake. Choose from Snowflake, Databricks and more." + "With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others." ] }, { @@ -76,15 +75,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "cNs9mHKaEaTE" - }, - "source": [ - "### Step 0: Install dlt" - ] - }, { "cell_type": "code", "execution_count": null, @@ -125,7 +115,7 @@ "source": [ "This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning.\n", "\n", - "Consider an example of a pipeline for the GitHub API:\n", + "Consider an example pipeline for the GitHub API:\n", "\n", "```\n", "Available dlt single file templates:\n", @@ -144,7 +134,7 @@ "\n", "### Step 1. Initialize the source\n", "\n", - "This command will initialize the pipeline example with GitHub API as the source and DuckBD as the destination:" + "This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination:" ] }, { @@ -165,10 +155,11 @@ }, "source": [ "Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline:\n", - "* `.dlt` folder for `secrets.toml` and `config.toml`;\n", - "* pipeline script `github_api_pipeline.py`;\n", - "* requirements.txt;\n", - "* `.gitignore`." + "\n", + "- The `.dlt` folder containing `secrets.toml` and `config.toml`\n", + "- The pipeline script `github_api_pipeline.py`\n", + "- `requirements.txt`\n", + "- `.gitignore`" ] }, { @@ -193,7 +184,7 @@ "- Adjust the pipeline script as needed\n", "- Run the pipeline script\n", "\n", - "> In certain cases, you can adjust the verified source code." + "> If needed, you can adjust the verified source code." ] }, { @@ -213,7 +204,8 @@ "id": "Rr3RWZSHcnSs" }, "source": [ - "From the code we can see that this pipeline loads **only \"issues\" endpoint**, you can adjust this code as you wish: add new endpoints, add additional logic, add transformations, etc." + "From the code, we can see that this pipeline loads **only the `\"issues\"` endpoint**. \n", + "You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more." ] }, { @@ -224,9 +216,10 @@ "source": [ "### Step 2. Add credentials\n", "\n", - "In Colab is more convenient to use ENVs. In the previous lesson you learned how to configure dlt resource via environment variable.\n", + "In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`.\n", + "\n", + "In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable:\n", "\n", - "In the pipeline above we can see that `access_token` variable is `dlt.secrets.value`, it means we should configure this variable.\n", "\n", "```python\n", "@dlt.resource(write_disposition=\"replace\")\n", @@ -243,10 +236,10 @@ }, "outputs": [], "source": [ - "import os\n", + "import dlt\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")" + "dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")" ] }, { @@ -284,13 +277,13 @@ "id": "imvWv_2Cbumt" }, "source": [ - "From the pipeline output we can take pipeline information like pipeline_name, dataset_name, destination path, etc.\n", + "From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more.\n", "\n", - "\n", - "> Pipeline **github_api_pipeline** load step completed in 1.23 seconds\n", - "1 load package(s) were loaded to destination duckdb and into dataset **github_api_data**\n", - "The duckdb destination used duckdb:////content/**github_api_pipeline.duckdb** location to store data\n", - "Load package 1733848559.8195539 is LOADED and contains no failed jobs\n" + "> Pipeline **github_api_pipeline** load step completed in 1.23 seconds \n", + "> 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**. \n", + "> The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location. \n", + "> Load package `1733848559.8195539` is **LOADED** and contains no failed jobs.\n", + "\n" ] }, { @@ -301,7 +294,7 @@ "source": [ "## Step 4: Explore your data\n", "\n", - "Let's explore what tables were created in duckdb." + "Let's explore what tables were created in the destination." ] }, { @@ -348,18 +341,18 @@ "source": [ "## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)**\n", "\n", - "`rest_api` is a generic source that you can use to create a `dlt` source from a REST API using a declarative configuration. The majority of REST APIs behave in a similar way; this `dlt` source attempts to provide a declarative way to define a `dlt` source for those APIs.\n", + "`rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively.\n", "\n", - "Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can define:\n", + "Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify:\n", "\n", "- the API endpoints to pull data from,\n", "- their relationships,\n", "- how to handle pagination,\n", "- authentication.\n", "\n", - "dlt will take care of the rest: **unnesting the data, inferring the schema**, etc., and **writing to the destination**\n", + "`dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**.\n", "\n", - "In previous lesson you've already met Rest API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low level abstraction** that powers the REST API Source." + "In the previous lesson, you already used the REST API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source.\n" ] }, { @@ -368,8 +361,9 @@ "id": "SqoKS0mNdFOd" }, "source": [ - "### Initialize `rest_api` template\n", - "You can initialize `rest_api` **template** using `init` command:" + "### Initialize the `rest_api` template\n", + "\n", + "You can initialize the `rest_api` **template** using the `init` command:\n" ] }, { @@ -389,15 +383,13 @@ "id": "MJ89LnH91GQh" }, "source": [ - "In the `rest_api_pipeline.py` script you will find sources for GitHub API and for PokeAPI, which were defined using `rest_api` source and `RESTAPIConfig`.\n", - "\n", - "Since the `rest_api` source is a **built-in source**, you don't have to initialize it. You can **import** it from `dlt.sources` and use it immediately.\n", + "In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`.\n", "\n", + "Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it.\n", "\n", "### Example\n", "\n", - "Here's a simplified example of how to configure the REST API source to load `issues` and issue `comments` from GitHub API:\n", - "\n" + "Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API:\n" ] }, { @@ -416,13 +408,11 @@ " \"client\": {\n", " \"base_url\": \"https://api.github.com\",\n", " \"auth\": {\n", - " \"token\": dlt.secrets[\n", - " \"sources.access_token\"\n", - " ], # <--- we already configured access_token above\n", + " \"token\": dlt.secrets[\"sources.access_token\"],\n", " },\n", - " \"paginator\": \"header_link\", # <---- set up paginator type\n", + " \"paginator\": \"header_link\",\n", " },\n", - " \"resources\": [ # <--- list resources\n", + " \"resources\": [\n", " {\n", " \"name\": \"issues\",\n", " \"endpoint\": {\n", @@ -433,40 +423,32 @@ " },\n", " },\n", " {\n", - " \"name\": \"issue_comments\", # <-- here we declare dlt.transformer\n", + " \"name\": \"issue_comments\",\n", " \"endpoint\": {\n", " \"path\": \"repos/dlt-hub/dlt/issues/{issue_number}/comments\",\n", " \"params\": {\n", " \"issue_number\": {\n", - " \"type\": (\n", - " \"resolve\"\n", - " ), # <--- use type 'resolve' to resolve {issue_number} for transformer\n", + " \"type\": (\"resolve\"),\n", " \"resource\": \"issues\",\n", " \"field\": \"number\",\n", " },\n", " },\n", " },\n", " },\n", - " {\n", - " \"name\": \"contributors\",\n", - " \"endpoint\": {\n", - " \"path\": \"repos/dlt-hub/dlt/contributors\",\n", - " },\n", - " },\n", " ],\n", "}\n", "\n", "github_source = rest_api_source(config)\n", "\n", "\n", - "pipeline = dlt.pipeline(\n", + "rest_api_pipeline = dlt.pipeline(\n", " pipeline_name=\"rest_api_github\",\n", " destination=\"duckdb\",\n", " dataset_name=\"rest_api_data\",\n", " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.run(github_source)\n", + "load_info = rest_api_pipeline.run(github_source)\n", "print(load_info)" ] }, @@ -478,7 +460,7 @@ }, "outputs": [], "source": [ - "pipeline.dataset().issues.df()" + "rest_api_pipeline.dataset().issues.df()" ] }, { @@ -487,12 +469,12 @@ "id": "mQuK4l23c8Of" }, "source": [ - "### **Exercise 1: Run rest_api source**\n", + "### **Exercise 1: Run `rest_api` source**\n", "\n", "Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n", "\n", - "#### Question\n", - "How many columns has the `issues` table?" + "#### **Question**\n", + "How many columns does the `issues` table have?" ] }, { @@ -501,15 +483,16 @@ "id": "UTKIM2ntOIrh" }, "source": [ - "### **Exercise 2: Create dlt source with rest_api**\n", + "### **Exercise 2: Create a dlt source with `rest_api`**\n", "\n", - "Add `contributors` endpoint for dlt repository to the `rest_api` configuration:\n", - "- resource name is \"contributors\"\n", - "- endpoint path : \"repos/dlt-hub/dlt/contributors\"\n", - "- no parameters\n", + "Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration:\n", "\n", - "#### Question\n", - "How many columns has the `contributors` table?" + "- Resource name: **\"contributors\"**\n", + "- Endpoint path: **\"repos/dlt-hub/dlt/contributors\"**\n", + "- No parameters\n", + "\n", + "#### **Question**\n", + "How many columns does the `contributors` table have?\n" ] }, { @@ -536,9 +519,9 @@ "id": "bHcBOhgVdmZH" }, "source": [ - "### Initialize `sql_database` template\n", + "### Initialize the `sql_database` template\n", "\n", - "Initialize dlt template for `sql_database` using `init` command:" + "Initialize the `dlt` template for `sql_database` using the `init` command:\n" ] }, { @@ -569,9 +552,9 @@ "source": [ "### Example\n", "\n", - "The example below will show you how you can use dlt to load data from a SQL Database (PostgreSQL, MySQL, SQLight, Oracle, IBM DB2, etc.) into destination.\n", + "The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination.\n", "\n", - "To make it easy to reproduce, we will be loading data from the [public MySQL RFam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance." + "To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance." ] }, { @@ -582,6 +565,7 @@ }, "outputs": [], "source": [ + "%%capture\n", "!pip install pymysql" ] }, @@ -595,21 +579,21 @@ "source": [ "from dlt.sources.sql_database import sql_database\n", "\n", - "source = sql_database(\n", + "sql_source = sql_database(\n", " \"mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam\",\n", " table_names=[\n", " \"family\",\n", " ],\n", ")\n", "\n", - "pipeline = dlt.pipeline(\n", + "sql_db_pipeline = dlt.pipeline(\n", " pipeline_name=\"sql_database_example\",\n", " destination=\"duckdb\",\n", " dataset_name=\"sql_data\",\n", " dev_mode=True,\n", ")\n", "\n", - "load_info = pipeline.run(source)\n", + "load_info = sql_db_pipeline.run(sql_source)\n", "print(load_info)" ] }, @@ -619,11 +603,11 @@ "id": "pjyJyF4Ofyuu" }, "source": [ - "### **Exercise 3: Run sql_database source**\n", + "### **Exercise 3: Run `sql_database` source**\n", "\n", "Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n", "\n", - "#### Question\n", + "#### **Question**\n", "How many columns does the `family` table have?" ] }, @@ -671,9 +655,9 @@ "id": "HfLjS_raUH9G" }, "source": [ - "### Initialize `filesystem` template\n", + "### Initialize the `filesystem` template\n", "\n", - "Initialize dlt template for `filesystem` using `init` command:" + "Initialize the dlt template for `filesystem` using the `init` command:\n" ] }, { @@ -715,7 +699,19 @@ }, "outputs": [], "source": [ - "!mkdir -p local_data && wget -O local_data/userdata.parquet https://www.timestored.com/data/sample/userdata.parquet" + "import os\n", + "import requests\n", + "\n", + "folder_name = \"local_data\"\n", + "os.makedirs(folder_name, exist_ok=True)\n", + "full_path = os.path.abspath(folder_name)\n", + "\n", + "url = \"https://www.timestored.com/data/sample/userdata.parquet\"\n", + "resp = requests.get(url)\n", + "resp.raise_for_status()\n", + "\n", + "with open(f\"{full_path}/userdata.parquet\", \"wb\") as f:\n", + " f.write(resp.content)" ] }, { @@ -729,14 +725,12 @@ "import dlt\n", "from dlt.sources.filesystem import filesystem, read_parquet\n", "\n", - "filesystem_resource = filesystem(\n", - " bucket_url=\"/content/local_data\", file_glob=\"**/*.parquet\"\n", - ")\n", + "filesystem_resource = filesystem(bucket_url=full_path, file_glob=\"**/*.parquet\")\n", "filesystem_pipe = filesystem_resource | read_parquet()\n", "\n", "# We load the data into the table_name table\n", - "pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n", - "load_info = pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n", + "fs_pipeline = dlt.pipeline(pipeline_name=\"my_pipeline\", destination=\"duckdb\")\n", + "load_info = fs_pipeline.run(filesystem_pipe.with_name(\"userdata\"))\n", "print(load_info)" ] }, @@ -746,12 +740,12 @@ "id": "0jzeZeINEzQb" }, "source": [ - "### **Exercise 4: Run filesystem source**\n", + "### **Exercise 4: Run `filesystem` source**\n", "\n", "Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`.\n", "\n", - "#### Question\n", - "How many columns does the `userdata` table have?" + "#### **Question**\n", + "How many columns does the `userdata` table have?\n" ] }, { @@ -760,7 +754,8 @@ "id": "o4SGNHSkF7_Y" }, "source": [ - "How to configure **Cloud Storage** you can read in the official [dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration)." + "You can read how to configure **Cloud Storage** in the official \n", + "[dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration).\n" ] }, { @@ -769,9 +764,7 @@ "id": "M03Zc9l7Y6Ue" }, "source": [ - "# **Built-in Destinations**\n", - "\n", - "https://dlthub.com/docs/dlt-ecosystem/destinations/" + "# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)\n" ] }, { @@ -797,9 +790,12 @@ "id": "BWAnIbicE4XC" }, "source": [ - "TBH this is a matter of simply going through the [documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n", - "- Most likely the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n", - "- If not, you can simply define a custom destination and still be able to benefit from most `dlt`-specific features. FYI, custom destinations will be covered in the next Advanced course, so we expect you to come back for the second part..." + "To be honest, this is simply a matter of going through the \n", + "[documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up:\n", + "\n", + "- Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day.\n", + "- If not, you can define a custom destination and still benefit from most `dlt`-specific features. \n", + " *FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…*\n" ] }, { @@ -810,7 +806,7 @@ "source": [ "## **Choosing a destination**\n", "\n", - "Switching between destinations in dlt is incredibly straightforward—simply modify the `destination` parameter in your pipeline configuration. For example:" + "Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example:" ] }, { @@ -821,17 +817,19 @@ }, "outputs": [], "source": [ - "pipeline = dlt.pipeline(\n", + "data_pipeline = dlt.pipeline(\n", " pipeline_name=\"data_pipeline\",\n", - " destination=\"duckdb\", # <--- to test pipeline locally\n", + " destination=\"duckdb\",\n", " dataset_name=\"data\",\n", ")\n", + "print(data_pipeline.destination.destination_type)\n", "\n", - "pipeline = dlt.pipeline(\n", + "data_pipeline = dlt.pipeline(\n", " pipeline_name=\"data_pipeline\",\n", - " destination=\"bigquery\", # <--- to run pipeline in production\n", + " destination=\"bigquery\",\n", " dataset_name=\"data\",\n", - ")" + ")\n", + "print(data_pipeline.destination.destination_type)" ] }, { @@ -869,7 +867,7 @@ "source": [ "import os\n", "\n", - "os.environ[\"BUCKET_URL\"] = \"/content\"" + "os.environ[\"BUCKET_URL\"] = \"./content\"" ] }, { @@ -902,13 +900,11 @@ "\n", "pipeline = dlt.pipeline(\n", " pipeline_name=\"fs_pipeline\",\n", - " destination=\"filesystem\", # <--- change destination to 'filesystem'\n", + " destination=\"filesystem\",\n", " dataset_name=\"fs_data\",\n", ")\n", "\n", - "load_info = pipeline.run(\n", - " source, loader_file_format=\"parquet\"\n", - ") # <--- choose a file format: parquet, csv or jsonl\n", + "load_info = pipeline.run(source, loader_file_format=\"parquet\")\n", "print(load_info)" ] }, @@ -929,7 +925,7 @@ }, "outputs": [], "source": [ - "! ls fs_data/family" + "! ls ./content/fs_data/family" ] }, { @@ -991,7 +987,7 @@ "load_info = pipeline.run(\n", " source,\n", " loader_file_format=\"parquet\",\n", - " table_format=\"iceberg\", # <--- choose a table format: delta or iceberg\n", + " table_format=\"iceberg\",\n", ")\n", "print(load_info)" ] @@ -1004,9 +1000,9 @@ "source": [ "**Note:**\n", "\n", - "Open source version of dlt supports basic functionality for **iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** integration with iceberg.\n", + "The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration.\n", "\n", - "[Join the waiting list to learn more about dlt+ and Iceberg.](https://info.dlthub.com/waiting-list)" + "[Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list)\n" ] }, { @@ -1017,9 +1013,12 @@ "source": [ "# **Spoiler: Custom Sources & Destinations**\n", "\n", - "`dlt` tried to simplify as much as possible both the process of creating sources ([RestAPI Client](https://dlthub.com/docs/general-usage/http/rest-client), [rest_api source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n", + "`dlt` aims to simplify the process of creating both custom sources \n", + "([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client), \n", + "[`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) \n", + "and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination).\n", "\n", - "We will look at this topic in more detail in the next Advanced course." + "We will explore this topic in more detail in the next Advanced course.\n" ] }, { @@ -1028,17 +1027,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrVnW2UdVjV4" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py b/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py new file mode 100644 index 000000000..e901c9b94 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py @@ -0,0 +1,826 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "dlt[pyiceberg]", +# "numpy", +# "pandas", +# "pymysql", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 3](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) 👩‍💻🚀** + + 1. Used pagination with REST APIs. + 2. Applied authentication for REST APIs. + 3. Tried the dlt `RESTClient`. + 4. Used environment variables to manage secrets and configuration. + 5. Learned how to add values to `secrets.toml` and `config.toml`. + 6. Used the special `secrets.toml` environment variable setup for Colab. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + # **`dlt`’s pre-built Sources and Destinations** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) + + + **Here, you will learn:** + - How to initialize verified sources. + - The built-in `rest_api` source. + - The built-in `sql_database` source. + - The built-in `filesystem` source. + - How to switch between destinations. + + --- + + Our verified sources are the simplest way to start building your stack. Choose from any of our fully customizable 30+ pre-built sources, such as SQL databases, Google Sheets, Salesforce, and more. + + With our numerous destinations, you can load data into a local database, data warehouse, or data lake. Choose from Snowflake, Databricks, and many others. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Using_pre_build_sources_and_destinations_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_4_Using_pre_build_sources_and_destinations_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Existing verified sources** + To use an [existing verified source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/), just run the `dlt init` command. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + There's a base project for each `dlt` verified source + destination combination, which you can adjust according to your needs. + + These base project can be initialized with a simple command: + + ``` + dlt init + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""List all verified sources:""") + return + + +@app.cell +def _(): + import subprocess + + subprocess.run(["dlt", "init", "--list-sources"], check=True) + return (subprocess,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This command shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning. + + Consider an example pipeline for the GitHub API: + + ``` + Available dlt single file templates: + --- + arrow: The Arrow Pipeline Template will show how to load and transform arrow tables. + dataframe: The DataFrame Pipeline Template will show how to load and transform pandas dataframes. + debug: The Debug Pipeline Template will load a column with each datatype to your destination. + default: The Intro Pipeline Template contains the example from the docs intro page + fruitshop: The Default Pipeline Template provides a simple starting point for your dlt pipeline + + ---> github_api: The Github API templates provides a starting + + point to read data from REST APIs with REST Client helper + requests: The Requests Pipeline Template provides a simple starting point for a dlt pipeline with the requests library + ``` + + ### Step 1. Initialize the source + + This command will initialize the pipeline example with the GitHub API as the source and DuckBD as the destination: + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "--non-interactive", "init", "github_api", "duckdb"], check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Now, check your files on the left side bar. It should contain all the necessary files to run your GitHub API -> DuckDB pipeline: + + - The `.dlt` folder containing `secrets.toml` and `config.toml` + - The pipeline script `github_api_pipeline.py` + - `requirements.txt` + - `.gitignore` + """) + return + + +@app.cell +def _(subprocess): + subprocess.run(["ls", "-a"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + What you would normally do with the project: + - Add your credentials and define configurations + - Adjust the pipeline script as needed + - Run the pipeline script + + > If needed, you can adjust the verified source code. + """) + return + + +@app.cell +def _(subprocess): + subprocess.run(["cat", "github_api_pipeline.py"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + From the code, we can see that this pipeline loads **only the `"issues"` endpoint**. + You can adjust this code as needed: add new endpoints, include additional logic, apply transformations, and more. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Step 2. Add credentials + + In Colab (or Molab), it is more convenient to use environment variables or `dlt.secrets`. + + In the pipeline above, the `access_token` parameter is set to `dlt.secrets.value`, which means you need to configure this variable: + + + ```python + @dlt.resource(write_disposition="replace") + def github_api_resource(access_token: Optional[str] = dlt.secrets.value): + ... + ``` + """) + return + + +@app.cell +def _(os): + import dlt + + dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY") + return (dlt,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### Step 3. Run the pipeline""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's run the pipeline!""") + return + + +@app.cell +def _(subprocess): + subprocess.run(["python", "github_api_pipeline.py"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + From the pipeline output, we can get information such as the pipeline name, dataset name, destination path, and more. + + > Pipeline **github_api_pipeline** load step completed in 1.23 seconds + > 1 load package was loaded to the DuckDB destination and into the dataset **github_api_data**. + > The DuckDB destination used `duckdb:////content/**github_api_pipeline.duckdb**` as the storage location. + > Load package `1733848559.8195539` is **LOADED** and contains no failed jobs. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 4: Explore your data + + Let's explore what tables were created in the destination. + """) + return + + +@app.cell +def _(): + import duckdb + + conn = duckdb.connect("github_api_pipeline.duckdb") + conn.sql("SET search_path = 'github_api_data'") + conn.sql("DESCRIBE").df() + return (conn,) + + +@app.cell +def _(conn): + data_table = conn.sql("SELECT * FROM github_api_resource").df() + data_table + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""# **Built-in sources: RestAPI, SQL database & Filesystem**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **[RestAPI source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic)** + + `rest_api` is a generic source that lets you create a `dlt` source from any REST API using a declarative configuration. Since most REST APIs follow similar patterns, this source provides a convenient way to define your integration declaratively. + + Using a [declarative configuration](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api/basic#source-configuration), you can specify: + + - the API endpoints to pull data from, + - their relationships, + - how to handle pagination, + - authentication. + + `dlt` handles the rest for you: **unnesting the data, inferring the schema**, and **writing it to the destination**. + + In the previous lesson, you already used the REST API Client. `dlt`’s **[RESTClient](https://dlthub.com/docs/general-usage/http/rest-client)** is the **low-level abstraction** that powers the RestAPI source. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Initialize the `rest_api` template + + You can initialize the `rest_api` **template** using the `init` command: + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "init", "rest_api", "duckdb"], input="y\n", text=True, check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In the `rest_api_pipeline.py` script, you will find sources for both the GitHub API and the PokeAPI, defined using the `rest_api` source and `RESTAPIConfig`. + + Since the `rest_api` source is a **built-in source**, you don't need to initialize it. You can simply **import** it from `dlt.sources` and start using it. + + ### Example + + Here is a simplified example of how to configure the REST API source to load `issues` and issue `comments` from the GitHub API: + """) + return + + +@app.cell +def _(dlt): + from dlt.sources.rest_api import RESTAPIConfig, rest_api_source + from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator + + config: RESTAPIConfig = { + "client": { + "base_url": "https://api.github.com", + "auth": {"token": dlt.secrets["sources.access_token"]}, + "paginator": "header_link", + }, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "repos/dlt-hub/dlt/issues", + "params": {"state": "open"}, + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "repos/dlt-hub/dlt/issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + }, + ], + } + github_source = rest_api_source(config) + rest_api_pipeline = dlt.pipeline( + pipeline_name="rest_api_github", + destination="duckdb", + dataset_name="rest_api_data", + dev_mode=True, + ) + _load_info = rest_api_pipeline.run(github_source) + print(_load_info) + return (rest_api_pipeline,) + + +@app.cell +def _(rest_api_pipeline): + rest_api_pipeline.dataset().issues.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 1: Run `rest_api` source** + + Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`. + + #### **Question** + How many columns does the `issues` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 2: Create a dlt source with `rest_api`** + + Add the `contributors` endpoint for the `dlt` repository to the `rest_api` configuration: + + - Resource name: **"contributors"** + - Endpoint path: **"repos/dlt-hub/dlt/contributors"** + - No parameters + + #### **Question** + How many columns does the `contributors` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **[SQL Databases source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/sql_database/)** + + SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval. + + The `sql_database` verified source loads data to your specified destination using one of the following backends: + * SQLAlchemy, + * PyArrow, + * pandas, + * ConnectorX. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Initialize the `sql_database` template + + Initialize the `dlt` template for `sql_database` using the `init` command: + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "init", "sql_database", "duckdb"], input="y\n", text=True, check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The `sql_database` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Example + + The example below shows how you can use dlt to load data from a SQL database (PostgreSQL, MySQL, SQLite, Oracle, IBM DB2, etc.) into a destination. + + To make it easy to reproduce, we will load data from the [public MySQL Rfam database](https://docs.rfam.org/en/latest/database.html) into a local DuckDB instance. + """) + return + + +@app.cell +def _(dlt): + from dlt.sources.sql_database import sql_database + + sql_source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["family"], + ) + sql_db_pipeline = dlt.pipeline( + pipeline_name="sql_database_example", + destination="duckdb", + dataset_name="sql_data", + dev_mode=True, + ) + _load_info = sql_db_pipeline.run(sql_source) + print(_load_info) + return (sql_database,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 3: Run `sql_database` source** + + Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`. + + #### **Question** + How many columns does the `family` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **[Filesystem source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/)** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The filesystem source allows seamless loading of files from the following locations: + + * AWS S3 + * Google Cloud Storage + * Google Drive + * Azure Blob Storage + * remote filesystem (via SFTP) + * local filesystem + + The filesystem source natively supports CSV, Parquet, and JSONL files and allows customization for loading any type of structured file. + + + **How filesystem source works** + + The Filesystem source doesn't just give you an easy way to load data from both remote and local files — it also comes with a powerful set of tools that let you customize the loading process to fit your specific needs. + + Filesystem source loads data in two steps: + + 1. It accesses the files in your remote or local file storage **without** actually **reading** the content yet. At this point, you can filter files by metadata or name. You can also set up incremental loading to load only new files. + 2. The **transformer** **reads** the files' content and yields the records. At this step, you can filter out the actual data, enrich records with metadata from files, or perform incremental loading based on the file content. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Initialize the `filesystem` template + + Initialize the dlt template for `filesystem` using the `init` command: + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "init", "filesystem", "duckdb"], input="y\n", text=True, check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The `filesystem` source is also a **built-in source**, you don't have to initialize it, just **import** it from `dlt.sources`.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Example + + To illustrate how this **built-in source** works, we first download some file to the local (Colab) filesystem. + """) + return + + +@app.cell +def _(): + import os + import requests + + folder_name = "local_data" + os.makedirs(folder_name, exist_ok=True) + full_path = os.path.abspath(folder_name) + + url = "https://www.timestored.com/data/sample/userdata.parquet" + resp = requests.get(url) + resp.raise_for_status() + + with open(f"{full_path}/userdata.parquet", "wb") as f: + f.write(resp.content) + return full_path, os + + +@app.cell +def _(dlt, full_path): + from dlt.sources.filesystem import filesystem, read_parquet + + filesystem_resource = filesystem(bucket_url=full_path, file_glob="**/*.parquet") + filesystem_pipe = filesystem_resource | read_parquet() + fs_pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") + _load_info = fs_pipeline.run(filesystem_pipe.with_name("userdata")) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Exercise 4: Run `filesystem` source** + + Explore the cells above and answer the question below using `sql_client` or `pipeline.dataset()`. + + #### **Question** + How many columns does the `userdata` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can read how to configure **Cloud Storage** in the official + [dlt documentation](https://dlthub.com/docs/dlt-ecosystem/verified-sources/filesystem/basic#configuration). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""# [**Built-in Destinations**](https://dlthub.com/docs/dlt-ecosystem/destinations/)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_4_Using_pre_build_sources_and_destinations_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_4_Using_pre_build_sources_and_destinations_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Exploring `dlt` destinations** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + To be honest, this is simply a matter of going through the + [documentation](https://dlthub.com/docs/dlt-ecosystem/destinations/) 👀, but to sum it up: + + - Most likely, the destination where you want to load data is already a `dlt` integration that undergoes several hundred automated tests every day. + - If not, you can define a custom destination and still benefit from most `dlt`-specific features. + *FYI: custom destinations will be covered in the next Advanced course — so we expect you to come back for part two…* + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Choosing a destination** + + Switching between destinations in `dlt` is incredibly straightforward. Simply modify the `destination` parameter in your pipeline configuration. For example: + """) + return + + +@app.cell +def _(dlt): + data_pipeline = dlt.pipeline( + pipeline_name="data_pipeline", + destination="duckdb", + dataset_name="data", + ) + print(data_pipeline.destination.destination_type) + + data_pipeline = dlt.pipeline( + pipeline_name="data_pipeline", + destination="bigquery", + dataset_name="data", + ) + print(data_pipeline.destination.destination_type) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""This flexibility allows you to easily transition from local development to production-grade environments.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Filesystem destination** + + The `filesystem` destination enables you to load data into **files stored locally** or in **cloud storage** solutions, making it an excellent choice for lightweight testing, prototyping, or file-based workflows. + + Below is an **example** demonstrating how to use the `filesystem` destination to load data in **Parquet** format: + + * Step 1: Set up a local bucket or cloud directory for storing files + """) + return + + +@app.cell +def _(os): + os.environ["BUCKET_URL"] = "./content" + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""* Step 2: Define the data source""") + return + + +@app.cell +def _(dlt, sql_database): + source = sql_database( + "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", + table_names=["family"], + ) + pipeline = dlt.pipeline( + pipeline_name="fs_pipeline", destination="filesystem", dataset_name="fs_data" + ) + _load_info = pipeline.run(source, loader_file_format="parquet") + print(_load_info) + return pipeline, source + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Look at the files:""") + return + + +@app.cell +def _(subprocess): + subprocess.run(["ls", "./content/fs_data/family"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Look at the loaded data:""") + return + + +@app.cell +def _(pipeline): + # explore loaded data + pipeline.dataset().family.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Table formats: [Delta tables & Iceberg](https://dlthub.com/docs/dlt-ecosystem/destinations/delta-iceberg)** + + dlt supports writing **Delta** and **Iceberg** tables when using the `filesystem` destination. + + **How it works:** + + dlt uses the `deltalake` and `pyiceberg` libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + """) + return + + +@app.cell +def _(pipeline, source): + _load_info = pipeline.run( + source, loader_file_format="parquet", table_format="iceberg" + ) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Note:** + + The open-source version of dlt supports basic functionality for **Iceberg**, but the dltHub team is currently working on an **extended** and **more powerful** Iceberg integration. + + [Join the waiting list to learn more about dltHub and Iceberg.](https://info.dlthub.com/waiting-list) + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Spoiler: Custom Sources & Destinations** + + `dlt` aims to simplify the process of creating both custom sources + ([REST API Client](https://dlthub.com/docs/general-usage/http/rest-client), + [`rest_api` source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/rest_api)) + and [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination). + + We will explore this topic in more detail in the next Advanced course. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb b/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb index 73e9ece91..ebd2130ac 100644 --- a/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb @@ -6,14 +6,14 @@ "id": "h93BcC8SX2fj" }, "source": [ - "# **Recap of [Lesson 4](https://colab.research.google.com/drive/1mfqZulsuFDc7h27d6joe2_Dduvl1uM-2#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩‍💻🚀**\n", "\n", - "1. Listed all available verified sources;\n", - "2. Initialized `github_api` verified source;\n", - "3. Explored built-in `rest_api` source.\n", - "4. Explored built-in `sql_database` source.\n", - "5. Explored built-in `filesystem` source.\n", - "6. Learned how to switch between destinations." + "1. Listed all available verified sources.\n", + "2. Initialized the `github_api` verified source.\n", + "3. Explored the built-in `rest_api` source.\n", + "4. Explored the built-in `sql_database` source.\n", + "5. Explored the built-in `filesystem` source.\n", + "6. Learned how to switch between destinations.\n" ] }, { @@ -24,7 +24,7 @@ "source": [ "---\n", "\n", - "# **Write Disposition and Incremental Loading** ⚙️🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n", + "# **Write Disposition and Incremental Loading** ⚙️🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb)\n", "\n", "\n", "**Here, you will learn:**\n", @@ -52,19 +52,17 @@ "id": "5ThZzzAwqLnn" }, "source": [ - "Write disposition in the context of the dlt library defines how the data should be written to the destination. There are three types of write dispositions:\n", + "A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types:\n", "\n", - "* **Append**: This is the **default** disposition. It will append the data to the existing data in the destination.\n", + "- **Append**: The **default** disposition. It appends new data to the existing data in the destination.\n", "\n", - "* **Replace**: This disposition replaces the data in the destination with the data from the resource. It **deletes** all the data and **recreates** the schema before loading the data.\n", + "- **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading.\n", "\n", - "* **Merge**: This write disposition merges the data from the resource with the data at the destination. For the merge disposition, you need to specify a `primary_key` for the resource.\n", + "- **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource.\n", "\n", - "The write disposition you choose depends on the dataset and how you can extract it. For more details, you can refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n", + "The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading).\n", "\n", - "\n", - "\n", - "A `write_disposition` in `dlt` can specified in the resource decorator:\n", + "You can specify a `write_disposition` in the resource decorator:\n", "\n", "```python\n", "@dlt.resource(write_disposition=\"append\")\n", @@ -79,25 +77,7 @@ "load_info = pipeline.run(my_resource, write_disposition=\"replace\")\n", "```\n", "\n", - "> In case you specify both, the write disposition specified at the pipeline run level will override the write disposition specified at the resource level." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SpEU7xzw9lZL" - }, - "source": [ - "### **0. Install dlt**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Su4oUJelKaZY" - }, - "source": [ - "Install `dlt` with DuckDB as a destination as per usual:" + "> If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level." ] }, { @@ -128,7 +108,7 @@ "id": "5IpPPDpVrU75" }, "source": [ - "As we already have said `append` is a default loading behavior. Now we will explore how this write disposition works." + "As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works." ] }, { @@ -162,7 +142,7 @@ "id": "CltUh8t6rGUP" }, "source": [ - "We create dlt pipeline as usual and load this data into DuckDB." + "We create a `dlt` pipeline as usual and load this data into DuckDB." ] }, { @@ -179,23 +159,23 @@ "\n", "@dlt.resource(\n", " name=\"pokemon\",\n", - " write_disposition=\"append\", # <--- add new argument into decorator\n", + " write_disposition=\"append\",\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def append_pokemon() -> TDataItems:\n", " yield data\n", "\n", "\n", - "pipeline = dlt.pipeline(\n", - " pipeline_name=\"poke_pipeline\",\n", + "append_pipeline = dlt.pipeline(\n", + " pipeline_name=\"append_poke_pipeline\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = append_pipeline.run(append_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "append_pipeline.dataset().pokemon.df()" ] }, { @@ -204,9 +184,9 @@ "id": "Wtz2oUpCs7Ay" }, "source": [ - "Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**. It is very useful.\n", + "Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful.\n", "\n", - "Example use case: when you have a new folder created daily with json file logs, and you want to ingest them incrementally." + "Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally.\n" ] }, { @@ -217,11 +197,11 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = append_pipeline.run(append_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "append_pipeline.dataset().pokemon.df()" ] }, { @@ -240,7 +220,7 @@ "id": "Njz_qUcpDtTW" }, "source": [ - "Perhaps this duplicated data is not what you want to get in your work projects. For example, if your data was updated, how we can refresh it in the database? One method is to tell dlt to **replace** the data in existing tables by using **write_disposition**." + "Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**.\n" ] }, { @@ -256,23 +236,23 @@ "\n", "@dlt.resource(\n", " name=\"pokemon\",\n", - " write_disposition=\"replace\", # <--- change 'append' to 'replace'\n", + " write_disposition=\"replace\",\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def replace_pokemon() -> TDataItems:\n", " yield data\n", "\n", "\n", - "pipeline = dlt.pipeline(\n", - " pipeline_name=\"poke_pipeline\",\n", + "replace_pipeline = dlt.pipeline(\n", + " pipeline_name=\"replace_poke_pipeline\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = replace_pipeline.run(replace_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "replace_pipeline.dataset().pokemon.df()" ] }, { @@ -292,11 +272,11 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = replace_pipeline.run(replace_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "replace_pipeline.dataset().pokemon.df()" ] }, { @@ -305,7 +285,7 @@ "id": "aPjezxijt_mz" }, "source": [ - "TAADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)." + "TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading)." ] }, { @@ -364,24 +344,24 @@ "\n", "@dlt.resource(\n", " name=\"pokemon\",\n", - " write_disposition=\"merge\", # <--- change 'replace' to 'merge'\n", - " primary_key=\"id\", # <--- add primary_key\n", + " write_disposition=\"merge\",\n", + " primary_key=\"id\",\n", ")\n", - "def pokemon() -> TDataItems:\n", + "def merge_pokemon() -> TDataItems:\n", " yield data\n", "\n", "\n", - "pipeline = dlt.pipeline(\n", + "merge_pipeline = dlt.pipeline(\n", " pipeline_name=\"poke_pipeline_merge\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = merge_pipeline.run(merge_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "merge_pipeline.dataset().pokemon.df()" ] }, { @@ -431,24 +411,24 @@ "outputs": [], "source": [ "# We added `created_at` field to the data\n", - "data = [\n", + "created_data = [\n", " {\n", " \"id\": \"1\",\n", " \"name\": \"bulbasaur\",\n", " \"size\": {\"weight\": 6.9, \"height\": 0.7},\n", - " \"created_at\": \"2024-12-01\", # <------- new field\n", + " \"created_at\": \"2024-12-01\",\n", " },\n", " {\n", " \"id\": \"4\",\n", " \"name\": \"charmander\",\n", " \"size\": {\"weight\": 8.5, \"height\": 0.6},\n", - " \"created_at\": \"2024-09-01\", # <------- new field\n", + " \"created_at\": \"2024-09-01\",\n", " },\n", " {\n", " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", " \"size\": {\"weight\": 6, \"height\": 0.4},\n", - " \"created_at\": \"2023-06-01\", # <------- new field\n", + " \"created_at\": \"2023-06-01\",\n", " },\n", "]" ] @@ -459,11 +439,11 @@ "id": "EO63mHgE_Oya" }, "source": [ - "**The goal**: Load only Pokémon caught after January 1, 2024, skipping the ones you already have.\n", + "**The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have.\n", "\n", "### **Step 2: Defining the incremental logic**\n", "\n", - "Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémon caught after a certain date:\n", + "Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date:\n", "```python\n", "cursor_date = dlt.sources.incremental(\"created_at\", initial_value=\"2024-01-01\")\n", "```\n", @@ -489,12 +469,12 @@ " name=\"pokemon\",\n", " write_disposition=\"append\",\n", ")\n", - "def pokemon(\n", + "def incremental_pokemon(\n", " cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n", " \"created_at\", initial_value=\"2024-01-01\"\n", " )\n", ") -> TDataItems:\n", - " yield data" + " yield created_data" ] }, { @@ -524,17 +504,17 @@ }, "outputs": [], "source": [ - "pipeline = dlt.pipeline(\n", + "incremental_pipeline = dlt.pipeline(\n", " pipeline_name=\"poke_pipeline_incremental\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = incremental_pipeline.run(incremental_pokemon)\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "incremental_pipeline.dataset().pokemon.df()" ] }, { @@ -584,7 +564,7 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = incremental_pipeline.run(incremental_pokemon)\n", "print(load_info)" ] }, @@ -619,21 +599,21 @@ }, "outputs": [], "source": [ - "# We added `created_at` field to the data\n", - "data = [\n", + "# We added `updated_at` field to the data\n", + "updated_data = [\n", " {\n", " \"id\": \"1\",\n", " \"name\": \"bulbasaur\",\n", " \"size\": {\"weight\": 6.9, \"height\": 0.7},\n", " \"created_at\": \"2024-12-01\",\n", - " \"updated_at\": \"2024-12-01\", # <------- new field\n", + " \"updated_at\": \"2024-12-01\",\n", " },\n", " {\n", " \"id\": \"4\",\n", " \"name\": \"charmander\",\n", " \"size\": {\"weight\": 8.5, \"height\": 0.6},\n", " \"created_at\": \"2024-09-01\",\n", - " \"updated_at\": \"2024-09-01\", # <------- new field\n", + " \"updated_at\": \"2024-09-01\",\n", " },\n", " {\n", " \"id\": \"25\",\n", @@ -641,9 +621,9 @@ " \"size\": {\n", " \"weight\": 9,\n", " \"height\": 0.4,\n", - " }, # <----- pikachu gained weight from 6 to 9\n", + " },\n", " \"created_at\": \"2023-06-01\",\n", - " \"updated_at\": \"2024-12-16\", # <------- new field, information about pikachu has updated\n", + " \"updated_at\": \"2024-12-16\",\n", " },\n", "]" ] @@ -670,14 +650,15 @@ "\n", "@dlt.resource(\n", " name=\"pokemon\",\n", - " write_disposition=\"merge\", # <--- change write disposition from 'append' to 'merge'\n", - " primary_key=\"id\", # <--- set a primary key\n", + " write_disposition=\"merge\",\n", + " primary_key=\"id\",\n", ")\n", - "def pokemon(\n", + "def dedup_pokemon(\n", + " data: TDataItems,\n", " cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental(\n", " \"updated_at\", initial_value=\"2024-01-01\"\n", - " )\n", - ") -> TDataItems: # <--- change the cursor name from 'created_at' to 'updated_at'\n", + " ),\n", + ") -> TDataItems:\n", " yield data" ] }, @@ -698,17 +679,17 @@ }, "outputs": [], "source": [ - "pipeline = dlt.pipeline(\n", + "dedup_pipeline = dlt.pipeline(\n", " pipeline_name=\"poke_pipeline_dedup\",\n", " destination=\"duckdb\",\n", " dataset_name=\"pokemon_data\",\n", ")\n", "\n", - "load_info = pipeline.run(pokemon)\n", + "load_info = dedup_pipeline.run(dedup_pokemon(updated_data))\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "dedup_pipeline.dataset().pokemon.df()" ] }, { @@ -717,7 +698,7 @@ "id": "omG1cgzcrqOs" }, "source": [ - "All Pokémon are processed because this is the pipeline’s first run.\n", + "All Pokémons are processed because this is the pipeline’s first run.\n", "\n", "Now, let’s say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`." ] @@ -730,8 +711,7 @@ }, "outputs": [], "source": [ - "# We added `created_at` field to the data\n", - "data = [\n", + "reupdated_data = [\n", " {\n", " \"id\": \"1\",\n", " \"name\": \"bulbasaur\",\n", @@ -749,9 +729,9 @@ " {\n", " \"id\": \"25\",\n", " \"name\": \"pikachu\",\n", - " \"size\": {\"weight\": 7.5, \"height\": 0.4}, # <--- pikachu lost weight\n", + " \"size\": {\"weight\": 7.5, \"height\": 0.4},\n", " \"created_at\": \"2023-06-01\",\n", - " \"updated_at\": \"2024-12-23\", # <--- data about his weight was updated a week later\n", + " \"updated_at\": \"2024-12-23\",\n", " },\n", "]" ] @@ -773,11 +753,11 @@ }, "outputs": [], "source": [ - "load_info = pipeline.run(pokemon)\n", + "load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data))\n", "print(load_info)\n", "\n", "# explore loaded data\n", - "pipeline.dataset().pokemon.df()" + "dedup_pipeline.dataset().pokemon.df()" ] }, { @@ -786,10 +766,10 @@ "id": "u2hZHn_EowBd" }, "source": [ - "**What happens?**\n", + "**What happened?**\n", "\n", - "* The pipeline detects that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped.\n", - "* Pikachu’s record is updated to reflect the latest weight.\n", + "* The pipeline detected that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped.\n", + "* Pikachu’s record was updated to reflect the latest weight.\n", "\n", "You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination." ] @@ -800,28 +780,17 @@ "id": "pufZ_GWPxqEQ" }, "source": [ - "The **`dlt.sources.incremental`** instance above has the next attributes:\n", + "The **`dlt.sources.incremental`** instance above has the following attributes:\n", "\n", "* **`cursor_date.initial_value`** which is always equal to \"2024-01-01\" passed in the constructor;\n", "* **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run;\n", "* **`cursor_date.last_value`** a \"real-time\" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`;\n", - "* **`cursor_date.end_value`** (here not used) marking the end of the backfill range.\n", + "* **`cursor_date.end_value`** (not used here) marking the end of the backfill range.\n", "\n", "## **Example**\n", "You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example:" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l4C_IFK7G4m9" - }, - "outputs": [], - "source": [ - "exit() # we use exit() to reset all ENVs we set" - ] - }, { "cell_type": "code", "execution_count": null, @@ -838,10 +807,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__ACCESS_TOKEN\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -859,9 +827,7 @@ " )\n", " ) -> TDataItems:\n", " params = {\n", - " \"since\": (\n", - " cursor_date.last_value\n", - " ), # <--- use last_value to request only new data from API\n", + " \"since\": (cursor_date.last_value),\n", " \"status\": \"open\",\n", " }\n", " for page in client.paginate(\"repos/dlt-hub/dlt/issues\", params=params):\n", @@ -885,9 +851,9 @@ "id": "5d1J5DPX3Dn3" }, "source": [ - "Pay attention how we use **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n", + "Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run.\n", "\n", - "Run the pipeline again and make sure that **no data was loaded**." + "Run the pipeline again and make sure that **no data is loaded**." ] }, { @@ -934,12 +900,12 @@ "\n", "Transform your GitHub API pipeline to use incremental loading. This means:\n", "\n", - "* Implement new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n", + "* Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint.\n", "* Fetch only pulls comments updated after the last pipeline run.\n", "* Use the `updated_at` field from the GitHub API as the incremental cursor.\n", "* [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository)\n", "* Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments`\n", - "* Use `since` parameter - only show results that were last updated after the given time - and `last_value`.\n", + "* Use the `since` parameter - only show results that were last updated after the given time - and `last_value`.\n", "* `initial_value` is `2024-12-01`.\n", "\n", "\n", @@ -954,17 +920,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SVyiG5wRVo1B" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py b/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py new file mode 100644 index 000000000..902bb7fc5 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py @@ -0,0 +1,743 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 4](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) 👩‍💻🚀** + + 1. Listed all available verified sources. + 2. Initialized the `github_api` verified source. + 3. Explored the built-in `rest_api` source. + 4. Explored the built-in `sql_database` source. + 5. Explored the built-in `filesystem` source. + 6. Learned how to switch between destinations. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **Write Disposition and Incremental Loading** ⚙️🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) + + + **Here, you will learn:** + - `dlt` write dispositions: + - Append + - Replace + - Merge + - What incremental loading is + - How to update and deduplicate your data + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **`dlt` write dispositions** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + A **write disposition** in the context of the `dlt` library defines how data should be written to the destination. There are three types: + + - **Append**: The **default** disposition. It appends new data to the existing data in the destination. + + - **Replace**: This disposition replaces all existing data at the destination with the new data from the resource. It **deletes** all previous data and **recreates** the schema before loading. + + - **Merge**: This disposition merges incoming data with existing data at the destination. For `merge`, you must specify a `primary_key` for the resource. + + The choice of write disposition depends on your dataset and how you extract it. For more details, refer to the [Incremental loading page](https://dlthub.com/docs/general-usage/incremental-loading). + + You can specify a `write_disposition` in the resource decorator: + + ```python + @dlt.resource(write_disposition="append") + def my_resource(): + ... + yield data + ``` + + Or directly in the pipeline run: + + ```python + load_info = pipeline.run(my_resource, write_disposition="replace") + ``` + + > If both are specified, the write disposition at the pipeline run level overrides the one set at the resource level. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **1. Append** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""As we have already said, `append` is the default loading behavior. Now we will explore how this write disposition works.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's remember our Quick Start data sample with pokemons:""") + return + + +@app.cell +def _(): + # Sample data containing pokemon details + data = [ + {"id": "1", "name": "bulbasaur", "size": {"weight": 6.9, "height": 0.7}}, + {"id": "4", "name": "charmander", "size": {"weight": 8.5, "height": 0.6}}, + {"id": "25", "name": "pikachu", "size": {"weight": 6, "height": 0.4}}, + ] + return (data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We create a `dlt` pipeline as usual and load this data into DuckDB.""" + ) + return + + +@app.cell +def _(data): + import dlt + from dlt.common.typing import TDataItems + + @dlt.resource(name="pokemon", write_disposition="append") + def append_pokemon() -> TDataItems: + yield data + + append_pipeline = dlt.pipeline( + pipeline_name="append_poke_pipeline", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = append_pipeline.run(append_pokemon) + print(_load_info) + # explore loaded data + append_pipeline.dataset().pokemon.df() + return TDataItems, append_pipeline, append_pokemon, dlt + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Run this example **twice**, and you'll notice that each time a copy of the data is added to your tables. We call this load mode **append**, and it is very useful. + + Example use case: when you have a new folder created daily with JSON log files, and you want to ingest them incrementally. + """) + return + + +@app.cell +def _(append_pipeline, append_pokemon): + _load_info = append_pipeline.run(append_pokemon) + print(_load_info) + # explore loaded data + append_pipeline.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **2. Replace** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Perhaps this duplicated data is not what you want in your work projects. For example, if your data was updated, how can we refresh it in the database? One way is to tell `dlt` to **replace** the data in the existing tables by using a **write_disposition**.""" + ) + return + + +@app.cell +def _(TDataItems, data, dlt): + @dlt.resource(name="pokemon", write_disposition="replace") + def replace_pokemon() -> TDataItems: + yield data + + replace_pipeline = dlt.pipeline( + pipeline_name="replace_poke_pipeline", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = replace_pipeline.run(replace_pokemon) + print(_load_info) + replace_pipeline.dataset().pokemon.df() + return replace_pipeline, replace_pokemon + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run it again:""") + return + + +@app.cell +def _(replace_pipeline, replace_pokemon): + _load_info = replace_pipeline.run(replace_pokemon) + print(_load_info) + # explore loaded data + replace_pipeline.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""TADA! No duplicates, your data was [fully refreshed](https://dlthub.com/docs/general-usage/full-loading).""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **3. [Merge](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental-loading)** + + Consider a scenario where the data in the source has been updated, but you want to avoid reloading the entire dataset. + + + + Merge write disposition is used to merge new data into the destination, using a `merge_key` and/or **deduplicating**/**upserting** new data using a `primary_key`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_5_Write_disposition_and_incremental_loading_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_5_Write_disposition_and_incremental_loading_img1.jpeg)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The **merge** write disposition can be useful in several situations: + + 1. If you have a dataset where records are frequently updated and you want to reflect these changes in your database, the `merge` write disposition can be used. It will **update the existing records** with the new data instead of creating duplicate entries. + + 2. If your data source occasionally sends **duplicate records**, the merge write disposition can help handle this. It uses a `primary_key` to identify unique records, so if a duplicate record (with the same `primary_key`) is encountered, it will be merged with the existing record instead of creating a new one. + + 3. If you are dealing with **Slowly Changing Dimensions** (SCD) where the attribute of a record changes over time and you want to maintain a history of these changes, you can use the `merge` write disposition with the scd2 strategy. + + + When using the merge disposition, you need to specify a `primary_key` or `merge_key` for the resource. + """) + return + + +@app.cell +def _(TDataItems, data, dlt): + @dlt.resource(name="pokemon", write_disposition="merge", primary_key="id") + def merge_pokemon() -> TDataItems: + yield data + + merge_pipeline = dlt.pipeline( + pipeline_name="poke_pipeline_merge", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = merge_pipeline.run(merge_pokemon) + print(_load_info) + merge_pipeline.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The merge write disposition can be used with three different strategies: + + * delete-insert (default strategy) + * scd2 + * upsert + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Incremental Loading** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Incremental loading is the act of loading only new or changed data and not old records that we already loaded. + + Imagine you’re a Pokémon trainer trying to catch ‘em all. You don’t want to keep visiting the same old PokéStops, catching the same old Bulbasaurs—you only want to find new and exciting Pokémon that have appeared since your last trip. That’s what incremental loading is all about: collecting only the new data that’s been added or changed, without wasting your Poké Balls (or database resources) on what you already have. + + In this example, we have a dataset of Pokémon, each with a **unique ID**, their **name**, **size** (height and weight), and **when** they were "caught" (`created_at` field). + + ### **Step 1: Adding the `created_at` Field** + """) + return + + +@app.cell +def _(): + # We added `created_at` field to the data + created_data = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": "2024-12-01", + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": "2024-09-01", + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 6, "height": 0.4}, + "created_at": "2023-06-01", + }, + ] + return (created_data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **The goal**: Load only Pokémons caught after January 1, 2024, skipping the ones you already have. + + ### **Step 2: Defining the incremental logic** + + Using `dlt`, we set up an [incremental filter](https://www.google.com/url?q=https://dlthub.com/docs/general-usage/incremental-loading%23incremental-loading-with-a-cursor-field&sa=D&source=editors&ust=1734717286675253&usg=AOvVaw3rAF3y3p86sGt49ImCTgon) to only fetch Pokémons caught after a certain date: + ```python + cursor_date = dlt.sources.incremental("created_at", initial_value="2024-01-01") + ``` + This tells `dlt`: + - **Start date**: January 1, 2024 (`initial_value`). + - **Field to track**: `created_at` (our timestamp). + + As you run the pipeline repeatedly, `dlt` will keep track of the latest `created_at` value processed. It will skip records older than this date in future runs. + """) + return + + +@app.cell +def _(TDataItems, created_data, dlt): + @dlt.resource(name="pokemon", write_disposition="append") + def incremental_pokemon( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "created_at", initial_value="2024-01-01" + ) + ) -> TDataItems: + yield created_data + return (incremental_pokemon,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""We use the `@dlt.resource` decorator to declare table **name** to which data will be loaded and **write disposition**, which is **append** by default.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Step 3: Running the pipeline** + Finally, we run our pipeline and load the fresh Pokémon data: + """) + return + + +@app.cell +def _(dlt, incremental_pokemon): + incremental_pipeline = dlt.pipeline( + pipeline_name="poke_pipeline_incremental", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = incremental_pipeline.run(incremental_pokemon) + print(_load_info) + # explore loaded data + incremental_pipeline.dataset().pokemon.df() + return (incremental_pipeline,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This: + 1. Loads **only Charmander and Bulbasaur** (caught after 2024-01-01). + 2. Skips Pikachu because it’s old news. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Only data for 2024 year was loaded.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_5_Write_disposition_and_incremental_loading_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_5_Write_disposition_and_incremental_loading_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Run the same pipeline again. The pipeline will detect that there are **no new records** based on the `created_at` field and the incremental cursor. As a result, **no new data will be loaded** into the destination: + >0 load package(s) were loaded + """) + return + + +@app.cell +def _(incremental_pipeline, incremental_pokemon): + _load_info = incremental_pipeline.run(incremental_pokemon) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### **Why incremental loading matters** + + * **Efficiency**. Skip redundant data, saving time and resources. + * **Scalability**. Handle growing datasets without bottlenecks. + * **Automation**. Let the tool track changes for you—no manual effort. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Update and deduplicate your data** + The script above finds new pokemons and adds them to the database. It will ignore any updates to user information. + """) + return + + +@app.cell +def _(): + # We added `updated_at` field to the data + updated_data = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": "2024-12-01", + "updated_at": "2024-12-01", + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": "2024-09-01", + "updated_at": "2024-09-01", + }, + { + "id": "25", + "name": "pikachu", + "size": { + "weight": 9, + "height": 0.4, + }, + "created_at": "2023-06-01", + "updated_at": "2024-12-16", + }, + ] + return (updated_data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Get always fresh content of all the pokemons: combine an **incremental load** with **merge** write disposition, like in the script below.""" + ) + return + + +@app.cell +def _(TDataItems, dlt): + @dlt.resource(name="pokemon", write_disposition="merge", primary_key="id") + def dedup_pokemon( + data: TDataItems, + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-01-01" + ), + ) -> TDataItems: + yield data + return (dedup_pokemon,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The incremental cursor keeps an eye on the `updated_at` field. Every time the pipeline runs, it only processes records with `updated_at` values greater than the last run.""" + ) + return + + +@app.cell +def _(dedup_pokemon, dlt, updated_data): + dedup_pipeline = dlt.pipeline( + pipeline_name="poke_pipeline_dedup", + destination="duckdb", + dataset_name="pokemon_data", + ) + _load_info = dedup_pipeline.run(dedup_pokemon(updated_data)) + print(_load_info) + # explore loaded data + dedup_pipeline.dataset().pokemon.df() + return (dedup_pipeline,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + All Pokémons are processed because this is the pipeline’s first run. + + Now, let’s say Pikachu goes to gym and sheds some weight (down to 7.5), and the `updated_at` field is set to `2024-12-23`. + """) + return + + +@app.cell +def _(): + reupdated_data = [ + { + "id": "1", + "name": "bulbasaur", + "size": {"weight": 6.9, "height": 0.7}, + "created_at": "2024-12-01", + "updated_at": "2024-12-01", + }, + { + "id": "4", + "name": "charmander", + "size": {"weight": 8.5, "height": 0.6}, + "created_at": "2024-09-01", + "updated_at": "2024-09-01", + }, + { + "id": "25", + "name": "pikachu", + "size": {"weight": 7.5, "height": 0.4}, + "created_at": "2023-06-01", + "updated_at": "2024-12-23", + }, + ] + return (reupdated_data,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the same pipeline:""") + return + + +@app.cell +def _(dedup_pipeline, dedup_pokemon, reupdated_data): + _load_info = dedup_pipeline.run(dedup_pokemon(reupdated_data)) + print(_load_info) + # explore loaded data + dedup_pipeline.dataset().pokemon.df() + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **What happened?** + + * The pipeline detected that `updated_at` for Bulbasaur and Charmander hasn’t changed—they’re skipped. + * Pikachu’s record was updated to reflect the latest weight. + + You can see that the **`_dlt_load_id`** for Bulbasaur and Charmander remained the same, but for Pikachu it was changed since only the updated Pikachu data was loaded into the destination. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The **`dlt.sources.incremental`** instance above has the following attributes: + + * **`cursor_date.initial_value`** which is always equal to "2024-01-01" passed in the constructor; + * **`cursor_date.start_value`** a maximum `updated_at` value from the previous run or the `initial_value` on the first run; + * **`cursor_date.last_value`** a "real-time" `updated_at` value updated with each yielded item or page. Before the first yield, it equals `start_value`; + * **`cursor_date.end_value`** (not used here) marking the end of the backfill range. + + ## **Example** + You can use them in the resource code to make **more efficient requests**. Take look at the GitHub API example: + """) + return + + +@app.cell +def _(TDataItems, dlt, os): + from typing import Iterable + from dlt.extract import DltResource + from dlt.sources.helpers import requests + from dlt.sources.helpers.rest_client import RESTClient + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + dlt.secrets["SOURCES__ACCESS_TOKEN"] = os.getenv("SECRET_KEY") + + @dlt.source + def github_source(access_token: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=access_token), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource(name="issues", write_disposition="merge", primary_key="id") + def github_issues( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/issues", params=params): + yield page + + return github_issues + + pipeline = dlt.pipeline(pipeline_name="github_incr", destination="duckdb") + _load_info = pipeline.run(github_source()) + print(_load_info) + return github_source, pipeline + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Pay attention to how we use the **since** GitHub API parameter and `cursor_date.last_value` to tell GitHub which issues we are interested in. `cursor_date.last_value` holds the last `cursor_date` value from the previous run. + + Run the pipeline again and make sure that **no data is loaded**. + """) + return + + +@app.cell +def _(github_source, pipeline): + # run the pipeline with the new resource + _load_info = pipeline.run(github_source()) + print(_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Apply Hints** + + Alternatively, you can use `apply_hints` on a resource to define an incremental field: + + ```python + resource = resource() + resource.apply_hints(incremental=dlt.sources.incremental("updated_at")) + ``` + + When you apply an incremental hint using `apply_hints`, the source still performs a full extract. The incremental hint is used by `dlt` to filter the data after it has been extracted, before it is loaded into the destination. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Exercise 1: Make the GitHub API pipeline incremental** + + In the previous lessons, you built a pipeline to pull data from the GitHub API. Now, let’s level it up by making it incremental, so it fetches only new or updated data. + + + Transform your GitHub API pipeline to use incremental loading. This means: + + * Implement a new `dlt.resource` for `pulls/comments` (List comments for Pull Requests) endpoint. + * Fetch only pulls comments updated after the last pipeline run. + * Use the `updated_at` field from the GitHub API as the incremental cursor. + * [Endpoint documentation](https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository) + * Endpoint URL: `https://api.github.com/repos/OWNER/REPO/pulls/comments` + * Use the `since` parameter - only show results that were last updated after the given time - and `last_value`. + * `initial_value` is `2024-12-01`. + + + ### Question + + How many columns does the `comments` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb b/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb index cda220b7f..cd875c717 100644 --- a/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb @@ -6,14 +6,14 @@ "id": "h93BcC8SX2fj" }, "source": [ - "# **Recap of [Lesson 5](https://colab.research.google.com/drive/1Zf24gIVMNNj9j-gtXFl8p0orI9ttySDn#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩‍💻🚀**\n", "\n", - "1. Explored 3 dlt write dispositions:\n", - " * append;\n", - " * replace;\n", - " * merge.\n", - "2. Learned how to update and depuplicate data\n", - "3. Created incremental pipeline\n" + "1. Explored 3 `dlt` write dispositions: \n", + " - append \n", + " - replace \n", + " - merge \n", + "2. Learned how to update and deduplicate data \n", + "3. Created an incremental pipeline\n" ] }, { @@ -22,16 +22,16 @@ "id": "26boldDvOn0R" }, "source": [ - "# **How dlt works** 🧠🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n", + "# **How dlt works** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb)\n", "\n", "\n", "**Here, you will learn:**\n", - "- Three main steps:\n", - " - Extract;\n", - " - Normalize;\n", - " - Load. \n", - "- Some default behaviour.\n", - "- About file formats." + "- The 3 main steps of a pipeline run: \n", + " - Extract \n", + " - Normalize \n", + " - Load \n", + "- Some default behaviors \n", + "- Supported file formats" ] }, { @@ -42,7 +42,7 @@ "source": [ "## **Introduction**\n", "\n", - "The main building block of dlt is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method." + "The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method." ] }, { @@ -60,8 +60,8 @@ "id": "Xh6CKQATb63X" }, "source": [ - "# **Understing `pipeline.run()`**\n", - " The `pipeline.run()` method executes the entire pipeline, encompassing the [`extract`](#scrollTo=4C0U1dnwZxAB), [`normalize`](#scrollTo=bCeUqaW_cRSh), and [`load`](#scrollTo=Rn6cUc0OcWsk) stages." + "# **Understanding `pipeline.run()`**\n", + " The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages." ] }, { @@ -89,7 +89,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install -U dlt" + "!pip install dlt" ] }, { @@ -136,14 +136,14 @@ "The `progress=\"log\"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipeline’s progress during execution. These logs provide visibility into the pipeline’s operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization.\n", "\n", "\n", - "dlt supports 4 progress monitors out of the box:\n", + "`dlt` supports 4 progress monitors out of the box:\n", "\n", "* `enlighten` - a status bar with progress bars that also allows for logging.\n", "* `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks.\n", "* `alive_progress` - with the most fancy animations.\n", - "* `log` - dumps the progress information to log, console, or text stream. the most useful on production optionally adds memory and CPU usage stats.\n", + "* `log` — dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats.\n", "\n", - "For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#display-the-loading-progress)." + "For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress)." ] }, { @@ -154,7 +154,7 @@ "source": [ "## **Extract**\n", "\n", - "Extract can be run individually with the `extract` command on the pipeline:\n", + "Extract can be run individually with the `extract` method on the pipeline:\n", "\n", "```python\n", "pipeline.extract(data)\n", @@ -712,17 +712,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rZpSep8SV1SZ" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py b/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py new file mode 100644 index 000000000..d108f7589 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py @@ -0,0 +1,663 @@ +# /// script +# dependencies = [ +# "dlt", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 5](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) 👩‍💻🚀** + + 1. Explored 3 `dlt` write dispositions: + - append + - replace + - merge + 2. Learned how to update and deduplicate data + 3. Created an incremental pipeline + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **How dlt works** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) + + + **Here, you will learn:** + - The 3 main steps of a pipeline run: + - Extract + - Normalize + - Load + - Some default behaviors + - Supported file formats + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Introduction** + + The main building block of `dlt` is the **pipeline**, which orchestrates the loading of data from your source into your destination in three discrete steps when you call its **run** method. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""So, let's take a step back and walk through the internal steps of `pipeline.run()`, identifying methods to optimize each one.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Understanding `pipeline.run()`** + The `pipeline.run()` method executes the entire pipeline, encompassing the `extract`, `normalize`, and `load` stages. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_6_How_dlt_works_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_6_How_dlt_works_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Consider this intentionally short example:""") + return + + +@app.cell +def _(): + import dlt + + pipeline = dlt.pipeline( + pipeline_name="my_pipeline", destination="duckdb", progress="log" + ) + + load_info = pipeline.run( + [ + {"id": 1}, + {"id": 2}, + {"id": 3, "nested": [{"id": 1}, {"id": 2}]}, + ], + table_name="items", + ) + print(load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This is what happens when the `run` method is executed: + + 1. **Extract** - Fully extracts the data from your source to your hard drive. In the example above, an implicit source with one resource with 3 items is created and extracted. + 2. **Normalize** - Inspects and normalizes your data and computes a schema compatible with your destination. For the example above, the normalizer will detect one column `id` of type `int` in one table named `items`, it will furthermore detect a nested list in table items and unnest it into a child table named `items__nested`. + 3. **Load** - Runs schema migrations if necessary on your destination and loads your data into the destination. For the example above, a new dataset on a local duckdb database is created that contains the two tables discovered in the previous steps. + + + ## **Display the loading progress** + Notice how we use `progress="log"` here. + + The `progress="log"` argument in the `dlt.pipeline` configuration enables detailed logging of the pipeline’s progress during execution. These logs provide visibility into the pipeline’s operations, showing how data flows through the **Extract**, **Normalize**, and **Load** phases. The logs include real-time metrics such as resource or file counts, time elapsed, processing rates, memory usage, and CPU utilization. + + + `dlt` supports 4 progress monitors out of the box: + + * `enlighten` - a status bar with progress bars that also allows for logging. + * `tqdm` - the most popular Python progress bar lib, proven to work in Notebooks. + * `alive_progress` - with the most fancy animations. + * `log` — dumps progress information to a log, console, or text stream; most useful in production, and can optionally include memory and CPU usage stats. + + For more information read the [official documentation](https://dlthub.com/docs/general-usage/pipeline#monitor-the-loading-progress). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Extract** + + Extract can be run individually with the `extract` method on the pipeline: + + ```python + pipeline.extract(data) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **What happens at the extraction stage?**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + When the `pipeline.run()` method is executed, it first performs the `extract` stage, during which the following occurs: + + 1. Data is fetched and stored in an in-memory buffer. + 2. When the buffer reaches its capacity, the data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items. + 3. If a size is specified for intermediary files and an the intermediary file in question reaches this size, a new intermediary file is opened for further data. + + ``` + API Data + | (extract) + Buffer + (resources) / | ... | \ + extracted data in local storage + + ``` + + The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Default behaviour at the extraction stage**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + - The in-memory buffer is set to `5000` items. + - By default, **intermediary files are not rotated**. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, `dlt` will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions. + - By default, intermediary files at the extract stage use a custom version of the JSONL format. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Normalize** + + Normalize can be run individually with the `normalize` command on the pipeline. Normalize is dependent on having a completed extract phase and will not do anything if there is no extracted data. + + ```py + pipeline.normalize() + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **What happens at the normalization stage?**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + In the `normalize` stage, `dlt` first transforms the structure of the input data. This transformed data is then converted into a relational structure that can be easily loaded into the destination. To be detailed, here's what happens during this stage: + + 1. Intermediary files are sent from the `extract` stage to the `normalize` stage. + 3. During normalization step it processes one intermediate file at a time within its own in-memory buffer. + 4. When the buffer reaches its capacity, the normalized data inside it is written to an intermediary file, and the buffer is cleared for the next set of data items. + 4. If a size is specified for intermediary files in the normalize stage and the intermediary file in question reaches this size, a new intermediary file is opened. + + ``` + (extract) + API Data --> extracted files in local storage + / | \ (normalize) + one file ... one file + / | \ / | \ + normalized files normalized files + + ``` + + + The **number** of intermediate **files** depends on the number of **resources** and whether **file rotation** is enabled. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Default behaviour at the normalization stage**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + - The in-memory buffer is set to `5000`, just like at the extraction stage. + - By default, **intermediary files are not rotated** as well. If you do not explicitly set a size for an intermediary file with `file_max_items=100000`, dlt will create a **single file** for a resource, regardless of the number of records it contains, even if it reaches millions. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## **Load** + + Load can be run individually with the `load` command on the pipeline. Load is dependent on having a completed normalize phase and will not do anything if there is no normalized data. + + ```py + pipeline.load() + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **What happens at the loading stage?**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `load` stage is responsible for taking the normalized data and loading it into your chosen destination: + + 1. All intermediary files from a single source are combined into a single load package. + 2. All load packages are then loaded into the destination. + + + ``` + (extract) (normalize) + API Data --> extracted files --> normalized files + / | ... | \ (load) + one normalized file ... one file + \ | ... | / + destination + + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Default behaviour at the loading stage**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""- Loading happens in `20` threads, each loading a single file.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""## **Intermediary file formats**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Intermediary files at the extract stage use a custom version of the JSONL format, while the loader files - files created at the normalize stage - can take 4 different formats.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **JSONL**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Definition**: JSON Delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line. + + **Compression:** enabled by default. + + **Data type handling:** + + - `datetime` and `date` are stored as ISO strings; + - `decimal` is stored as a text representation of a decimal number; + - `binary` is stored as a base64 encoded string; + - `HexBytes` is stored as a hex encoded string; + - `complex` is serialized as a string. + + **By default used by:** + + - Bigquery + - Snowflake + - Filesystem + + **Configuration**: + + - Directly in the `pipeline.run()`: + + ```py + info = pipeline.run(some_source(), loader_file_format="jsonl") + ``` + + - In `config.toml` or `secrets.toml`: + + ```py + [normalize] + loader_file_format="jsonl" + ``` + + - Via environment variables: + + ```py + export NORMALIZE__LOADER_FILE_FORMAT="jsonl" + ``` + + - Specify directly in the resource decorator: + + ```py + @dlt.resource(file_format="jsonl") + def generate_rows(): + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **Parquet**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Definition**: Apache Parquet is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem. + + **Prerequisite:** To use this format, you need a pyarrow package. You can get this package as a dlt extra as well: + + ```py + pip install "dlt[parquet]" + + ``` + + **Default version**: 2.4, which coerces timestamps to microseconds and silently truncates nanoseconds for better compatibility with databases and pandas. + + **Supported by:** + + - Bigquery + - DuckDB + - Snowflake + - Filesystem + - Athena + - Databricks + - Synapse + + **Configuration**: + + - Directly in the `pipeline.run()`: + + ```py + info = pipeline.run(some_source(), loader_file_format="parquet") + ``` + + - In `config.toml` or `secrets.toml`: + + ```py + [normalize] + loader_file_format="parquet" + ``` + + - Via environment variables: + + ```py + export NORMALIZE__LOADER_FILE_FORMAT="parquet" + ``` + + - Specify directly in the resource decorator: + + ```py + @dlt.resource(file_format="parquet") + def generate_rows(): + ... + ``` + + + **Destination AutoConfig**: + + `dlt` automatically configures the Parquet writer based on the destination's capabilities: + + - Selects the appropriate decimal type and sets the correct precision and scale for accurate numeric data storage, including handling very small units like Wei. + + - Adjusts the timestamp resolution (seconds, microseconds, or nanoseconds) to match what the destination supports + + + **Writer settings:** + + `dlt` uses the pyarrow Parquet writer for file creation. You can adjust the writer's behavior with the following options: + + - `flavor` adjusts schema and compatibility settings for different target systems. Defaults to None (pyarrow default). + - `version` selects Parquet logical types based on the Parquet format version. Defaults to "2.6". + - `data_page_size` sets the target size for data pages within a column chunk (in bytes). Defaults to None. + - `timestamp_timezone` specifies the timezone; defaults to UTC. + - `coerce_timestamps` sets the timestamp resolution (s, ms, us, ns). + - `allow_truncated_timestamps` raises an error if precision is lost on truncated timestamps. + + **Example configurations:** + + - In `configs.toml` or `secrets.toml`: + ```py + [normalize.data_writer] + # the default values + flavor="spark" + version="2.4" + data_page_size=1048576 + timestamp_timezone="Europe/Berlin" + ``` + + - Via environment variables: + ```py + export NORMALIZE__DATA_WRITER__FLAVOR="spark" + ``` + + + **Timestamps and timezones** + + `dlt` adds UTC adjustments to all timestamps, creating timezone-aware timestamp columns in destinations (except DuckDB). + + **Disable timezone/UTC adjustments:** + + - Set `flavor` to spark to use the deprecated `int96` timestamp type without logical adjustments. + + - Set `timestamp_timezone` to an empty string (`DATA_WRITER__TIMESTAMP_TIMEZONE=""`) to generate logical timestamps without UTC adjustment. + + By default, pyarrow converts timezone-aware DateTime objects to UTC and stores them in Parquet without timezone information. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **CSV**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Supported by:** + + - PostgreSQL + - Filesystem + - Snowflake + + **Configuration**: + + - Directly in the `pipeline.run()`: + + ```py + info = pipeline.run(some_source(), loader_file_format="csv") + ``` + + - In `config.toml` or `secrets.toml`: + + ```py + [normalize] + loader_file_format="csv" + ``` + + - Via environment variables: + + ```py + export NORMALIZE__LOADER_FILE_FORMAT="csv" + ``` + + - Specify directly in the resource decorator: + + ```py + @dlt.resource(file_format="csv") + def generate_rows(): + ... + ``` + + + **Two implementation**: + + 1. `pyarrow` csv writer - very fast, multithreaded writer for the arrow tables + - binary columns are supported only if they contain valid UTF-8 characters + - complex (nested, struct) types are not supported + 2. `python stdlib writer` - a csv writer included in the Python standard library for Python objects + + - binary columns are supported only if they contain valid UTF-8 characters (easy to add more encodings) + - complex columns dumped with json.dumps + - None values are always quoted + + **Default settings:** + + - separators are commas + - quotes are " and are escaped as "" + - NULL values both are empty strings and empty tokens as in the example below + - UNIX new lines are used + - dates are represented as ISO 8601 + quoting style is "when needed" + + **Adjustable setting:** + + - `delimiter`: change the delimiting character (default: ',') + - `include_header`: include the header row (default: True) + - `quoting`: `quote_all` - all values are quoted, `quote_needed` - quote only values that need quoting (default: `quote_needed`) + + ```py + [normalize.data_writer] + delimiter="|" + include_header=false + quoting="quote_all" + ``` + + or + + ```py + NORMALIZE__DATA_WRITER__DELIMITER=| + NORMALIZE__DATA_WRITER__INCLUDE_HEADER=False + NORMALIZE__DATA_WRITER__QUOTING=quote_all + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""### **SQL INSERT File Format**""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage. + + Additional data types are stored as follows: + + - `datetime` and date are stored as ISO strings; + - `decimal` is stored as a text representation of a decimal number; + - `binary` storage depends on the format accepted by the destination; + - `complex` storage also depends on the format accepted by the destination. + + This file format is compressed by default. + + **Default for:** + + 1. DuckDB + 2. PostgreSQL + 3. Redshift + + **Supported by:** + + 1. Filesystem + + **Configuration**: + + - Directly in the `pipeline.run()`: + + ```py + info = pipeline.run(some_source(), loader_file_format="insert_values") + ``` + + - In `config.toml` or `secrets.toml`: + + ```py + [normalize] + loader_file_format="insert_values" + ``` + + - Via environment variables: + + ```py + export NORMALIZE__LOADER_FILE_FORMAT="insert_values" + ``` + + - Specify directly in the resource decorator: + + ```py + @dlt.resource(file_format="insert_values") + def generate_rows(): + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb b/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb index 8a37d2c2e..047a2887e 100644 --- a/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb @@ -6,14 +6,14 @@ "id": "h93BcC8SX2fj" }, "source": [ - "# **Recap of [Lesson 6](https://colab.research.google.com/drive/1geSMNRkSwAelQJKd3e8vdoHCKiHMdmIo#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩‍💻🚀**\n", "\n", - "1. Learned how dlt works under the hood;\n", - "2. Explored 3 main steps:\n", - " * Extract;\n", - " * Normalize;\n", - " * Load.\n", - "3. Learned which file formats dlt supports." + "1. Learned how `dlt` works under the hood. \n", + "2. Explored the 3 main steps of a pipeline run: \n", + " - Extract \n", + " - Normalize \n", + " - Load \n", + "3. Learned which file formats `dlt` supports." ] }, { @@ -24,7 +24,7 @@ "source": [ "---\n", "\n", - "# **Inspecting & Adjusting Schema** 🧠🧠 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n", + "# **Inspecting & Adjusting Schema** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb)\n", "\n", "\n", "**Here, you will learn or refresh your knowledge on:**\n", @@ -56,7 +56,7 @@ "id": "1vRudCVb9zII" }, "source": [ - "Let's load some GitHub data to DuckDB to inspect the schema in different ways. First we need to install dlt with DuckDB:" + "Let's load some GitHub data to DuckDB to inspect the schema in different ways." ] }, { @@ -68,7 +68,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install -U dlt" + "!pip install dlt" ] }, { @@ -77,7 +77,7 @@ "id": "DKvf4NWW-U9V" }, "source": [ - "Define a dlt resource that fetches pull requests and wrap it in a dlt source, create a pipeline and run it:" + "Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:" ] }, { @@ -100,7 +100,7 @@ "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -259,7 +259,7 @@ " pipeline_name=\"github_pipeline2\",\n", " destination=\"duckdb\",\n", " dataset_name=\"github_data\",\n", - " export_schema_path=\"schemas/export\", # <--- dir path for a schema export\n", + " export_schema_path=\"schemas/export\",\n", ")" ] }, @@ -308,7 +308,9 @@ }, "outputs": [], "source": [ - "!ls schemas/export && cat schemas/export/github_source.schema.yaml" + "print(os.listdir(\"schemas/export\"))\n", + "with open(\"schemas/export/github_source.schema.yaml\") as f:\n", + " print(f.read())" ] }, { @@ -957,7 +959,8 @@ }, "outputs": [], "source": [ - "!cat schemas/export/github_source.schema.yaml" + "with open(\"schemas/export/github_source.schema.yaml\") as f:\n", + " print(f.read())" ] }, { @@ -977,17 +980,8 @@ "id": "NYbccmLie1zm" }, "source": [ - "✅ ▶ Proceed to the [next lesson](https://colab.research.google.com/drive/1jp5UtydA3x9cAq-fbW2tRmAOl4LMZqM1#forceEdit=true&sandboxMode=true)!" + "✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gxU44wP9GvG6" - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py b/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py new file mode 100644 index 000000000..ee82fb4a8 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py @@ -0,0 +1,882 @@ +# /// script +# dependencies = [ +# "dlt", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 6](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) 👩‍💻🚀** + + 1. Learned how `dlt` works under the hood. + 2. Explored the 3 main steps of a pipeline run: + - Extract + - Normalize + - Load + 3. Learned which file formats `dlt` supports. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **Inspecting & Adjusting Schema** 🧠🧠 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) + + + **Here, you will learn or refresh your knowledge on:** + - Methods to inspect a schema + - The components of a schema + - How to modify a schema + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Methods to inspect a schema** + + - **What's a schema?** The schema describes the structure of normalized data (e.g. tables, columns, data types, etc.). `dlt` generates schemas from the data during the normalization process. + + - **How can you inspect a schema in `dlt`?** There are multiple ways: + - CLI + - Python + - Export schema directly + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Let's load some GitHub data to DuckDB to inspect the schema in different ways.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Define a `dlt` resource that fetches pull requests and wrap it in a `dlt` source, create a pipeline and run it:""" + ) + return + + +@app.cell +def _(): + from typing import Iterable + import dlt + from dlt.common.typing import TDataItems + from dlt.extract import DltResource + from dlt.sources.helpers import requests + from dlt.sources.helpers.rest_client import RESTClient + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + import os + + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY") + + @dlt.source + def github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + + return github_pulls + + # define new dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="github_pipeline1", + destination="duckdb", + dataset_name="github_data", + ) + + # run the pipeline with the new resource + load_info = pipeline.run(github_source()) + print(load_info) + return dlt, github_source, load_info, os + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) CLI** + + Let's first try the CLI command `dlt pipeline -v load-package`, which is used to inspect a load package in verbose mode. + + > In the context of the `dlt` library, a load package is a collection of jobs with data for particular tables. The -v flag stands for verbose, which means the command will provide more detailed output. + + Specifically, this command will show the schema changes introduced in the load package for the given pipeline. + """) + return + + +@app.cell +def _(): + import subprocess + + subprocess.run( + ["dlt", "pipeline", "-v", "github_pipeline1", "load-package"], check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(1) Python** + + Alternatively, we can inspect the schema object from load info with: + + ```python + print(load_info.load_packages[0].schema) + ``` + + which has the following public methods and attributes: + """) + return + + +@app.cell +def _(load_info): + # This code snippet just prints out the public methods and attributes of the schema object in load info + all_attributes_methods = dir(load_info.load_packages[0].schema) + public_attributes_methods = [ + attr for attr in all_attributes_methods if not attr.startswith("_") + ] + + print(f"{'Attribute/Method':<50} {'Type':<10}") + print("-" * 40) + for attr in public_attributes_methods: + attr_value = getattr(load_info.load_packages[0].schema, attr) + if callable(attr_value): + print(f"{attr:<50} {'method':<10}") + else: + print(f"{attr:<50} {'attribute':<10}") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Let's use the `to_pretty_json` method and print the schema:""") + return + + +@app.cell +def _(load_info): + print(load_info.load_packages[0].schema.to_pretty_json()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(2) Exporting schema** + + > Exporting the data schema directly into a file might be even more straightforward than the two previous approaches. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The instruction to export a schema should be provided at the beginning when creating a pipeline:""" + ) + return + + +@app.cell +def _(dlt): + pipeline_1 = dlt.pipeline( + pipeline_name="github_pipeline2", + destination="duckdb", + dataset_name="github_data", + export_schema_path="schemas/export", + ) + return (pipeline_1,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline:""") + return + + +@app.cell +def _(github_source, pipeline_1): + load_info_1 = pipeline_1.run(github_source()) + print(load_info_1) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img1.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Check if the schema was exported.""") + return + + +@app.cell +def _(os): + print(os.listdir("schemas/export")) + with open("schemas/export/github_source.schema.yaml") as _f: + print(_f.read()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **The components of a schema** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> Since we learned the ways we can inspect the schema, it's important to actually understand what it contains to be able to meaningfully adjust it later.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + A schema (in YAML format) looks somethng like this: + + ```yaml + version: 2 + version_hash: wdIt+pExjT8Mj1ygQEMhq3E3SXtNBuIbHg0fDz9xD9I= + engine_version: 11 + name: github_source + tables: + _dlt_version: + ... + _dlt_loads: + ... + github_pulls: + ... + settings: + detections: + - iso_timestamp + default_hints: + not_null: + - _dlt_id + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + - _dlt_load_id + parent_key: + - _dlt_parent_id + root_key: + - _dlt_root_id + unique: + - _dlt_id + row_key: + - _dlt_id + normalizers: + names: snake_case + json: + module: dlt.common.normalizers.json.relational + previous_hashes: + - 0WLnuf3Jh1J1XsbVrV2eB824Z6heOlf5o912i1v3tho= + - 0d1z0RFV2O0OvfEWkebtSjxrCjjiyv1lOeNiF0V8Lws= + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) Schema version hash** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The schema hash, denoted by `version_hash`, is generated from the actual schema content, excluding the hash values and version of the schema. + + Each time the schema is changed, a new hash is produced. + + > Note that during the initial run (the first pipeline run), the version will be 2, and there will be two previous hashes because the schema is updated during both the extract and normalize stages. You can rely on the version number to determine how many times the schema has been changed, but keep in mind that it stops being reliable when parallelization is introduced. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Each version hash is then stored in the `_dlt_version` table.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + On subsequent runs, `dlt` checks if the generated schema hash is stored in this table. If it is not, `dlt` concludes that the schema has changed and migrates the destination accordingly. + + - If multiple pipelines are sending data to the same dataset and there is a clash in table names, a single table with the union of the columns will be created. + - If columns clash and have different types or other incompatible characteristics, the load may fail if the data cannot be coerced. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(1) Naming convention** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Each schema contains a naming convention that is denoted in the following way when the schema is exported: + + ```yaml + ... + normalizers: + names: snake_case # naming convention + ... + ``` + The naming convention is particularly useful if the identifiers of the data to be loaded (e.g., keys in JSON files) need to match the namespace of the destination (such as Redshift, which accepts case-insensitive alphanumeric identifiers with a maximum of 127 characters). This convention is used by `dlt` to translate between these identifiers and namespaces. + + The standard behavior of `dlt` is to use the same naming convention for all destinations, ensuring that users always see the same tables and columns in their databases. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The default naming convention is `snake_case`: + + - Removes all ASCII characters except alphanumerics and underscores. + - Adds an underscore (`_`) if the name starts with a number. + - Multiple underscores (`_`) are reduced to a single underscore. + - The parent-child relationship is expressed as a double underscore (`__`) in names. + - The identifier is shortened if it exceeds the length allowed at the destination. + + > If you provide any schema elements that contain identifiers via decorators or arguments (e.g., `table_name` or `columns`), all the names used will be converted according to the naming convention when added to the schema. For example, if you execute `dlt.run(..., table_name="CamelCaseTableName")`, the data will be loaded into `camel_case_table_name`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + To retain the original naming convention, you can define the following in your `config.toml`: + + ```python + [schema] + naming="direct" + ``` + + or use an environment variable as: + + ``` + SCHEMA__NAMING=direct + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(2) Schema settings** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `settings` section of the schema file allows you to define various global rules that impact how tables and columns are inferred from data. + + ```yaml + settings: + detections: + ... + default_hints: + ... + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **1. Detections** + + You can define a set of functions that will be used to infer the data type of the column from a value. These functions are executed sequentially from top to bottom on the list. + + ```yaml + settings: + detections: + - timestamp # detects int and float values that can be interpreted as timestamps within a 5-year range and converts them + - iso_timestamp # detects ISO 8601 strings and converts them to timestamp + - iso_date #detects strings representing an ISO-like date (excluding timestamps) and, if so, converts to date + - large_integer # detects integers too large for 64-bit and classifies as "wei" or converts to text if extremely large + - hexbytes_to_text # detects HexBytes objects and converts them to text + - wei_to_double # detects Wei values and converts them to double for aggregate non-financial reporting + ``` + + > `iso_timestamp` detector is enabled by default. + + Detectors can be removed or added directly in code: + + ```python + source = source() + source.schema.remove_type_detection("iso_timestamp") + source.schema.add_type_detection("timestamp") + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **2. Column hint rules** + + The `default_hints` section in the schema file is used to define global rules that apply to newly inferred columns. + + > These rules are applied **after normalization**, meaning after the naming convention is applied! + + + By default, schema adopts column hint rules from the json(relational) normalizer to support correct hinting of columns added by the normalizer: + + ```yaml + settings: + default_hints: + foreign_key: + - _dlt_parent_id + not_null: + - _dlt_id + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + - _dlt_load_id + unique: + - _dlt_id + root_key: + - _dlt_root_id + ``` + + + You can define column names with regular expressions as well. + + ```yaml + settings: + default_hints: + partition: + - re:_timestamp$ # add partition hint to all columns ending with _timestamp + ``` + + Column hints can be added directly in code: + + ```python + source = data_source() + # this will update existing hints with the hints passed + source.schema.merge_hints({"partition": ["re:_timestamp$"]}) + + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **3. Preferred data types** + + In the `preferred_types` section, you can define rules that will set the data type for newly created columns. On the left side, you specify a rule for a column name, and on the right side, you define the corresponding data type. You can use column names directly or with regular expressions to match them. + + ```yaml + settings: + preferred_types: + re:timestamp: timestamp + inserted_at: timestamp + created_at: timestamp + updated_at: timestamp + ``` + Above, we prefer `timestamp` data type for all columns containing timestamp substring and define a exact matches for certain columns. + + Preferred data types can be added directly in code as well: + + ```python + source = data_source() + source.schema.update_preferred_types( + { + "re:timestamp": "timestamp", + "inserted_at": "timestamp", + "created_at": "timestamp", + "updated_at": "timestamp", + } + ) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **How to modify a schema** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Speaking of data types... you can directly apply data types and hints to your resources, bypassing the need for importing and adjusting schemas. This approach is ideal for rapid prototyping and handling data sources with dynamic schema requirements. + + The two main approaches are: + + - Using the `columns` argument in the `dlt.resource` decorator. + - Using the `apply_hints` method. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **`(0) @dlt.resource(columns=...)`** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + This code snippet sets up a nullable boolean column named `my_column` directly in the decorator. + + ```python + @dlt.resource(name='my_table', columns={"my_column": {"data_type": "bool", "nullable": True}}) + def my_resource(): + for i in range(10): + yield {'my_column': i % 2 == 0} + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(1) `apply_hints`** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + When dealing with dynamically generated resources or needing to programmatically set hints, `apply_hints` is your go-to tool. + + The `apply_hints` method in dlt is used to programmatically **set** or **adjust** various aspects of your data resources or pipeline. It can be used in several ways: + + * You can use `apply_hints` to **directly define data types** and their properties, such as nullability, within the `@dlt.resource` decorator. This eliminates the dependency on external schema files. + + * When **dealing with dynamically generated resources** or needing to programmatically set hints, `apply_hints` is your tool. It's especially useful for applying hints across various collections or tables at once. + + * `apply_hints` can be used to **load your data incrementally**. For example, you can load only files that have been updated since the last time dlt processed them, or load only the new or updated records by looking at a specific column. + + * You can **set or update the table name, columns, and other schema elements** when your resource is executed, and you already yield data. Such changes will be merged with the existing schema in the same way the `apply_hints` method works. + + + It’s especially useful for applying hints across multiple collections or tables at once. + + For example, to apply a complex data type across all collections from a MongoDB source: + + ```python + all_collections = ["collection1", "collection2", "collection3"] # replace with your actual collection names + source_data = mongodb().with_resources(*all_collections) + + for col in all_collections: + source_data.resources[col].apply_hints(columns={"column_name": {"data_type": "complex"}}) + + pipeline = dlt.pipeline( + pipeline_name="mongodb_pipeline", + destination="duckdb", + dataset_name="mongodb_data" + ) + load_info = pipeline.run(source_data) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(2) Adjusting schema settings** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> Maybe you've noticed, but there several ways to adjust your schema settings directly in code were already covered. This is just a recap. You can go back directly to the Schema Settings section.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Detectors can be removed or added directly in code: + + ```python + source = source() + source.schema.remove_type_detection("iso_timestamp") + source.schema.add_type_detection("timestamp") + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Column hints can be added directly in code: + + ```python + source = data_source() + # this will update existing hints with the hints passed + source.schema.merge_hints({"partition": ["re:_timestamp$"]}) + + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Preferred data types can be added directly in code as well: + + ```python + source = data_source() + source.schema.update_preferred_types( + { + "re:timestamp": "timestamp", + "inserted_at": "timestamp", + "created_at": "timestamp", + "updated_at": "timestamp", + } + ) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(3) Importing a schema** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""> We mentioned that you can export a schema. In a similar fashion you can import a schema. The usual approach to use this functionaility is to export the schema first, make the adjustments and put the adjusted schema into the corresponding import folder.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""The instruction to import a schema should be provided at the beginning when creating a pipeline:""" + ) + return + + +@app.cell +def _(dlt): + pipeline_2 = dlt.pipeline( + pipeline_name="github_pipeline3", + destination="duckdb", + dataset_name="github_data", + export_schema_path="schemas/export", + import_schema_path="schemas/import", + ) + return (pipeline_2,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Let's make an initial pipeline run to export schema into the file.""" + ) + return + + +@app.cell +def _(github_source, pipeline_2): + # run the pipeline with the new resource + load_info_2 = pipeline_2.run(github_source()) + print(load_info_2) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Look at the "Files" in the left sidebar, see the `schema` folder, and `export` and `import` folders inside.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_7_Inspecting_%26_Adjusting_Schema_img3](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_7_Inspecting_%26_Adjusting_Schema_img3.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + Now, both folders contain identic schema files. + + ### **Exercise 1: Adjust import schema** + + **Adjust the import schema** by adding a description of the **`github_pulls`** table. + + + ``` + github_pulls: + columns: + updated_at: + incremental: true + write_disposition: append + resource: github_pulls + description: Table contains all pull requests information from dlt repository + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Run the pipeline:""") + return + + +@app.cell +def _(github_source, pipeline_2): + load_info_3 = pipeline_2.run(github_source()) + print(load_info_3) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Check the exported schema file. It should now contain a description for the `github_pulls` table.""" + ) + return + + +@app.cell +def _(): + with open("schemas/export/github_source.schema.yaml") as _f: + print(_f.read()) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ### Question + + What **data type** does the column `version` in the `_dlt_version` table have? + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""✅ ▶ Proceed to the [next lesson](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)!""" + ) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb b/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb index fa3d27c72..f47626e41 100644 --- a/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb +++ b/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb @@ -6,12 +6,12 @@ "id": "h93BcC8SX2fj" }, "source": [ - "# **Recap of [Lesson 7](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r#forceEdit=true&sandboxMode=true) 👩‍💻🚀**\n", + "# **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩‍💻🚀**\n", "\n", - "1. Learned what is a schema.\n", + "1. Learned what a schema is.\n", "2. Explored schema settings and components.\n", - "3. Learned how to retrieve dlt pipeline schema.\n", - "4. Learned how to adjust schema." + "3. Learned how to retrieve a dlt pipeline schema.\n", + "4. Learned how to adjust the schema." ] }, { @@ -22,13 +22,13 @@ "source": [ "---\n", "\n", - "# **Understanding Pipeline Metadata and State** 👻📄 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n", + "# **Understanding Pipeline Metadata and State** 👻📄 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb)\n", "\n", "\n", "**Here, you will learn or brush up on:**\n", - "- What's pipeline metadata\n", + "- What pipeline metadata is\n", "- Exploring pipeline metadata from load info\n", - "- Exploring pipeline metadate from trace\n", + "- Exploring pipeline metadata from trace\n", "- Exploring pipeline metadata from state" ] }, @@ -48,16 +48,16 @@ "id": "nFZNlDb1Y7ZH" }, "source": [ - "Metadata is basically data about data.\n", + "**Metadata** is essentially *data about data*.\n", "\n", - "Pipeline Metadata is data about your data pipeline. This can be useful if you want to know things like:\n", + "**Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like:\n", "\n", "- When your pipeline first ran\n", "- When your pipeline last ran\n", "- Information about your source or destination\n", "- Processing time\n", - "- Or information that you yourself may want to add to the metadata\n", - "- And much more!\n" + "- Custom metadata you add yourself\n", + "- And much more!" ] }, { @@ -73,9 +73,9 @@ "id": "wY2ySVotY-JU" }, "source": [ - " `dlt` allows you to be able to view all this metadata through various options!\n", + "`dlt` allows you to view all this metadata through various options!\n", "\n", - "This notebook will walk you through those options. Namely:\n", + "This notebook will walk you through those options, namely:\n", "\n", "- Load info\n", "- Trace\n", @@ -88,7 +88,7 @@ "id": "JTR2acUYZbku" }, "source": [ - "Let's load some GitHub data to DuckDB to inspect the pipeline metadata in different ways. First we need to install dlt with DuckDB:" + "Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways." ] }, { @@ -109,7 +109,7 @@ "id": "AhU2JVjTZn_j" }, "source": [ - "Define a dlt resource that fetches Pull Requests and wrap it in a dlt source:" + "Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:" ] }, { @@ -129,10 +129,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -206,13 +205,13 @@ "id": "NA2dPY3_a2Ue" }, "source": [ - "From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r) Colab we've already learned that we can see which schema changes a load package has introduced with the command:\n", + "From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command:\n", "\n", "```\n", "dlt pipeline -v load-package\n", "```\n", "\n", - "The verbose flag only accounts for the schema changes, so if we run it without the flag, we will still see the most recent load package info:" + "The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info:" ] }, { @@ -232,9 +231,9 @@ "id": "w9ztJjzWcB3q" }, "source": [ - "The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of 0 when the load process is fully completed. The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them.\n", + "The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them.\n", "\n", - "We can also see load package info with a specific load id:" + "We can also view load package info for a specific `load_id` (replace the value with the one output above):\n" ] }, { @@ -264,12 +263,12 @@ "id": "Lg1lg6FVdKLl" }, "source": [ - "From the [`Inspecting & Adjusting Schema`](https://colab.research.google.com/drive/1LokUcM5YSazdq5jfbkop-Z5rmP-39y4r?usp=sharing) Colab we've also learned that a schema can be accessed with:\n", + "From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with:\n", "\n", "```python\n", "print(load_info.load_packages[0].schema)\n", "```\n", - "Similarly if we drop the schema part, we will just get the load package info:" + "Similarly, if we drop the schema part, we will get the load package info:" ] }, { @@ -351,7 +350,7 @@ "id": "P3_rFHz6elTy" }, "source": [ - "You can access pipeline trace using the command:\n", + "You can access the pipeline trace using the command:\n", "\n", "\n", "```\n", @@ -365,7 +364,7 @@ "id": "E2B3-30Yezbi" }, "source": [ - "Try on the github issues pipeline:" + "Try running it on the github issues pipeline:" ] }, { @@ -458,7 +457,7 @@ "id": "XMsVhKYHff20" }, "source": [ - "In particular how many rows of data were normalized:" + "How many rows of data were normalized:" ] }, { @@ -513,17 +512,19 @@ }, "source": [ "**When to use pipeline state**\n", - "- dlt uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n", + "- `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state.\n", "- Store a list of already requested entities if the list is not much bigger than 100k elements.\n", "- Store large dictionaries of last values if you are not able to implement it with the standard incremental construct.\n", "- Store the custom fields dictionaries, dynamic configurations and other source-scoped state.\n", "\n", "**When not to use pipeline state**\n", "\n", - "Do not use dlt state when it may grow to millions of elements. Do you plan to store modification timestamps of all of your millions of user records? This is probably a bad idea! In that case you could:\n", + "Do not use `dlt` state when it may grow to millions of elements. \n", + "For example, storing modification timestamps for millions of user records is a bad idea. \n", + "In that case, you could:\n", "\n", - "- Store the state in dynamo-db, redis etc. taking into the account that if the extract stage fails you'll end with invalid state.\n", - "- Use your loaded data as the state. dlt exposes the current pipeline via dlt.current.pipeline() from which you can obtain sqlclient and load the data of interest. In that case try at least to process your user records in batches." + "- Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state.\n", + "- Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches." ] }, { @@ -634,10 +635,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -696,7 +696,7 @@ "id": "UEBszW96bX1F" }, "source": [ - "In the state you will see the new items:" + "In the state, you will see the new items:" ] }, { @@ -748,10 +748,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -826,11 +825,10 @@ "id": "im-o7K5IkoW5" }, "source": [ - "You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store mapping of custom fields to their displayable names.\n", + "You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names.\n", "\n", - "Let's read some custom keys from the state:\n", + "Let's read some custom keys from the state with:\n", "```python\n", - "# Let's read some custom state information\n", "source_new_keys = dlt.current.source_state().get(\"resources\", {}).get(\"github_pulls\", {}).get(\"new_key\")\n", "```\n", "Full example:" @@ -850,10 +848,9 @@ "from dlt.sources.helpers.rest_client.auth import BearerTokenAuth\n", "from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator\n", "\n", - "import os\n", "from google.colab import userdata\n", "\n", - "os.environ[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", + "dlt.secrets[\"SOURCES__SECRET_KEY\"] = userdata.get(\"SECRET_KEY\")\n", "\n", "\n", "@dlt.source\n", @@ -915,17 +912,24 @@ "id": "WIhvQCY_lEaB" }, "source": [ - "What if you run your pipeline on, for example, Airflow where every task gets a clean filesystem and pipeline working directory is always deleted?\n", + "What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted?\n", "\n", - "**dlt loads** your **state** into the destination **together** with all other **data** and when faced with a clean start, it will try to restore state from the destination.\n", + "**dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination.\n", "\n", - "The remote state is identified by pipeline name, the destination location (as given by the credentials) and destination dataset. To re-use **the same state**, use **the same pipeline name** and destination.\n", + "The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset. \n", + "To reuse **the same state**, use **the same pipeline name** and the same destination.\n", "\n", - "The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, pipeline run (that the state belongs to) and state blob.\n", + "The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob.\n", "\n", - "dlt has `dlt pipeline sync` command where you can request the state back from that table.\n", + "`dlt` provides the command:\n", "\n", - "💡 If you can keep the pipeline working directory across the runs, you can disable the state sync by setting `restore_from_destination=false` i.e. in your `config.toml`." + "```\n", + "dlt pipeline sync\n", + "```\n", + "\n", + "which retrieves the state from that table.\n", + "\n", + "💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`." ] }, { @@ -937,11 +941,8 @@ "outputs": [], "source": [ "import duckdb\n", - "from google.colab import data_table\n", "from IPython.display import display\n", "\n", - "data_table.enable_dataframe_formatter()\n", - "\n", "# a database 'chess_pipeline.duckdb' was created in working directory so just connect to it\n", "conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n", "conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n", @@ -955,7 +956,7 @@ "id": "YIy5yLOAlJ9M" }, "source": [ - "Column \"state\" is compressed json dictionary." + "The \"state\" column is a compressed json dictionary." ] }, { @@ -998,14 +999,14 @@ "source": [ "**To fully reset the state:**\n", "\n", - "Drop the destination dataset to fully reset the pipeline.\n", - "Set the `dev_mode` flag when creating pipeline.\n", - "Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n", + "- Drop the destination dataset to fully reset the pipeline. \n", + "- Set the `dev_mode` flag when creating the pipeline. \n", + "- Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name.\n", "\n", "**To partially reset the state:**\n", "\n", - "Use the `dlt pipeline drop ` command to drop state and tables for a given resource.\n", - "Use the `dlt pipeline drop --state-paths` command to reset the state at given path without touching the tables and data." + "- Use the `dlt pipeline drop ` command to drop state and tables for a given resource. \n", + "- Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data." ] }, { @@ -1014,9 +1015,9 @@ "id": "fUuRzapCl8pC" }, "source": [ - "**Example for partial reset:**\n", + "**Example for a partial reset:**\n", "\n", - "> in an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI" + "> In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI." ] }, { @@ -1058,7 +1059,7 @@ "id": "NYbccmLie1zm" }, "source": [ - "🎊🎊🎊 That is actually it! We hope you enjoyed this course and learned more about dlt! 🎊🎊🎊\n", + "🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊\n", "\n", "Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼" ] diff --git a/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py b/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py new file mode 100644 index 000000000..c99ce4556 --- /dev/null +++ b/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py @@ -0,0 +1,884 @@ +# /// script +# dependencies = [ +# "dlt[duckdb]", +# "numpy", +# "pandas", +# "sqlalchemy", +# ] +# /// + +import marimo + +__generated_with = "0.17.4" +app = marimo.App() + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # **Recap of [Lesson 7](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) 👩‍💻🚀** + + 1. Learned what a schema is. + 2. Explored schema settings and components. + 3. Learned how to retrieve a dlt pipeline schema. + 4. Learned how to adjust the schema. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + # **Understanding Pipeline Metadata and State** 👻📄 [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) + + + **Here, you will learn or brush up on:** + - What pipeline metadata is + - Exploring pipeline metadata from load info + - Exploring pipeline metadata from trace + - Exploring pipeline metadata from state + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Pipeline Metadata** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Metadata** is essentially *data about data*. + + **Pipeline metadata** is data about your data pipeline. This is useful when you want to know things like: + + - When your pipeline first ran + - When your pipeline last ran + - Information about your source or destination + - Processing time + - Custom metadata you add yourself + - And much more! + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_8_Understanding_Pipeline_Metadata_and_State_img1](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_8_Understanding_Pipeline_Metadata_and_State_img1.jpeg)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + `dlt` allows you to view all this metadata through various options! + + This notebook will walk you through those options, namely: + + - Load info + - Trace + - State + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Let's load some GitHub data into DuckDB to inspect the pipeline metadata in different ways.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""Define a `dlt` resource that fetches Pull Requests and wrap it in a `dlt` source:""" + ) + return + + +@app.cell +def _(os): + from typing import Iterable + import dlt + from dlt.extract import DltResource + from dlt.common.typing import TDataItems + from dlt.sources.helpers import requests + from dlt.sources.helpers.rest_client import RESTClient + from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY") + + @dlt.source + def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + + return github_pulls + + pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + load_info = pipeline.run(_github_source()) + # define new dlt pipeline + # run the pipeline with the new resource + print(load_info) + return ( + BearerTokenAuth, + DltResource, + HeaderLinkPaginator, + Iterable, + RESTClient, + TDataItems, + dlt, + load_info, + pipeline, + ) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Load info** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + `Load Info:` This is a collection of useful information about the recently loaded data. It includes details like the pipeline and dataset name, destination information, and a list of loaded packages with their statuses, file sizes, types, and error messages (if any). + + `Load Package:` A load package is a collection of jobs with data for specific tables, generated during each execution of the pipeline. Each package is uniquely identified by a `load_id`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) CLI** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've already learned that we can see which schema changes a load package introduced with the command: + + ``` + dlt pipeline -v load-package + ``` + + The verbose flag only shows schema changes, so if we run it **without** the flag, we will still see the most recent load package info: + """) + return + + +@app.cell +def _(): + import subprocess + + subprocess.run(["dlt", "pipeline", "github_pipeline", "load-package"], check=True) + return (subprocess,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + The `load_id` of a particular package is added to the top data tables (parent tables) and to the special `_dlt_loads` table with a status of `0` when the load process is fully completed. The `_dlt_loads` table tracks completed loads and allows chaining transformations on top of them. + + We can also view load package info for a specific `load_id` (replace the value with the one output above): + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "pipeline", "github_pipeline", "load-package", "1741348101.3398592"], + check=True, + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) Python** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + From the [`Inspecting & Adjusting Schema`](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) section, we've also learned that a schema can be accessed with: + + ```python + print(load_info.load_packages[0].schema) + ``` + Similarly, if we drop the schema part, we will get the load package info: + """) + return + + +@app.cell +def _(load_info): + print(load_info.load_packages[0]) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""which has the following public methods and attributes:""") + return + + +@app.cell +def _(load_info): + # This code snippet just prints out the public methoda and attributes of the schema object in load info + all_attributes_methods = dir(load_info.load_packages[0]) + public_attributes_methods = [ + attr for attr in all_attributes_methods if not attr.startswith("_") + ] + + print(f"{'Attribute/Method':<50} {'Type':<10}") + print("-" * 40) + for attr in public_attributes_methods: + attr_value = getattr(load_info.load_packages[0], attr) + if callable(attr_value): + print(f"{attr:<50} {'method':<10}") + else: + print(f"{attr:<50} {'attribute':<10}") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **Trace** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""`Trace`: A trace is a detailed record of the execution of a pipeline. It provides rich information on the pipeline processing steps: **extract**, **normalize**, and **load**. It also shows the last `load_info`.""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) CLI** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can access the pipeline trace using the command: + + + ``` + dlt pipeline trace + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Try running it on the github issues pipeline:""") + return + + +@app.cell +def _(subprocess): + subprocess.run(["dlt", "pipeline", "github_pipeline", "trace"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) Python** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""We can also print out the trace in code:""") + return + + +@app.cell +def _(pipeline): + # print human friendly trace information + print(pipeline.last_trace) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Separately receive the extract stage info:""") + return + + +@app.cell +def _(pipeline): + # print human friendly trace information + print(pipeline.last_trace.last_extract_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""As well as the normalization stage info with:""") + return + + +@app.cell +def _(pipeline): + # print human friendly normalization information + print(pipeline.last_trace.last_normalize_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""How many rows of data were normalized:""") + return + + +@app.cell +def _(pipeline): + # access row counts dictionary of normalize info + print(pipeline.last_trace.last_normalize_info.row_counts) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""And finally the load stage info:""") + return + + +@app.cell +def _(pipeline): + # print human friendly load information + print(pipeline.last_trace.last_load_info) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ## **State** + + [`The pipeline state`](https://dlthub.com/docs/general-usage/state) is a Python dictionary that lives alongside your data. You can store values in it during a pipeline run, and then retrieve them in the next pipeline run. It's used for tasks like preserving the "last value" or similar loading checkpoints, and it gets committed atomically with the data. The state is stored locally in the pipeline working directory and is also stored at the destination for future runs. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **When to use pipeline state** + - `dlt` uses the state internally to implement last value incremental loading. This use case should cover around 90% of your needs to use the pipeline state. + - Store a list of already requested entities if the list is not much bigger than 100k elements. + - Store large dictionaries of last values if you are not able to implement it with the standard incremental construct. + - Store the custom fields dictionaries, dynamic configurations and other source-scoped state. + + **When not to use pipeline state** + + Do not use `dlt` state when it may grow to millions of elements. + For example, storing modification timestamps for millions of user records is a bad idea. + In that case, you could: + + - Store the state in DynamoDB, Redis, etc., keeping in mind that if the extract stage fails, you may end up with invalid state. + - Use your loaded data as the state. `dlt` exposes the current pipeline via `dlt.current.pipeline()`, from which you can obtain a `sql_client` and load the data you need. If you choose this approach, try to process your user records in batches. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(0) CLI** + """) + return + + +@app.cell +def _(subprocess): + subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **(1) Python** + """) + return + + +@app.cell +def _(): + import json + + def read_state(filepath: str) -> str: + with open(filepath, "r", encoding="utf-8") as file: + data = json.load(file) + pretty_json = json.dumps(data, indent=4) + return pretty_json + return (read_state,) + + +@app.cell +def _(read_state): + # stored in your default pipelines folder + print(read_state("/var/dlt/pipelines/github_pipeline/state.json")) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **Modify State** + + The pipeline state is a Python dictionary that lives alongside your data; you can store values in it and, on the next pipeline run, request them back. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + #### **(0) Resource state** + + You can **read** and **write** the state in your resources using: + + ```python + dlt.current.resource_state().get() + ``` + and + + ```python + dlt.current.resource_state().setdefault(key, value) + ``` + """) + return + + +@app.cell +def _( + BearerTokenAuth, + DltResource, + HeaderLinkPaginator, + Iterable, + RESTClient, + TDataItems, + dlt, + os, +): + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY") + + @dlt.source + def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + dlt.current.resource_state().setdefault( + "new_key", ["first_value", "second_value"] + ) + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + + return github_pulls + + pipeline_1 = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + load_info_1 = pipeline_1.run(_github_source()) + print(load_info_1) + return + + +@app.cell +def _(read_state): + print(read_state("/var/dlt/pipelines/github_pipeline/state.json")) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""In the state, you will see the new items:""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r"""![Lesson_8_Understanding_Pipeline_Metadata_and_State_img2](https://storage.googleapis.com/dlt-blog-images/dlt-fundamentals-course/Lesson_8_Understanding_Pipeline_Metadata_and_State_img2.png)""" + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can modify any item in the state dict: + + ```python + new_keys = dlt.current.resource_state().setdefault("new_key", ["first_value", "second_value"]) + + if "something_happend": + new_keys.append("third_value") + + incremental_dict = dlt.current.resource_state().get("incremental") + incremental_dict.update({"second_new_key": "fourth_value"}) + ``` + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""Full example:""") + return + + +@app.cell +def _( + BearerTokenAuth, + DltResource, + HeaderLinkPaginator, + Iterable, + RESTClient, + TDataItems, + dlt, + os, +): + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY") + + @dlt.source + def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + new_keys = dlt.current.resource_state().setdefault( + "new_key", ["first_value", "second_value"] + ) + if "something_happened": + new_keys.append("third_value") + incremental_dict = dlt.current.resource_state().get("incremental") + incremental_dict.update({"second_new_key": "fourth_value"}) + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + + return github_pulls + + pipeline_2 = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + load_info_2 = pipeline_2.run(_github_source()) + print(load_info_2) + return + + +@app.cell +def _(read_state): + print(read_state("/var/dlt/pipelines/github_pipeline/state.json")) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + #### **(1) Source state** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + You can also access the source-scoped state with `dlt.current.source_state()` which can be shared across resources of a particular source and is also available **read-only** in the source-decorated functions. The most common use case for the source-scoped state is to store the mapping of custom fields to their displayable names. + + Let's read some custom keys from the state with: + ```python + source_new_keys = dlt.current.source_state().get("resources", {}).get("github_pulls", {}).get("new_key") + ``` + Full example: + """) + return + + +@app.cell +def _( + BearerTokenAuth, + DltResource, + HeaderLinkPaginator, + Iterable, + RESTClient, + TDataItems, + dlt, + os, +): + dlt.secrets["SOURCES__SECRET_KEY"] = os.getenv("SECRET_KEY") + + @dlt.source + def _github_source(secret_key: str = dlt.secrets.value) -> Iterable[DltResource]: + client = RESTClient( + base_url="https://api.github.com", + auth=BearerTokenAuth(token=secret_key), + paginator=HeaderLinkPaginator(), + ) + + @dlt.resource + def github_pulls( + cursor_date: dlt.sources.incremental[str] = dlt.sources.incremental( + "updated_at", initial_value="2024-12-01" + ) + ) -> TDataItems: + params = {"since": cursor_date.last_value, "status": "open"} + for page in client.paginate("repos/dlt-hub/dlt/pulls", params=params): + yield page + source_new_keys = ( + dlt.current.source_state() + .get("resources", {}) + .get("github_pulls", {}) + .get("new_key") + ) + print("My custom values: ", source_new_keys) + + return github_pulls + + pipeline_3 = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_data", + ) + load_info_3 = pipeline_3.run(_github_source()) + print(load_info_3) + return (pipeline_3,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **Sync State** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + What if you run your pipeline on, for example, Airflow, where every task gets a clean filesystem and the pipeline working directory is always deleted? + + **dlt loads** your **state** into the destination **together** with all other **data**, and when starting from a clean slate, it will try to restore the state from the destination. + + The remote state is identified by the pipeline name, the destination location (as defined by the credentials), and the destination dataset. + To reuse **the same state**, use **the same pipeline name** and the same destination. + + The state is stored in the `_dlt_pipeline_state` table at the destination and contains information about the pipeline, the pipeline run (to which the state belongs), and the state blob. + + `dlt` provides the command: + + ``` + dlt pipeline sync + ``` + + which retrieves the state from that table. + + 💡 If you can keep the pipeline working directory across runs, you can disable state sync by setting `restore_from_destination = false` in your `config.toml`. + """) + return + + +@app.cell +def _(pipeline_3): + import duckdb + from IPython.display import display + + conn = duckdb.connect(f"{pipeline_3.pipeline_name}.duckdb") + # a database 'chess_pipeline.duckdb' was created in working directory so just connect to it + conn.sql(f"SET search_path = '{pipeline_3.dataset_name}'") + stats_table = conn.sql("SELECT * FROM _dlt_pipeline_state").df() + display(stats_table) + return (conn,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r"""The "state" column is a compressed json dictionary.""") + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + |index|version|engine\_version|pipeline\_name|state|created\_at|version\_hash|\_dlt\_load\_id|\_dlt\_id| + |---|---|---|---|---|---|---|---|---| + |0|1|4|github\_pipeline|eNplkN....6+/m/QA7mbNc|2025-03-10 14:02:34\.340458+00:00|pnp+9AIA5jAGx5LKon6zWmPnfYVb10ROa5aIKjv9O0I=|1741615353\.5473728|FOzn5XuSZ/y/BQ| + """) + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "--non-interactive", "pipeline", "github_pipeline", "sync"], check=True + ) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + ### **Reset State** + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **To fully reset the state:** + + - Drop the destination dataset to fully reset the pipeline. + - Set the `dev_mode` flag when creating the pipeline. + - Use the `dlt pipeline drop --drop-all` command to drop state and tables for a given schema name. + + **To partially reset the state:** + + - Use the `dlt pipeline drop ` command to drop state and tables for a given resource. + - Use the `dlt pipeline drop --state-paths` command to reset the state at a given path without touching the tables or data. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + **Example for a partial reset:** + + > In an ipynb environment, when the duckdb connection we opened is not yet closed -> close the connection before attempting to edit the pipeline through the CLI. + """) + return + + +@app.cell +def _(conn): + conn.close() + return + + +@app.cell +def _(subprocess): + subprocess.run( + ["dlt", "pipeline", "github_pipeline", "drop", "github_pulls"], + input="y\n", + text=True, + check=True, + ) + return + + +@app.cell +def _(subprocess): + subprocess.run(["dlt", "pipeline", "-v", "github_pipeline", "info"], check=True) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + 🎊🎊🎊 That's it! We hope you enjoyed this course and learned more about `dlt`! 🎊🎊🎊 + + Please share your feedback with us: [Feedback Google Form](https://forms.gle/1NYrGcRj5gLQ4WDt8) 🌼 + """) + return + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +if __name__ == "__main__": + app.run() diff --git a/docs/pyproject.toml b/docs/pyproject.toml index 0e115d151..493375ff0 100644 --- a/docs/pyproject.toml +++ b/docs/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "regex>=2025.10.23", "pytest-forked>=1.6.0", "databind>=4.5.2", + "marimo>=0.17.4", ] diff --git a/docs/uv.lock b/docs/uv.lock index f5bde2fae..a983689ea 100644 --- a/docs/uv.lock +++ b/docs/uv.lock @@ -1121,6 +1121,7 @@ dependencies = [ { name = "google-api-python-client" }, { name = "google-auth-oauthlib" }, { name = "lancedb" }, + { name = "marimo" }, { name = "modal" }, { name = "mypy" }, { name = "nbqa" }, @@ -1160,6 +1161,7 @@ requires-dist = [ { name = "google-api-python-client", specifier = ">=1.7.11" }, { name = "google-auth-oauthlib", specifier = ">=1.0.0,<2" }, { name = "lancedb", marker = "python_full_version < '3.13'", specifier = ">=0.8.2" }, + { name = "marimo", specifier = ">=0.17.4" }, { name = "modal", specifier = ">=0.64.170" }, { name = "modal", specifier = ">=1.2.1" }, { name = "mypy", specifier = ">=1.11.0,<1.13.0" }, diff --git a/docs/website/docs/tutorial/advanced-course.md b/docs/website/docs/tutorial/advanced-course.md index 54dfb8d1b..67514ebb9 100644 --- a/docs/website/docs/tutorial/advanced-course.md +++ b/docs/website/docs/tutorial/advanced-course.md @@ -10,34 +10,34 @@ In this course, you'll go far beyond the basics. You’ll build production-grade ## Lessons -### **Lesson 1: Custom Sources – REST APIs & RESTClient** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) +### **Lesson 1: Custom Sources – REST APIs & RESTClient** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_1_custom_sources_restapi_source_and_restclient.ipynb) Learn how to build flexible REST API connectors from scratch using `@dlt.resource` and the powerful `RESTClient`. -### **Lesson 2: Custom Sources – SQL Databases** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) +### **Lesson 2: Custom Sources – SQL Databases** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_2_custom_sources_sql_databases_.ipynb) Connect to any SQL-compatible database, reflect table schemas, write query adapters, and selectively ingest data using `sql_database`. -### **Lesson 3: Custom Sources – Filesystems & Cloud Storage** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) +### **Lesson 3: Custom Sources – Filesystems & Cloud Storage** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_3_custom_sources_filesystem_and_cloud_storage.ipynb) Build sources that read from local or remote files (S3, GCS, Azure). -### **Lesson 4: Custom Destinations – Reverse ETL** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) +### **Lesson 4: Custom Destinations – Reverse ETL** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_4_destinations_reverse_etl.ipynb) Use `@dlt.destination` to send data back to APIs like Notion, Slack, or Airtable. Learn batching, retries, and idempotent patterns. -### **Lesson 5: Transforming Data Before & After Load** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) +### **Lesson 5: Transforming Data Before & After Load** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_5_transform_data_before_and_after_loading.ipynb) Learn when and how to apply `add_map`, `add_filter`, `@dlt.transformer`, or even post-load transformations via SQL or Ibis. Control exactly how your data looks. -### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) +### **Lesson 6: Write Disposition Strategies & Advanced Tricks** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_6_write_disposition_strategies_and_advanced_tricks.ipynb) Understand how to use `replace` and `merge`, and combine them with schema hints and incremental loading. -### **Lesson 7: Data Contracts** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) +### **Lesson 7: Data Contracts** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_7_data_contracts.ipynb) Define expectations on schema, enforce data types and behaviors, and lock down your schema evolution. Ensure reliable downstream use of your data. -### **Lesson 8: Logging & Tracing** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) +### **Lesson 8: Logging & Tracing** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_8_logging_and_tracing.ipynb) Track every step of your pipeline: from extraction to load. Use logs, traces, and metadata to debug and analyze performance. -### **Lesson 9: Performance Optimization** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) +### **Lesson 9: Performance Optimization** [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-advanced-course/lesson_9_performance_optimisation.ipynb) Handle large datasets, tune buffer sizes, parallelize resource extraction, optimize memory usage, and reduce pipeline runtime. ## Homework & Certification diff --git a/docs/website/docs/tutorial/fundamentals-course.md b/docs/website/docs/tutorial/fundamentals-course.md index ec10eae16..b429d0187 100644 --- a/docs/website/docs/tutorial/fundamentals-course.md +++ b/docs/website/docs/tutorial/fundamentals-course.md @@ -10,42 +10,41 @@ In this course you will learn the fundamentals of `dlt` alongside some of the mo ## Lessons -### Lesson 1: Quick Start [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) +### Lesson 1: Quick Start [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_1_quick_start.ipynb) Discover what dlt is, run your first pipeline with toy data, and explore it like a pro using DuckDB, `sql_client`, and dlt datasets! -### Lesson 2: dlt Resources and Sources [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) +### Lesson 2: dlt Resources and Sources [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline.ipynb) Learn to run pipelines with diverse data sources (dataframes, databases, and REST APIs), master `dlt.resource`, `dlt.source`, and `dlt.transformer`, and create your first REST API pipeline! -### Lesson 3: Pagination & Authentication & dlt Configuration [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) +### Lesson 3: Pagination & Authentication & dlt Configuration [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_3_pagination_and_authentication_and_dlt_configuration.ipynb) + Since it is never a good idea to publicly put your API keys into your code, different environments have different methods to set and access these secret keys. `dlt` is no different. Master pagination and authentication for REST APIs, explore dlt's RESTClient and manage secrets and configs. -### Lesson 4: Using dlt's pre-built Sources and Destinations [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) +### Lesson 4: Using dlt's pre-built Sources and Destinations [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_4_using_pre_build_sources_and_destinations.ipynb) Now that you took a data source and loaded it into a `duckdb` destination, it is time to look into what other possibilities `dlt` offers. In this notebook we will take a look at pre-built verified sources and destinations and how to use them. -### Lesson 5: Write disposition and incremental loading [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) - +### Lesson 5: Write disposition and incremental loading [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_5_write_disposition_and_incremental_loading.ipynb) Learn to control data behavior with dlt write dispositions (Append, Replace, Merge), master incremental loading, and efficiently update and deduplicate your datasets. -### Lesson 6: How dlt works [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) - +### Lesson 6: How dlt works [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_6_how_dlt_works.ipynb) Discover the magic behind `dlt`! Learn its three main steps — Extract, Normalize, Load — along with default behaviors and supported file formats. -### Lesson 7: Inspecting & Adjusting Schema [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) +### Lesson 7: Inspecting & Adjusting Schema [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_7_inspecting_and_adjusting_schema.ipynb) dlt creates and manages the schema automatically, but what if you want to control it yourself? Explore the schema and customize it to your needs easily with dlt! -### Lesson 8: Understanding Pipeline State & Metadata [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) +### Lesson 8: Understanding Pipeline State & Metadata [![Open in molab](https://marimo.io/molab-shield.svg)](https://molab.marimo.io/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/dlt-hub/dlt/blob/master/docs/education/dlt-fundamentals-course/lesson_8_understanding_pipeline_metadata_and_state.ipynb) After having learnt about pipelines and how to move data from one place to another. We now learn about information about the pipeline itself. Or, metadata of a pipeline that can be accessed and edited through dlt. diff --git a/pyproject.toml b/pyproject.toml index 7bad61e46..639e6ed95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -249,6 +249,8 @@ dev = [ "pydoclint>=0.6.5,<0.7", "types-paramiko>=3.5.0.20250708", "graphviz>=0.21", + # limits sqlglot - remove when #3489 is fixed + "sqlglot<28.1", ] # NOTE: those dependencies are used to test built in sources diff --git a/uv.lock b/uv.lock index 0e6aa28c5..757bc3655 100644 --- a/uv.lock +++ b/uv.lock @@ -2282,6 +2282,7 @@ dev = [ { name = "requests-mock" }, { name = "ruff" }, { name = "sqlfluff" }, + { name = "sqlglot" }, { name = "types-cachetools" }, { name = "types-click" }, { name = "types-deprecated" }, @@ -2487,6 +2488,7 @@ dev = [ { name = "requests-mock", specifier = ">=1.10.0,<2" }, { name = "ruff", specifier = ">=0.3.2,<0.4" }, { name = "sqlfluff", specifier = ">=2.3.2,<3" }, + { name = "sqlglot", specifier = "<28.1" }, { name = "types-cachetools", specifier = ">=4.2.9" }, { name = "types-click", specifier = ">=7.1.8,<8" }, { name = "types-deprecated", specifier = ">=1.2.9.2,<2" },