refresh docs intro (#3270)

* renames pipeline to workspace dashboard

* refreshes intro

* review changes

* sidebar, references, dataset.table( cleanup
This commit is contained in:
rudolfix
2025-10-31 17:14:49 +01:00
committed by GitHub
parent 192296f4f8
commit 4a431d60ed
26 changed files with 156 additions and 134 deletions

View File

@@ -89,7 +89,7 @@ jobs:
python-version: ${{ matrix.python-version }}
activate-environment: true
# test pipeline dashboard app, does not work with python 3.13
# test workspace dashboard app, does not work with python 3.13
- name: Install dlt with duckdb and dashboard
# note: this also tests the workspace extra installation
run: uv sync ${{ matrix.uv_sync_args }} --extra duckdb --extra workspace --group sentry-sdk --group pipeline --group sources --group dashboard-tests
@@ -99,12 +99,12 @@ jobs:
run: playwright install && playwright install-deps
if: matrix.python-version != '3.14.0-beta.4'
# Run pipeline dashboard unit tests
- name: Run pipeline dashboard unit tests
# Run workspace dashboard unit tests
- name: Run workspace dashboard unit tests
run: |
pytest tests/workspace/helpers/dashboard
# Run pipeline dashboard e2e tests (does not pass with python 3.9
# Run workspace dashboard e2e tests (does not pass with python 3.9
- name: Run dashboard e2e
run: |
marimo run --headless dlt/_workspace/helpers/dashboard/dlt_dashboard.py -- -- --pipelines-dir _storage/.dlt/pipelines/ --with_test_identifiers true & pytest --browser chromium tests/e2e

View File

@@ -204,9 +204,9 @@ You can see the GitHub actions setup for remote destinations in `.github/workflo
### E2E Tests
`dlt` ships with the Pipeline Dashboard (https://dlthub.com/docs/general-usage/dashboard). To ensure that the dashboard works correctly in the Browser on all Platforms, we have e2e tests with Playwright as part of our test suite. To run the e2e tests locally, please:
`dlt` ships with the Workspace Dashboard (https://dlthub.com/docs/general-usage/dashboard). To ensure that the dashboard works correctly in the Browser on all Platforms, we have e2e tests with Playwright as part of our test suite. To run the e2e tests locally, please:
1. Install all dependenices with `make dev`
1. Install all dependencies with `make dev`
2. Install the dashboard testing dependencies with `uv sync --group dashboard-tests`
3. Install playwright dependencies with `playwright install`
4. Start the dashboard in silent mode from one terminal window: `make start-dlt-dashboard-e2e`

View File

@@ -1,5 +1,5 @@
<h1 align="center">
<strong>data load tool (dlt) — the open-source Python library for data loading</strong>
<strong>data load tool (dlt) — the open-source Python library that automates all your tedious data loading tasks</strong>
</h1>
<p align="center">
Be it a Google Colab notebook, AWS Lambda function, an Airflow DAG, your local laptop,<br/>or a GPT-4 assisted development playground—<strong>dlt</strong> can be dropped in anywhere.
@@ -37,9 +37,6 @@ dlt supports Python 3.9 through Python 3.14. Note that some optional extras are
pip install dlt
```
More options: [Install via Conda or Pixi](https://dlthub.com/docs/reference/installation#31-install-dlt-via-pixi-or-conda)
## Quick Start
Load chess game data from chess.com API and save it in DuckDB:
@@ -72,20 +69,16 @@ Try it out in our **[Colab Demo](https://colab.research.google.com/drive/1NfSB1D
## Features
- **Automatic Schema:** Data structure inspection and schema creation for the destination.
- **Data Normalization:** Consistent and verified data before loading.
- **Seamless Integration:** Colab, AWS Lambda, Airflow, and local environments.
- **Scalable:** Adapts to growing data needs in production.
- **Easy Maintenance:** Clear data pipeline structure for updates.
- **Rapid Exploration:** Quickly explore and gain insights from new data sources.
- **Versatile Usage:** Suitable for ad-hoc exploration to advanced loading infrastructures.
- **Start in Seconds with CLI:** Powerful CLI for managing, deploying and inspecting local pipelines.
- **Incremental Loading:** Load only new or changed data and avoid loading old records again.
- **Open Source:** Free and Apache 2.0 Licensed.
dlt is an open-source Python library that loads data from various, often messy data sources into well-structured datasets. It provides lightweight Python interfaces to extract, load, inspect, and transform data. dlt and dlt docs are built from the ground up to be used with LLMs: the [LLM-native workflow](https://dlthub.com/docs/dlt-ecosystem/llm-tooling/llm-native-workflow.md) will take your pipeline code to data in a notebook for over [5000 sources](https://dlthub.com/workspace).
## Ready to use Sources and Destinations
dlt is designed to be easy to use, flexible, and scalable:
Explore ready to use sources (e.g. Google Sheets) in the [Verified Sources docs](https://dlthub.com/docs/dlt-ecosystem/verified-sources) and supported destinations (e.g. DuckDB) in the [Destinations docs](https://dlthub.com/docs/dlt-ecosystem/destinations).
- dlt extracts data from [REST APIs](https://dlthub.com/docs/tutorial/rest-api), [SQL databases](https://dlthub.com/docs/tutorial/sql-database), [cloud storage](https://dlthub.com/docs/tutorial/filesystem), [Python data structures](https://dlthub.com/docs/tutorial/load-data-from-an-api), and [many more](https://dlthub.com/docs/dlt-ecosystem/verified-sources).
- dlt infers [schemas](https://dlthub.com/docs/general-usage/schema) and [data types](https://dlthub.com/docs/general-usage/schema/#data-types), [normalizes the data](https://dlthub.com/docs/general-usage/schema/#data-normalizer), and handles nested data structures.
- dlt supports a variety of [popular destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/) and has an interface to add [custom destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/destination) to create reverse ETL pipelines.
- dlt automates pipeline maintenance with [incremental loading](https://dlthub.com/docs/general-usage/incremental-loading), [schema evolution](https://dlthub.com/docs/general-usage/schema-evolution), and [schema and data contracts](https://dlthub.com/docs/general-usage/schema-contracts).
- dlt supports [Python and SQL data access](https://dlthub.com/docs/general-usage/dataset-access/), [transformations](https://dlthub.com/docs/dlt-ecosystem/transformations), [pipeline inspection](https://dlthub.com/docs/general-usage/dashboard.md), and [visualizing data in Marimo Notebooks](https://dlthub.com/docs/general-usage/dataset-access/marimo).
- dlt can be deployed anywhere Python runs, be it on [Airflow](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [serverless functions](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions), or any other cloud deployment of your choice.
## Documentation

View File

@@ -189,14 +189,15 @@ pipeline state set by the resources during the extraction process.
show_cmd = pipeline_subparsers.add_parser(
"show",
help=(
"Generates and launches Streamlit app with the loading status and dataset explorer"
"Generates and launches workspace dashboard with the loading status and dataset"
" explorer"
),
description="""
Launches the pipeline dashboard app with a comprehensive interface to inspect the pipeline state, schemas, and data in the destination.
Launches the workspace dashboard with a comprehensive interface to inspect the pipeline state, schemas, and data in the destination.
This app should be executed from the same folder from which you ran the pipeline script to be able access destination credentials.
This dashboard should be executed from the same folder from which you ran the pipeline script to be able access destination credentials.
If the --edit flag is used, will launch the editable version of the app if it exists in the current directory, or create this version and launch it in edit mode.
If the --edit flag is used, will launch the editable version of the dashboard if it exists in the current directory, or create this version and launch it in edit mode.
Requires `marimo` to be installed in the current environment: `pip install marimo`. Use the --streamlit flag to launch the legacy streamlit app.
""",
@@ -205,16 +206,16 @@ Requires `marimo` to be installed in the current environment: `pip install marim
"--streamlit",
default=False,
action="store_true",
help="Launch the legacy Streamlit dashboard instead of the new pipeline dashboard. ",
help="Launch the legacy Streamlit dashboard instead of the new workspace dashboard. ",
)
show_cmd.add_argument(
"--edit",
default=False,
action="store_true",
help=(
"Creates editable version of pipeline dashboard in current directory if it does not"
" exist there yet and launches it in edit mode. Will have no effect when using the"
" streamlit flag."
"Creates editable version of workspace dashboard in current directory if it does"
" not exist there yet and launches it in edit mode. Will have no effect when using"
" the streamlit flag."
),
)
pipeline_subparsers.add_parser(
@@ -498,9 +499,9 @@ The `dlt schema` command will load, validate and print out a dlt schema: `dlt sc
class DashboardCommand(SupportsCliCommand):
command = "dashboard"
help_string = "Starts the dlt pipeline dashboard"
help_string = "Starts the dlt workspace dashboard"
description = """
The `dlt dashboard` command starts the dlt pipeline dashboard. You can use the dashboard:
The `dlt dashboard` command starts the dlt workspace dashboard. You can use the dashboard:
* to list and inspect local pipelines
* browse the full pipeline schema and all hints

View File

@@ -5,7 +5,7 @@ import marimo
__generated_with = "0.13.9"
app = marimo.App(
width="medium", app_title="dlt pipeline dashboard", css_file="dlt_dashboard_styles.css"
width="medium", app_title="dlt workspace dashboard", css_file="dlt_dashboard_styles.css"
)
with app.setup:

View File

@@ -20,7 +20,7 @@ _credentials_info = (
# App general
#
app_title = """
# Welcome to the dltHub pipeline dashboard...
# Welcome to the dltHub workspace dashboard...
"""
app_intro = """
<p align="center">...the hackable data platform for `dlt` developers.</p>
@@ -56,30 +56,30 @@ home_quick_start_title = """
"""
home_basics_text = f"""
## dltHub pipeline dashboard basics
## dltHub workspace dashboard basics
We found `{{}}` pipelines in the local directory `{{}}`. When you select a pipeline to inspect, you can:
* See an overview of your pipeline
* See the current pipeline schema and incremental state
* Browse the data in the pipeline's dataset (requires credentials to be available to the dltHub pipeline dashboard)
* Browse the data in the pipeline's dataset (requires credentials to be available to the dltHub workspace dashboard)
* View the pipeline state locally and on the destination
* Browse information about past loads and traces
To inspect data in the destination dataset, ensure your destination credentials are available to the dltHub pipeline dashboard. Either provide them as environment variables, or start the dltHub pipeline dashboard from the directory that contains your `.dlt` folder, where the credentials are stored.
To inspect data in the destination dataset, ensure your destination credentials are available to the dltHub workspace dashboard. Either provide them as environment variables, or start the dltHub workspace dashboard from the directory that contains your `.dlt` folder, where the credentials are stored.
If the dltHub pipeline dashboard cannot connect to the destination, you will receive a warning and will only be able to browse the locally stored information about the pipeline.
If the dltHub workspace dashboard cannot connect to the destination, you will receive a warning and will only be able to browse the locally stored information about the pipeline.
## dltHub pipeline dashboard CLI commands
## dltHub workspace dashboard CLI commands
* `dlt pipeline <pipeline_name> show` - Start the pipeline dashboard for the selected pipeline
* `dlt pipeline <pipeline_name> show --edit` - Start a local copy of the pipeline dashboard for the selected pipeline in edit mode
* `dlt pipeline <pipeline_name> show` - Start the workspace dashboard for the selected pipeline
* `dlt pipeline <pipeline_name> show --edit` - Start a local copy of the workspace dashboard for the selected pipeline in edit mode
## Learn more
* [dlt dashboard docs]({_help_url}) - Dashboard docs
* [dlt pipeline sync]({_sync_help_url}) command - Learn how to restore a pipeline locally to be able to see it in the dashboard
* [Marimo docs](https://docs.marimo.io/) - Learn more about Marimo, the framework that powers the dltHub pipeline dashboard
* [Marimo docs](https://docs.marimo.io/) - Learn more about Marimo, the framework that powers the dltHub workspace dashboard
<small>
2025 [dltHub](https://dlthub.com)

View File

@@ -1,10 +1,10 @@
---
title: LLM-native workflow
title: Build pipelines and reports with LLMs
description: How to extract and explore data from REST API with AI editors/agents
keywords: [cursor, llm, restapi, ai]
---
# LLM-native workflow
# Build dlt pipelines and reports with LLMs
## Overview
@@ -13,7 +13,7 @@ This guide walks you through a collaborative AI-human workflow for extracting an
You will learn:
1. How to initialize a dltHub workspace for your source using dltHubs [LLM-context database](https://dlthub.com/workspace).
2. How to build a REST API source in minutes with AI assistance.
3. How to debug a pipeline and explore data using the pipeline dashboard.
3. How to debug a pipeline and explore data using the workspace dashboard.
4. How to start a new notebook and work with the pipelines dataset in it.
## Prerequisites
@@ -76,7 +76,7 @@ pip install "dlt[workspace]"
### Initialize workspace
dltHub provides prepared contexts for 1000+ sources, available at [https://dlthub.com/workspace](https://dlthub.com/workspace). To get started, search for your API and follow the tailored instructions.
We provide LLM context from over 5,000 sources, available at [https://dlthub.com/workspace](https://dlthub.com/workspace). To get started, search for your API and follow the tailored instructions.
<div style={{textAlign: 'center'}}>
![search for your source](https://storage.googleapis.com/dlt-blog-images/llm_workflows_search.png)
@@ -154,7 +154,7 @@ Load package 1749667187.541553 is LOADED and contains no failed jobs
If the pipeline fails, pass error messages to the LLM. Restart after 4-8 failed attempts.
:::
### Validate with pipeline dashboard
### Validate with workspace dashboard
Launch the dashboard to validate your pipeline:
@@ -188,7 +188,7 @@ import dlt
my_data = dlt.pipeline("{source}_pipeline").dataset()
# get any table as Pandas frame
# my_data.{table_name}.df().head()
my_data.table("table_name").df().head()
```
For more, see the [dataset access guide](../../general-usage/dataset-access).

View File

@@ -25,15 +25,20 @@ pipeline = dlt.pipeline(
dev_mode=True
)
# get a dataframe of all reactions from the dataset
reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df()
# get a data frame of all reactions from the dataset
github_issues = pipeline.dataset().table("issues")
reactions = github_issues.select(
"reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket"
).df()
# calculate and print out the sum of all reactions
counts = reactions.sum(0).sort_values(0, ascending=False)
print(counts)
# alternatively, you can fetch the data as an arrow table
reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").arrow()
reactions = github_issues.select(
"reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket"
).arrow()
# ... do transformations on the arrow table
```
@@ -55,7 +60,7 @@ pipeline = dlt.pipeline(
)
# get user relation with only a few columns selected, but omitting email and name
users = pipeline.dataset().users.select("age", "amount_spent", "country")
users = pipeline.dataset().table("users").select("age", "amount_spent", "country")
# load the data into a new table called users_clean in the same dataset
pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean")
@@ -79,7 +84,7 @@ pipeline = dlt.pipeline(
# NOTE: For selecting only users above 18, we could also use the filter method on the relation with ibis expressions
@dlt.resource(table_name="users_clean")
def users_clean():
users = pipeline.dataset().users
users = pipeline.dataset().table("users")
for arrow_table in users.iter_arrow(chunk_size=1000):
# we want to filter out users under 18

View File

@@ -60,7 +60,7 @@ We will create a simple example pipeline from a [PokeAPI spec](https://pokeapi.c
dlt pipeline pokemon_pipeline info
```
8. You can now also install marimo to see a preview of the data in the pipeline dashboard; you should have loaded 40 Pokemons and their details.
8. You can now also install marimo to see a preview of the data in the workspace dashboard; you should have loaded 40 Pokemons and their details.
```sh
pip install pandas marimo
dlt pipeline pokemon_pipeline show

View File

@@ -1,15 +1,15 @@
---
title: Inspect your pipeline with the pipeline dashboard
title: Inspect your pipeline with the workspace dashboard
description: Open a comprehensive dashboard with information about your pipeline
keywords: [pipeline, schema, data, inspect]
---
# Inspect your pipeline with the pipeline dashboard
# Inspect your pipeline with the workspace dashboard
Once you have run a pipeline locally, you can launch a web app that displays detailed information about your pipeline. This app is built with the Marimo Python notebook framework. For this to work, you will need to have the `marimo` package installed.
:::tip
The pipeline dashboard app works with all destinations that are supported by our dataset. Vector databases are generally unsupported at this point; however, you can still inspect metadata such as run traces, schemas, and pipeline state.
The workspace dashboard app works with all destinations that are supported by our dataset. Vector databases are generally unsupported at this point; however, you can still inspect metadata such as run traces, schemas, and pipeline state.
:::
## Features
@@ -113,9 +113,9 @@ This provides an overview and detailed information about loads found in the _dlt
![Pipeline loads](https://storage.googleapis.com/dlt-blog-images/dashboard-loads.png)
## Creating your own pipeline dashboard
## Creating your own workspace dashboard
You can eject the code for the pipeline dashboard into your current working directory and start editing it to create a custom version that fits your needs. To do this, run the `show` command with the `--edit` flag:
You can eject the code for the workspace dashboard into your current working directory and start editing it to create a custom version that fits your needs. To do this, run the `show` command with the `--edit` flag:
```sh
dlt pipeline {pipeline_name} show --edit

View File

@@ -16,7 +16,7 @@ Here's a full example of how to retrieve data from a pipeline and load it into a
## Getting started
Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain a `Dataset` which is contains the crendentials and schema to your destination dataset. You can run construct a query and execute it on the dataset to retrieve a `Relation` which you may use to retrieve data from the `Dataset`.
Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain a `Dataset` which is contains the credentials and schema to your destination dataset. You can construct a query and execute it on the dataset to retrieve a `Relation` which you may use to retrieve data from the `Dataset`.
**Note:** The `Dataset` and `Relation` objects are **lazy-loading**. They will only query and retrieve data when you perform an action that requires it, such as fetching data into a DataFrame or iterating over the data. This means that simply creating these objects does not load data into memory, making your code more efficient.

View File

@@ -42,7 +42,7 @@ def quick_start_example_snippet(pipeline: dlt.Pipeline) -> None:
dataset = pipeline.dataset()
# Step 2: Access a table as a ReadableRelation
customers_relation = dataset.customers # Or dataset["customers"]
customers_relation = dataset.table("customers")
# Step 3: Fetch the entire table as a Pandas DataFrame
df = customers_relation.df() # or customers_relation.df(chunk_size=50)
@@ -64,8 +64,8 @@ def getting_started_snippet(pipeline: dlt.Pipeline) -> None:
def accessing_tables_snippet(dataset: dlt.Dataset) -> None:
# @@@DLT_SNIPPET_START accessing_tables
# Using attribute access
customers_relation = dataset.customers
# Using `table` method`
customers_relation = dataset.table("customers")
# Using item access
customers_relation = dataset["customers"]
@@ -73,7 +73,7 @@ def accessing_tables_snippet(dataset: dlt.Dataset) -> None:
def fetch_entire_table_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START fetch_entire_table_df
df = customers_relation.df()
@@ -89,7 +89,7 @@ def fetch_entire_table_snippet(dataset: dlt.Dataset) -> None:
def iterating_chunks_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START iterating_df_chunks
for df_chunk in customers_relation.iter_df(chunk_size=5):
# Process each DataFrame chunk
@@ -124,15 +124,15 @@ def context_manager_snippet(dataset: dlt.Dataset) -> None:
# the dataset context manager will keep the connection open
# and close it after the with block is exited
with dataset as dataset_:
print(dataset.customers.limit(50).arrow())
print(dataset.purchases.arrow())
with dataset:
print(dataset.table("customers").limit(50).arrow())
print(dataset.table("purchases").arrow())
# @@@DLT_SNIPPET_END context_manager
def limiting_records_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START limiting_records
# Get the first 50 items as a PyArrow table
arrow_table = customers_relation.limit(50).arrow()
@@ -144,7 +144,7 @@ def limiting_records_snippet(dataset: dlt.Dataset) -> None:
def select_columns_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START select_columns
# Select only 'id' and 'name' columns
items_list = customers_relation.select("id", "name").fetchall()
@@ -158,7 +158,7 @@ def select_columns_snippet(dataset: dlt.Dataset) -> None:
def order_by_snippet(default_dataset: dlt.Dataset) -> None:
customers_relation = default_dataset.customers
customers_relation = default_dataset.table("customers")
# @@@DLT_SNIPPET_START order_by
# Order by 'id'
ordered_list = customers_relation.order_by("id").fetchall()
@@ -166,7 +166,7 @@ def order_by_snippet(default_dataset: dlt.Dataset) -> None:
def filter_snippet(default_dataset: dlt.Dataset) -> None:
customers_relation = default_dataset.customers
customers_relation = default_dataset.table("customers")
# @@@DLT_SNIPPET_START filter
# Filter by 'id'
filtered = customers_relation.where("id", "in", [3, 1, 7]).fetchall()
@@ -186,7 +186,7 @@ def filter_snippet(default_dataset: dlt.Dataset) -> None:
def aggregate_snippet(default_dataset: dlt.Dataset) -> None:
customers_relation = default_dataset.customers
customers_relation = default_dataset.table("customers")
# @@@DLT_SNIPPET_START aggregate
# Get max 'id'
@@ -199,7 +199,7 @@ def aggregate_snippet(default_dataset: dlt.Dataset) -> None:
def chain_operations_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START chain_operations
# Select columns and limit the number of records
@@ -267,21 +267,21 @@ def ibis_expressions_snippet(pipeline: dlt.Pipeline) -> None:
def fetch_one_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START fetch_one
record = customers_relation.fetchone()
# @@@DLT_SNIPPET_END fetch_one
def fetch_many_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START fetch_many
records = customers_relation.fetchmany(10)
# @@@DLT_SNIPPET_END fetch_many
def iterating_with_limit_and_select_snippet(dataset: dlt.Dataset) -> None:
customers_relation = dataset.customers
customers_relation = dataset.table("customers")
# @@@DLT_SNIPPET_START iterating_with_limit_and_select
# Dataframes
for df_chunk in customers_relation.select("id", "name").limit(100).iter_df(chunk_size=20): ...

View File

@@ -13,7 +13,7 @@ The Streamlit app does not work with all destinations supported by `dlt`. Only d
:::
:::warning
The Streamlit app is not under active development anymore and may soon be deprecated. We encourage all users to use the [pipeline dashboard](../dashboard.md)
The Streamlit app is not under active development anymore and may soon be deprecated. We encourage all users to use the [workspace dashboard](../dashboard.md)
:::
## Prerequisites

View File

@@ -126,7 +126,7 @@ dataset = dlt.hub.current.project.catalog().dataset("my_pipeline_dataset") # ty
# This function reads data in chunks from an existing table and yields each chunk
def transform_frames():
# Read the 'items' table in chunks of 1000 rows
for df in dataset.items.iter_df(chunk_size=1000):
for df in dataset.table("items").iter_df(chunk_size=1000):
# You can process the data here if needed
yield df

View File

@@ -121,8 +121,8 @@ def multiple_transformation_instructions_snippet(fruitshop_pipeline: dlt.Pipelin
# this (probably nonsensical) transformation will create a union of the customers and purchases tables
@dlt.hub.transformation(write_disposition="append")
def union_of_tables(dataset: dlt.Dataset) -> Any:
yield dataset.customers
yield dataset.purchases
yield dataset.table("purchases")
yield dataset.table("customers")
# @@@DLT_SNIPPET_END multiple_transformation_instructions
@@ -210,7 +210,7 @@ def arrow_dataframe_operations_snippet(fruitshop_pipeline: dlt.Pipeline) -> None
@dlt.hub.transformation
def copied_customers(dataset: dlt.Dataset) -> Any:
# get full customers table as arrow table
customers = dataset.customers.arrow()
customers = dataset.table("customers").arrow()
# Sort the table by 'name'
sorted_customers = customers.sort_by([("name", "ascending")])
@@ -222,8 +222,8 @@ def arrow_dataframe_operations_snippet(fruitshop_pipeline: dlt.Pipeline) -> None
@dlt.hub.transformation
def enriched_purchases(dataset: dlt.Dataset) -> Any:
# get both fully tables as dataframes
purchases = dataset.purchases.df()
customers = dataset.customers.df()
purchases = dataset.table("purchases").df()
customers = dataset.table("customers").df()
# Merge (JOIN) the DataFrames
result = purchases.merge(customers, left_on="customer_id", right_on="id")

View File

@@ -17,7 +17,7 @@ dltHub is built around the open-source library [dlt](../intro.md). It uses the s
dltHub supports both local and managed cloud development. A single developer can deploy and operate pipelines, transformations, and notebooks directly from a dltHub Workspace, using a single command.
The dltHub Runtime, customizable pipeline dashboard, and validation tools make it straightforward to monitor, troubleshoot, and keep data reliable throughout the whole end-to-end data workflow:
The dltHub Runtime, customizable workspace dashboard, and validation tools make it straightforward to monitor, troubleshoot, and keep data reliable throughout the whole end-to-end data workflow:
```mermaid
flowchart LR

View File

@@ -12,22 +12,26 @@ import snippets from '!!raw-loader!./intro-snippets.py';
## What is dlt?
dlt is an open-source Python library that loads data from various, often messy data sources into well-structured, live datasets. It offers a lightweight interface for extracting data from [REST APIs](./tutorial/rest-api), [SQL databases](./tutorial/sql-database), [cloud storage](./tutorial/filesystem), [Python data structures](./tutorial/load-data-from-an-api), and [many more](./dlt-ecosystem/verified-sources).
dlt is an open-source Python library that loads data from various, often messy data sources into well-structured datasets. It provides lightweight Python interfaces to extract, load, inspect and transform the data. dlt and dlt docs are built ground up to be used with LLMs: [LLM-native workflow](dlt-ecosystem/llm-tooling/llm-native-workflow.md) will take you pipeline code to data in a notebook for over [5,000 sources](https://dlthub.com/workspace).
dlt is designed to be easy to use, flexible, and scalable:
- dlt extracts data from [REST APIs](./tutorial/rest-api), [SQL databases](./tutorial/sql-database), [cloud storage](./tutorial/filesystem), [Python data structures](./tutorial/load-data-from-an-api), and [many more](./dlt-ecosystem/verified-sources)
- dlt infers [schemas](./general-usage/schema) and [data types](./general-usage/schema/#data-types), [normalizes the data](./general-usage/schema/#data-normalizer), and handles nested data structures.
- dlt supports a variety of [popular destinations](./dlt-ecosystem/destinations/) and has an interface to add [custom destinations](./dlt-ecosystem/destinations/destination) to create reverse ETL pipelines.
- dlt can be deployed anywhere Python runs, be it on [Airflow](./walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [serverless functions](./walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions), or any other cloud deployment of your choice.
- dlt automates pipeline maintenance with [incremental loading](./general-usage/incremental-loading), [schema evolution](./general-usage/schema-evolution), and [schema and data contracts](./general-usage/schema-contracts).
- dlt supports [Python and SQL data access](general-usage/dataset-access/), [transformations](dlt-ecosystem/transformations) and supports [pipeline inspection](general-usage/dashboard.md) and [visualizing data in Marimo Notebooks](general-usage/dataset-access/marimo).
- dlt can be deployed anywhere Python runs, be it on [Airflow](./walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [serverless functions](./walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions), or any other cloud deployment of your choice.
To get started with dlt, install the library using pip:
To get started with dlt, install the library using pip (use [clean virtual environment](reference/installation) for your experiments!):
```sh
pip install dlt
```
:::tip
We recommend using a clean virtual environment for your experiments! Read the [detailed instructions](./reference/installation) on how to set up one.
If you'd like to try out dlt without installing it on your machine, check out the [Google Colab demo](https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing) or
use our simple [marimo / wasm based playground](./tutorial/playground) on this docs page.
:::
## Load data with dlt from …
@@ -71,12 +75,16 @@ pipeline = dlt.pipeline(
load_info = pipeline.run(source)
# print load info and posts table as dataframe
# print load info and posts table as data frame
print(load_info)
print(pipeline.dataset().posts.df())
```
:::tip
LLMs are great at generating REST API pipelines!
* [Follow LLM tutorial](dlt-ecosystem/llm-tooling/llm-native-workflow.md) and start with one of [5,000+ sources](https://dlthub.com/workspace)
* Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods.
:::
Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods.
</TabItem>
<TabItem value="sql-database">
@@ -97,7 +105,7 @@ pipeline = dlt.pipeline(
load_info = pipeline.run(source)
# print load info and the "family" table as dataframe
# print load info and the "family" table as data frame
print(load_info)
print(pipeline.dataset().family.df())
```
@@ -125,7 +133,7 @@ pipeline = dlt.pipeline(
load_info = pipeline.run(resource)
# print load info and the "example" table as dataframe
# print load info and the "example" table as data frame
print(load_info)
print(pipeline.dataset().example.df())
```
@@ -135,7 +143,7 @@ Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more abo
</TabItem>
<TabItem value="python-data">
dlt is able to load data from Python generators or directly from Python data structures:
dlt can load data from Python generators or directly from Python data structures:
```py
import dlt
@@ -152,7 +160,7 @@ pipeline = dlt.pipeline(
load_info = pipeline.run(foo)
# print load info and the "foo_data" table as dataframe
# print load info and the "foo_data" table as data frame
print(load_info)
print(pipeline.dataset().foo_data.df())
```
@@ -163,14 +171,8 @@ Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api
</Tabs>
:::tip
If you'd like to try out dlt without installing it on your machine, check out the [Google Colab demo](https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing) or
use our simple [marimo / wasm based playground](./tutorial/playground) on this docs page.
:::
## Join the dlt community
1. Give the library a ⭐ and check out the code on [GitHub](https://github.com/dlt-hub/dlt).
1. Ask questions and share how you use the library on [Slack](https://dlthub.com/community).
1. Report problems and make feature requests [here](https://github.com/dlt-hub/dlt/issues/new/choose).

View File

@@ -54,7 +54,7 @@ dlt [-h] [--version] [--disable-telemetry] [--enable-telemetry]
* [`init`](#dlt-init) - Creates a pipeline project in the current folder by adding existing verified source or creating a new one from template.
* [`render-docs`](#dlt-render-docs) - Renders markdown version of cli docs
* [`deploy`](#dlt-deploy) - Creates a deployment package for a selected pipeline script
* [`dashboard`](#dlt-dashboard) - Starts the dlt pipeline dashboard
* [`dashboard`](#dlt-dashboard) - Starts the dlt workspace dashboard
* [`ai`](#dlt-ai) - Use ai-powered development tools and utilities
</details>
@@ -145,7 +145,7 @@ Inherits arguments from [`dlt`](#dlt).
**Available subcommands**
* [`info`](#dlt-pipeline-info) - Displays state of the pipeline, use -v or -vv for more info
* [`show`](#dlt-pipeline-show) - Generates and launches streamlit app with the loading status and dataset explorer
* [`show`](#dlt-pipeline-show) - Generates and launches workspace dashboard with the loading status and dataset explorer
* [`failed-jobs`](#dlt-pipeline-failed-jobs) - Displays information on all the failed loads in all completed packages, failed jobs and associated error messages
* [`drop-pending-packages`](#dlt-pipeline-drop-pending-packages) - Deletes all extracted and normalized packages including those that are partially loaded.
* [`sync`](#dlt-pipeline-sync) - Drops the local state of the pipeline and resets all the schemas and restores it from destination. the destination state, data and schemas are left intact.
@@ -185,7 +185,7 @@ Inherits arguments from [`dlt pipeline`](#dlt-pipeline).
### `dlt pipeline show`
Generates and launches Streamlit app with the loading status and dataset explorer.
Generates and launches workspace dashboard with the loading status and dataset explorer.
**Usage**
```sh
@@ -194,11 +194,11 @@ dlt pipeline [pipeline_name] show [-h] [--streamlit] [--edit]
**Description**
Launches the pipeline dashboard app with a comprehensive interface to inspect the pipeline state, schemas, and data in the destination.
Launches the workspace dashboard with a comprehensive interface to inspect the pipeline state, schemas, and data in the destination.
This app should be executed from the same folder from which you ran the pipeline script to be able access destination credentials.
This dashboard should be executed from the same folder from which you ran the pipeline script to be able access destination credentials.
If the --edit flag is used, will launch the editable version of the app if it exists in the current directory, or create this version and launch it in edit mode.
If the --edit flag is used, will launch the editable version of the dashboard if it exists in the current directory, or create this version and launch it in edit mode.
Requires `marimo` to be installed in the current environment: `pip install marimo`. Use the --streamlit flag to launch the legacy streamlit app.
@@ -210,8 +210,8 @@ Inherits arguments from [`dlt pipeline`](#dlt-pipeline).
**Options**
* `-h, --help` - Show this help message and exit
* `--streamlit` - Launch the legacy streamlit dashboard instead of the new pipeline dashboard.
* `--edit` - Creates editable version of pipeline dashboard in current directory if it does not exist there yet and launches it in edit mode. will have no effect when using the streamlit flag.
* `--streamlit` - Launch the legacy streamlit dashboard instead of the new workspace dashboard.
* `--edit` - Creates editable version of workspace dashboard in current directory if it does not exist there yet and launches it in edit mode. will have no effect when using the streamlit flag.
</details>
@@ -700,7 +700,7 @@ Inherits arguments from [`dlt deploy`](#dlt-deploy).
## `dlt dashboard`
Starts the dlt pipeline dashboard.
Starts the dlt workspace dashboard.
**Usage**
```sh
@@ -709,7 +709,7 @@ dlt dashboard [-h] [--pipelines-dir PIPELINES_DIR] [--edit]
**Description**
The `dlt dashboard` command starts the dlt pipeline dashboard. You can use the dashboard:
The `dlt dashboard` command starts the dlt workspace dashboard. You can use the dashboard:
* to list and inspect local pipelines
* browse the full pipeline schema and all hints

View File

@@ -362,9 +362,13 @@ Check out [other examples](../dlt-ecosystem/verified-sources/filesystem/advanced
Congratulations on completing the tutorial! You've learned how to set up a filesystem source in dlt and run a data pipeline to load the data into DuckDB.
With your pipeline code ready, we recommend the following next steps:
- Inspect your pipeline and data in [workspace dashboard](../general-usage/dashboard.md)
- [Access your data](../general-usage/dataset-access/) using `dataset` interface
- [Explore your data and create reports](../general-usage/dataset-access/marimo) in Marimo notebooks.
Interested in learning more about dlt? Here are some suggestions:
- Learn more about the filesystem source configuration in [filesystem source](../dlt-ecosystem/verified-sources/filesystem)
- Learn more about different credential types in [Built-in credentials](../general-usage/credentials/complex_types#built-in-credentials)
- Learn how to [create a custom source](./load-data-from-an-api.md) in the advanced tutorial

View File

@@ -1,6 +1,6 @@
---
title: "Build a dlt pipeline"
description: Build a data pipeline with dlt
title: Build advanced dlt pipeline from scratch
description: Build custom, production grade pipeline just by writing code
keywords: [getting started, quick start, basic examples]
---

View File

@@ -321,6 +321,11 @@ Read more about [incremental loading](../dlt-ecosystem/verified-sources/rest_api
Congratulations on completing the tutorial! You've learned how to set up a REST API source in dlt and run a data pipeline to load the data into DuckDB.
With your pipeline code ready, we recommend the following next steps:
- Inspect your pipeline and data in [workspace dashboard](../general-usage/dashboard.md)
- [Access your data](../general-usage/dataset-access/) using `dataset` interface
- [Explore your data and create reports](../general-usage/dataset-access/marimo) in Marimo notebooks.
Interested in learning more about dlt? Here are some suggestions:
- Learn more about the REST API source configuration in the [REST API source documentation](../dlt-ecosystem/verified-sources/rest_api/)

View File

@@ -267,6 +267,11 @@ In the first run of the pipeline `python sql_database_pipeline.py`, the entire t
Congratulations on completing the tutorial! You learned how to set up a SQL Database source in dlt and run a data pipeline to load the data into DuckDB.
With your pipeline code ready, we recommend the following next steps:
- Inspect your pipeline and data in [workspace dashboard](../general-usage/dashboard.md)
- [Access your data](../general-usage/dataset-access/) using `dataset` interface
- [Explore your data and create reports](../general-usage/dataset-access/marimo) in Marimo notebooks.
Interested in learning more about dlt? Here are some suggestions:
- Learn more about the SQL Database source configuration in [the SQL Database source reference](../dlt-ecosystem/verified-sources/sql_database)
- Learn how to extract [single tables and use fast `arrow` and `connectorx` backends](../dlt-ecosystem/verified-sources/sql_database/configuration.md)

View File

@@ -155,7 +155,7 @@ You will need to install `pip dlt[workspace]`
dlt pipeline github_api_pipeline show
```
This will open the pipeline dashboard app that gives you an overview of the data loaded.
This will open the workspace dashboard app that gives you an overview of the data loaded.
## 5. Next steps

View File

@@ -89,7 +89,7 @@ table, do SQL queries, etc., by executing the following command from the same fo
dlt pipeline chess_pipeline show
```
This will launch the pipeline dashboard, which you can open in your browser:
This will launch the workspace dashboard, which you can open in your browser:
```text
Found pipeline chess_pipeline in /home/user-name/.dlt/pipelines

View File

@@ -37,6 +37,7 @@ const sidebars = {
},
items: [
'reference/installation',
"dlt-ecosystem/llm-tooling/llm-native-workflow",
'tutorial/rest-api',
'tutorial/sql-database',
'tutorial/filesystem',
@@ -128,6 +129,12 @@ const sidebars = {
'general-usage/http/requests',
]
},
{
type: 'link',
label: '5k+ REST APIs with LLMs',
description: 'Pick one of 5k+ REST APIs from LLM context',
href: 'https://dlthub.com/workspace',
},
]
},
{
@@ -238,7 +245,7 @@ const sidebars = {
items: [
'walkthroughs/create-a-pipeline',
'walkthroughs/run-a-pipeline',
{
/*{
type: "category",
label: "Build with LLMs",
link: {
@@ -250,7 +257,7 @@ const sidebars = {
items: [
"dlt-ecosystem/llm-tooling/llm-native-workflow",
]
},
},*/
{
type: 'category',
label: 'Load data incrementally',
@@ -477,7 +484,7 @@ const sidebars = {
items: [
'hub/intro',
'hub/getting-started/installation',
'dlt-ecosystem/llm-tooling/llm-native-workflow',
{ type: 'ref', id: 'dlt-ecosystem/llm-tooling/llm-native-workflow' },
]
},
{
@@ -501,7 +508,7 @@ const sidebars = {
type: 'category',
label: 'Ensure data quality',
items: [
'general-usage/dashboard',
{ type: 'ref', id: 'general-usage/dashboard' },
'hub/features/mcp-server',
'hub/features/quality/data-quality',
]
@@ -510,8 +517,8 @@ const sidebars = {
type: 'category',
label: 'Create reports and transformations',
items: [
'general-usage/dataset-access/marimo',
'general-usage/dataset-access/dataset',
{ type: 'ref', id: 'general-usage/dataset-access/marimo' },
{ type: 'ref', id: 'general-usage/dataset-access/dataset' },
'hub/features/transformations/index',
'hub/features/transformations/dbt-transformations',
]
@@ -553,20 +560,20 @@ const sidebars = {
],
};
// insert examples
// insert examples
for (const item of sidebars.docsSidebar) {
if (item.label === 'Code examples') {
for (let examplePath of walkSync("./docs_processed/examples")) {
examplePath = examplePath.replace("docs_processed/", "");
examplePath = examplePath.replace(".mdx", "");
examplePath = examplePath.replace(".md", "");
item.items.push(examplePath);
if (item.label === 'Code examples') {
for (let examplePath of walkSync("./docs_processed/examples")) {
examplePath = examplePath.replace("docs_processed/", "");
examplePath = examplePath.replace(".mdx", "");
examplePath = examplePath.replace(".md", "");
item.items.push(examplePath);
}
}
}
// inject api reference if it exists
// inject api reference if it exists
if (fs.existsSync('./docs_processed/api_reference/sidebar.json')) {
for (const item of sidebars.docsSidebar) {
if (item.label === 'Reference') {

View File

@@ -131,13 +131,13 @@ def test_page_overview(page: Page):
_go_home(page)
# check title
expect(page).to_have_title("dlt pipeline dashboard")
expect(page).to_have_title("dlt workspace dashboard")
# check top heading
expect(
page.get_by_role("heading", name="Welcome to the dltHub pipeline dashboard...")
page.get_by_role("heading", name="Welcome to the dltHub workspace dashboard...")
).to_contain_text(
"Welcome to the dltHub pipeline dashboard..."
"Welcome to the dltHub workspace dashboard..."
) #
#