Files
dlt/docs/docs_tools/education/preprocess_to_molab.py
anuunchin 266052eb76 Docs: Converting Jupyter notebooks in education to marimo notebooks (#3068)
* Initial commit

* lesson_1_quick_start adjusted for marimo

* lesson_2_dlt_sources_and_resources_create_first_dlt_pipeline marimo

* Fundamentals course 3 improved

* Marimo badges added

* Fundamenta: course 8

* Marimo badge link fix

* Fundamentals: course 7

* Fundamentals: course 6

* Fundamentals: course 5

* Fundamentals: cousre 4

* Fundamentals: course 3

* Fundamentals: course 2

* Fundmantals: course 1

* marimo links corrected

* Inline deps

* Fundamentals: fix lesson 2

* Fundamentals: fix lesson 3

* Fundamentals: fix lesson 4

* Formatting moved to build-molabs

* Fundamentals: fix lesson 5

* Removal of scrolls

* Fundamentals: fix lesson 6

* Fundamentals: fix lesson 7

* Fundamentals: fix lesson 8

* os.environ replaced with dlt.secrets where relevant

* Advanced: fix lesson 5

* Advanced fix lesson 9

* os.environ fixes

* Advanced: fix lesson 1

* Comments cleanup

* Additional comment removal, fix lesson 6 advanced

* Clean main makefile

* Get rid of constants.py

* Nicer json.loads()

* Better functions in preprocess_to_molab

* Tests for doc tooling funcs

* Validate molab command

* Marimo check added

* docs pages adjustment

* limits sqlglot in dev group until fixed

---------

Co-authored-by: Marcin Rudolf <rudolfix@rudolfix.org>
2025-12-16 16:30:32 +01:00

291 lines
9.9 KiB
Python

import json
import re
import shlex
import subprocess
from pathlib import Path
from typing import Dict, Any
EDUCATION_NOTEBOOKS_DIR = Path(__file__).parent.parent.parent / "education"
TEMP_IPYNB_FILE_PREIFX = "tmp"
MUST_INSTALL_PACKAGES = {"numpy", "pandas", "sqlalchemy"}
def replace_colab_imports_in_notebook(notebook_dict: Dict[str, Any]) -> Dict[str, Any]:
"""
Remove Google Colab-specific imports and replace Colab API calls with standard Python.
Google Colab provides special APIs like `google.colab.userdata` for accessing secrets
that don't exist outside the Colab environment. This function:
- Removes: `from google.colab import userdata` (and similar imports)
- Replaces: `userdata.get(...)` → `os.getenv(...)`
Args:
notebook_dict: Notebook as a Python dictionary
Returns:
Modified notebook dictionary
"""
for cell in notebook_dict.get("cells", []):
if cell.get("cell_type") == "code":
source = cell.get("source", [])
if isinstance(source, list):
# Remove lines with Google Colab imports
source = [
line
for line in source
if not re.match(r"^\s*from google\.colab import", line)
]
# Replace userdata.get with os.getenv
source = [
line.replace("userdata.get(", "os.getenv(") for line in source
]
cell["source"] = source
return notebook_dict
def process_shell_commands_in_notebook(
notebook_dict: Dict[str, Any]
) -> tuple[Dict[str, Any], set[str]]:
"""
Convert Jupyter shell commands to Python subprocess calls and extract dependencies.
Jupyter/Colab notebooks support shell commands with `!` syntax (e.g., `!pip install dlt`),
but this is IPython-specific magic syntax that doesn't work in standard Python or Marimo.
This function:
- Extracts package names from `!pip install` commands for dependency tracking
- Converts other `!command` shell commands to `subprocess.run()` calls
- Removes notebook-specific magic commands (e.g., `%%capture`)
Args:
notebook_dict: Notebook as a Python dictionary
Returns:
Tuple of (modified notebook dict, set of package names extracted from pip install commands)
"""
packages: set[str] = set()
subprocess_imported: bool = False
for cell in notebook_dict.get("cells", []):
if cell.get("cell_type") == "code":
cell_code = cell.get("source", [])
new_cell_code = []
for line in cell_code:
stripped = line.strip()
# skip magic commands
if stripped.startswith("%%capture"):
continue
# extract packages from pip install
if stripped.startswith("!pip install"):
match = re.search(r"!pip install\s+(.+?)(?:\n|$)", stripped)
if match:
cleaned = (
match.group(1).strip().replace('"', "").replace("'", "")
)
# Remove spaces around commas in brackets
cleaned = re.sub(r"\[\s*", "[", cleaned) # Remove space after [
cleaned = re.sub(
r"\s*\]", "]", cleaned
) # Remove space before ]
cleaned = re.sub(
r",\s+", ",", cleaned
) # Remove space after commas
pkgs = [
p.strip()
for p in cleaned.split()
if p.strip() and not p.startswith("-")
] # Filter flags
packages.update(pkgs)
continue
# convert other shell commands
elif stripped.startswith("!"):
if not subprocess_imported:
new_cell_code.append("import subprocess\n")
subprocess_imported = True
cmd = stripped[1:]
new_line = _build_subprocess_line(cmd) + "\n"
new_cell_code.append(new_line)
else:
new_cell_code.append(line)
cell["source"] = new_cell_code
return notebook_dict, packages
def add_inline_dependencies_to_content(packages: set[str], py_content: str) -> str:
"""
Add PEP 723 inline script metadata block with dependencies.
Marimo/Molab can automatically install packages when they're declared using PEP 723
inline script metadata. The dependency list includes:
- Packages extracted from !pip install commands in the original notebook
- MUST_INSTALL_PACKAGES (core dependencies required for all notebooks)
Args:
packages: Set of package names to include (will be merged with MUST_INSTALL_PACKAGES)
py_content: The Python file content as a string
Returns:
Python content with PEP 723 metadata block prepended
NOTE: Without this, users would need to go through a step of manually installing packages before running
the notebook (Marimo will try to install missing imports, which is not exactly nice for a smooth experience.
Also, some libraries used under the hood are not directly imported and are not caught by Marimo).
Format:
# /// script
# dependencies = [
# "package1",
# "package2",
# ]
# ///
"""
packages = packages.copy() # Don't mutate the input set
packages.update(MUST_INSTALL_PACKAGES)
if not packages:
return py_content
pkg_lines = "\n".join(f'# "{pkg}",' for pkg in sorted(packages))
deps_block = f"""# /// script
# dependencies = [
{pkg_lines}
# ]
# ///
"""
return deps_block + py_content
def read_notebook(ipynb_path: Path) -> Dict[str, Any]:
"""
Read a Jupyter notebook file and return as a dictionary.
Args:
ipynb_path: Path to the .ipynb file
Returns:
Notebook data as a Python dictionary
"""
data: Dict[str, Any] = json.loads(ipynb_path.read_text(encoding="utf-8"))
return data
def write_notebook(notebook_dict: Dict[str, Any], output_path: Path) -> None:
"""
Write a notebook dictionary to a file.
Args:
notebook_dict: Notebook data as a Python dictionary
output_path: Path where the notebook should be written
"""
output_path.write_text(
json.dumps(notebook_dict, indent=1, ensure_ascii=False), encoding="utf-8"
)
def convert_notebook_to_marimo(temp_ipynb_path: Path) -> str:
"""
Convert a Jupyter notebook to Marimo Python format using marimo CLI.
Args:
temp_ipynb_path: Path to the temporary preprocessed notebook
Returns:
Marimo Python file content as a string
"""
result = subprocess.run(
["marimo", "convert", str(temp_ipynb_path)],
capture_output=True,
text=True,
check=True,
)
return result.stdout
def write_python_file(content: str, output_path: Path) -> None:
"""
Write Python content to a file.
Args:
content: Python file content as a string
output_path: Path where the file should be written
"""
output_path.write_text(content, encoding="utf-8")
def _build_subprocess_line(cmd: str) -> str:
"""
Generate a subprocess.run() call string from a shell command.
This helper converts various shell command patterns to their Python subprocess
equivalents, handling special cases like piped input.
Conversion rules:
- Simple commands: `command arg` → `subprocess.run(['command', 'arg'], check=True)`
- Yes piping: `yes | command` → `subprocess.run(['command'], input='y\\n', ...)`
- No piping: `no | command` → `subprocess.run(['command'], input='n\\n', ...)`
- Complex pipes: `cmd1 | cmd2` → `subprocess.run('cmd1 | cmd2', shell=True, ...)`
Args:
cmd: The shell command string (without the leading `!`)
Returns:
A string containing Python code for subprocess.run()
"""
cmd = cmd.strip()
# No pipe → simple list argv
if "|" not in cmd:
argv = shlex.split(cmd)
return f"subprocess.run({argv!r}, check=True)"
# Split pipe
left, right = map(str.strip, cmd.split("|", 1))
left_lower = left.lower()
# yes | command → feed "y\n"
if left_lower == "yes":
argv = shlex.split(right)
return f"subprocess.run({argv!r}, input='y\\n', text=True, check=True)"
# no | command → feed "n\n"
if left_lower == "no":
argv = shlex.split(right)
return f"subprocess.run({argv!r}, input='n\\n', text=True, check=True)"
# generic pipe: shell=True fallback
return f"subprocess.run({cmd!r}, shell=True, check=True)"
if __name__ == "__main__":
for ipynb_file in EDUCATION_NOTEBOOKS_DIR.glob("*/*.ipynb"):
# 1. Read notebook file
notebook_dict = read_notebook(ipynb_file)
# 2. Replace Colab imports
notebook_dict = replace_colab_imports_in_notebook(notebook_dict)
# 3. Process shell commands
notebook_dict, packages = process_shell_commands_in_notebook(notebook_dict)
# 4. Write temporary notebook
temp_ipynb_file = ipynb_file.with_name(
f"{TEMP_IPYNB_FILE_PREIFX}_{ipynb_file.name}"
)
write_notebook(notebook_dict, temp_ipynb_file)
# 5. Convert to Marimo format
py_content = convert_notebook_to_marimo(temp_ipynb_file)
# 6. Add inline dependencies
py_content_with_deps = add_inline_dependencies_to_content(packages, py_content)
# 7. Write final Python file
output_path = ipynb_file.with_suffix(".py")
write_python_file(py_content_with_deps, output_path)
# 8. Clean up temporary files
temp_ipynb_file.unlink()