reimplement, add tests (#3418)

Co-authored-by: ivasio <ivan@dlthub.com>
This commit is contained in:
ivasio
2025-12-02 23:02:28 +01:00
committed by GitHub
parent 3e84f7aaa9
commit af8908968e
4 changed files with 55 additions and 22 deletions

View File

@@ -1,4 +1,4 @@
from typing import Iterator, Optional, List
from typing import Iterable, Iterator, Optional, List, Tuple
from pathlib import Path
from pathspec import PathSpec
from pathspec.util import iter_tree_files
@@ -6,7 +6,16 @@ from pathspec.util import iter_tree_files
from dlt._workspace._workspace_context import WorkspaceRunContext
class WorkspaceFileSelector:
class BaseFileSelector(Iterable[Tuple[Path, Path]]):
"""
Base class for file selectors. For every file yields 2 paths: absolute path in the filesystem
and relative path of the file in the resulting tarball
"""
pass
class WorkspaceFileSelector(BaseFileSelector):
"""Iterates files in workspace respecting ignore patterns and excluding workspace internals.
Uses gitignore-style patterns from a configurable ignore file (default .gitignore). Additional
@@ -22,7 +31,7 @@ class WorkspaceFileSelector:
self.root_path: Path = Path(context.run_dir).resolve()
self.settings_dir: Path = Path(context.settings_dir).resolve()
self.ignore_file: str = ignore_file
self.spec: PathSpec = self._build_pathspec(additional_excludes or [])
self.ignore_spec: PathSpec = self._build_pathspec(additional_excludes or [])
def _build_pathspec(self, additional_excludes: List[str]) -> PathSpec:
"""Build PathSpec from ignore file + defaults + additional excludes"""
@@ -39,8 +48,25 @@ class WorkspaceFileSelector:
return PathSpec.from_lines("gitwildmatch", patterns)
def __iter__(self) -> Iterator[Path]:
def __iter__(self) -> Iterator[Tuple[Path, Path]]:
"""Yield paths of files eligible for deployment"""
root_path = Path(self.root_path)
for file_path in iter_tree_files(self.root_path):
if not self.spec.match_file(file_path):
yield Path(file_path)
if not self.ignore_spec.match_file(file_path):
yield root_path / file_path, Path(file_path)
class ConfigurationFileSelector(BaseFileSelector):
"""Iterates config and secrets files in workspace"""
def __init__(
self,
context: WorkspaceRunContext,
) -> None:
self.settings_dir: Path = Path(context.settings_dir).resolve()
def __iter__(self) -> Iterator[Tuple[Path, Path]]:
"""Yield paths of config and secrets paths"""
for file_path in iter_tree_files(self.settings_dir):
if file_path.endswith("config.toml") or file_path.endswith("secrets.toml"):
yield self.settings_dir / file_path, Path(file_path)

View File

@@ -7,7 +7,7 @@ import yaml
from dlt.common.time import precise_time
from dlt.common.utils import digest256_tar_stream
from dlt._workspace.deployment.file_selector import WorkspaceFileSelector
from dlt._workspace.deployment.file_selector import BaseFileSelector, WorkspaceFileSelector
from dlt._workspace.deployment.manifest import (
TDeploymentFileItem,
TDeploymentManifest,
@@ -22,33 +22,32 @@ DEFAULT_MANIFEST_FILE_NAME = "manifest.yaml"
DEFAULT_DEPLOYMENT_PACKAGE_LAYOUT = "deployment-{timestamp}.tar.gz"
class DeploymentPackageBuilder:
class PackageBuilder:
"""Builds gzipped deployment package from file selectors"""
def __init__(self, context: WorkspaceRunContext):
self.run_context: WorkspaceRunContext = context
def write_package_to_stream(
self, file_selector: WorkspaceFileSelector, output_stream: BinaryIO
self, file_selector: BaseFileSelector, output_stream: BinaryIO
) -> str:
"""Write deployment package to output stream, return content hash"""
manifest_files: List[TDeploymentFileItem] = []
# Add files to the archive
with tarfile.open(fileobj=output_stream, mode="w|gz") as tar:
for file_path in file_selector:
full_path = self.run_context.run_dir / file_path
for abs_path, rel_path in file_selector:
# Use POSIX paths for tar archives (cross-platform compatibility)
posix_path = file_path.as_posix()
posix_path = rel_path.as_posix()
tar.add(
full_path,
abs_path,
arcname=f"{DEFAULT_DEPLOYMENT_FILES_FOLDER}/{posix_path}",
recursive=False,
)
manifest_files.append(
{
"relative_path": posix_path,
"size_in_bytes": full_path.stat().st_size,
"size_in_bytes": abs_path.stat().st_size,
}
)
# Create and add manifest with file metadata at the end
@@ -67,7 +66,7 @@ class DeploymentPackageBuilder:
return digest256_tar_stream(output_stream)
def build_package(self, file_selector: WorkspaceFileSelector) -> Tuple[Path, str]:
def build_package(self, file_selector: BaseFileSelector) -> Tuple[Path, str]:
"""Create deployment package file, return (path, content_hash)"""
package_name = DEFAULT_DEPLOYMENT_PACKAGE_LAYOUT.format(timestamp=str(precise_time()))
package_path = Path(self.run_context.get_data_entity(package_name))

View File

@@ -1,7 +1,7 @@
import os
import pytest
from dlt._workspace.deployment.file_selector import WorkspaceFileSelector
from dlt._workspace.deployment.file_selector import ConfigurationFileSelector, WorkspaceFileSelector
from tests.workspace.utils import isolated_workspace
@@ -27,5 +27,14 @@ def test_file_selector_respects_gitignore(with_additional_exclude: bool) -> None
selector = WorkspaceFileSelector(
ctx, additional_excludes=additional_excludes, ignore_file=".ignorefile"
)
files = set([f.as_posix() for f in selector])
files = set([rel.as_posix() for _, rel in selector])
assert files == expected_files
def test_configuration_file_selector() -> None:
"""Test that ConfigurationFileSelector yields only config/secrets from settings dir."""
with isolated_workspace("configured_workspace") as ctx:
selector = ConfigurationFileSelector(ctx)
files = set([rel.as_posix() for _, rel in selector])
# In this workspace case only .config.toml files exist
assert files == {"config.toml", "dev.config.toml"}

View File

@@ -2,11 +2,10 @@ import os
import tarfile
import yaml
from io import BytesIO
from pathlib import Path
import time
from dlt._workspace.deployment.package_builder import (
DeploymentPackageBuilder,
PackageBuilder,
DEFAULT_DEPLOYMENT_FILES_FOLDER,
DEFAULT_MANIFEST_FILE_NAME,
)
@@ -20,7 +19,7 @@ def test_write_package_to_stream() -> None:
"""Test building deployment package to a stream and verify structure."""
with isolated_workspace("default") as ctx:
builder = DeploymentPackageBuilder(ctx)
builder = PackageBuilder(ctx)
selector = WorkspaceFileSelector(ctx, ignore_file=".ignorefile")
stream = BytesIO()
@@ -66,7 +65,7 @@ def test_build_package() -> None:
"""Test that deployment packages are content-addressable with reproducible hashes."""
with isolated_workspace("default") as ctx:
builder = DeploymentPackageBuilder(ctx)
builder = PackageBuilder(ctx)
selector = WorkspaceFileSelector(ctx)
package_path, content_hash = builder.build_package(selector)
@@ -86,7 +85,7 @@ def test_build_package() -> None:
def test_manifest_files_are_sorted() -> None:
"""Test that hash is independent of file iteration order."""
with isolated_workspace("default") as ctx:
builder = DeploymentPackageBuilder(ctx)
builder = PackageBuilder(ctx)
selector = WorkspaceFileSelector(ctx)
hash1 = builder.write_package_to_stream(selector, BytesIO())