(fix) use sparse checkout for dlt init dlthub (#3356)

* adds option to sparse checkout repo

* use sparse checkout for llm context

* fixes sqlglot from find

* adds checkout after sparse clone

* explains unknown path tests
This commit is contained in:
rudolfix
2025-11-22 12:15:40 +01:00
committed by GitHub
parent bbc1cb81cd
commit 5242790b13
3 changed files with 115 additions and 6 deletions

View File

@@ -119,7 +119,9 @@ def vibe_source_setup(
"""Copies files from vibe sources repo into the current working folder"""
fmt.echo("Looking up in dltHub for rules, docs and snippets for %s..." % fmt.bold(source))
src_storage = git.get_fresh_repo_files(location, get_dlt_repos_dir(), branch=branch)
src_storage = git.get_fresh_repo_files(
location, get_dlt_repos_dir(), branch=branch, path=source
)
if not src_storage.has_folder(source):
fmt.warning("We have nothing for %s at dltHub yet." % fmt.bold(source))
return

View File

@@ -70,14 +70,37 @@ def get_default_branch(repo: Repo) -> str:
def ensure_remote_head(
repo_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None
repo_path: str,
branch: Optional[str] = None,
with_git_command: Optional[str] = None,
path: Optional[str] = None,
) -> None:
"""Updates repository from origin and ensures it's clean and synced.
Uses sparse checkout when path is specified, fetching only the specified
directory instead of the entire repository tree.
Args:
repo_path: Local path to the git repository.
branch: Branch to checkout. Defaults to repository's default branch.
with_git_command: Custom GIT_SSH_COMMAND for authentication.
path: Directory path for sparse checkout. When set, only this path
is checked out, reducing download size and time.
Raises:
RepositoryDirtyError: If repository has uncommitted changes or is
not synced with origin.
"""
from git import Repo, RepositoryDirtyError
# update remotes and check if heads are same. ignores locally modified files
with Repo(repo_path) as repo:
# use custom environment if specified
with repo.git.custom_environment(GIT_SSH_COMMAND=with_git_command):
# if path is set, use sparse checkout
if path is not None:
# assume that sparse checkout was enabled when cloning
repo.git.sparse_checkout("set", path)
# checkout branch before fetching
repo.git.checkout(branch or get_default_branch(repo))
# update origin
@@ -92,14 +115,35 @@ def clone_repo(
clone_path: str,
branch: Optional[str] = None,
with_git_command: Optional[str] = None,
path: Optional[str] = None,
) -> Repo:
_import_git()
from git import Repo
repo = Repo.clone_from(repository_url, clone_path, env=dict(GIT_SSH_COMMAND=with_git_command))
if path is not None:
# set of options that prevents downloading all blobs
multi_options = [
"--depth=1",
"--filter=blob:none",
"--no-checkout",
]
else:
multi_options = None
repo = Repo.clone_from(
repository_url,
clone_path,
env=dict(GIT_SSH_COMMAND=with_git_command),
multi_options=multi_options,
)
# set up sparse mode to checkout paths on demand
if path is not None:
repo.git.sparse_checkout("init", "--cone")
repo.git.sparse_checkout("set", path)
if branch:
repo.git.checkout(branch)
elif path is not None:
repo.git.checkout()
return repo
@@ -109,8 +153,23 @@ def force_clone_repo(
repo_name: str,
branch: Optional[str] = None,
with_git_command: Optional[str] = None,
path: Optional[str] = None,
) -> None:
"""Deletes the working directory repo_storage.root/repo_name and clones the `repo_url` into it. Will checkout `branch` if provided"""
"""Deletes existing repository and performs fresh clone.
Removes repo_storage.root/repo_name if it exists, then clones from
repo_url. Uses sparse checkout when path is specified to download only
the specified directory, reducing clone time and disk usage.
Args:
repo_url: Git repository URL to clone from.
repo_storage: FileStorage instance managing the clone destination.
repo_name: Directory name for the cloned repository.
branch: Branch to checkout after cloning.
with_git_command: Custom GIT_SSH_COMMAND for authentication.
path: Directory path for sparse checkout. When set, only this path
is cloned using --filter=blob:none and --depth=1.
"""
try:
# delete repo folder
if repo_storage.has_folder(repo_name):
@@ -120,6 +179,7 @@ def force_clone_repo(
repo_storage.make_full_path(repo_name),
branch=branch,
with_git_command=with_git_command,
path=path,
).close()
except Exception:
# delete folder so we start clean next time
@@ -133,8 +193,25 @@ def get_fresh_repo_files(
working_dir: str = None,
branch: Optional[str] = None,
with_git_command: Optional[str] = None,
path: Optional[str] = None,
) -> FileStorage:
"""Returns a file storage leading to the newest repository files. If `repo_location` is url, file will be checked out into `working_dir/repo_name`"""
"""Returns FileStorage with up-to-date repository files.
If repo_location is a local directory, returns storage pointing to it.
If it's a git URL, clones or updates the repository in working_dir/repo_name.
Supports sparse checkout to fetch only a specific directory path.
Args:
repo_location: Local directory path or git repository URL.
working_dir: Directory where repository will be cloned if repo_location is URL.
branch: Branch to checkout.
with_git_command: Custom GIT_SSH_COMMAND for authentication.
path: Directory path for sparse checkout. Downloads only this directory,
improving performance for large repositories.
Returns:
FileStorage instance pointing to repository files (or specified path within).
"""
from git import GitError
url = giturlparse.parse(repo_location, check_domain=False)
@@ -146,7 +223,9 @@ def get_fresh_repo_files(
repo_name = url.name
repo_path = os.path.join(working_dir, repo_name)
try:
ensure_remote_head(repo_path, branch=branch, with_git_command=with_git_command)
ensure_remote_head(
repo_path, branch=branch, with_git_command=with_git_command, path=path
)
except GitError:
force_clone_repo(
repo_location,
@@ -154,6 +233,7 @@ def get_fresh_repo_files(
repo_name,
branch=branch,
with_git_command=with_git_command,
path=path,
)
return FileStorage(repo_path)

View File

@@ -21,6 +21,7 @@ AWESOME_REPO = "https://github.com/sindresorhus/awesome.git"
JAFFLE_SHOP_REPO = "https://github.com/dbt-labs/jaffle_shop.git"
PRIVATE_REPO = "git@github.com:scale-vector/rasa_bot_experiments.git"
PRIVATE_REPO_WITH_ACCESS = "git@github.com:scale-vector/test_private_repo.git"
CONTEXT_REPO = "https://github.com/dlt-hub/vibe-hub.git"
def test_ssh_key_context() -> None:
@@ -116,6 +117,32 @@ def test_fresh_repo_files_branch_change(test_storage: FileStorage) -> None:
assert is_clean_and_synced(repo)
def test_sparse_checkout(test_storage: FileStorage) -> None:
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="abbyy")
assert repo_storage.has_folder("abbyy")
# only abbyy present
assert len(repo_storage.list_folder_dirs(".")) == 2 # .git abbyy
# two files inside
assert len(repo_storage.list_folder_files("abbyy")) == 2
# checkout the other one
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="stripe")
assert repo_storage.has_folder("stripe")
assert len(repo_storage.list_folder_dirs(".")) == 2 # .git stripe
# unknown path in case repo is already cloned and checkout was done
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="__unknown")
assert not repo_storage.has_folder("__unknown")
assert len(repo_storage.list_folder_dirs(".")) == 1 # .git
def test_sparse_checkout_path_not_exist_on_clone(test_storage: FileStorage) -> None:
# unknown path before first checkout
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="__unknown")
assert not repo_storage.has_folder("__unknown")
assert len(repo_storage.list_folder_dirs(".")) == 1 # .git
def test_fresh_repo_files_branch_change_to_default(test_storage: FileStorage) -> None:
repo_storage = get_fresh_repo_files(AWESOME_REPO, test_storage.storage_path, branch="gh-pages")
with get_repo(repo_storage.storage_path) as repo: