mirror of
https://github.com/dlt-hub/dlt.git
synced 2025-12-17 19:31:30 +00:00
(fix) use sparse checkout for dlt init dlthub (#3356)
* adds option to sparse checkout repo * use sparse checkout for llm context * fixes sqlglot from find * adds checkout after sparse clone * explains unknown path tests
This commit is contained in:
@@ -119,7 +119,9 @@ def vibe_source_setup(
|
||||
"""Copies files from vibe sources repo into the current working folder"""
|
||||
|
||||
fmt.echo("Looking up in dltHub for rules, docs and snippets for %s..." % fmt.bold(source))
|
||||
src_storage = git.get_fresh_repo_files(location, get_dlt_repos_dir(), branch=branch)
|
||||
src_storage = git.get_fresh_repo_files(
|
||||
location, get_dlt_repos_dir(), branch=branch, path=source
|
||||
)
|
||||
if not src_storage.has_folder(source):
|
||||
fmt.warning("We have nothing for %s at dltHub yet." % fmt.bold(source))
|
||||
return
|
||||
|
||||
@@ -70,14 +70,37 @@ def get_default_branch(repo: Repo) -> str:
|
||||
|
||||
|
||||
def ensure_remote_head(
|
||||
repo_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None
|
||||
repo_path: str,
|
||||
branch: Optional[str] = None,
|
||||
with_git_command: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Updates repository from origin and ensures it's clean and synced.
|
||||
|
||||
Uses sparse checkout when path is specified, fetching only the specified
|
||||
directory instead of the entire repository tree.
|
||||
|
||||
Args:
|
||||
repo_path: Local path to the git repository.
|
||||
branch: Branch to checkout. Defaults to repository's default branch.
|
||||
with_git_command: Custom GIT_SSH_COMMAND for authentication.
|
||||
path: Directory path for sparse checkout. When set, only this path
|
||||
is checked out, reducing download size and time.
|
||||
|
||||
Raises:
|
||||
RepositoryDirtyError: If repository has uncommitted changes or is
|
||||
not synced with origin.
|
||||
"""
|
||||
from git import Repo, RepositoryDirtyError
|
||||
|
||||
# update remotes and check if heads are same. ignores locally modified files
|
||||
with Repo(repo_path) as repo:
|
||||
# use custom environment if specified
|
||||
with repo.git.custom_environment(GIT_SSH_COMMAND=with_git_command):
|
||||
# if path is set, use sparse checkout
|
||||
if path is not None:
|
||||
# assume that sparse checkout was enabled when cloning
|
||||
repo.git.sparse_checkout("set", path)
|
||||
# checkout branch before fetching
|
||||
repo.git.checkout(branch or get_default_branch(repo))
|
||||
# update origin
|
||||
@@ -92,14 +115,35 @@ def clone_repo(
|
||||
clone_path: str,
|
||||
branch: Optional[str] = None,
|
||||
with_git_command: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
) -> Repo:
|
||||
_import_git()
|
||||
|
||||
from git import Repo
|
||||
|
||||
repo = Repo.clone_from(repository_url, clone_path, env=dict(GIT_SSH_COMMAND=with_git_command))
|
||||
if path is not None:
|
||||
# set of options that prevents downloading all blobs
|
||||
multi_options = [
|
||||
"--depth=1",
|
||||
"--filter=blob:none",
|
||||
"--no-checkout",
|
||||
]
|
||||
else:
|
||||
multi_options = None
|
||||
repo = Repo.clone_from(
|
||||
repository_url,
|
||||
clone_path,
|
||||
env=dict(GIT_SSH_COMMAND=with_git_command),
|
||||
multi_options=multi_options,
|
||||
)
|
||||
# set up sparse mode to checkout paths on demand
|
||||
if path is not None:
|
||||
repo.git.sparse_checkout("init", "--cone")
|
||||
repo.git.sparse_checkout("set", path)
|
||||
if branch:
|
||||
repo.git.checkout(branch)
|
||||
elif path is not None:
|
||||
repo.git.checkout()
|
||||
return repo
|
||||
|
||||
|
||||
@@ -109,8 +153,23 @@ def force_clone_repo(
|
||||
repo_name: str,
|
||||
branch: Optional[str] = None,
|
||||
with_git_command: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Deletes the working directory repo_storage.root/repo_name and clones the `repo_url` into it. Will checkout `branch` if provided"""
|
||||
"""Deletes existing repository and performs fresh clone.
|
||||
|
||||
Removes repo_storage.root/repo_name if it exists, then clones from
|
||||
repo_url. Uses sparse checkout when path is specified to download only
|
||||
the specified directory, reducing clone time and disk usage.
|
||||
|
||||
Args:
|
||||
repo_url: Git repository URL to clone from.
|
||||
repo_storage: FileStorage instance managing the clone destination.
|
||||
repo_name: Directory name for the cloned repository.
|
||||
branch: Branch to checkout after cloning.
|
||||
with_git_command: Custom GIT_SSH_COMMAND for authentication.
|
||||
path: Directory path for sparse checkout. When set, only this path
|
||||
is cloned using --filter=blob:none and --depth=1.
|
||||
"""
|
||||
try:
|
||||
# delete repo folder
|
||||
if repo_storage.has_folder(repo_name):
|
||||
@@ -120,6 +179,7 @@ def force_clone_repo(
|
||||
repo_storage.make_full_path(repo_name),
|
||||
branch=branch,
|
||||
with_git_command=with_git_command,
|
||||
path=path,
|
||||
).close()
|
||||
except Exception:
|
||||
# delete folder so we start clean next time
|
||||
@@ -133,8 +193,25 @@ def get_fresh_repo_files(
|
||||
working_dir: str = None,
|
||||
branch: Optional[str] = None,
|
||||
with_git_command: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
) -> FileStorage:
|
||||
"""Returns a file storage leading to the newest repository files. If `repo_location` is url, file will be checked out into `working_dir/repo_name`"""
|
||||
"""Returns FileStorage with up-to-date repository files.
|
||||
|
||||
If repo_location is a local directory, returns storage pointing to it.
|
||||
If it's a git URL, clones or updates the repository in working_dir/repo_name.
|
||||
Supports sparse checkout to fetch only a specific directory path.
|
||||
|
||||
Args:
|
||||
repo_location: Local directory path or git repository URL.
|
||||
working_dir: Directory where repository will be cloned if repo_location is URL.
|
||||
branch: Branch to checkout.
|
||||
with_git_command: Custom GIT_SSH_COMMAND for authentication.
|
||||
path: Directory path for sparse checkout. Downloads only this directory,
|
||||
improving performance for large repositories.
|
||||
|
||||
Returns:
|
||||
FileStorage instance pointing to repository files (or specified path within).
|
||||
"""
|
||||
from git import GitError
|
||||
|
||||
url = giturlparse.parse(repo_location, check_domain=False)
|
||||
@@ -146,7 +223,9 @@ def get_fresh_repo_files(
|
||||
repo_name = url.name
|
||||
repo_path = os.path.join(working_dir, repo_name)
|
||||
try:
|
||||
ensure_remote_head(repo_path, branch=branch, with_git_command=with_git_command)
|
||||
ensure_remote_head(
|
||||
repo_path, branch=branch, with_git_command=with_git_command, path=path
|
||||
)
|
||||
except GitError:
|
||||
force_clone_repo(
|
||||
repo_location,
|
||||
@@ -154,6 +233,7 @@ def get_fresh_repo_files(
|
||||
repo_name,
|
||||
branch=branch,
|
||||
with_git_command=with_git_command,
|
||||
path=path,
|
||||
)
|
||||
return FileStorage(repo_path)
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ AWESOME_REPO = "https://github.com/sindresorhus/awesome.git"
|
||||
JAFFLE_SHOP_REPO = "https://github.com/dbt-labs/jaffle_shop.git"
|
||||
PRIVATE_REPO = "git@github.com:scale-vector/rasa_bot_experiments.git"
|
||||
PRIVATE_REPO_WITH_ACCESS = "git@github.com:scale-vector/test_private_repo.git"
|
||||
CONTEXT_REPO = "https://github.com/dlt-hub/vibe-hub.git"
|
||||
|
||||
|
||||
def test_ssh_key_context() -> None:
|
||||
@@ -116,6 +117,32 @@ def test_fresh_repo_files_branch_change(test_storage: FileStorage) -> None:
|
||||
assert is_clean_and_synced(repo)
|
||||
|
||||
|
||||
def test_sparse_checkout(test_storage: FileStorage) -> None:
|
||||
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="abbyy")
|
||||
assert repo_storage.has_folder("abbyy")
|
||||
# only abbyy present
|
||||
assert len(repo_storage.list_folder_dirs(".")) == 2 # .git abbyy
|
||||
# two files inside
|
||||
assert len(repo_storage.list_folder_files("abbyy")) == 2
|
||||
|
||||
# checkout the other one
|
||||
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="stripe")
|
||||
assert repo_storage.has_folder("stripe")
|
||||
assert len(repo_storage.list_folder_dirs(".")) == 2 # .git stripe
|
||||
|
||||
# unknown path in case repo is already cloned and checkout was done
|
||||
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="__unknown")
|
||||
assert not repo_storage.has_folder("__unknown")
|
||||
assert len(repo_storage.list_folder_dirs(".")) == 1 # .git
|
||||
|
||||
|
||||
def test_sparse_checkout_path_not_exist_on_clone(test_storage: FileStorage) -> None:
|
||||
# unknown path before first checkout
|
||||
repo_storage = get_fresh_repo_files(CONTEXT_REPO, test_storage.storage_path, path="__unknown")
|
||||
assert not repo_storage.has_folder("__unknown")
|
||||
assert len(repo_storage.list_folder_dirs(".")) == 1 # .git
|
||||
|
||||
|
||||
def test_fresh_repo_files_branch_change_to_default(test_storage: FileStorage) -> None:
|
||||
repo_storage = get_fresh_repo_files(AWESOME_REPO, test_storage.storage_path, branch="gh-pages")
|
||||
with get_repo(repo_storage.storage_path) as repo:
|
||||
|
||||
Reference in New Issue
Block a user