mirror of
https://github.com/dbt-labs/dbt-core
synced 2025-12-17 19:31:34 +00:00
optimize name matches
This commit is contained in:
@@ -123,8 +123,14 @@ class DepsTask(BaseTask):
|
||||
|
||||
This method is called only during `dbt deps --add-package` to check if the package
|
||||
being added already exists in packages.yml. It uses substring matching to identify
|
||||
duplicates, checking if the package name appears within package identifiers (such as
|
||||
within git URLs, hub package names, or local paths).
|
||||
duplicates, which means it will match across different package sources. For example,
|
||||
adding a hub package "dbt-labs/dbt_utils" will remove an existing git package
|
||||
"https://github.com/dbt-labs/dbt-utils.git" since both contain "dbt_utils" or "dbt-utils".
|
||||
|
||||
The matching is flexible to handle both underscore and hyphen variants of package names,
|
||||
as git repos often use hyphens (dbt-utils) while package names use underscores (dbt_utils).
|
||||
Word boundaries (/, .) are enforced to prevent false matches like "dbt-core" matching
|
||||
"dbt-core-utils".
|
||||
|
||||
Args:
|
||||
packages_yml (dict): In-memory read of `packages.yml` contents
|
||||
@@ -132,6 +138,30 @@ class DepsTask(BaseTask):
|
||||
Returns:
|
||||
dict: Updated packages_yml contents with matching packages removed
|
||||
"""
|
||||
# Extract the package name for matching
|
||||
package_name = self.args.add_package["name"]
|
||||
|
||||
# Create variants for flexible matching (handle _ vs -)
|
||||
# Check multiple variants to handle naming inconsistencies between hub and git
|
||||
package_name_parts = [
|
||||
package_name, # Original: "dbt-labs/dbt_utils"
|
||||
package_name.replace("_", "-"), # Hyphens: "dbt-labs/dbt-utils"
|
||||
package_name.replace("-", "_"), # Underscores: "dbt_labs/dbt_utils"
|
||||
]
|
||||
# Extract just the package name without org (after last /)
|
||||
if "/" in package_name:
|
||||
short_name = package_name.split("/")[-1]
|
||||
package_name_parts.extend(
|
||||
[
|
||||
short_name, # "dbt_utils"
|
||||
short_name.replace("_", "-"), # "dbt-utils"
|
||||
short_name.replace("-", "_"), # "dbt_utils" (deduplicated)
|
||||
]
|
||||
)
|
||||
|
||||
# Remove duplicates from package_name_parts
|
||||
package_name_parts = list(set(package_name_parts))
|
||||
|
||||
# Iterate backwards to safely delete items without index shifting issues
|
||||
for i in range(len(packages_yml["packages"]) - 1, -1, -1):
|
||||
pkg_entry = packages_yml["packages"][i]
|
||||
@@ -146,19 +176,40 @@ class DepsTask(BaseTask):
|
||||
or pkg_entry.get("private") # private package
|
||||
)
|
||||
|
||||
# Check if package name appears in the identifier using substring match
|
||||
if package_identifier and self.args.add_package["name"] in package_identifier:
|
||||
del packages_yml["packages"][i]
|
||||
# Filter out non-string values (like warn-unpinned boolean) before logging
|
||||
# Note: Check for bool first since bool is a subclass of int in Python
|
||||
loggable_package = {
|
||||
k: v
|
||||
for k, v in pkg_entry.items()
|
||||
if not isinstance(v, bool)
|
||||
and isinstance(v, (str, int, float))
|
||||
and k != "unrendered"
|
||||
}
|
||||
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
|
||||
# Check if any variant of the package name appears in the identifier
|
||||
# Use word boundaries to avoid false matches (e.g., "dbt-core" shouldn't match "dbt-core-utils")
|
||||
# Word boundaries are: start/end of string, /, or .
|
||||
# Note: - and _ are NOT boundaries since they're used within compound package names
|
||||
if package_identifier:
|
||||
is_duplicate = False
|
||||
for name_variant in package_name_parts:
|
||||
if name_variant in package_identifier:
|
||||
# Found a match, now verify it's not a substring of a larger word
|
||||
# Check characters before and after the match
|
||||
idx = package_identifier.find(name_variant)
|
||||
start_ok = idx == 0 or package_identifier[idx - 1] in "/."
|
||||
end_idx = idx + len(name_variant)
|
||||
end_ok = (
|
||||
end_idx == len(package_identifier)
|
||||
or package_identifier[end_idx] in "/."
|
||||
)
|
||||
|
||||
if start_ok and end_ok:
|
||||
is_duplicate = True
|
||||
break
|
||||
|
||||
if is_duplicate:
|
||||
del packages_yml["packages"][i]
|
||||
# Filter out non-string values (like warn-unpinned boolean) before logging
|
||||
# Note: Check for bool first since bool is a subclass of int in Python
|
||||
loggable_package = {
|
||||
k: v
|
||||
for k, v in pkg_entry.items()
|
||||
if not isinstance(v, bool)
|
||||
and isinstance(v, (str, int, float))
|
||||
and k != "unrendered"
|
||||
}
|
||||
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
|
||||
|
||||
return packages_yml
|
||||
|
||||
|
||||
@@ -1133,3 +1133,96 @@ class TestCheckForDuplicatePackagesWithBooleans(unittest.TestCase):
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(len(result["packages"]), 1)
|
||||
self.assertIn("dbt_amplitude", result["packages"][0]["git"])
|
||||
|
||||
def test_check_duplicate_underscore_hyphen_matching(self):
|
||||
"""Test that underscore and hyphen variants match (dbt_utils matches dbt-utils)"""
|
||||
# Adding hub package with underscore should match git package with hyphen
|
||||
mock_args = Namespace(
|
||||
add_package={"name": "dbt-labs/dbt_utils", "version": "1.0.0"}, source="hub"
|
||||
)
|
||||
|
||||
with mock.patch("dbt.task.deps.BaseTask.__init__"):
|
||||
task = DepsTask.__new__(DepsTask)
|
||||
task.args = mock_args
|
||||
|
||||
packages_yml = {
|
||||
"packages": [
|
||||
{
|
||||
"git": "https://github.com/dbt-labs/dbt-utils.git", # hyphen in URL
|
||||
"revision": "1.0.0",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
# Should match because "dbt-utils" variant matches the git URL
|
||||
with mock.patch("dbt_common.events.functions.fire_event"):
|
||||
result = task.check_for_duplicate_packages(packages_yml)
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(len(result["packages"]), 0) # Git package removed
|
||||
|
||||
def test_check_duplicate_no_partial_word_match(self):
|
||||
"""Test that partial word matches are rejected (dbt-core shouldn't match dbt-core-utils)"""
|
||||
mock_args = Namespace(
|
||||
add_package={"name": "dbt-labs/dbt-core", "version": "1.0.0"}, source="hub"
|
||||
)
|
||||
|
||||
with mock.patch("dbt.task.deps.BaseTask.__init__"):
|
||||
task = DepsTask.__new__(DepsTask)
|
||||
task.args = mock_args
|
||||
|
||||
packages_yml = {
|
||||
"packages": [
|
||||
{
|
||||
"git": "https://github.com/dbt-labs/dbt-core-utils.git",
|
||||
"revision": "1.0.0",
|
||||
},
|
||||
{
|
||||
"package": "other-org/my-dbt-core-fork",
|
||||
"version": "2.0.0",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
# Should NOT match because "dbt-core" is part of a larger word
|
||||
with mock.patch("dbt_common.events.functions.fire_event"):
|
||||
result = task.check_for_duplicate_packages(packages_yml)
|
||||
|
||||
# Both packages should remain (no matches)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(len(result["packages"]), 2)
|
||||
|
||||
def test_check_duplicate_exact_word_boundary_match(self):
|
||||
"""Test that exact matches with word boundaries work correctly"""
|
||||
mock_args = Namespace(
|
||||
add_package={"name": "dbt-labs/dbt-utils", "version": "1.0.0"}, source="hub"
|
||||
)
|
||||
|
||||
with mock.patch("dbt.task.deps.BaseTask.__init__"):
|
||||
task = DepsTask.__new__(DepsTask)
|
||||
task.args = mock_args
|
||||
|
||||
packages_yml = {
|
||||
"packages": [
|
||||
{
|
||||
"git": "https://github.com/dbt-labs/dbt-utils.git", # Should match
|
||||
"revision": "1.0.0",
|
||||
},
|
||||
{
|
||||
"git": "https://github.com/other/dbt-utils-extra.git", # Should NOT match
|
||||
"revision": "2.0.0",
|
||||
},
|
||||
{
|
||||
"package": "dbt-labs/dbt_utils", # Should match (underscore variant)
|
||||
"version": "0.9.0",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
with mock.patch("dbt_common.events.functions.fire_event"):
|
||||
result = task.check_for_duplicate_packages(packages_yml)
|
||||
|
||||
# Only dbt-utils-extra should remain
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(len(result["packages"]), 1)
|
||||
self.assertIn("dbt-utils-extra", result["packages"][0]["git"])
|
||||
|
||||
Reference in New Issue
Block a user