Revert "optimize name matches"

This reverts commit 087f8167ec.
This commit is contained in:
Emily Rockman
2025-12-01 13:27:02 -05:00
parent d096a6776e
commit 148b9b41a5
2 changed files with 15 additions and 159 deletions

View File

@@ -123,14 +123,8 @@ class DepsTask(BaseTask):
This method is called only during `dbt deps --add-package` to check if the package
being added already exists in packages.yml. It uses substring matching to identify
duplicates, which means it will match across different package sources. For example,
adding a hub package "dbt-labs/dbt_utils" will remove an existing git package
"https://github.com/dbt-labs/dbt-utils.git" since both contain "dbt_utils" or "dbt-utils".
The matching is flexible to handle both underscore and hyphen variants of package names,
as git repos often use hyphens (dbt-utils) while package names use underscores (dbt_utils).
Word boundaries (/, .) are enforced to prevent false matches like "dbt-core" matching
"dbt-core-utils".
duplicates, checking if the package name appears within package identifiers (such as
within git URLs, hub package names, or local paths).
Args:
packages_yml (dict): In-memory read of `packages.yml` contents
@@ -138,30 +132,6 @@ class DepsTask(BaseTask):
Returns:
dict: Updated packages_yml contents with matching packages removed
"""
# Extract the package name for matching
package_name = self.args.add_package["name"]
# Create variants for flexible matching (handle _ vs -)
# Check multiple variants to handle naming inconsistencies between hub and git
package_name_parts = [
package_name, # Original: "dbt-labs/dbt_utils"
package_name.replace("_", "-"), # Hyphens: "dbt-labs/dbt-utils"
package_name.replace("-", "_"), # Underscores: "dbt_labs/dbt_utils"
]
# Extract just the package name without org (after last /)
if "/" in package_name:
short_name = package_name.split("/")[-1]
package_name_parts.extend(
[
short_name, # "dbt_utils"
short_name.replace("_", "-"), # "dbt-utils"
short_name.replace("-", "_"), # "dbt_utils" (deduplicated)
]
)
# Remove duplicates from package_name_parts
package_name_parts = list(set(package_name_parts))
# Iterate backwards to safely delete items without index shifting issues
for i in range(len(packages_yml["packages"]) - 1, -1, -1):
pkg_entry = packages_yml["packages"][i]
@@ -176,40 +146,19 @@ class DepsTask(BaseTask):
or pkg_entry.get("private") # private package
)
# Check if any variant of the package name appears in the identifier
# Use word boundaries to avoid false matches (e.g., "dbt-core" shouldn't match "dbt-core-utils")
# Word boundaries are: start/end of string, /, or .
# Note: - and _ are NOT boundaries since they're used within compound package names
if package_identifier:
is_duplicate = False
for name_variant in package_name_parts:
if name_variant in package_identifier:
# Found a match, now verify it's not a substring of a larger word
# Check characters before and after the match
idx = package_identifier.find(name_variant)
start_ok = idx == 0 or package_identifier[idx - 1] in "/."
end_idx = idx + len(name_variant)
end_ok = (
end_idx == len(package_identifier)
or package_identifier[end_idx] in "/."
)
if start_ok and end_ok:
is_duplicate = True
break
if is_duplicate:
del packages_yml["packages"][i]
# Filter out non-string values (like warn-unpinned boolean) before logging
# Note: Check for bool first since bool is a subclass of int in Python
loggable_package = {
k: v
for k, v in pkg_entry.items()
if not isinstance(v, bool)
and isinstance(v, (str, int, float))
and k != "unrendered"
}
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
# Check if package name appears in the identifier using substring match
if package_identifier and self.args.add_package["name"] in package_identifier:
del packages_yml["packages"][i]
# Filter out non-string values (like warn-unpinned boolean) before logging
# Note: Check for bool first since bool is a subclass of int in Python
loggable_package = {
k: v
for k, v in pkg_entry.items()
if not isinstance(v, bool)
and isinstance(v, (str, int, float))
and k != "unrendered"
}
fire_event(DepsFoundDuplicatePackage(removed_package=loggable_package))
return packages_yml

View File

@@ -1133,96 +1133,3 @@ class TestCheckForDuplicatePackagesWithBooleans(unittest.TestCase):
self.assertIsNotNone(result)
self.assertEqual(len(result["packages"]), 1)
self.assertIn("dbt_amplitude", result["packages"][0]["git"])
def test_check_duplicate_underscore_hyphen_matching(self):
"""Test that underscore and hyphen variants match (dbt_utils matches dbt-utils)"""
# Adding hub package with underscore should match git package with hyphen
mock_args = Namespace(
add_package={"name": "dbt-labs/dbt_utils", "version": "1.0.0"}, source="hub"
)
with mock.patch("dbt.task.deps.BaseTask.__init__"):
task = DepsTask.__new__(DepsTask)
task.args = mock_args
packages_yml = {
"packages": [
{
"git": "https://github.com/dbt-labs/dbt-utils.git", # hyphen in URL
"revision": "1.0.0",
},
]
}
# Should match because "dbt-utils" variant matches the git URL
with mock.patch("dbt_common.events.functions.fire_event"):
result = task.check_for_duplicate_packages(packages_yml)
self.assertIsNotNone(result)
self.assertEqual(len(result["packages"]), 0) # Git package removed
def test_check_duplicate_no_partial_word_match(self):
"""Test that partial word matches are rejected (dbt-core shouldn't match dbt-core-utils)"""
mock_args = Namespace(
add_package={"name": "dbt-labs/dbt-core", "version": "1.0.0"}, source="hub"
)
with mock.patch("dbt.task.deps.BaseTask.__init__"):
task = DepsTask.__new__(DepsTask)
task.args = mock_args
packages_yml = {
"packages": [
{
"git": "https://github.com/dbt-labs/dbt-core-utils.git",
"revision": "1.0.0",
},
{
"package": "other-org/my-dbt-core-fork",
"version": "2.0.0",
},
]
}
# Should NOT match because "dbt-core" is part of a larger word
with mock.patch("dbt_common.events.functions.fire_event"):
result = task.check_for_duplicate_packages(packages_yml)
# Both packages should remain (no matches)
self.assertIsNotNone(result)
self.assertEqual(len(result["packages"]), 2)
def test_check_duplicate_exact_word_boundary_match(self):
"""Test that exact matches with word boundaries work correctly"""
mock_args = Namespace(
add_package={"name": "dbt-labs/dbt-utils", "version": "1.0.0"}, source="hub"
)
with mock.patch("dbt.task.deps.BaseTask.__init__"):
task = DepsTask.__new__(DepsTask)
task.args = mock_args
packages_yml = {
"packages": [
{
"git": "https://github.com/dbt-labs/dbt-utils.git", # Should match
"revision": "1.0.0",
},
{
"git": "https://github.com/other/dbt-utils-extra.git", # Should NOT match
"revision": "2.0.0",
},
{
"package": "dbt-labs/dbt_utils", # Should match (underscore variant)
"version": "0.9.0",
},
]
}
with mock.patch("dbt_common.events.functions.fire_event"):
result = task.check_for_duplicate_packages(packages_yml)
# Only dbt-utils-extra should remain
self.assertIsNotNone(result)
self.assertEqual(len(result["packages"]), 1)
self.assertIn("dbt-utils-extra", result["packages"][0]["git"])