Compare commits

...

2 Commits

Author SHA1 Message Date
Jeremy Cohen
20be92525a Update MB -> MiB in functional test 2024-12-24 17:44:51 +01:00
Noah Holm
704120cf3f Reapply changes from #7125
Co-authored by: Noah Holm <32292420+noppaz@users.noreply.github.com>
Co-authored by: Jeremy Cohen <jeremy@dbtlabs.com>
2024-12-24 17:13:17 +01:00
12 changed files with 63 additions and 21 deletions

View File

@@ -0,0 +1,6 @@
kind: Features
body: Make MAXIMUM_SEED_SIZE_MIB configurable
time: 2023-03-07T13:48:38.792321024Z
custom:
Author: noppaz acurtis-evi
Issue: 7117 7124

View File

@@ -3,6 +3,7 @@ from dataclasses import dataclass
from typing import List, Optional
from dbt.artifacts.resources.types import NodeType
from dbt_common.clients.system import convert_path
from dbt_common.dataclass_schema import dbtClassMixin
@@ -60,6 +61,27 @@ class FileHash(dbtClassMixin):
checksum = hashlib.new(name, data).hexdigest()
return cls(name=name, checksum=checksum)
@classmethod
def from_path(cls, path: str, name="sha256") -> "FileHash":
"""Create a file hash from the file at given path. The hash is always the
utf-8 encoding of the contents which is stripped to give similar hashes
as `FileHash.from_contents`.
"""
path = convert_path(path)
chunk_size = 1 * 1024 * 1024
file_hash = hashlib.new(name)
with open(path, "r") as handle:
# Left and rightstrip start and end of contents to give identical
# results as the seed hashing implementation with from_contents
chunk = handle.read(chunk_size).lstrip()
while chunk:
next_chunk = handle.read(chunk_size)
if not next_chunk:
chunk = chunk.rstrip()
file_hash.update(chunk.encode("utf-8"))
chunk = next_chunk
return cls(name=name, checksum=file_hash.hexdigest())
@dataclass
class Docs(dbtClassMixin):

View File

@@ -118,6 +118,7 @@ def global_flags(func):
@p.log_level_file
@p.log_path
@p.macro_debugging
@p.maximum_seed_size_mib
@p.partial_parse
@p.partial_parse_file_path
@p.partial_parse_file_diff

View File

@@ -167,6 +167,14 @@ indirect_selection = click.option(
default="eager",
)
maximum_seed_size_mib = click.option(
"--maximum-seed-size-mib",
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
type=click.INT,
default=1,
)
lock = click.option(
"--lock",
envvar=None,

View File

@@ -4,9 +4,6 @@ DEFAULT_ENV_PLACEHOLDER = "DBT_DEFAULT_PLACEHOLDER"
SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"
MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
MAXIMUM_SEED_SIZE_NAME = "1MB"
PIN_PACKAGE_URL = (
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
)

View File

@@ -5,7 +5,6 @@ from typing import Any, Dict, List, Optional, Union
from mashumaro.types import SerializableType
from dbt.artifacts.resources.base import FileHash
from dbt.constants import MAXIMUM_SEED_SIZE
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin
from .util import SourceKey
@@ -65,9 +64,8 @@ class FilePath(dbtClassMixin):
def original_file_path(self) -> str:
return os.path.join(self.searched_path, self.relative_path)
def seed_too_large(self) -> bool:
"""Return whether the file this represents is over the seed size limit"""
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
def file_size(self) -> int:
return os.stat(self.full_path).st_size
@dataclass

View File

@@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin):
log_format_file: Optional[str] = None
log_level: Optional[str] = None
log_level_file: Optional[str] = None
maximum_seed_size_mib: Optional[int] = None
partial_parse: Optional[bool] = None
populate_cache: Optional[bool] = None
printer_width: Optional[int] = None

View File

@@ -1,6 +1,6 @@
import json
from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
from dbt.constants import PIN_PACKAGE_URL
from dbt.events.base_types import (
DebugLevel,
DynamicLevel,
@@ -8,6 +8,7 @@ from dbt.events.base_types import (
InfoLevel,
WarnLevel,
)
from dbt.flags import get_flags
from dbt_common.events.base_types import EventLevel
from dbt_common.events.format import (
format_fancy_output_line,
@@ -675,10 +676,11 @@ class SeedIncreased(WarnLevel):
return "I052"
def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
f">{maximum_seed_size_name} in size. The previous file was "
f"<={maximum_seed_size_name}, so it has changed"
)
return msg
@@ -688,9 +690,10 @@ class SeedExceedsLimitSamePath(WarnLevel):
return "I053"
def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
f">{maximum_seed_size_name} in size at the same path, dbt "
f"cannot tell if it has changed: assuming they are the same"
)
return msg
@@ -701,9 +704,10 @@ class SeedExceedsLimitAndPathChanged(WarnLevel):
return "I054"
def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
f">{maximum_seed_size_name} in size. The previous file was in "
f"a different location, assuming it has changed"
)
return msg
@@ -714,9 +718,10 @@ class SeedExceedsLimitChecksumChanged(WarnLevel):
return "I055"
def message(self) -> str:
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
f">{maximum_seed_size_name} in size. The previous file had a "
f"checksum type of {self.checksum_name}, so it has changed"
)
return msg

View File

@@ -69,6 +69,7 @@ def get_flag_dict():
"log_path",
"invocation_command",
"empty",
"maximum_seed_size_mib",
}
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}

View File

@@ -17,6 +17,7 @@ from dbt.contracts.files import (
)
from dbt.events.types import InputFileDiffError
from dbt.exceptions import ParsingError
from dbt.flags import get_flags
from dbt.parser.common import schema_file_keys
from dbt.parser.schemas import yaml_from_file
from dbt.parser.search import filesystem_search
@@ -123,12 +124,14 @@ def validate_yaml(file_path, dct):
# Special processing for big seed files
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
if match.seed_too_large():
# Users can configure the maximum seed size (MiB) that will be hashed for state comparison
maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024
# maximum_seed_size = 0 means no limit
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:
# We don't want to calculate a hash of this file. Use the path.
source_file = SourceFile.big_seed(match)
else:
file_contents = load_file_contents(match.absolute_path, strip=True)
checksum = FileHash.from_contents(file_contents)
checksum = FileHash.from_path(match.absolute_path)
source_file = SourceFile(path=match, checksum=checksum)
source_file.contents = ""
source_file.parse_file_type = ParseFileType.Seed

View File

@@ -228,7 +228,7 @@ class TestChangedSeedContents(BaseModifiedState):
"./state",
]
)
assert ">1MB" in str(exc.value)
assert ">1MiB" in str(exc.value)
# now check if unmodified returns none
results = run_dbt(

View File

@@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
@@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
@@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
@@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):