mirror of
https://github.com/dbt-labs/dbt-core
synced 2025-12-20 18:01:27 +00:00
Compare commits
2 Commits
enable-pos
...
jerco/redo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20be92525a | ||
|
|
704120cf3f |
6
.changes/unreleased/Features-20230307-134838.yaml
Normal file
6
.changes/unreleased/Features-20230307-134838.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
kind: Features
|
||||
body: Make MAXIMUM_SEED_SIZE_MIB configurable
|
||||
time: 2023-03-07T13:48:38.792321024Z
|
||||
custom:
|
||||
Author: noppaz acurtis-evi
|
||||
Issue: 7117 7124
|
||||
@@ -3,6 +3,7 @@ from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
from dbt.artifacts.resources.types import NodeType
|
||||
from dbt_common.clients.system import convert_path
|
||||
from dbt_common.dataclass_schema import dbtClassMixin
|
||||
|
||||
|
||||
@@ -60,6 +61,27 @@ class FileHash(dbtClassMixin):
|
||||
checksum = hashlib.new(name, data).hexdigest()
|
||||
return cls(name=name, checksum=checksum)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, path: str, name="sha256") -> "FileHash":
|
||||
"""Create a file hash from the file at given path. The hash is always the
|
||||
utf-8 encoding of the contents which is stripped to give similar hashes
|
||||
as `FileHash.from_contents`.
|
||||
"""
|
||||
path = convert_path(path)
|
||||
chunk_size = 1 * 1024 * 1024
|
||||
file_hash = hashlib.new(name)
|
||||
with open(path, "r") as handle:
|
||||
# Left and rightstrip start and end of contents to give identical
|
||||
# results as the seed hashing implementation with from_contents
|
||||
chunk = handle.read(chunk_size).lstrip()
|
||||
while chunk:
|
||||
next_chunk = handle.read(chunk_size)
|
||||
if not next_chunk:
|
||||
chunk = chunk.rstrip()
|
||||
file_hash.update(chunk.encode("utf-8"))
|
||||
chunk = next_chunk
|
||||
return cls(name=name, checksum=file_hash.hexdigest())
|
||||
|
||||
|
||||
@dataclass
|
||||
class Docs(dbtClassMixin):
|
||||
|
||||
@@ -118,6 +118,7 @@ def global_flags(func):
|
||||
@p.log_level_file
|
||||
@p.log_path
|
||||
@p.macro_debugging
|
||||
@p.maximum_seed_size_mib
|
||||
@p.partial_parse
|
||||
@p.partial_parse_file_path
|
||||
@p.partial_parse_file_diff
|
||||
|
||||
@@ -167,6 +167,14 @@ indirect_selection = click.option(
|
||||
default="eager",
|
||||
)
|
||||
|
||||
maximum_seed_size_mib = click.option(
|
||||
"--maximum-seed-size-mib",
|
||||
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
|
||||
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
|
||||
type=click.INT,
|
||||
default=1,
|
||||
)
|
||||
|
||||
lock = click.option(
|
||||
"--lock",
|
||||
envvar=None,
|
||||
|
||||
@@ -4,9 +4,6 @@ DEFAULT_ENV_PLACEHOLDER = "DBT_DEFAULT_PLACEHOLDER"
|
||||
|
||||
SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"
|
||||
|
||||
MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
|
||||
MAXIMUM_SEED_SIZE_NAME = "1MB"
|
||||
|
||||
PIN_PACKAGE_URL = (
|
||||
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
|
||||
)
|
||||
|
||||
@@ -5,7 +5,6 @@ from typing import Any, Dict, List, Optional, Union
|
||||
from mashumaro.types import SerializableType
|
||||
|
||||
from dbt.artifacts.resources.base import FileHash
|
||||
from dbt.constants import MAXIMUM_SEED_SIZE
|
||||
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin
|
||||
|
||||
from .util import SourceKey
|
||||
@@ -65,9 +64,8 @@ class FilePath(dbtClassMixin):
|
||||
def original_file_path(self) -> str:
|
||||
return os.path.join(self.searched_path, self.relative_path)
|
||||
|
||||
def seed_too_large(self) -> bool:
|
||||
"""Return whether the file this represents is over the seed size limit"""
|
||||
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
|
||||
def file_size(self) -> int:
|
||||
return os.stat(self.full_path).st_size
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin):
|
||||
log_format_file: Optional[str] = None
|
||||
log_level: Optional[str] = None
|
||||
log_level_file: Optional[str] = None
|
||||
maximum_seed_size_mib: Optional[int] = None
|
||||
partial_parse: Optional[bool] = None
|
||||
populate_cache: Optional[bool] = None
|
||||
printer_width: Optional[int] = None
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import json
|
||||
|
||||
from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
|
||||
from dbt.constants import PIN_PACKAGE_URL
|
||||
from dbt.events.base_types import (
|
||||
DebugLevel,
|
||||
DynamicLevel,
|
||||
@@ -8,6 +8,7 @@ from dbt.events.base_types import (
|
||||
InfoLevel,
|
||||
WarnLevel,
|
||||
)
|
||||
from dbt.flags import get_flags
|
||||
from dbt_common.events.base_types import EventLevel
|
||||
from dbt_common.events.format import (
|
||||
format_fancy_output_line,
|
||||
@@ -675,10 +676,11 @@ class SeedIncreased(WarnLevel):
|
||||
return "I052"
|
||||
|
||||
def message(self) -> str:
|
||||
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||
msg = (
|
||||
f"Found a seed ({self.package_name}.{self.name}) "
|
||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
|
||||
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
|
||||
f">{maximum_seed_size_name} in size. The previous file was "
|
||||
f"<={maximum_seed_size_name}, so it has changed"
|
||||
)
|
||||
return msg
|
||||
|
||||
@@ -688,9 +690,10 @@ class SeedExceedsLimitSamePath(WarnLevel):
|
||||
return "I053"
|
||||
|
||||
def message(self) -> str:
|
||||
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||
msg = (
|
||||
f"Found a seed ({self.package_name}.{self.name}) "
|
||||
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
|
||||
f">{maximum_seed_size_name} in size at the same path, dbt "
|
||||
f"cannot tell if it has changed: assuming they are the same"
|
||||
)
|
||||
return msg
|
||||
@@ -701,9 +704,10 @@ class SeedExceedsLimitAndPathChanged(WarnLevel):
|
||||
return "I054"
|
||||
|
||||
def message(self) -> str:
|
||||
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||
msg = (
|
||||
f"Found a seed ({self.package_name}.{self.name}) "
|
||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
|
||||
f">{maximum_seed_size_name} in size. The previous file was in "
|
||||
f"a different location, assuming it has changed"
|
||||
)
|
||||
return msg
|
||||
@@ -714,9 +718,10 @@ class SeedExceedsLimitChecksumChanged(WarnLevel):
|
||||
return "I055"
|
||||
|
||||
def message(self) -> str:
|
||||
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||
msg = (
|
||||
f"Found a seed ({self.package_name}.{self.name}) "
|
||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
|
||||
f">{maximum_seed_size_name} in size. The previous file had a "
|
||||
f"checksum type of {self.checksum_name}, so it has changed"
|
||||
)
|
||||
return msg
|
||||
|
||||
@@ -69,6 +69,7 @@ def get_flag_dict():
|
||||
"log_path",
|
||||
"invocation_command",
|
||||
"empty",
|
||||
"maximum_seed_size_mib",
|
||||
}
|
||||
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ from dbt.contracts.files import (
|
||||
)
|
||||
from dbt.events.types import InputFileDiffError
|
||||
from dbt.exceptions import ParsingError
|
||||
from dbt.flags import get_flags
|
||||
from dbt.parser.common import schema_file_keys
|
||||
from dbt.parser.schemas import yaml_from_file
|
||||
from dbt.parser.search import filesystem_search
|
||||
@@ -123,12 +124,14 @@ def validate_yaml(file_path, dct):
|
||||
|
||||
# Special processing for big seed files
|
||||
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
|
||||
if match.seed_too_large():
|
||||
# Users can configure the maximum seed size (MiB) that will be hashed for state comparison
|
||||
maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024
|
||||
# maximum_seed_size = 0 means no limit
|
||||
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:
|
||||
# We don't want to calculate a hash of this file. Use the path.
|
||||
source_file = SourceFile.big_seed(match)
|
||||
else:
|
||||
file_contents = load_file_contents(match.absolute_path, strip=True)
|
||||
checksum = FileHash.from_contents(file_contents)
|
||||
checksum = FileHash.from_path(match.absolute_path)
|
||||
source_file = SourceFile(path=match, checksum=checksum)
|
||||
source_file.contents = ""
|
||||
source_file.parse_file_type = ParseFileType.Seed
|
||||
|
||||
@@ -228,7 +228,7 @@ class TestChangedSeedContents(BaseModifiedState):
|
||||
"./state",
|
||||
]
|
||||
)
|
||||
assert ">1MB" in str(exc.value)
|
||||
assert ">1MiB" in str(exc.value)
|
||||
|
||||
# now check if unmodified returns none
|
||||
results = run_dbt(
|
||||
|
||||
@@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
|
||||
event = warn_or_error_patch.call_args[0][0]
|
||||
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
||||
msg = event.message()
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
||||
assert not search_manifest_using_method(manifest, method, "new")
|
||||
warn_or_error_patch.assert_not_called()
|
||||
@@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
|
||||
event = warn_or_error_patch.call_args[0][0]
|
||||
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
||||
msg = event.message()
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||
|
||||
|
||||
def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
|
||||
@@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
|
||||
event = warn_or_error_patch.call_args[0][0]
|
||||
assert type(event).__name__ == "SeedIncreased"
|
||||
msg = event.message()
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
||||
assert not search_manifest_using_method(manifest, method, "new")
|
||||
warn_or_error_patch.assert_not_called()
|
||||
@@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
|
||||
event = warn_or_error_patch.call_args[0][0]
|
||||
assert type(event).__name__ == "SeedIncreased"
|
||||
msg = event.message()
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
||||
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||
|
||||
|
||||
def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):
|
||||
|
||||
Reference in New Issue
Block a user