mirror of
https://github.com/dbt-labs/dbt-core
synced 2025-12-21 07:41:27 +00:00
Compare commits
2 Commits
enable-pos
...
jerco/redo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20be92525a | ||
|
|
704120cf3f |
6
.changes/unreleased/Features-20230307-134838.yaml
Normal file
6
.changes/unreleased/Features-20230307-134838.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
kind: Features
|
||||||
|
body: Make MAXIMUM_SEED_SIZE_MIB configurable
|
||||||
|
time: 2023-03-07T13:48:38.792321024Z
|
||||||
|
custom:
|
||||||
|
Author: noppaz acurtis-evi
|
||||||
|
Issue: 7117 7124
|
||||||
@@ -3,6 +3,7 @@ from dataclasses import dataclass
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from dbt.artifacts.resources.types import NodeType
|
from dbt.artifacts.resources.types import NodeType
|
||||||
|
from dbt_common.clients.system import convert_path
|
||||||
from dbt_common.dataclass_schema import dbtClassMixin
|
from dbt_common.dataclass_schema import dbtClassMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -60,6 +61,27 @@ class FileHash(dbtClassMixin):
|
|||||||
checksum = hashlib.new(name, data).hexdigest()
|
checksum = hashlib.new(name, data).hexdigest()
|
||||||
return cls(name=name, checksum=checksum)
|
return cls(name=name, checksum=checksum)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_path(cls, path: str, name="sha256") -> "FileHash":
|
||||||
|
"""Create a file hash from the file at given path. The hash is always the
|
||||||
|
utf-8 encoding of the contents which is stripped to give similar hashes
|
||||||
|
as `FileHash.from_contents`.
|
||||||
|
"""
|
||||||
|
path = convert_path(path)
|
||||||
|
chunk_size = 1 * 1024 * 1024
|
||||||
|
file_hash = hashlib.new(name)
|
||||||
|
with open(path, "r") as handle:
|
||||||
|
# Left and rightstrip start and end of contents to give identical
|
||||||
|
# results as the seed hashing implementation with from_contents
|
||||||
|
chunk = handle.read(chunk_size).lstrip()
|
||||||
|
while chunk:
|
||||||
|
next_chunk = handle.read(chunk_size)
|
||||||
|
if not next_chunk:
|
||||||
|
chunk = chunk.rstrip()
|
||||||
|
file_hash.update(chunk.encode("utf-8"))
|
||||||
|
chunk = next_chunk
|
||||||
|
return cls(name=name, checksum=file_hash.hexdigest())
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Docs(dbtClassMixin):
|
class Docs(dbtClassMixin):
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ def global_flags(func):
|
|||||||
@p.log_level_file
|
@p.log_level_file
|
||||||
@p.log_path
|
@p.log_path
|
||||||
@p.macro_debugging
|
@p.macro_debugging
|
||||||
|
@p.maximum_seed_size_mib
|
||||||
@p.partial_parse
|
@p.partial_parse
|
||||||
@p.partial_parse_file_path
|
@p.partial_parse_file_path
|
||||||
@p.partial_parse_file_diff
|
@p.partial_parse_file_diff
|
||||||
|
|||||||
@@ -167,6 +167,14 @@ indirect_selection = click.option(
|
|||||||
default="eager",
|
default="eager",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
maximum_seed_size_mib = click.option(
|
||||||
|
"--maximum-seed-size-mib",
|
||||||
|
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
|
||||||
|
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
|
||||||
|
type=click.INT,
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
|
|
||||||
lock = click.option(
|
lock = click.option(
|
||||||
"--lock",
|
"--lock",
|
||||||
envvar=None,
|
envvar=None,
|
||||||
|
|||||||
@@ -4,9 +4,6 @@ DEFAULT_ENV_PLACEHOLDER = "DBT_DEFAULT_PLACEHOLDER"
|
|||||||
|
|
||||||
SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"
|
SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$"
|
||||||
|
|
||||||
MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
|
|
||||||
MAXIMUM_SEED_SIZE_NAME = "1MB"
|
|
||||||
|
|
||||||
PIN_PACKAGE_URL = (
|
PIN_PACKAGE_URL = (
|
||||||
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
|
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from typing import Any, Dict, List, Optional, Union
|
|||||||
from mashumaro.types import SerializableType
|
from mashumaro.types import SerializableType
|
||||||
|
|
||||||
from dbt.artifacts.resources.base import FileHash
|
from dbt.artifacts.resources.base import FileHash
|
||||||
from dbt.constants import MAXIMUM_SEED_SIZE
|
|
||||||
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin
|
from dbt_common.dataclass_schema import StrEnum, dbtClassMixin
|
||||||
|
|
||||||
from .util import SourceKey
|
from .util import SourceKey
|
||||||
@@ -65,9 +64,8 @@ class FilePath(dbtClassMixin):
|
|||||||
def original_file_path(self) -> str:
|
def original_file_path(self) -> str:
|
||||||
return os.path.join(self.searched_path, self.relative_path)
|
return os.path.join(self.searched_path, self.relative_path)
|
||||||
|
|
||||||
def seed_too_large(self) -> bool:
|
def file_size(self) -> int:
|
||||||
"""Return whether the file this represents is over the seed size limit"""
|
return os.stat(self.full_path).st_size
|
||||||
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin):
|
|||||||
log_format_file: Optional[str] = None
|
log_format_file: Optional[str] = None
|
||||||
log_level: Optional[str] = None
|
log_level: Optional[str] = None
|
||||||
log_level_file: Optional[str] = None
|
log_level_file: Optional[str] = None
|
||||||
|
maximum_seed_size_mib: Optional[int] = None
|
||||||
partial_parse: Optional[bool] = None
|
partial_parse: Optional[bool] = None
|
||||||
populate_cache: Optional[bool] = None
|
populate_cache: Optional[bool] = None
|
||||||
printer_width: Optional[int] = None
|
printer_width: Optional[int] = None
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
|
from dbt.constants import PIN_PACKAGE_URL
|
||||||
from dbt.events.base_types import (
|
from dbt.events.base_types import (
|
||||||
DebugLevel,
|
DebugLevel,
|
||||||
DynamicLevel,
|
DynamicLevel,
|
||||||
@@ -8,6 +8,7 @@ from dbt.events.base_types import (
|
|||||||
InfoLevel,
|
InfoLevel,
|
||||||
WarnLevel,
|
WarnLevel,
|
||||||
)
|
)
|
||||||
|
from dbt.flags import get_flags
|
||||||
from dbt_common.events.base_types import EventLevel
|
from dbt_common.events.base_types import EventLevel
|
||||||
from dbt_common.events.format import (
|
from dbt_common.events.format import (
|
||||||
format_fancy_output_line,
|
format_fancy_output_line,
|
||||||
@@ -675,10 +676,11 @@ class SeedIncreased(WarnLevel):
|
|||||||
return "I052"
|
return "I052"
|
||||||
|
|
||||||
def message(self) -> str:
|
def message(self) -> str:
|
||||||
|
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||||
msg = (
|
msg = (
|
||||||
f"Found a seed ({self.package_name}.{self.name}) "
|
f"Found a seed ({self.package_name}.{self.name}) "
|
||||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
|
f">{maximum_seed_size_name} in size. The previous file was "
|
||||||
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
|
f"<={maximum_seed_size_name}, so it has changed"
|
||||||
)
|
)
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
@@ -688,9 +690,10 @@ class SeedExceedsLimitSamePath(WarnLevel):
|
|||||||
return "I053"
|
return "I053"
|
||||||
|
|
||||||
def message(self) -> str:
|
def message(self) -> str:
|
||||||
|
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||||
msg = (
|
msg = (
|
||||||
f"Found a seed ({self.package_name}.{self.name}) "
|
f"Found a seed ({self.package_name}.{self.name}) "
|
||||||
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
|
f">{maximum_seed_size_name} in size at the same path, dbt "
|
||||||
f"cannot tell if it has changed: assuming they are the same"
|
f"cannot tell if it has changed: assuming they are the same"
|
||||||
)
|
)
|
||||||
return msg
|
return msg
|
||||||
@@ -701,9 +704,10 @@ class SeedExceedsLimitAndPathChanged(WarnLevel):
|
|||||||
return "I054"
|
return "I054"
|
||||||
|
|
||||||
def message(self) -> str:
|
def message(self) -> str:
|
||||||
|
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||||
msg = (
|
msg = (
|
||||||
f"Found a seed ({self.package_name}.{self.name}) "
|
f"Found a seed ({self.package_name}.{self.name}) "
|
||||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
|
f">{maximum_seed_size_name} in size. The previous file was in "
|
||||||
f"a different location, assuming it has changed"
|
f"a different location, assuming it has changed"
|
||||||
)
|
)
|
||||||
return msg
|
return msg
|
||||||
@@ -714,9 +718,10 @@ class SeedExceedsLimitChecksumChanged(WarnLevel):
|
|||||||
return "I055"
|
return "I055"
|
||||||
|
|
||||||
def message(self) -> str:
|
def message(self) -> str:
|
||||||
|
maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"
|
||||||
msg = (
|
msg = (
|
||||||
f"Found a seed ({self.package_name}.{self.name}) "
|
f"Found a seed ({self.package_name}.{self.name}) "
|
||||||
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
|
f">{maximum_seed_size_name} in size. The previous file had a "
|
||||||
f"checksum type of {self.checksum_name}, so it has changed"
|
f"checksum type of {self.checksum_name}, so it has changed"
|
||||||
)
|
)
|
||||||
return msg
|
return msg
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ def get_flag_dict():
|
|||||||
"log_path",
|
"log_path",
|
||||||
"invocation_command",
|
"invocation_command",
|
||||||
"empty",
|
"empty",
|
||||||
|
"maximum_seed_size_mib",
|
||||||
}
|
}
|
||||||
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}
|
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ from dbt.contracts.files import (
|
|||||||
)
|
)
|
||||||
from dbt.events.types import InputFileDiffError
|
from dbt.events.types import InputFileDiffError
|
||||||
from dbt.exceptions import ParsingError
|
from dbt.exceptions import ParsingError
|
||||||
|
from dbt.flags import get_flags
|
||||||
from dbt.parser.common import schema_file_keys
|
from dbt.parser.common import schema_file_keys
|
||||||
from dbt.parser.schemas import yaml_from_file
|
from dbt.parser.schemas import yaml_from_file
|
||||||
from dbt.parser.search import filesystem_search
|
from dbt.parser.search import filesystem_search
|
||||||
@@ -123,12 +124,14 @@ def validate_yaml(file_path, dct):
|
|||||||
|
|
||||||
# Special processing for big seed files
|
# Special processing for big seed files
|
||||||
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
|
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
|
||||||
if match.seed_too_large():
|
# Users can configure the maximum seed size (MiB) that will be hashed for state comparison
|
||||||
|
maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024
|
||||||
|
# maximum_seed_size = 0 means no limit
|
||||||
|
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:
|
||||||
# We don't want to calculate a hash of this file. Use the path.
|
# We don't want to calculate a hash of this file. Use the path.
|
||||||
source_file = SourceFile.big_seed(match)
|
source_file = SourceFile.big_seed(match)
|
||||||
else:
|
else:
|
||||||
file_contents = load_file_contents(match.absolute_path, strip=True)
|
checksum = FileHash.from_path(match.absolute_path)
|
||||||
checksum = FileHash.from_contents(file_contents)
|
|
||||||
source_file = SourceFile(path=match, checksum=checksum)
|
source_file = SourceFile(path=match, checksum=checksum)
|
||||||
source_file.contents = ""
|
source_file.contents = ""
|
||||||
source_file.parse_file_type = ParseFileType.Seed
|
source_file.parse_file_type = ParseFileType.Seed
|
||||||
|
|||||||
@@ -228,7 +228,7 @@ class TestChangedSeedContents(BaseModifiedState):
|
|||||||
"./state",
|
"./state",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert ">1MB" in str(exc.value)
|
assert ">1MiB" in str(exc.value)
|
||||||
|
|
||||||
# now check if unmodified returns none
|
# now check if unmodified returns none
|
||||||
results = run_dbt(
|
results = run_dbt(
|
||||||
|
|||||||
@@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
|
|||||||
event = warn_or_error_patch.call_args[0][0]
|
event = warn_or_error_patch.call_args[0][0]
|
||||||
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
||||||
msg = event.message()
|
msg = event.message()
|
||||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||||
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
||||||
assert not search_manifest_using_method(manifest, method, "new")
|
assert not search_manifest_using_method(manifest, method, "new")
|
||||||
warn_or_error_patch.assert_not_called()
|
warn_or_error_patch.assert_not_called()
|
||||||
@@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
|
|||||||
event = warn_or_error_patch.call_args[0][0]
|
event = warn_or_error_patch.call_args[0][0]
|
||||||
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
assert type(event).__name__ == "SeedExceedsLimitSamePath"
|
||||||
msg = event.message()
|
msg = event.message()
|
||||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||||
|
|
||||||
|
|
||||||
def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
|
def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed):
|
||||||
@@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
|
|||||||
event = warn_or_error_patch.call_args[0][0]
|
event = warn_or_error_patch.call_args[0][0]
|
||||||
assert type(event).__name__ == "SeedIncreased"
|
assert type(event).__name__ == "SeedIncreased"
|
||||||
msg = event.message()
|
msg = event.message()
|
||||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||||
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
|
||||||
assert not search_manifest_using_method(manifest, method, "new")
|
assert not search_manifest_using_method(manifest, method, "new")
|
||||||
warn_or_error_patch.assert_not_called()
|
warn_or_error_patch.assert_not_called()
|
||||||
@@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
|
|||||||
event = warn_or_error_patch.call_args[0][0]
|
event = warn_or_error_patch.call_args[0][0]
|
||||||
assert type(event).__name__ == "SeedIncreased"
|
assert type(event).__name__ == "SeedIncreased"
|
||||||
msg = event.message()
|
msg = event.message()
|
||||||
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
|
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
|
||||||
|
|
||||||
|
|
||||||
def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):
|
def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed):
|
||||||
|
|||||||
Reference in New Issue
Block a user