Compare commits

...

29 Commits

Author SHA1 Message Date
Nathaniel May
11543c99eb fixed typo 2021-07-09 12:04:59 -04:00
Nathaniel May
ae7b0a935c remove result from load_all_timing event test 2021-07-09 11:49:15 -04:00
Nathaniel May
7b510c4803 add temp debug lines 2021-07-09 10:20:35 -04:00
Nathaniel May
fc7faffb19 Merge branch 'experimental-parser-correctness-checks' of github.com:fishtown-analytics/dbt into experimental-parser-correctness-checks 2021-07-08 21:29:26 -04:00
Nathaniel May
88c9506e04 use new test values 2021-07-08 21:28:57 -04:00
Nathaniel May
27cdeb6468 hook up tracking for real 2021-07-08 21:06:17 -04:00
Nathaniel May
7f583053c2 fix tracking 2021-07-08 21:06:17 -04:00
Nathaniel May
7893ab6e30 changelog 2021-07-08 21:06:17 -04:00
Nathaniel May
42a27933a9 add type annotations 2021-07-08 21:06:17 -04:00
Nathaniel May
515383320b fix config checks 2021-07-08 21:06:17 -04:00
Nathaniel May
dbf613a1eb fix set typing 2021-07-08 21:06:17 -04:00
Nathaniel May
69f7efe028 add comment to tracking result 2021-07-08 21:06:17 -04:00
Nathaniel May
16501781c5 add result set to tracking 2021-07-08 21:06:17 -04:00
Nathaniel May
761951681d address PR feedback 2021-07-08 21:06:17 -04:00
Nathaniel May
8f065db04d update comments 2021-07-08 21:06:17 -04:00
Nathaniel May
31adfe14ab comment fixes 2021-07-08 21:06:17 -04:00
Nathaniel May
16841d3886 add correctness tracking for experimental parser 2021-07-08 21:06:17 -04:00
Nathaniel May
b3af8f1a0c hook up tracking for real 2021-07-08 15:14:44 -04:00
Nathaniel May
f826d795e4 fix tracking 2021-07-07 17:28:21 -04:00
Nathaniel May
05e7f97a9f changelog 2021-07-06 17:14:14 -04:00
Nathaniel May
686cf6b836 add type annotations 2021-07-06 17:08:40 -04:00
Nathaniel May
7aba21ee47 fix config checks 2021-07-06 16:59:01 -04:00
Nathaniel May
e9ec739b24 fix set typing 2021-07-06 14:54:59 -04:00
Nathaniel May
a894fe540e add comment to tracking result 2021-07-06 14:17:21 -04:00
Nathaniel May
b20ca48442 add result set to tracking 2021-07-06 14:11:48 -04:00
Nathaniel May
ed0e689c5a address PR feedback 2021-07-06 13:43:39 -04:00
Nathaniel May
e8301461d3 update comments 2021-07-06 11:45:16 -04:00
Nathaniel May
89d3d9af88 comment fixes 2021-07-01 18:10:08 -04:00
Nathaniel May
1a428e5bc3 add correctness tracking for experimental parser 2021-07-01 18:03:52 -04:00
4 changed files with 307 additions and 65 deletions

View File

@@ -38,6 +38,7 @@ Contributors:
### Under the hood
- Add reporting for experimental parser correctness checks [#3528](https://github.com/dbt-labs/dbt/pull/3528)
- Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497)
- Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461))
- Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))

View File

@@ -4,7 +4,12 @@ import dbt.flags as flags
from dbt.node_types import NodeType
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
import dbt.tracking as tracking
from dbt import utils
from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore
import itertools
import random
from typing import Any, Dict, List, Tuple
class ModelParser(SimpleSQLParser[ParsedModelNode]):
@@ -26,46 +31,122 @@ class ModelParser(SimpleSQLParser[ParsedModelNode]):
) -> None:
self.manifest._parsing_info.static_analysis_path_count += 1
# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
super().render_update(node, config)
# `True` roughly 1/100 times this function is called
sample: bool = random.randint(1, 101) == 100
# if the --use-experimental-parser flag was set
else:
# run the experimental parser if the flag is on or if we're sampling
if flags.USE_EXPERIMENTAL_PARSER or sample:
try:
# run dbt jinja extractor (powered by tree-sitter + rust)
# throws an exception if it can't parse the source
res = py_extract_from_source(node.raw_sql)
experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql)
# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config_calls = []
for c in res['configs']:
# second config format
config_calls: List[Dict[str, str]] = []
for c in experimentally_parsed['configs']:
config_calls.append({c[0]: c[1]})
config._config_calls = config_calls
# format sources TODO change extractor to match this type
source_calls: List[List[str]] = []
for s in experimentally_parsed['sources']:
source_calls.append([s[0], s[1]])
experimentally_parsed['sources'] = source_calls
# this uses the updated config to set all the right things in the node
# if there are hooks present, it WILL render jinja. Will need to change
# when we support hooks
self.update_parsed_node(node, config)
except ExtractionError as e:
experimentally_parsed = e
# udpate the unrendered config with values from the file
# values from yaml files are in there already
node.unrendered_config.update(dict(res['configs']))
# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
# normal rendering
super().render_update(node, config)
# if we're sampling, compare for correctness
if sample:
result: List[str] = []
# experimental parser couldn't parse
if isinstance(experimentally_parsed, Exception):
result += ["01_experimental_parser_cannot_parse"]
else:
# rearrange existing configs to match:
real_configs: List[Tuple[str, Any]] = list(
itertools.chain.from_iterable(
map(lambda x: x.items(), config._config_calls)
)
)
# set refs, sources, and configs on the node object
node.refs = node.refs + res['refs']
for sourcev in res['sources']:
# TODO change extractor to match type here
node.sources.append([sourcev[0], sourcev[1]])
for configv in res['configs']:
node.config[configv[0]] = configv[1]
# look for false positive configs
for c in experimentally_parsed['configs']:
if c not in real_configs:
result += ["02_false_positive_config_value"]
break
self.manifest._parsing_info.static_analysis_parsed_path_count += 1
# look for missed configs
for c in real_configs:
if c not in experimentally_parsed['configs']:
result += ["03_missed_config_value"]
break
# exception was thrown by dbt jinja extractor meaning it can't
# handle this source. fall back to python jinja rendering.
except ExtractionError:
super().render_update(node, config)
# look for false positive sources
for s in experimentally_parsed['sources']:
if s not in node.sources:
result += ["04_false_positive_source_value"]
break
# look for missed sources
for s in node.sources:
if s not in experimentally_parsed['sources']:
result += ["05_missed_source_value"]
break
# look for false positive refs
for r in experimentally_parsed['refs']:
if r not in node.refs:
result += ["06_false_positive_ref_value"]
break
# look for missed refs
for r in node.refs:
if r not in experimentally_parsed['refs']:
result += ["07_missed_ref_value"]
break
# if there are no errors, return a success value
if not result:
result = ["00_exact_match"]
# fire a tracking event. this fires one event for every sample
# so that we have data on a per file basis. Not only can we expect
# no false positives or misses, we can expect the number model
# files parseable by the experimental parser to match our internal
# testing.
tracking.track_experimental_parser_sample({
"project_id": self.root_project.hashed_name(),
"file_id": utils.get_hash(node),
"status": result
})
# if the --use-experimental-parser flag was set, and the experimental parser succeeded
elif not isinstance(experimentally_parsed, Exception):
# since it doesn't need python jinja, fit the refs, sources, and configs
# into the node. Down the line the rest of the node will be updated with
# this information. (e.g. depends_on etc.)
config._config_calls = config_calls
# this uses the updated config to set all the right things in the node.
# if there are hooks present, it WILL render jinja. Will need to change
# when the experimental parser supports hooks
self.update_parsed_node(node, config)
# update the unrendered config with values from the file.
# values from yaml files are in there already
node.unrendered_config.update(dict(experimentally_parsed['configs']))
# set refs, sources, and configs on the node object
node.refs += experimentally_parsed['refs']
node.sources += experimentally_parsed['sources']
for configv in experimentally_parsed['configs']:
node.config[configv[0]] = configv[1]
self.manifest._parsing_info.static_analysis_parsed_path_count += 1
# the experimental parser tried and failed on this model.
# fall back to python jinja rendering.
else:
super().render_update(node, config)

View File

@@ -30,7 +30,7 @@ RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'
EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0'
DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'
@@ -423,6 +423,20 @@ def track_invalid_invocation(
)
def track_experimental_parser_sample(options):
context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)]
assert active_user is not None, \
'Cannot track project loading time when active user is None'
track(
active_user,
category='dbt',
action='experimental_parser',
label=active_user.invocation_id,
context=context
)
def flush():
logger.debug("Flushing usage events")
tracker.flush()

View File

@@ -10,6 +10,17 @@ import dbt.tracking
import dbt.utils
# immutably creates a new array with the value inserted at the index
def inserted(value, index, arr):
x = []
for i in range(0, len(arr)):
if i == index:
x.append(value)
x.append(arr[i])
else:
x.append(arr[i])
return x
class TestEventTracking(DBTIntegrationTest):
maxDiff = None
@@ -240,7 +251,7 @@ class TestEventTrackingSuccess(TestEventTracking):
@use_profile("postgres")
def test__postgres_event_tracking_compile(self):
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -267,6 +278,17 @@ class TestEventTrackingSuccess(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('compile', 'start'),
self.load_context(),
@@ -274,13 +296,19 @@ class TestEventTrackingSuccess(TestEventTracking):
self.build_context('compile', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["compile", "--vars", "sensitive_thing: abc"],
expected_calls,
expected_calls_A,
expected_contexts
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["compile", "--vars", "sensitive_thing: abc"],
expected_calls_B,
expected_contexts
)
self.assertTrue(test_result_A or test_result_B)
@use_profile("postgres")
def test__postgres_event_tracking_deps(self):
@@ -364,7 +392,7 @@ class TestEventTrackingSuccess(TestEventTracking):
},
}]
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -397,6 +425,17 @@ class TestEventTrackingSuccess(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('seed', 'start'),
self.load_context(),
@@ -405,12 +444,14 @@ class TestEventTrackingSuccess(TestEventTracking):
self.build_context('seed', 'end', result_type='ok')
]
test_result = self.run_event_test(["seed"], expected_calls, expected_contexts)
self.assertTrue(test_result)
test_result_A = self.run_event_test(["seed"], expected_calls_A, expected_contexts)
test_result_A = self.run_event_test(["seed"], expected_calls_B, expected_contexts)
self.assertTrue(test_result_A or test_result_B)
@use_profile("postgres")
def test__postgres_event_tracking_models(self):
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -449,6 +490,17 @@ class TestEventTrackingSuccess(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
hashed = '20ff78afb16c8b3b8f83861b1d3b99bd'
# this hashed contents field changes on azure postgres tests, I believe
# due to newlines again
@@ -478,20 +530,26 @@ class TestEventTrackingSuccess(TestEventTracking):
self.build_context('run', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["run", "--model", "example", "example_2"],
expected_calls,
expected_calls_A,
expected_contexts
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["run", "--model", "example", "example_2"],
expected_calls_A,
expected_contexts
)
self.assertTrue(test_result_A or test_result_B)
@use_profile("postgres")
def test__postgres_event_tracking_model_error(self):
# cmd = ["run", "--model", "model_error"]
# self.run_event_test(cmd, event_run_model_error, expect_pass=False)
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -524,6 +582,17 @@ class TestEventTrackingSuccess(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('run', 'start'),
self.load_context(),
@@ -539,14 +608,21 @@ class TestEventTrackingSuccess(TestEventTracking):
self.build_context('run', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["run", "--model", "model_error"],
expected_calls,
expected_calls_A,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["run", "--model", "model_error"],
expected_calls_B,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result_A or test_result_B)
@use_profile("postgres")
def test__postgres_event_tracking_tests(self):
@@ -554,7 +630,7 @@ class TestEventTrackingSuccess(TestEventTracking):
self.run_dbt(["deps"])
self.run_dbt(["run", "--model", "example", "example_2"])
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -581,6 +657,17 @@ class TestEventTrackingSuccess(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('test', 'start'),
self.load_context(),
@@ -588,14 +675,21 @@ class TestEventTrackingSuccess(TestEventTracking):
self.build_context('test', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["test"],
expected_calls,
expected_calls_A,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["test"],
expected_calls_A,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result_A or test_result_B)
class TestEventTrackingCompilationError(TestEventTracking):
@@ -676,7 +770,7 @@ class TestEventTrackingUnableToConnect(TestEventTracking):
@use_profile("postgres")
def test__postgres_event_tracking_unable_to_connect(self):
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -703,6 +797,17 @@ class TestEventTrackingUnableToConnect(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('run', 'start'),
self.load_context(),
@@ -710,14 +815,21 @@ class TestEventTrackingUnableToConnect(TestEventTracking):
self.build_context('run', 'end', result_type='error')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["run", "--target", "noaccess", "--models", "example"],
expected_calls,
expected_calls_A,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["run", "--target", "noaccess", "--models", "example"],
expected_calls_B,
expected_contexts,
expect_pass=False
)
self.assertTrue(test_result_A or test_result_B)
class TestEventTrackingSnapshot(TestEventTracking):
@@ -732,7 +844,7 @@ class TestEventTrackingSnapshot(TestEventTracking):
def test__postgres_event_tracking_snapshot(self):
self.run_dbt(["run", "--models", "snapshottable"])
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -765,6 +877,17 @@ class TestEventTrackingSnapshot(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
# the model here has a raw_sql that contains the schema, which changes
expected_contexts = [
self.build_context('snapshot', 'start'),
@@ -781,13 +904,19 @@ class TestEventTrackingSnapshot(TestEventTracking):
self.build_context('snapshot', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["snapshot"],
expected_calls,
expected_calls_A,
expected_contexts
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["snapshot"],
expected_calls_B,
expected_contexts
)
self.assertTrue(test_result_A or test_result_B)
class TestEventTrackingCatalogGenerate(TestEventTracking):
@@ -796,7 +925,7 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
# create a model for the catalog
self.run_dbt(["run", "--models", "example"])
expected_calls = [
expected_calls_A = [
call(
category='dbt',
action='invocation',
@@ -823,6 +952,17 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
),
]
expected_calls_B = inserted(
call(
category='dbt',
action='experimental_parser',
label=ANY,
context=ANY
),
3,
expected_calls_A
)
expected_contexts = [
self.build_context('generate', 'start'),
self.load_context(),
@@ -830,10 +970,16 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
self.build_context('generate', 'end', result_type='ok')
]
test_result = self.run_event_test(
test_result_A = self.run_event_test(
["docs", "generate"],
expected_calls,
expected_calls_A,
expected_contexts
)
self.assertTrue(test_result)
test_result_B = self.run_event_test(
["docs", "generate"],
expected_calls_B,
expected_contexts
)
self.assertTrue(test_result_A or test_result_B)