fixed typo

remove result from load_all_timing event test
add temp debug lines
2025-12-19 18:01:26 +00:00 · 2021-07-09 12:04:59 -04:00 · 2021-07-09 11:49:15 -04:00 · 2021-07-09 10:20:35 -04:00 · 2021-07-08 21:29:26 -04:00 · 2021-07-08 21:28:57 -04:00
4 changed files with 307 additions and 65 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@ Contributors:

 ### Under the hood

+- Add reporting for experimental parser correctness checks [#3528](https://github.com/dbt-labs/dbt/pull/3528)
 - Swap experimental parser implementation to use Rust [#3497](https://github.com/fishtown-analytics/dbt/pull/3497)
 - Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https://github.com/fishtown-analytics/dbt/pull/3465), [#3461](https://github.com/fishtown-analytics/dbt/pull/3461))
 - Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https://github.com/fishtown-analytics/dbt/issues/3246), [#3460](https://github.com/fishtown-analytics/dbt/pull/3460))
--- a/core/dbt/parser/models.py
+++ b/core/dbt/parser/models.py
@@ -4,7 +4,12 @@ import dbt.flags as flags
 from dbt.node_types import NodeType
 from dbt.parser.base import SimpleSQLParser
 from dbt.parser.search import FileBlock
+import dbt.tracking as tracking
+from dbt import utils
 from dbt_extractor import ExtractionError, py_extract_from_source  # type: ignore
+import itertools
+import random
+from typing import Any, Dict, List, Tuple


 class ModelParser(SimpleSQLParser[ParsedModelNode]):
@@ -26,46 +31,122 @@ class ModelParser(SimpleSQLParser[ParsedModelNode]):
    ) -> None:
        self.manifest._parsing_info.static_analysis_path_count += 1

-        # normal dbt run
-        if not flags.USE_EXPERIMENTAL_PARSER:
-            super().render_update(node, config)
+        # `True` roughly 1/100 times this function is called
+        sample: bool = random.randint(1, 101) == 100

-        # if the --use-experimental-parser flag was set
-        else:
+        # run the experimental parser if the flag is on or if we're sampling
+        if flags.USE_EXPERIMENTAL_PARSER or sample:
            try:
-                # run dbt jinja extractor (powered by tree-sitter + rust)
-                # throws an exception if it can't parse the source
-                res = py_extract_from_source(node.raw_sql)
+                experimentally_parsed: Dict[str, List[Any]] = py_extract_from_source(node.raw_sql)

-                # since it doesn't need python jinja, fit the refs, sources, and configs
-                # into the node. Down the line the rest of the node will be updated with
-                # this information. (e.g. depends_on etc.)
-                config_calls = []
-                for c in res['configs']:
+                # second config format
+                config_calls: List[Dict[str, str]] = []
+                for c in experimentally_parsed['configs']:
                    config_calls.append({c[0]: c[1]})

-                config._config_calls = config_calls
+                # format sources TODO change extractor to match this type
+                source_calls: List[List[str]] = []
+                for s in experimentally_parsed['sources']:
+                    source_calls.append([s[0], s[1]])
+                experimentally_parsed['sources'] = source_calls

-                # this uses the updated config to set all the right things in the node
-                # if there are hooks present, it WILL render jinja. Will need to change
-                # when we support hooks
-                self.update_parsed_node(node, config)
+            except ExtractionError as e:
+                experimentally_parsed = e

-                # udpate the unrendered config with values from the file
-                # values from yaml files are in there already
-                node.unrendered_config.update(dict(res['configs']))
+        # normal dbt run
+        if not flags.USE_EXPERIMENTAL_PARSER:
+            # normal rendering
+            super().render_update(node, config)
+            # if we're sampling, compare for correctness
+            if sample:
+                result: List[str] = []
+                # experimental parser couldn't parse
+                if isinstance(experimentally_parsed, Exception):
+                    result += ["01_experimental_parser_cannot_parse"]
+                else:
+                    # rearrange existing configs to match:
+                    real_configs: List[Tuple[str, Any]] = list(
+                        itertools.chain.from_iterable(
+                            map(lambda x: x.items(), config._config_calls)
+                        )
+                    )

-                # set refs, sources, and configs on the node object
-                node.refs = node.refs + res['refs']
-                for sourcev in res['sources']:
-                    # TODO change extractor to match type here
-                    node.sources.append([sourcev[0], sourcev[1]])
-                for configv in res['configs']:
-                    node.config[configv[0]] = configv[1]
+                    # look for false positive configs
+                    for c in experimentally_parsed['configs']:
+                        if c not in real_configs:
+                            result += ["02_false_positive_config_value"]
+                            break

-                self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+                    # look for missed configs
+                    for c in real_configs:
+                        if c not in experimentally_parsed['configs']:
+                            result += ["03_missed_config_value"]
+                            break

-            # exception was thrown by dbt jinja extractor meaning it can't
-            # handle this source. fall back to python jinja rendering.
-            except ExtractionError:
-                super().render_update(node, config)
+                    # look for false positive sources
+                    for s in experimentally_parsed['sources']:
+                        if s not in node.sources:
+                            result += ["04_false_positive_source_value"]
+                            break
+
+                    # look for missed sources
+                    for s in node.sources:
+                        if s not in experimentally_parsed['sources']:
+                            result += ["05_missed_source_value"]
+                            break
+
+                    # look for false positive refs
+                    for r in experimentally_parsed['refs']:
+                        if r not in node.refs:
+                            result += ["06_false_positive_ref_value"]
+                            break
+
+                    # look for missed refs
+                    for r in node.refs:
+                        if r not in experimentally_parsed['refs']:
+                            result += ["07_missed_ref_value"]
+                            break
+
+                    # if there are no errors, return a success value
+                    if not result:
+                        result = ["00_exact_match"]
+
+                    # fire a tracking event. this fires one event for every sample
+                    # so that we have data on a per file basis. Not only can we expect
+                    # no false positives or misses, we can expect the number model
+                    # files parseable by the experimental parser to match our internal
+                    # testing.
+                    tracking.track_experimental_parser_sample({
+                        "project_id": self.root_project.hashed_name(),
+                        "file_id": utils.get_hash(node),
+                        "status": result
+                    })
+
+        # if the --use-experimental-parser flag was set, and the experimental parser succeeded
+        elif not isinstance(experimentally_parsed, Exception):
+            # since it doesn't need python jinja, fit the refs, sources, and configs
+            # into the node. Down the line the rest of the node will be updated with
+            # this information. (e.g. depends_on etc.)
+            config._config_calls = config_calls
+
+            # this uses the updated config to set all the right things in the node.
+            # if there are hooks present, it WILL render jinja. Will need to change
+            # when the experimental parser supports hooks
+            self.update_parsed_node(node, config)
+
+            # update the unrendered config with values from the file.
+            # values from yaml files are in there already
+            node.unrendered_config.update(dict(experimentally_parsed['configs']))
+
+            # set refs, sources, and configs on the node object
+            node.refs += experimentally_parsed['refs']
+            node.sources += experimentally_parsed['sources']
+            for configv in experimentally_parsed['configs']:
+                node.config[configv[0]] = configv[1]
+
+            self.manifest._parsing_info.static_analysis_parsed_path_count += 1
+
+        # the experimental parser tried and failed on this model.
+        # fall back to python jinja rendering.
+        else:
+            super().render_update(node, config)
--- a/core/dbt/tracking.py
+++ b/core/dbt/tracking.py
@@ -30,7 +30,7 @@ RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
 DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
 LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
 RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'
-
+EXPERIMENTAL_PARSER = 'iglu:com.dbt/experimental_parser/jsonschema/1-0-0'
 DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'


@@ -423,6 +423,20 @@ def track_invalid_invocation(
    )


+def track_experimental_parser_sample(options):
+    context = [SelfDescribingJson(EXPERIMENTAL_PARSER, options)]
+    assert active_user is not None, \
+        'Cannot track project loading time when active user is None'
+
+    track(
+        active_user,
+        category='dbt',
+        action='experimental_parser',
+        label=active_user.invocation_id,
+        context=context
+    )
+
+
 def flush():
    logger.debug("Flushing usage events")
    tracker.flush()
--- a/test/integration/033_event_tracking_test/test_events.py
+++ b/test/integration/033_event_tracking_test/test_events.py
@@ -10,6 +10,17 @@ import dbt.tracking
 import dbt.utils


+# immutably creates a new array with the value inserted at the index
+def inserted(value, index, arr):
+    x = []
+    for i in range(0, len(arr)):
+        if i == index:
+            x.append(value)
+            x.append(arr[i])
+        else:
+            x.append(arr[i])
+        return x
+
 class TestEventTracking(DBTIntegrationTest):
    maxDiff = None

@@ -240,7 +251,7 @@ class TestEventTrackingSuccess(TestEventTracking):

    @use_profile("postgres")
    def test__postgres_event_tracking_compile(self):
-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -267,6 +278,17 @@ class TestEventTrackingSuccess(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('compile', 'start'),
            self.load_context(),
@@ -274,13 +296,19 @@ class TestEventTrackingSuccess(TestEventTracking):
            self.build_context('compile', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["compile", "--vars", "sensitive_thing: abc"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["compile", "--vars", "sensitive_thing: abc"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)

    @use_profile("postgres")
    def test__postgres_event_tracking_deps(self):
@@ -364,7 +392,7 @@ class TestEventTrackingSuccess(TestEventTracking):
                },
            }]

-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -397,6 +425,17 @@ class TestEventTrackingSuccess(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('seed', 'start'),
            self.load_context(),
@@ -405,12 +444,14 @@ class TestEventTrackingSuccess(TestEventTracking):
            self.build_context('seed', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(["seed"], expected_calls, expected_contexts)
-        self.assertTrue(test_result)
+        test_result_A = self.run_event_test(["seed"], expected_calls_A, expected_contexts)
+        test_result_A = self.run_event_test(["seed"], expected_calls_B, expected_contexts)
+        
+        self.assertTrue(test_result_A or test_result_B)

    @use_profile("postgres")
    def test__postgres_event_tracking_models(self):
-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -449,6 +490,17 @@ class TestEventTrackingSuccess(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        hashed = '20ff78afb16c8b3b8f83861b1d3b99bd'
        # this hashed contents field changes on azure postgres tests, I believe
        # due to newlines again
@@ -478,20 +530,26 @@ class TestEventTrackingSuccess(TestEventTracking):
            self.build_context('run', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["run", "--model", "example", "example_2"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--model", "example", "example_2"],
+            expected_calls_A,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)

    @use_profile("postgres")
    def test__postgres_event_tracking_model_error(self):
        # cmd = ["run", "--model", "model_error"]
        # self.run_event_test(cmd, event_run_model_error, expect_pass=False)

-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -524,6 +582,17 @@ class TestEventTrackingSuccess(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('run', 'start'),
            self.load_context(),
@@ -539,14 +608,21 @@ class TestEventTrackingSuccess(TestEventTracking):
            self.build_context('run', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["run", "--model", "model_error"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts,
            expect_pass=False
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--model", "model_error"],
+            expected_calls_B,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)

    @use_profile("postgres")
    def test__postgres_event_tracking_tests(self):
@@ -554,7 +630,7 @@ class TestEventTrackingSuccess(TestEventTracking):
        self.run_dbt(["deps"])
        self.run_dbt(["run", "--model", "example", "example_2"])

-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -581,6 +657,17 @@ class TestEventTrackingSuccess(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('test', 'start'),
            self.load_context(),
@@ -588,14 +675,21 @@ class TestEventTrackingSuccess(TestEventTracking):
            self.build_context('test', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["test"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts,
            expect_pass=False
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["test"],
+            expected_calls_A,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)


 class TestEventTrackingCompilationError(TestEventTracking):
@@ -676,7 +770,7 @@ class TestEventTrackingUnableToConnect(TestEventTracking):

    @use_profile("postgres")
    def test__postgres_event_tracking_unable_to_connect(self):
-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -703,6 +797,17 @@ class TestEventTrackingUnableToConnect(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('run', 'start'),
            self.load_context(),
@@ -710,14 +815,21 @@ class TestEventTrackingUnableToConnect(TestEventTracking):
            self.build_context('run', 'end', result_type='error')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["run", "--target", "noaccess", "--models", "example"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts,
            expect_pass=False
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["run", "--target", "noaccess", "--models", "example"],
+            expected_calls_B,
+            expected_contexts,
+            expect_pass=False
+        )
+
+        self.assertTrue(test_result_A or test_result_B)


 class TestEventTrackingSnapshot(TestEventTracking):
@@ -732,7 +844,7 @@ class TestEventTrackingSnapshot(TestEventTracking):
    def test__postgres_event_tracking_snapshot(self):
        self.run_dbt(["run", "--models", "snapshottable"])

-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -765,6 +877,17 @@ class TestEventTrackingSnapshot(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        # the model here has a raw_sql that contains the schema, which changes
        expected_contexts = [
            self.build_context('snapshot', 'start'),
@@ -781,13 +904,19 @@ class TestEventTrackingSnapshot(TestEventTracking):
            self.build_context('snapshot', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["snapshot"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["snapshot"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)


 class TestEventTrackingCatalogGenerate(TestEventTracking):
@@ -796,7 +925,7 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
        # create a model for the catalog
        self.run_dbt(["run", "--models", "example"])

-        expected_calls = [
+        expected_calls_A = [
            call(
                category='dbt',
                action='invocation',
@@ -823,6 +952,17 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
            ),
        ]

+        expected_calls_B = inserted(
+            call(
+                category='dbt',
+                action='experimental_parser',
+                label=ANY,
+                context=ANY
+            ),
+            3,
+            expected_calls_A
+        )
+
        expected_contexts = [
            self.build_context('generate', 'start'),
            self.load_context(),
@@ -830,10 +970,16 @@ class TestEventTrackingCatalogGenerate(TestEventTracking):
            self.build_context('generate', 'end', result_type='ok')
        ]

-        test_result = self.run_event_test(
+        test_result_A = self.run_event_test(
            ["docs", "generate"],
-            expected_calls,
+            expected_calls_A,
            expected_contexts
        )

-        self.assertTrue(test_result)
+        test_result_B = self.run_event_test(
+            ["docs", "generate"],
+            expected_calls_B,
+            expected_contexts
+        )
+
+        self.assertTrue(test_result_A or test_result_B)
Author	SHA1	Message	Date
Nathaniel May	11543c99eb	fixed typo	2021-07-09 12:04:59 -04:00
Nathaniel May	ae7b0a935c	remove result from load_all_timing event test	2021-07-09 11:49:15 -04:00
Nathaniel May	7b510c4803	add temp debug lines	2021-07-09 10:20:35 -04:00
Nathaniel May	fc7faffb19	Merge branch 'experimental-parser-correctness-checks' of github.com:fishtown-analytics/dbt into experimental-parser-correctness-checks	2021-07-08 21:29:26 -04:00
Nathaniel May	88c9506e04	use new test values	2021-07-08 21:28:57 -04:00
Nathaniel May	27cdeb6468	hook up tracking for real	2021-07-08 21:06:17 -04:00
Nathaniel May	7f583053c2	fix tracking	2021-07-08 21:06:17 -04:00
Nathaniel May	7893ab6e30	changelog	2021-07-08 21:06:17 -04:00
Nathaniel May	42a27933a9	add type annotations	2021-07-08 21:06:17 -04:00
Nathaniel May	515383320b	fix config checks	2021-07-08 21:06:17 -04:00
Nathaniel May	dbf613a1eb	fix set typing	2021-07-08 21:06:17 -04:00
Nathaniel May	69f7efe028	add comment to tracking result	2021-07-08 21:06:17 -04:00
Nathaniel May	16501781c5	add result set to tracking	2021-07-08 21:06:17 -04:00
Nathaniel May	761951681d	address PR feedback	2021-07-08 21:06:17 -04:00
Nathaniel May	8f065db04d	update comments	2021-07-08 21:06:17 -04:00
Nathaniel May	31adfe14ab	comment fixes	2021-07-08 21:06:17 -04:00
Nathaniel May	16841d3886	add correctness tracking for experimental parser	2021-07-08 21:06:17 -04:00
Nathaniel May	b3af8f1a0c	hook up tracking for real	2021-07-08 15:14:44 -04:00
Nathaniel May	f826d795e4	fix tracking	2021-07-07 17:28:21 -04:00
Nathaniel May	05e7f97a9f	changelog	2021-07-06 17:14:14 -04:00
Nathaniel May	686cf6b836	add type annotations	2021-07-06 17:08:40 -04:00
Nathaniel May	7aba21ee47	fix config checks	2021-07-06 16:59:01 -04:00
Nathaniel May	e9ec739b24	fix set typing	2021-07-06 14:54:59 -04:00
Nathaniel May	a894fe540e	add comment to tracking result	2021-07-06 14:17:21 -04:00
Nathaniel May	b20ca48442	add result set to tracking	2021-07-06 14:11:48 -04:00
Nathaniel May	ed0e689c5a	address PR feedback	2021-07-06 13:43:39 -04:00
Nathaniel May	e8301461d3	update comments	2021-07-06 11:45:16 -04:00
Nathaniel May	89d3d9af88	comment fixes	2021-07-01 18:10:08 -04:00
Nathaniel May	1a428e5bc3	add correctness tracking for experimental parser	2021-07-01 18:03:52 -04:00