feat: inspect columns

2025-12-18 02:11:27 +00:00 · 2023-10-07 20:05:49 +01:00
parent df37f575af
commit 1c645e270b
8 changed files with 151 additions and 3 deletions
--- a/dbt_project.yml
+++ b/dbt_project.yml
@@ -34,6 +34,9 @@ models:
        int_all_dag_relationships:
          # required for BigQuery, Redshift, and Databricks for performance/memory reasons
          +materialized: "{{ 'table' if target.type in ['bigquery', 'redshift', 'databricks'] else 'view' }}"
+        int_all_columns:
+          # required for BigQuery, Redshift, and Databricks for performance/memory reasons
+          +materialized: "{{ 'table' if target.type in ['bigquery', 'redshift', 'databricks'] else 'view' }}"
      dag:
        +materialized: table
    staging:
--- a/macros/insert_resources_from_graph.sql
+++ b/macros/insert_resources_from_graph.sql
@@ -1,5 +1,5 @@
-{% macro insert_resources_from_graph(relation, resource_type='nodes', relationships=False, batch_size=var('insert_batch_size') | int) %}
-  {%- set values = get_resource_values(resource_type, relationships) -%}
+{% macro insert_resources_from_graph(relation, resource_type='nodes', relationships=False, columns=False, batch_size=var('insert_batch_size') | int) %}
+  {%- set values = get_resource_values(resource_type, relationships, columns) -%}
  {%- set values_length = values | length -%}
  {%- set loop_count = (values_length / batch_size) | round(0, 'ceil') | int -%}
  
--- a/macros/unpack/get_column_values.sql
+++ b/macros/unpack/get_column_values.sql
@@ -0,0 +1,46 @@
+{%- macro get_column_values(node_type) -%}
+    {{ return(adapter.dispatch('get_column_values', 'dbt_project_evaluator')(node_type)) }}
+{%- endmacro -%}
+
+{%- macro default__get_column_values(node_type) -%}
+
+    {%- if execute -%}
+        {%- if node_type == 'nodes' %}
+            {% set nodes_list = graph.nodes.values() %}   
+        {%- elif node_type == 'sources' -%}
+            {% set nodes_list = graph.sources.values() %}
+        {%- else -%}
+            {{ exceptions.raise_compiler_error("node_type needs to be either nodes or sources, got " ~ node_type) }}
+        {% endif -%}
+
+        {%- set values = [] -%}
+
+        {%- for node in nodes_list -%}
+            {%- for column in node.columns.values() -%}
+
+                {%- set values_line  = 
+                    [
+                        wrap_string_with_quotes(node.unique_id),
+                        wrap_string_with_quotes(node.name),
+                        wrap_string_with_quotes(node.resource_type),
+                        wrap_string_with_quotes(node.original_file_path | replace("\\","\\\\")),
+                        wrap_string_with_quotes(node.database),
+                        wrap_string_with_quotes(node.schema),
+                        wrap_string_with_quotes(node.package_name),
+                        wrap_string_with_quotes(node.alias),
+                        wrap_string_with_quotes(dbt.escape_single_quotes(column.name)),
+                        wrap_string_with_quotes(dbt.escape_single_quotes(column.description)),
+                        'null' if not column.data_type else wrap_string_with_quotes(dbt.escape_single_quotes(column.data_type)),
+                        'null' if not column.quote else wrap_string_with_quotes(dbt.escape_single_quotes(column.quote))
+                    ]
+                %}
+
+                {%- do values.append(values_line) -%}
+
+            {%- endfor -%}
+        {%- endfor -%}
+    {{ return(values) }}
+
+    {%- endif -%}
+  
+{%- endmacro -%}
--- a/macros/unpack/get_resource_values.sql
+++ b/macros/unpack/get_resource_values.sql
@@ -1,6 +1,8 @@
-{% macro get_resource_values(resource=None, relationships=None) %}
+{% macro get_resource_values(resource=None, relationships=None, columns=None) %}
  {% if relationships %}
    {{ return(adapter.dispatch('get_relationship_values', 'dbt_project_evaluator')(node_type=resource)) }}
+  {% elif columns %}
+    {{ return(adapter.dispatch('get_column_values', 'dbt_project_evaluator')(node_type=resource)) }}
  {% elif resource == 'exposures' %}
    {{ return(adapter.dispatch('get_exposure_values', 'dbt_project_evaluator')()) }}
  {% elif resource == 'sources' %}
--- a/models/marts/core/int_all_columns.sql
+++ b/models/marts/core/int_all_columns.sql
@@ -0,0 +1,16 @@
+-- one row for each column in a node or source
+
+select 
+    columns.unique_id,
+    columns.node_name,
+    columns.resource_type,
+    columns.file_path,
+    columns.database,
+    columns.schema,
+    columns.package_name,
+    columns.alias,
+    columns.name,
+    columns.description,
+    columns.data_type,
+    columns.quote
+from {{ ref('stg_columns') }} as columns
--- a/models/staging/graph/base/base_node_columns.sql
+++ b/models/staging/graph/base/base_node_columns.sql
@@ -0,0 +1,31 @@
+{{
+    config(
+        materialized='table',
+        post_hook="{{ insert_resources_from_graph(this, resource_type='nodes', columns=True) }}"
+    )
+}}
+
+{% if execute %}
+    {{ check_model_is_table(model) }}
+{% endif %}
+/* Bigquery won't let us `where` without `from` so we use this workaround */
+with dummy_cte as (
+    select 1 as foo
+) 
+
+select 
+    cast(null as {{ dbt.type_string() }}) as unique_id,
+    cast(null as {{ dbt.type_string() }}) as node_name,
+    cast(null as {{ dbt.type_string() }}) as resource_type,
+    cast(null as {{ dbt.type_string() }}) as file_path,
+    cast(null as {{ dbt.type_string() }}) as database,
+    cast(null as {{ dbt.type_string() }}) as schema,
+    cast(null as {{ dbt.type_string() }}) as package_name,
+    cast(null as {{ dbt.type_string() }}) as alias,
+    cast(null as {{ dbt.type_string()}}) as name,
+    cast(null as {{ dbt.type_string()}}) as description,
+    cast(null as {{ dbt.type_string()}}) as data_type,
+    cast(null as {{ dbt.type_string()}}) as quote
+
+from dummy_cte
+where false
--- a/models/staging/graph/base/base_source_columns.sql
+++ b/models/staging/graph/base/base_source_columns.sql
@@ -0,0 +1,31 @@
+{{
+    config(
+        materialized='table',
+        post_hook="{{ insert_resources_from_graph(this, resource_type='sources', columns=True) }}"
+    )
+}}
+
+{% if execute %}
+    {{ check_model_is_table(model) }}
+{% endif %}
+/* Bigquery won't let us `where` without `from` so we use this workaround */
+with dummy_cte as (
+    select 1 as foo
+) 
+
+select 
+    cast(null as {{ dbt.type_string() }}) as unique_id,
+    cast(null as {{ dbt.type_string() }}) as node_name,
+    cast(null as {{ dbt.type_string() }}) as resource_type,
+    cast(null as {{ dbt.type_string() }}) as file_path,
+    cast(null as {{ dbt.type_string() }}) as database,
+    cast(null as {{ dbt.type_string() }}) as schema,
+    cast(null as {{ dbt.type_string() }}) as package_name,
+    cast(null as {{ dbt.type_string() }}) as alias,
+    cast(null as {{ dbt.type_string()}}) as name,
+    cast(null as {{ dbt.type_string()}}) as description,
+    cast(null as {{ dbt.type_string()}}) as data_type,
+    cast(False as boolean) as quote
+
+from dummy_cte
+where false
--- a/models/staging/graph/stg_columns.sql
+++ b/models/staging/graph/stg_columns.sql
@@ -0,0 +1,19 @@
+{{
+    config(
+        materialized='table',
+    )
+}}
+
+{% if execute %}
+    {{ check_model_is_table(model) }}
+{% endif %}
+
+with unioned as (
+
+    {{ dbt_utils.union_relations([
+        ref('base_node_columns'),
+        ref('base_source_columns')
+    ])}}
+)
+
+select distinct * from unioned