feat: inspect columns

This commit is contained in:
Jacek Wojna
2023-10-07 20:05:49 +01:00
parent df37f575af
commit 1c645e270b
8 changed files with 151 additions and 3 deletions

View File

@@ -34,6 +34,9 @@ models:
int_all_dag_relationships:
# required for BigQuery, Redshift, and Databricks for performance/memory reasons
+materialized: "{{ 'table' if target.type in ['bigquery', 'redshift', 'databricks'] else 'view' }}"
int_all_columns:
# required for BigQuery, Redshift, and Databricks for performance/memory reasons
+materialized: "{{ 'table' if target.type in ['bigquery', 'redshift', 'databricks'] else 'view' }}"
dag:
+materialized: table
staging:

View File

@@ -1,5 +1,5 @@
{% macro insert_resources_from_graph(relation, resource_type='nodes', relationships=False, batch_size=var('insert_batch_size') | int) %}
{%- set values = get_resource_values(resource_type, relationships) -%}
{% macro insert_resources_from_graph(relation, resource_type='nodes', relationships=False, columns=False, batch_size=var('insert_batch_size') | int) %}
{%- set values = get_resource_values(resource_type, relationships, columns) -%}
{%- set values_length = values | length -%}
{%- set loop_count = (values_length / batch_size) | round(0, 'ceil') | int -%}

View File

@@ -0,0 +1,46 @@
{%- macro get_column_values(node_type) -%}
{{ return(adapter.dispatch('get_column_values', 'dbt_project_evaluator')(node_type)) }}
{%- endmacro -%}
{%- macro default__get_column_values(node_type) -%}
{%- if execute -%}
{%- if node_type == 'nodes' %}
{% set nodes_list = graph.nodes.values() %}
{%- elif node_type == 'sources' -%}
{% set nodes_list = graph.sources.values() %}
{%- else -%}
{{ exceptions.raise_compiler_error("node_type needs to be either nodes or sources, got " ~ node_type) }}
{% endif -%}
{%- set values = [] -%}
{%- for node in nodes_list -%}
{%- for column in node.columns.values() -%}
{%- set values_line =
[
wrap_string_with_quotes(node.unique_id),
wrap_string_with_quotes(node.name),
wrap_string_with_quotes(node.resource_type),
wrap_string_with_quotes(node.original_file_path | replace("\\","\\\\")),
wrap_string_with_quotes(node.database),
wrap_string_with_quotes(node.schema),
wrap_string_with_quotes(node.package_name),
wrap_string_with_quotes(node.alias),
wrap_string_with_quotes(dbt.escape_single_quotes(column.name)),
wrap_string_with_quotes(dbt.escape_single_quotes(column.description)),
'null' if not column.data_type else wrap_string_with_quotes(dbt.escape_single_quotes(column.data_type)),
'null' if not column.quote else wrap_string_with_quotes(dbt.escape_single_quotes(column.quote))
]
%}
{%- do values.append(values_line) -%}
{%- endfor -%}
{%- endfor -%}
{{ return(values) }}
{%- endif -%}
{%- endmacro -%}

View File

@@ -1,6 +1,8 @@
{% macro get_resource_values(resource=None, relationships=None) %}
{% macro get_resource_values(resource=None, relationships=None, columns=None) %}
{% if relationships %}
{{ return(adapter.dispatch('get_relationship_values', 'dbt_project_evaluator')(node_type=resource)) }}
{% elif columns %}
{{ return(adapter.dispatch('get_column_values', 'dbt_project_evaluator')(node_type=resource)) }}
{% elif resource == 'exposures' %}
{{ return(adapter.dispatch('get_exposure_values', 'dbt_project_evaluator')()) }}
{% elif resource == 'sources' %}

View File

@@ -0,0 +1,16 @@
-- one row for each column in a node or source
select
columns.unique_id,
columns.node_name,
columns.resource_type,
columns.file_path,
columns.database,
columns.schema,
columns.package_name,
columns.alias,
columns.name,
columns.description,
columns.data_type,
columns.quote
from {{ ref('stg_columns') }} as columns

View File

@@ -0,0 +1,31 @@
{{
config(
materialized='table',
post_hook="{{ insert_resources_from_graph(this, resource_type='nodes', columns=True) }}"
)
}}
{% if execute %}
{{ check_model_is_table(model) }}
{% endif %}
/* Bigquery won't let us `where` without `from` so we use this workaround */
with dummy_cte as (
select 1 as foo
)
select
cast(null as {{ dbt.type_string() }}) as unique_id,
cast(null as {{ dbt.type_string() }}) as node_name,
cast(null as {{ dbt.type_string() }}) as resource_type,
cast(null as {{ dbt.type_string() }}) as file_path,
cast(null as {{ dbt.type_string() }}) as database,
cast(null as {{ dbt.type_string() }}) as schema,
cast(null as {{ dbt.type_string() }}) as package_name,
cast(null as {{ dbt.type_string() }}) as alias,
cast(null as {{ dbt.type_string()}}) as name,
cast(null as {{ dbt.type_string()}}) as description,
cast(null as {{ dbt.type_string()}}) as data_type,
cast(null as {{ dbt.type_string()}}) as quote
from dummy_cte
where false

View File

@@ -0,0 +1,31 @@
{{
config(
materialized='table',
post_hook="{{ insert_resources_from_graph(this, resource_type='sources', columns=True) }}"
)
}}
{% if execute %}
{{ check_model_is_table(model) }}
{% endif %}
/* Bigquery won't let us `where` without `from` so we use this workaround */
with dummy_cte as (
select 1 as foo
)
select
cast(null as {{ dbt.type_string() }}) as unique_id,
cast(null as {{ dbt.type_string() }}) as node_name,
cast(null as {{ dbt.type_string() }}) as resource_type,
cast(null as {{ dbt.type_string() }}) as file_path,
cast(null as {{ dbt.type_string() }}) as database,
cast(null as {{ dbt.type_string() }}) as schema,
cast(null as {{ dbt.type_string() }}) as package_name,
cast(null as {{ dbt.type_string() }}) as alias,
cast(null as {{ dbt.type_string()}}) as name,
cast(null as {{ dbt.type_string()}}) as description,
cast(null as {{ dbt.type_string()}}) as data_type,
cast(False as boolean) as quote
from dummy_cte
where false

View File

@@ -0,0 +1,19 @@
{{
config(
materialized='table',
)
}}
{% if execute %}
{{ check_model_is_table(model) }}
{% endif %}
with unioned as (
{{ dbt_utils.union_relations([
ref('base_node_columns'),
ref('base_source_columns')
])}}
)
select distinct * from unioned