Postgres: Implement UTF-8 identifier support (#7242)

This commit is contained in:
adnandaut
2025-11-07 13:52:30 +01:00
committed by GitHub
parent 3b2f728a74
commit 1002ce7de5
9 changed files with 251 additions and 9 deletions

2
sqlfluffrs/Cargo.lock generated
View File

@@ -504,7 +504,7 @@ dependencies = [
[[package]]
name = "sqlfluffrs"
version = "0.1.0"
version = "4.0.0-alpha.1"
dependencies = [
"bincode",
"env_logger",

View File

@@ -1317,7 +1317,7 @@ pub static DUCKDB_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Duckdb,
"word",
r#"[a-zA-Z_][0-9a-zA-Z_$]*"#,
r#"[\p{L}_][\p{L}\p{N}_$]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {

View File

@@ -1294,7 +1294,7 @@ pub static GREENPLUM_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Greenplum,
"word",
r#"[a-zA-Z_][0-9a-zA-Z_$]*"#,
r#"[\p{L}_][\p{L}\p{N}_$]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {

View File

@@ -1249,7 +1249,7 @@ pub static MATERIALIZE_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Materialize,
"word",
r#"[a-zA-Z_][0-9a-zA-Z_$]*"#,
r#"[\p{L}_][\p{L}\p{N}_$]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {

View File

@@ -1290,7 +1290,7 @@ pub static POSTGRES_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Postgres,
"word",
r#"[a-zA-Z_][0-9a-zA-Z_$]*"#,
r#"[\p{L}_][\p{L}\p{N}_$]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {

View File

@@ -293,7 +293,7 @@ postgres_dialect.patch_lexer_matchers(
WhitespaceSegment,
),
),
RegexLexer("word", r"[a-zA-Z_][0-9a-zA-Z_$]*", WordSegment),
RegexLexer("word", r"[\p{L}_][\p{L}\p{N}_$]*", WordSegment),
]
)
@@ -493,8 +493,8 @@ postgres_dialect.replace(
# Generate the anti template from the set of reserved keywords
lambda dialect: RegexParser(
# Cant begin with $ or digits,
# must only contain digits, letters, underscore or $
r"[A-Z_][A-Z0-9_$]*",
# must only contain digits, letters (including Unicode), underscore or $
r"[\p{L}_][\p{L}\p{N}_$]*",
IdentifierSegment,
type="naked_identifier",
anti_template=r"^(" + r"|".join(dialect.sets("reserved_keywords")) + r")$",

View File

@@ -0,0 +1,32 @@
-- Postgres should work with Unicode identifiers in various places
SELECT größe, länge FROM measurements;
SELECT
width AS größe,
height AS höhe
FROM dimensions;
SELECT 'Não' reativação FROM table1;
SELECT αλφα, βητα, γαμμα FROM greek_letters;
SELECT москва, санкт_петербург FROM cities;
SELECT field1 AS FROM users;
SELECT
id,
größe,
description
FROM products;
WITH größen AS (
SELECT * FROM base_table
)
SELECT * FROM größen;
CREATE TABLE größen (
länge INTEGER,
breite INTEGER
);

View File

@@ -0,0 +1,210 @@
# YML test files are auto-generated from SQL files and should not be edited by
# hand. To help enforce this, the "hash" field in the file must match a hash
# computed by SQLFluff when running the tests. Please run
# `python test/generate_parse_fixture_yml.py` to generate them after adding or
# altering SQL files.
_hash: de152e05441c785e19af9f4e5142cee92514b0fd949340f450e052e1e25598fd
file:
- statement:
select_statement:
select_clause:
- keyword: SELECT
- select_clause_element:
column_reference:
naked_identifier: größe
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: länge
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: measurements
- statement_terminator: ;
- statement:
select_statement:
select_clause:
- keyword: SELECT
- select_clause_element:
column_reference:
naked_identifier: width
alias_expression:
alias_operator:
keyword: AS
naked_identifier: größe
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: height
alias_expression:
alias_operator:
keyword: AS
naked_identifier: höhe
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: dimensions
- statement_terminator: ;
- statement:
select_statement:
select_clause:
keyword: SELECT
select_clause_element:
quoted_literal: "'Não'"
alias_expression:
naked_identifier: reativação
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: table1
- statement_terminator: ;
- statement:
select_statement:
select_clause:
- keyword: SELECT
- select_clause_element:
column_reference:
naked_identifier: αλφα
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: βητα
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: γαμμα
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: greek_letters
- statement_terminator: ;
- statement:
select_statement:
select_clause:
- keyword: SELECT
- select_clause_element:
column_reference:
naked_identifier: москва
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: санкт_петербург
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: cities
- statement_terminator: ;
- statement:
select_statement:
select_clause:
keyword: SELECT
select_clause_element:
column_reference:
naked_identifier: field1
alias_expression:
alias_operator:
keyword: AS
naked_identifier: 名前
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: users
- statement_terminator: ;
- statement:
select_statement:
select_clause:
- keyword: SELECT
- select_clause_element:
column_reference:
naked_identifier: id
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: größe
- comma: ','
- select_clause_element:
column_reference:
naked_identifier: description
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: products
- statement_terminator: ;
- statement:
with_compound_statement:
keyword: WITH
common_table_expression:
naked_identifier: größen
keyword: AS
bracketed:
start_bracket: (
select_statement:
select_clause:
keyword: SELECT
select_clause_element:
wildcard_expression:
wildcard_identifier:
star: '*'
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: base_table
end_bracket: )
select_statement:
select_clause:
keyword: SELECT
select_clause_element:
wildcard_expression:
wildcard_identifier:
star: '*'
from_clause:
keyword: FROM
from_expression:
from_expression_element:
table_expression:
table_reference:
naked_identifier: größen
- statement_terminator: ;
- statement:
create_table_statement:
- keyword: CREATE
- keyword: TABLE
- table_reference:
naked_identifier: größen
- bracketed:
- start_bracket: (
- column_definition:
naked_identifier: länge
data_type:
keyword: INTEGER
- comma: ','
- column_definition:
naked_identifier: breite
data_type:
keyword: INTEGER
- end_bracket: )
- statement_terminator: ;

View File

@@ -39,7 +39,7 @@ commands =
# environment is invoked. Leaving the trailing comma ensures that this
# environment still installs the relevant plugins.
{py,winpy}{39,310,311,312,313,}: python -m pip install "{toxinidir}/plugins/sqlfluff-plugin-example"
rust: python -m pip install --force-reinstall --no-deps --find-links="{toxinidir}/dist" sqlfluffrs
rust: python -m pip install --force-reinstall --no-deps --no-index --find-links="{toxinidir}/dist" sqlfluffrs
# Clean up from previous tests
python "{toxinidir}/util.py" clean-tests
# Run tests