Write an implementation of the lexer in Rust (#7132)

Co-authored-by: Alan Cruickshank <alanmcruickshank@gmail.com>
This commit is contained in:
Cameron
2025-10-23 17:49:35 -04:00
committed by GitHub
parent b1a9d8a436
commit 6124c61a7c
108 changed files with 41538 additions and 43 deletions

View File

@@ -25,12 +25,15 @@ on:
required: false
type: boolean
default: false
with-rust:
required: true
type: string
secrets:
gh_token:
required: true
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ inputs.python-version }}-${{ inputs.marks }}-${{ inputs.coverage }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ inputs.python-version }}-${{ inputs.marks }}-${{ inputs.coverage }}-${{ inputs.with-rust }}
cancel-in-progress: true
jobs:
@@ -49,6 +52,14 @@ jobs:
setup.cfg
requirements_dev.txt
- name: Download built wheels
if: ${{ inputs.with_rust }} == '-rust'
uses: actions/download-artifact@v4
with:
path: ./dist
pattern: wheels-*
merge-multiple: true
- name: Install dependencies
run: pip install tox
@@ -69,10 +80,10 @@ jobs:
# NOTE: We have a separate job for coverage reporting because
# it impacts performance and slows the test suite significantly.
if: ${{ inputs.coverage }}
run: tox -e py${{ steps.py_version.outputs.PYVERSION }} -- --cov=sqlfluff -n 2 test -m "${{ inputs.marks }}" --durations=16 --verbosity=0
run: tox -e py${{ steps.py_version.outputs.PYVERSION }}${{ inputs.with-rust }} -- --cov=sqlfluff -n 2 test -m "${{ inputs.marks }}" --durations=16 --verbosity=0
- name: Run the tests (without coverage)
if: ${{ !inputs.coverage }}
run: tox -e py${{ steps.py_version.outputs.PYVERSION }} -- -n 2 test -m "${{ inputs.marks }}" --durations=16 --verbosity=0
run: tox -e py${{ steps.py_version.outputs.PYVERSION }}${{ inputs.with-rust }} -- -n 2 test -m "${{ inputs.marks }}" --durations=16 --verbosity=0
- name: Rename coverage files with suffix
# NOTE: We do this because we're using the same tox environment for multiple
@@ -88,7 +99,7 @@ jobs:
uses: actions/upload-artifact@v4
if: ${{ inputs.coverage }}
with:
name: coverage-data-py${{ inputs.python-version }}-${{ inputs.marks }}
name: coverage-data-py${{ inputs.python-version }}-${{ inputs.marks }}${{ inputs.with-rust }}
path: ".coverage.*"
if-no-files-found: ignore
include-hidden-files: true

View File

@@ -45,6 +45,7 @@ jobs:
"mypy",
"mypyc",
"doctests",
"check-rs",
]
include:
# Default to most recent python version
@@ -64,9 +65,144 @@ jobs:
- name: Run the tests
run: tox -e ${{ matrix.job }}
rs-build-linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-latest
target: x86_64
- runner: ubuntu-latest
target: x86
- runner: ubuntu-latest
target: aarch64
- runner: ubuntu-latest
target: armv7
- runner: ubuntu-latest
target: s390x
- runner: ubuntu-latest
target: ppc64le
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}
path: dist
rs-build-musllinux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-latest
target: x86_64
- runner: ubuntu-latest
target: x86
- runner: ubuntu-latest
target: aarch64
- runner: ubuntu-latest
target: armv7
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: musllinux_1_2
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-musllinux-${{ matrix.platform.target }}
path: dist
rs-build-windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist
rs-build-macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist
rs-build-sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist --manifest-path sqlfluffrs/Cargo.toml
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist
# Test with coverage tracking on most recent python (py313).
python-version-tests:
name: Python Tests
needs: rs-build-linux
strategy:
matrix:
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
@@ -77,6 +213,7 @@ jobs:
# Override coverage to be true for most recent python version.
- python-version: "3.13"
coverage: true
with-rust: [ "-rust", "" ]
permissions:
contents: read
pull-requests: write
@@ -84,6 +221,7 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
coverage: ${{ matrix.coverage }}
with-rust: ${{ matrix.with-rust }}
secrets:
gh_token: ${{ secrets.github_token }}
@@ -114,9 +252,12 @@ jobs:
gh_token: ${{ secrets.github_token }}
dialect-tests:
name: Dialect ${{ matrix.marks }}
name: Dialect ${{ matrix.marks }}${{ matrix.with-rust }}
needs: rs-build-linux
strategy:
matrix:
marks: [ "parse_suite", "fix_suite", "rules_suite" ]
with-rust: [ "-rust", "" ]
include:
# This runs the bulk of the dialect _parsing_ tests.
#
@@ -149,6 +290,7 @@ jobs:
python-version: "3.13"
marks: ${{ matrix.marks }}
coverage: ${{ matrix.coverage }}
with-rust: ${{ matrix.with-rust }}
secrets:
gh_token: ${{ secrets.github_token }}

View File

@@ -0,0 +1,172 @@
name: Publish SQLFluff-rs PyPI Version
on:
release:
types:
- published
workflow_dispatch:
permissions:
contents: read
jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
- runner: ubuntu-22.04
target: x86
- runner: ubuntu-22.04
target: aarch64
- runner: ubuntu-22.04
target: armv7
- runner: ubuntu-22.04
target: s390x
- runner: ubuntu-22.04
target: ppc64le
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}
path: dist
musllinux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
- runner: ubuntu-22.04
target: x86
- runner: ubuntu-22.04
target: aarch64
- runner: ubuntu-22.04
target: armv7
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: musllinux_1_2
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-musllinux-${{ matrix.platform.target }}
path: dist
windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist
macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter --manifest-path sqlfluffrs/Cargo.toml
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist
sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist --manifest-path sqlfluffrs/Cargo.toml
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist
release:
name: Release
runs-on: ubuntu-latest
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
needs: [linux, musllinux, windows, macos, sdist]
permissions:
# Use to sign the release artifacts
id-token: write
# Used to upload release artifacts
contents: write
# Used to generate artifact attestation
attestations: write
steps:
- uses: actions/download-artifact@v4
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v2
with:
subject-path: 'wheels-*/*'
- name: Publish to PyPI
if: ${{ startsWith(github.ref, 'refs/tags/') }}
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/*

View File

@@ -36,7 +36,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.14.1
rev: v1.18.1
hooks:
- id: mypy
additional_dependencies:
@@ -58,7 +58,7 @@ repos:
pathspec,
pytest, # and by extension... pluggy
click,
platformdirs
platformdirs,
]
files: ^src/sqlfluff/.*
# The mypy pre-commit hook by default sets a few arguments that we don't normally

View File

@@ -224,6 +224,10 @@ for development, and which parts of the test suite you may find most useful.
runs to specific dialects to further improve iteration speed. e.g.
- `tox -e generate-fixture-yml -- -d mysql` will run just the mysql tests.
- `python test/generate_parse_fixture_yml.py -d mysql` will do the same.
As you make changes to a dialect, you will also need to regenerate the Rust
dialects to keep them in sync. To do this, run `tox -e generate-rs` (if using
tox), or, with sqlfluff installed in a virtual environment, run
`utils/rustify.py build` to resync the languages.
2. Developing for the dbt templater should only require running the dbt test
suite (see below).
3. Developing rules and rule plugins there are a couple of scenarios.

View File

@@ -7,12 +7,10 @@ build-backend = "setuptools.build_meta"
name = "sqlfluff"
version = "3.5.0"
description = "The SQL Linter for Humans"
readme = {file = "README.md", content-type = "text/markdown"}
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.9"
authors = [
{name = "Alan Cruickshank", email = "alan@designingoverload.com"},
]
license = {file = "LICENSE.md"}
authors = [{ name = "Alan Cruickshank", email = "alan@designingoverload.com" }]
license = { file = "LICENSE.md" }
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
@@ -99,6 +97,9 @@ dependencies = [
"tqdm",
]
[project.optional-dependencies]
rs = ["sqlfluffrs~=0.1.0"]
[project.urls]
Homepage = "https://www.sqlfluff.com"
Documentation = "https://docs.sqlfluff.com"
@@ -148,9 +149,7 @@ root_package = "sqlfluff"
[[tool.importlinter.contracts]]
name = "Forbid dependencies outside core"
type = "forbidden"
source_modules = [
"sqlfluff.core",
]
source_modules = ["sqlfluff.core"]
forbidden_modules = [
"sqlfluff.api",
"sqlfluff.cli",
@@ -162,12 +161,8 @@ forbidden_modules = [
[[tool.importlinter.contracts]]
name = "API may not depend on CLI"
type = "forbidden"
source_modules = [
"sqlfluff.api",
]
forbidden_modules = [
"sqlfluff.cli",
]
source_modules = ["sqlfluff.api"]
forbidden_modules = ["sqlfluff.cli"]
[[tool.importlinter.contracts]]
name = "Helper methods must be internally independent"
@@ -222,6 +217,7 @@ warn_unused_ignores = true
strict_equality = true
extra_checks = true
no_implicit_reexport = true
mypy_path = "$MYPY_CONFIG_FILE_DIR/sqlfluffrs"
# skip type checking for 3rd party packages for which stubs are not available
[[tool.mypy.overrides]]
@@ -232,7 +228,6 @@ ignore_missing_imports = true
module = "tblib.*"
ignore_missing_imports = true
[tool.ruff.lint]
extend-select = ["I", "D"]
@@ -280,7 +275,7 @@ ignore-path = "docs/source/_partials/"
skip = "*/test/fixtures/*,*/.*,*/pyproject.toml"
check-hidden = true
quiet-level=2
quiet-level = 2
# ignore-regex = '\\[fnrstv]'
builtin = "clear,rare,informal,names"
@@ -288,7 +283,7 @@ ignore-words-list = "fo,ws,falsy,coo,inout,deque,crate,trough,ro,mange,identifer
# ignore-words = "dev/tools/codespell/codespell-ignore.txt"
# exclude-file = "dev/tools/codespell/codespell-lines-ignore.txt"
uri-ignore-words-list="crate"
uri-ignore-words-list = "crate"
# For future reference: it is not currently possible to specify
# the standard dictionary and the custom dictionary in the configuration

72
sqlfluffrs/.gitignore vendored Normal file
View File

@@ -0,0 +1,72 @@
/target
# Byte-compiled / optimized / DLL files
__pycache__/
.pytest_cache/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
.venv/
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
include/
man/
venv/
*.egg-info/
.installed.cfg
*.egg
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
pip-selfcheck.json
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# Rope
.ropeproject
# Django stuff:
*.log
*.pot
.DS_Store
# Sphinx documentation
docs/_build/
# PyCharm
.idea/
# VSCode
.vscode/
# Pyenv
.python-version

731
sqlfluffrs/Cargo.lock generated Normal file
View File

@@ -0,0 +1,731 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
dependencies = [
"anstyle",
"once_cell",
"windows-sys",
]
[[package]]
name = "arc-swap"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "bit-set"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bumpalo"
version = "3.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "env_filter"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"jiff",
"log",
]
[[package]]
name = "equivalent"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "fancy-regex"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
dependencies = [
"bit-set",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "getrandom"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "indoc"
version = "2.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jiff"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49"
dependencies = [
"jiff-static",
"log",
"portable-atomic",
"portable-atomic-util",
"serde",
]
[[package]]
name = "jiff-static"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "js-sys"
version = "0.3.80"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852f13bec5eba4ba9afbeb93fd7c13fe56147f055939ae21c43a29a0ecb2702e"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memoffset"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]]
name = "portable-atomic-util"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
dependencies = [
"portable-atomic",
]
[[package]]
name = "proc-macro2"
version = "1.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
dependencies = [
"unicode-ident",
]
[[package]]
name = "pyo3"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
dependencies = [
"hashbrown",
"indoc",
"libc",
"memoffset",
"once_cell",
"portable-atomic",
"pyo3-build-config",
"pyo3-ffi",
"pyo3-macros",
"unindent",
"uuid",
]
[[package]]
name = "pyo3-build-config"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
dependencies = [
"target-lexicon",
]
[[package]]
name = "pyo3-ffi"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
dependencies = [
"libc",
"pyo3-build-config",
]
[[package]]
name = "pyo3-log"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "833e6fdc21553e9938d9443050ed3c7787ac3c1a1aefccbd03dfae0c7a4be529"
dependencies = [
"arc-swap",
"log",
"pyo3",
]
[[package]]
name = "pyo3-macros"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
"quote",
"syn",
]
[[package]]
name = "pyo3-macros-backend"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
dependencies = [
"heck",
"proc-macro2",
"pyo3-build-config",
"quote",
"syn",
]
[[package]]
name = "quote"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
[[package]]
name = "regex"
version = "1.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
version = "1.0.225"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd6c24dee235d0da097043389623fb913daddf92c76e9f5a1db88607a0bcbd1d"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.225"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "659356f9a0cb1e529b24c01e43ad2bdf520ec4ceaf83047b83ddcc2251f96383"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.225"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea936adf78b1f766949a4977b91d2f5595825bd6ec079aa9543ad2685fc4516"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]]
name = "slotmap"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbff4acf519f630b3a3ddcfaea6c06b42174d9a44bc70c620e9ed1649d58b82a"
dependencies = [
"version_check",
]
[[package]]
name = "sqlfluffrs"
version = "0.1.0"
dependencies = [
"bincode",
"env_logger",
"fancy-regex",
"hashbrown",
"itertools",
"log",
"once_cell",
"pyo3",
"pyo3-log",
"regex",
"serde",
"serde_json",
"slotmap",
"uuid",
]
[[package]]
name = "syn"
version = "2.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "target-lexicon"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
[[package]]
name = "unicode-ident"
version = "1.0.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034"
[[package]]
name = "unindent"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid"
version = "1.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
dependencies = [
"getrandom",
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.14.7+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
dependencies = [
"wasip2",
]
[[package]]
name = "wasip2"
version = "1.0.1+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
dependencies = [
"wit-bindgen",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab10a69fbd0a177f5f649ad4d8d3305499c42bab9aef2f7ff592d0ec8f833819"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bb702423545a6007bbc368fde243ba47ca275e549c8a28617f56f6ba53b1d1c"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc65f4f411d91494355917b605e1480033152658d71f722a90647f56a70c88a0"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffc003a991398a8ee604a401e194b6b3a39677b3173d6e74495eb51b82e99a32"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "293c37f4efa430ca14db3721dfbe48d8c33308096bd44d80ebaa775ab71ba1cf"
dependencies = [
"unicode-ident",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wit-bindgen"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"

32
sqlfluffrs/Cargo.toml Normal file
View File

@@ -0,0 +1,32 @@
[package]
name = "sqlfluffrs"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "sqlfluffrs"
crate-type = ["cdylib"]
[features]
unicode = []
python = ["unicode", "pyo3"]
[dependencies]
env_logger = "0.11.8"
fancy-regex = "0.16.2"
hashbrown = "0.15.5"
itertools = "0.14.0"
log = "0.4.28"
once_cell = "1.21.3"
pyo3 = { version = "0.26.0", optional = true, features = ["hashbrown", "extension-module", "uuid"] }
pyo3-log = { version = "0.13.0", optional = true }
regex = { version = "1.11.2", features = ["perf"] }
slotmap = "1.0.7"
uuid = { version = "1.18.1", features = ["v4"] }
serde = { version = "1.0.225", features = ["derive"] }
serde_json = "1.0.145"
bincode = "1.3.3"
[dev-dependencies]
env_logger = "0.11.6"

21
sqlfluffrs/LICENSE.md Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 Alan Cruickshank
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

20
sqlfluffrs/README.md Normal file
View File

@@ -0,0 +1,20 @@
# SQLFluff-rs
This package is an optional installation for [SQLFluff](https://github.com/sqlfluff/sqlfluff) and is **not** intended to be used as a standalone linting solution.
## Purpose
SQLFluff-rs serves as a Rust-based component that can be integrated with the main SQLFluff package. It is currently in development and should be considered experimental.
## Installation
This package is automatically handled when installing SQLFluff with the appropriate optional dependencies. Direct installation or standalone usage is not supported.
To install from pip:
```sh
pip install sqlfluff[rs]
```
## Development Status
This is a supplementary component and is not meant to replace or function independently of the main SQLFluff package. For SQL linting, please use the main [SQLFluff](https://github.com/sqlfluff/sqlfluff) package.

0
sqlfluffrs/py.typed Normal file
View File

43
sqlfluffrs/pyproject.toml Normal file
View File

@@ -0,0 +1,43 @@
[build-system]
requires = ["maturin>=1.8,<2.0"]
build-backend = "maturin"
[project]
name = "sqlfluffrs"
readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE.md" }
description = "The SQL Linter for Humans"
requires-python = ">=3.9"
classifiers = [
"Development Status :: 3 - Alpha",
"Environment :: Console",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: Unix",
"Operating System :: POSIX",
"Operating System :: MacOS",
"Operating System :: Microsoft :: Windows",
"Programming Language :: Rust",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Programming Language :: SQL",
"Topic :: Utilities",
"Topic :: Software Development :: Quality Assurance",
]
dynamic = ["version"]
[project.urls]
Homepage = "https://www.sqlfluff.com"
Documentation = "https://docs.sqlfluff.com"
Source = "https://github.com/sqlfluff/sqlfluff"
"Issue Tracker" = "https://github.com/sqlfluff/sqlfluff/issues"
[tool.maturin]
features = ["pyo3/extension-module", "python"]

143
sqlfluffrs/sqlfluffrs.pyi Normal file
View File

@@ -0,0 +1,143 @@
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
from uuid import UUID
if TYPE_CHECKING:
from sqlfluff.core.config import FluffConfig
from sqlfluff.core.parser.lexer import StringLexer
from sqlfluff.core.parser.segments import SourceFix
from sqlfluff.core.templaters import TemplatedFile
SerializedObject = dict[str, Union[str, int, bool, list["SerializedObject"]]]
TupleSerialisedSegment = tuple[str, Union[str, tuple["TupleSerialisedSegment", ...]]]
class Slice: ...
class RsRawFileSlice:
raw: str
slice_type: str
source_idx: int
block_idx: int
tag: Optional[str]
class RsTemplatedFileSlice:
slice_type: str
source_slice: Slice
templated_slice: Slice
class RsTemplatedFile:
source_str: str
fname: str
templated_str: str
sliced_file: List[RsTemplatedFileSlice]
raw_sliced: List[RsRawFileSlice]
class RsPositionMarker:
source_slice: slice
templated_slice: slice
templated_file: RsTemplatedFile
working_line_no: int
working_line_pos: int
class RsToken:
raw: str
pos_marker: RsPositionMarker
type: str
uuid: Optional[int]
source_fixes: Optional[list["SourceFix"]]
def raw_trimmed(self) -> str: ...
@property
def is_templated(self) -> bool: ...
@property
def is_code(self) -> bool: ...
@property
def is_meta(self) -> bool: ...
@property
def source_str(self) -> str: ...
@property
def block_type(self) -> str: ...
@property
def block_uuid(self) -> Optional[UUID]: ...
@property
def cache_key(self) -> str: ...
@property
def trim_start(self) -> Optional[tuple[str]]: ...
@property
def trim_chars(self) -> Optional[tuple[str]]: ...
@property
def quoted_value(self) -> Optional[tuple[str, int | str]]: ...
@property
def escape_replacements(self) -> Optional[list[tuple[str, str]]]: ...
def count_segments(self, raw_only: bool = False) -> int: ...
def get_type(self) -> str: ...
def recursive_crawl(
self,
seg_type: Tuple[str, ...],
recurse_into: bool,
no_recursive_seg_type: Optional[Union[str, List[str]]] = None,
allow_self: bool = True,
) -> List["RsToken"]: ...
def recursive_crawl_all(self, reverse: bool) -> List["RsToken"]: ...
@property
def segments(self) -> List["RsToken"]: ...
def path_to(self, other: "RsToken") -> List[Any]: ...
def get_start_loc(self) -> Tuple[int, int]: ...
def get_end_loc(self) -> Tuple[int, int]: ...
@property
def raw_segments(self) -> List["RsToken"]: ...
def copy(
self,
segments: Optional[List["RsToken"]] = None,
parent: Optional[Any] = None,
parent_idx: Optional[int] = None,
) -> "RsToken": ...
def edit(
self,
raw: Optional[str] = None,
source_fixes: Optional[List[Any]] = None,
) -> "RsToken": ...
def to_tuple(
self,
code_only: Optional[bool] = None,
show_raw: Optional[bool] = None,
include_meta: Optional[bool] = None,
) -> TupleSerialisedSegment: ...
def __repr__(self) -> str: ...
@property
def instance_types(self) -> List[str]: ...
class RsSQLLexerError:
desc: str
line_no: int
line_pos: int
ignore: bool
warning: bool
fatal: bool
def __init__(
self,
msg: Optional[str] = None,
pos: Optional[RsPositionMarker] = None,
line_no: int = 0,
line_pos: int = 0,
ignore: bool = False,
warning: bool = False,
fatal: bool = False,
) -> None: ...
def rule_code(self) -> str: ...
def rule_name(self) -> str: ...
def source_signature(self) -> Tuple[Tuple[str, int, int], str]: ...
def to_dict(self) -> SerializedObject: ...
def ignore_if_in(self, ignore_iterable: list[str]) -> None: ...
def warning_if_in(self, ignore_iterable: list[str]) -> None: ...
class RsLexer:
def __init__(
self,
config: Optional["FluffConfig"] = None,
last_resort_lexer: Optional["StringLexer"] = None,
dialect: Optional[str] = None,
): ...
def _lex(
self, lex_input: Union[str, "TemplatedFile"]
) -> Tuple[List[RsToken], List[Any]]: ...

View File

@@ -0,0 +1,44 @@
#[derive(Clone)]
pub struct FluffConfig {
pub dialect: Option<String>,
pub template_blocks_indent: bool,
}
impl FluffConfig {
pub fn new(dialect: Option<String>, template_blocks_indent: bool) -> Self {
Self {
dialect,
template_blocks_indent,
}
}
}
#[cfg(feature = "python")]
pub mod python {
use pyo3::{
prelude::*,
types::{PyDict, PyDictMethods},
};
use super::FluffConfig;
#[derive(Clone)]
pub struct PyFluffConfig(pub FluffConfig);
impl<'py> FromPyObject<'py> for PyFluffConfig {
fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
let configs = ob.getattr("_configs")?;
let configs_dict = configs.downcast::<PyDict>()?;
let core = configs_dict.get_item("core").ok().flatten().unwrap();
let core_dict = core.downcast::<PyDict>()?;
let dialect = core_dict
.get_item("dialect")
.ok()
.flatten()
.and_then(|x| x.extract::<String>().ok());
// println!("{:?}", dialect);
Ok(Self(FluffConfig::new(dialect, true)))
}
}
}

View File

@@ -0,0 +1 @@
pub mod fluffconfig;

View File

@@ -0,0 +1,895 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static ANSI_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"CASE".to_string(),
"CROSS".to_string(),
"FULL".to_string(),
"IGNORE".to_string(),
"INNER".to_string(),
"INTERVAL".to_string(),
"JOIN".to_string(),
"LEFT".to_string(),
"NATURAL".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PARTITION".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROWS".to_string(),
"SELECT".to_string(),
"SET".to_string(),
"UNION".to_string(),
"USING".to_string(),
]});
pub static ANSI_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Ansi,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"inline_comment",
r#"(--|#)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("--"), String::from("#")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Ansi,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Ansi,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"single_quote",
r#"'([^'\\]|\\.|'')*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\'|''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"double_quote",
r#""(""|[^"\\]|\\.)*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"\\]|\\.)*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\"|"""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"back_quote",
r#"`(?:[^`\\]|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\`"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"numeric_literal",
r#"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::literal_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Ansi,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Ansi,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,916 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static CLICKHOUSE_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"CASE".to_string(),
"CROSS".to_string(),
"FULL".to_string(),
"IGNORE".to_string(),
"INNER".to_string(),
"INTERVAL".to_string(),
"JOIN".to_string(),
"LEFT".to_string(),
"NATURAL".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PARTITION".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROWS".to_string(),
"SELECT".to_string(),
"SET".to_string(),
"UNION".to_string(),
"USING".to_string(),
]});
pub static CLICKHOUSE_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"inline_comment",
r#"(--|#)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("--"), String::from("#")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Clickhouse,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Clickhouse,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"single_quote",
r#"'([^'\\]|\\.|'')*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\'|''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"double_quote",
r#""([^"\\]|""|\\.)*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"\\]|""|\\.)*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#"(""|\\")"#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"back_quote",
r#"`(?:[^`\\]|``|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|``|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"(``|\\`)"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"numeric_literal",
r#"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::literal_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"lambda",
"->",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::symbol_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Clickhouse,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Clickhouse,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,915 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static DB2_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"CASE".to_string(),
"CROSS".to_string(),
"FULL".to_string(),
"IGNORE".to_string(),
"INNER".to_string(),
"INTERVAL".to_string(),
"JOIN".to_string(),
"LEFT".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PARTITION".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROWS".to_string(),
"SELECT".to_string(),
"SET".to_string(),
"UNION".to_string(),
"USING".to_string(),
]});
pub static DB2_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Db2,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"inline_comment",
r#"(--)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("-"), String::from("-")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Db2,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Db2,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"single_quote",
r#"'((?:[^']|'')*)'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^']|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"double_quote",
r#""((?:[^"]|"")*)""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"]|"")*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#""""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"back_quote",
r#"`(?:[^`\\]|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\`"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"numeric_literal",
r#"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::literal_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"right_arrow",
"=>",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Db2,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Db2,
"word",
r#"[0-9a-zA-Z_#]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,996 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static FLINK_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"ALL".to_string(),
"AND".to_string(),
"ANY".to_string(),
"AS".to_string(),
"AUTHORIZATION".to_string(),
"BETWEEN".to_string(),
"BIGINT".to_string(),
"BINARY".to_string(),
"BOOLEAN".to_string(),
"BOTH".to_string(),
"BY".to_string(),
"CASE".to_string(),
"CAST".to_string(),
"CHAR".to_string(),
"CHARACTER".to_string(),
"CHECK".to_string(),
"COLLATE".to_string(),
"COLUMN".to_string(),
"CONSTRAINT".to_string(),
"CREATE".to_string(),
"CROSS".to_string(),
"CURRENT_DATE".to_string(),
"CURRENT_TIME".to_string(),
"CURRENT_TIMESTAMP".to_string(),
"CURRENT_USER".to_string(),
"CURSOR".to_string(),
"DATE".to_string(),
"DAY".to_string(),
"DECIMAL".to_string(),
"DECLARE".to_string(),
"DELETE".to_string(),
"DESC".to_string(),
"DISTINCT".to_string(),
"DOUBLE".to_string(),
"DROP".to_string(),
"ELSE".to_string(),
"END".to_string(),
"ESCAPE".to_string(),
"EXCEPT".to_string(),
"EXISTS".to_string(),
"EXTRACT".to_string(),
"FALSE".to_string(),
"FETCH".to_string(),
"FILTER".to_string(),
"FLOAT".to_string(),
"FOR".to_string(),
"FOREIGN".to_string(),
"FROM".to_string(),
"FULL".to_string(),
"FUNCTION".to_string(),
"GRANT".to_string(),
"GROUP".to_string(),
"HAVING".to_string(),
"HOUR".to_string(),
"IF".to_string(),
"IGNORE".to_string(),
"IN".to_string(),
"INNER".to_string(),
"INSERT".to_string(),
"INT".to_string(),
"INTEGER".to_string(),
"INTERSECT".to_string(),
"INTERVAL".to_string(),
"INTO".to_string(),
"IS".to_string(),
"JOIN".to_string(),
"LEADING".to_string(),
"LEFT".to_string(),
"LIKE".to_string(),
"LIMIT".to_string(),
"LOCAL".to_string(),
"MINUTE".to_string(),
"MONTH".to_string(),
"NATURAL".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"NUMERIC".to_string(),
"OF".to_string(),
"ON".to_string(),
"ONLY".to_string(),
"OR".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"OVERLAPS".to_string(),
"OVERLAY".to_string(),
"PARTITION".to_string(),
"POSITION".to_string(),
"PRIMARY".to_string(),
"REAL".to_string(),
"REFERENCES".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROW".to_string(),
"ROWS".to_string(),
"SECOND".to_string(),
"SELECT".to_string(),
"SESSION_USER".to_string(),
"SET".to_string(),
"SMALLINT".to_string(),
"SOME".to_string(),
"SUBSTRING".to_string(),
"TABLE".to_string(),
"THEN".to_string(),
"TIME".to_string(),
"TIMESTAMP".to_string(),
"TINYINT".to_string(),
"TO".to_string(),
"TRAILING".to_string(),
"TRUE".to_string(),
"UNION".to_string(),
"UNIQUE".to_string(),
"UNKNOWN".to_string(),
"UPDATE".to_string(),
"USER".to_string(),
"USING".to_string(),
"VALUES".to_string(),
"VARCHAR".to_string(),
"WHEN".to_string(),
"WHERE".to_string(),
"WITH".to_string(),
"YEAR".to_string(),
]});
pub static FLINK_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Flink,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"inline_comment",
r#"(--)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("-"), String::from("-")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Flink,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Flink,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"single_quote",
r#"'([^'\\]|\\.)*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.)*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"double_quote",
r#""(""|[^"\\]|\\.)*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"\\]|\\.)*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\"|"""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"back_quote",
r#"`([^`]|``)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`]|``)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"``"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"numeric_literal",
r#"(?>(?>\d+\.\d+|\d+\.|\.\d+)([eE][+-]?\d+)?([dDfF]|BD|bd)?|\d+[eE][+-]?\d+([dDfF]|BD|bd)?|\d+([dDfFlLsSyY]|BD|bd)?)((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"equals",
r#"==|="#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Flink,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Flink,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,200 @@
/* This is a generated file! */
/* dialect mods */
pub mod ansi;
use crate::dialect::ansi::matcher::{ANSI_KEYWORDS, ANSI_LEXERS};
pub mod athena;
use crate::dialect::athena::matcher::{ATHENA_KEYWORDS, ATHENA_LEXERS};
pub mod bigquery;
use crate::dialect::bigquery::matcher::{BIGQUERY_KEYWORDS, BIGQUERY_LEXERS};
pub mod clickhouse;
use crate::dialect::clickhouse::matcher::{CLICKHOUSE_KEYWORDS, CLICKHOUSE_LEXERS};
pub mod databricks;
use crate::dialect::databricks::matcher::{DATABRICKS_KEYWORDS, DATABRICKS_LEXERS};
pub mod db2;
use crate::dialect::db2::matcher::{DB2_KEYWORDS, DB2_LEXERS};
pub mod doris;
use crate::dialect::doris::matcher::{DORIS_KEYWORDS, DORIS_LEXERS};
pub mod duckdb;
use crate::dialect::duckdb::matcher::{DUCKDB_KEYWORDS, DUCKDB_LEXERS};
pub mod exasol;
use crate::dialect::exasol::matcher::{EXASOL_KEYWORDS, EXASOL_LEXERS};
pub mod flink;
use crate::dialect::flink::matcher::{FLINK_KEYWORDS, FLINK_LEXERS};
pub mod greenplum;
use crate::dialect::greenplum::matcher::{GREENPLUM_KEYWORDS, GREENPLUM_LEXERS};
pub mod hive;
use crate::dialect::hive::matcher::{HIVE_KEYWORDS, HIVE_LEXERS};
pub mod impala;
use crate::dialect::impala::matcher::{IMPALA_KEYWORDS, IMPALA_LEXERS};
pub mod mariadb;
use crate::dialect::mariadb::matcher::{MARIADB_KEYWORDS, MARIADB_LEXERS};
pub mod materialize;
use crate::dialect::materialize::matcher::{MATERIALIZE_KEYWORDS, MATERIALIZE_LEXERS};
pub mod mysql;
use crate::dialect::mysql::matcher::{MYSQL_KEYWORDS, MYSQL_LEXERS};
pub mod oracle;
use crate::dialect::oracle::matcher::{ORACLE_KEYWORDS, ORACLE_LEXERS};
pub mod postgres;
use crate::dialect::postgres::matcher::{POSTGRES_KEYWORDS, POSTGRES_LEXERS};
pub mod redshift;
use crate::dialect::redshift::matcher::{REDSHIFT_KEYWORDS, REDSHIFT_LEXERS};
pub mod snowflake;
use crate::dialect::snowflake::matcher::{SNOWFLAKE_KEYWORDS, SNOWFLAKE_LEXERS};
pub mod soql;
use crate::dialect::soql::matcher::{SOQL_KEYWORDS, SOQL_LEXERS};
pub mod sparksql;
use crate::dialect::sparksql::matcher::{SPARKSQL_KEYWORDS, SPARKSQL_LEXERS};
pub mod sqlite;
use crate::dialect::sqlite::matcher::{SQLITE_KEYWORDS, SQLITE_LEXERS};
pub mod starrocks;
use crate::dialect::starrocks::matcher::{STARROCKS_KEYWORDS, STARROCKS_LEXERS};
pub mod teradata;
use crate::dialect::teradata::matcher::{TERADATA_KEYWORDS, TERADATA_LEXERS};
pub mod trino;
use crate::dialect::trino::matcher::{TRINO_KEYWORDS, TRINO_LEXERS};
pub mod tsql;
use crate::dialect::tsql::matcher::{TSQL_KEYWORDS, TSQL_LEXERS};
pub mod vertica;
use crate::dialect::vertica::matcher::{VERTICA_KEYWORDS, VERTICA_LEXERS};
use crate::matcher::LexMatcher;
use std::str::FromStr;
#[derive(Debug, Eq, PartialEq, Hash, Copy, Clone)]
pub enum Dialect {
Ansi,
Athena,
Bigquery,
Clickhouse,
Databricks,
Db2,
Doris,
Duckdb,
Exasol,
Flink,
Greenplum,
Hive,
Impala,
Mariadb,
Materialize,
Mysql,
Oracle,
Postgres,
Redshift,
Snowflake,
Soql,
Sparksql,
Sqlite,
Starrocks,
Teradata,
Trino,
Tsql,
Vertica,
}
impl Dialect {
pub(crate) fn get_reserved_keywords(&self) -> &'static Vec<String> {
match self {
Dialect::Ansi => &ANSI_KEYWORDS,
Dialect::Athena => &ATHENA_KEYWORDS,
Dialect::Bigquery => &BIGQUERY_KEYWORDS,
Dialect::Clickhouse => &CLICKHOUSE_KEYWORDS,
Dialect::Databricks => &DATABRICKS_KEYWORDS,
Dialect::Db2 => &DB2_KEYWORDS,
Dialect::Doris => &DORIS_KEYWORDS,
Dialect::Duckdb => &DUCKDB_KEYWORDS,
Dialect::Exasol => &EXASOL_KEYWORDS,
Dialect::Flink => &FLINK_KEYWORDS,
Dialect::Greenplum => &GREENPLUM_KEYWORDS,
Dialect::Hive => &HIVE_KEYWORDS,
Dialect::Impala => &IMPALA_KEYWORDS,
Dialect::Mariadb => &MARIADB_KEYWORDS,
Dialect::Materialize => &MATERIALIZE_KEYWORDS,
Dialect::Mysql => &MYSQL_KEYWORDS,
Dialect::Oracle => &ORACLE_KEYWORDS,
Dialect::Postgres => &POSTGRES_KEYWORDS,
Dialect::Redshift => &REDSHIFT_KEYWORDS,
Dialect::Snowflake => &SNOWFLAKE_KEYWORDS,
Dialect::Soql => &SOQL_KEYWORDS,
Dialect::Sparksql => &SPARKSQL_KEYWORDS,
Dialect::Sqlite => &SQLITE_KEYWORDS,
Dialect::Starrocks => &STARROCKS_KEYWORDS,
Dialect::Teradata => &TERADATA_KEYWORDS,
Dialect::Trino => &TRINO_KEYWORDS,
Dialect::Tsql => &TSQL_KEYWORDS,
Dialect::Vertica => &VERTICA_KEYWORDS,
}
}
pub fn get_lexers(&self) -> &'static Vec<LexMatcher> {
match self {
Dialect::Ansi => &ANSI_LEXERS,
Dialect::Athena => &ATHENA_LEXERS,
Dialect::Bigquery => &BIGQUERY_LEXERS,
Dialect::Clickhouse => &CLICKHOUSE_LEXERS,
Dialect::Databricks => &DATABRICKS_LEXERS,
Dialect::Db2 => &DB2_LEXERS,
Dialect::Doris => &DORIS_LEXERS,
Dialect::Duckdb => &DUCKDB_LEXERS,
Dialect::Exasol => &EXASOL_LEXERS,
Dialect::Flink => &FLINK_LEXERS,
Dialect::Greenplum => &GREENPLUM_LEXERS,
Dialect::Hive => &HIVE_LEXERS,
Dialect::Impala => &IMPALA_LEXERS,
Dialect::Mariadb => &MARIADB_LEXERS,
Dialect::Materialize => &MATERIALIZE_LEXERS,
Dialect::Mysql => &MYSQL_LEXERS,
Dialect::Oracle => &ORACLE_LEXERS,
Dialect::Postgres => &POSTGRES_LEXERS,
Dialect::Redshift => &REDSHIFT_LEXERS,
Dialect::Snowflake => &SNOWFLAKE_LEXERS,
Dialect::Soql => &SOQL_LEXERS,
Dialect::Sparksql => &SPARKSQL_LEXERS,
Dialect::Sqlite => &SQLITE_LEXERS,
Dialect::Starrocks => &STARROCKS_LEXERS,
Dialect::Teradata => &TERADATA_LEXERS,
Dialect::Trino => &TRINO_LEXERS,
Dialect::Tsql => &TSQL_LEXERS,
Dialect::Vertica => &VERTICA_LEXERS,
}
}
}
impl FromStr for Dialect {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"ansi" => Ok(Dialect::Ansi),
"athena" => Ok(Dialect::Athena),
"bigquery" => Ok(Dialect::Bigquery),
"clickhouse" => Ok(Dialect::Clickhouse),
"databricks" => Ok(Dialect::Databricks),
"db2" => Ok(Dialect::Db2),
"doris" => Ok(Dialect::Doris),
"duckdb" => Ok(Dialect::Duckdb),
"exasol" => Ok(Dialect::Exasol),
"flink" => Ok(Dialect::Flink),
"greenplum" => Ok(Dialect::Greenplum),
"hive" => Ok(Dialect::Hive),
"impala" => Ok(Dialect::Impala),
"mariadb" => Ok(Dialect::Mariadb),
"materialize" => Ok(Dialect::Materialize),
"mysql" => Ok(Dialect::Mysql),
"oracle" => Ok(Dialect::Oracle),
"postgres" => Ok(Dialect::Postgres),
"redshift" => Ok(Dialect::Redshift),
"snowflake" => Ok(Dialect::Snowflake),
"soql" => Ok(Dialect::Soql),
"sparksql" => Ok(Dialect::Sparksql),
"sqlite" => Ok(Dialect::Sqlite),
"starrocks" => Ok(Dialect::Starrocks),
"teradata" => Ok(Dialect::Teradata),
"trino" => Ok(Dialect::Trino),
"tsql" => Ok(Dialect::Tsql),
"vertica" => Ok(Dialect::Vertica),
_ => Err(()),
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,978 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static SOQL_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"CASE".to_string(),
"CROSS".to_string(),
"FULL".to_string(),
"IGNORE".to_string(),
"INNER".to_string(),
"INTERVAL".to_string(),
"JOIN".to_string(),
"LAST_90_DAYS".to_string(),
"LAST_FISCAL_QUARTER".to_string(),
"LAST_FISCAL_YEAR".to_string(),
"LAST_MONTH".to_string(),
"LAST_N_DAYS".to_string(),
"LAST_N_FISCAL_QUARTERS".to_string(),
"LAST_N_FISCAL_YEARS".to_string(),
"LAST_N_MONTHS".to_string(),
"LAST_N_QUARTERS".to_string(),
"LAST_N_WEEKS".to_string(),
"LAST_N_YEARS".to_string(),
"LAST_QUARTER".to_string(),
"LAST_WEEK".to_string(),
"LAST_YEAR".to_string(),
"LEFT".to_string(),
"NATURAL".to_string(),
"NEXT_90_DAYS".to_string(),
"NEXT_FISCAL_QUARTER".to_string(),
"NEXT_FISCAL_YEAR".to_string(),
"NEXT_MONTH".to_string(),
"NEXT_N_DAYS".to_string(),
"NEXT_N_FISCAL_QUARTERS".to_string(),
"NEXT_N_FISCAL_YEARS".to_string(),
"NEXT_N_MONTHS".to_string(),
"NEXT_N_QUARTERS".to_string(),
"NEXT_N_WEEKS".to_string(),
"NEXT_N_YEARS".to_string(),
"NEXT_QUARTER".to_string(),
"NEXT_WEEK".to_string(),
"NEXT_YEAR".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PARTITION".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROWS".to_string(),
"SELECT".to_string(),
"SET".to_string(),
"THIS_FISCAL_QUARTER".to_string(),
"THIS_FISCAL_YEAR".to_string(),
"THIS_MONTH".to_string(),
"THIS_QUARTER".to_string(),
"THIS_WEEK".to_string(),
"THIS_YEAR".to_string(),
"TODAY".to_string(),
"TOMORROW".to_string(),
"UNION".to_string(),
"USING".to_string(),
"YESTERDAY".to_string(),
]});
pub static SOQL_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Soql,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"inline_comment",
r#"(--|#)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("--"), String::from("#")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Soql,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Soql,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"single_quote",
r#"'([^'\\]|\\.|'')*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\'|''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"double_quote",
r#""(""|[^"\\]|\\.)*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"\\]|\\.)*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\"|"""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"back_quote",
r#"`(?:[^`\\]|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\`"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"datetime_literal",
r#"[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(Z|(\+|\-)[0-9]{2}:[0-9]{2})"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"date_literal",
r#"[0-9]{4}-[0-9]{2}-[0-9]{2}"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"numeric_literal",
r#"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::literal_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Soql,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Soql,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,898 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static TERADATA_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"CASE".to_string(),
"CROSS".to_string(),
"FULL".to_string(),
"IGNORE".to_string(),
"INNER".to_string(),
"INTERVAL".to_string(),
"JOIN".to_string(),
"LEFT".to_string(),
"LOCKING".to_string(),
"NATURAL".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PARTITION".to_string(),
"REPLACE".to_string(),
"RESPECT".to_string(),
"RIGHT".to_string(),
"ROWS".to_string(),
"SELECT".to_string(),
"SET".to_string(),
"TIMESTAMP".to_string(),
"UNION".to_string(),
"USING".to_string(),
]});
pub static TERADATA_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Teradata,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"inline_comment",
r#"(--|#)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("--"), String::from("#")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Teradata,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Teradata,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"single_quote",
r#"'([^'\\]|\\.|'')*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\'|''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"double_quote",
r#""(""|[^"\\]|\\.)*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"\\]|\\.)*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\"|"""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"back_quote",
r#"`(?:[^`\\]|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\`"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"numeric_literal",
r#"([0-9]+(\.[0-9]*)?)"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Teradata,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Teradata,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

View File

@@ -0,0 +1,998 @@
/* This is a generated file! */
use once_cell::sync::Lazy;
use crate::matcher::{LexMatcher, extract_nested_block_comment};
use crate::token::Token;
use crate::token::config::TokenConfig;
use crate::regex::RegexModeGroup;
use crate::dialect::Dialect;
use hashbrown::HashSet;
pub static TRINO_KEYWORDS: Lazy<Vec<String>> = Lazy::new(|| { vec![
"ALTER".to_string(),
"AND".to_string(),
"AS".to_string(),
"BETWEEN".to_string(),
"BY".to_string(),
"CASE".to_string(),
"CAST".to_string(),
"CONSTRAINT".to_string(),
"CREATE".to_string(),
"CROSS".to_string(),
"CUBE".to_string(),
"CURRENT_CATALOG".to_string(),
"CURRENT_DATE".to_string(),
"CURRENT_PATH".to_string(),
"CURRENT_ROLE".to_string(),
"CURRENT_SCHEMA".to_string(),
"CURRENT_TIME".to_string(),
"CURRENT_TIMESTAMP".to_string(),
"CURRENT_USER".to_string(),
"DEALLOCATE".to_string(),
"DELETE".to_string(),
"DESCRIBE".to_string(),
"DISTINCT".to_string(),
"DROP".to_string(),
"ELSE".to_string(),
"END".to_string(),
"ESCAPE".to_string(),
"EXCEPT".to_string(),
"EXECUTE".to_string(),
"EXISTS".to_string(),
"EXTRACT".to_string(),
"FALSE".to_string(),
"FOR".to_string(),
"FROM".to_string(),
"FULL".to_string(),
"FUNCTION".to_string(),
"GROUP".to_string(),
"GROUPING".to_string(),
"HAVING".to_string(),
"IN".to_string(),
"INNER".to_string(),
"INSERT".to_string(),
"INTERSECT".to_string(),
"INTO".to_string(),
"IS".to_string(),
"JOIN".to_string(),
"JSON_ARRAY".to_string(),
"JSON_EXISTS".to_string(),
"JSON_OBJECT".to_string(),
"JSON_QUERY".to_string(),
"JSON_TABLE".to_string(),
"JSON_VALUE".to_string(),
"LEFT".to_string(),
"LIKE".to_string(),
"LISTAGG".to_string(),
"LOCALTIME".to_string(),
"LOCALTIMESTAMP".to_string(),
"NATURAL".to_string(),
"NORMALIZE".to_string(),
"NOT".to_string(),
"NULL".to_string(),
"ON".to_string(),
"OR".to_string(),
"ORDER".to_string(),
"OUTER".to_string(),
"PREPARE".to_string(),
"RECURSIVE".to_string(),
"RIGHT".to_string(),
"ROLLUP".to_string(),
"SELECT".to_string(),
"SKIP".to_string(),
"TABLE".to_string(),
"THEN".to_string(),
"TRIM".to_string(),
"TRUE".to_string(),
"UESCAPE".to_string(),
"UNION".to_string(),
"UNNEST".to_string(),
"USING".to_string(),
"VALUES".to_string(),
"WHEN".to_string(),
"WHERE".to_string(),
"WITH".to_string(),
]});
pub static TRINO_LEXERS: Lazy<Vec<LexMatcher>> = Lazy::new(|| { vec![
LexMatcher::regex_lexer(
Dialect::Trino,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"inline_comment",
r#"(--|#)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
Some(vec![String::from("--"), String::from("#")]),
None,
None,
None,
None,
None,
|input| input.starts_with(['#','-','/']),
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Trino,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(
LexMatcher::regex_subdivider(
Dialect::Trino,
"whitespace",
r#"[^\S\r\n]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::whitespace_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"single_quote",
r#"'([^'\\]|\\.|'')*'"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"'((?:[^'\\]|\\.|'')*)'"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\'|''"#.to_string(), r#"'"#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'\'', ..] => true, // Single quote case
[b'R' | b'r', b'\'', ..] => true, // r' or R'
[b'B' | b'b', b'\'', ..] => true, // b' or B'
[b'R' | b'r', b'B' | b'b', b'\'', ..] => true, // rb', RB', etc.
[b'B' | b'b', b'R' | b'r', b'\'', ..] => true, // br', Br', etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"double_quote",
r#""([^"]|"")*""#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#""((?:[^"]|"")*)""#.to_string(), RegexModeGroup::Index(1))),
Some((r#""""#.to_string(), r#"""#.to_string())),
None,
None,
|input| match input.as_bytes() {
[b'"', ..] => true, // Just a double quote
[b'R' | b'r', b'"', ..] => true, // r" or R"
[b'B' | b'b', b'"', ..] => true, // b" or B"
[b'R' | b'r', b'B' | b'b', b'"', ..] => true, // rb", RB", etc.
[b'B' | b'b', b'R' | b'r', b'"', ..] => true, // br", Br", etc.
_ => false,
},
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"back_quote",
r#"`(?:[^`\\]|\\.)*`"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"`((?:[^`\\]|\\.)*)`"#.to_string(), RegexModeGroup::Index(1))),
Some((r#"\\`"#.to_string(), r#"`"#.to_string())),
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"dollar_quote",
r#"\$(\w*)\$(.*?)\$\1\$"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
Some((r#"\$(\w*)\$(.*?)\$\1\$"#.to_string(), RegexModeGroup::Index(2))),
None,
None,
None,
|input| input.starts_with("$"),
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"numeric_literal",
r#"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::literal_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|input| input.starts_with(['x','X','.','0','1','2','3','4','5','6','7','8','9']),
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"obevo_annotation",
r#"////\s*(CHANGE|BODY|METADATA)[^\n]*"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comment_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"glob_operator",
"~~~",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"right_arrow",
"->",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"fat_right_arrow",
"=>",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"like_operator",
r#"!?~~?\*?"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::comparison_operator_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"newline",
r#"\r\n|\n"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::newline_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"casting_operator",
"::",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"equals",
"=",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"greater_than",
">",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"less_than",
"<",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"not",
"!",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"dot",
".",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"comma",
",",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"plus",
"+",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"minus",
"-",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"divide",
"/",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"percent",
"%",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"question",
"?",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"ampersand",
"&",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"vertical_bar",
"|",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"caret",
"^",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"star",
"*",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"start_bracket",
"(",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"end_bracket",
")",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"start_square_bracket",
"[",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"end_square_bracket",
"]",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"start_curly_bracket",
"{",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"end_curly_bracket",
"}",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"colon",
":",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::string_lexer(
Dialect::Trino,
"semicolon",
";",
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::code_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
),
LexMatcher::regex_lexer(
Dialect::Trino,
"word",
r#"[0-9a-zA-Z_]+"#,
|raw, pos_marker, class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold| {
Token::word_token(raw, pos_marker, TokenConfig {
class_types, instance_types, trim_start, trim_chars,
quoted_value, escape_replacement, casefold,
})
},
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
),
]});

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
/* This is a generated file! */
#[allow(clippy::needless_raw_string_hashes)]
pub mod matcher;
// pub mod parser;

1335
sqlfluffrs/src/lexer.rs Normal file

File diff suppressed because it is too large Load Diff

14
sqlfluffrs/src/lib.rs Normal file
View File

@@ -0,0 +1,14 @@
pub mod config;
pub mod dialect;
pub mod lexer;
pub mod marker;
pub mod matcher;
#[cfg(feature = "python")]
pub mod python;
pub mod regex;
pub mod slice;
pub mod templater;
pub mod token;
// include!(concat!(env!("OUT_DIR"), "/dialect_matcher.rs"));
use crate::dialect::Dialect;

491
sqlfluffrs/src/marker.rs Normal file
View File

@@ -0,0 +1,491 @@
use hashbrown::HashMap;
use std::cmp::Ordering;
use std::fmt::Display;
use std::sync::Arc;
use crate::slice::Slice;
use crate::templater::templatefile::TemplatedFile;
#[derive(Debug, Clone)]
pub struct PositionMarker {
pub source_slice: Slice,
pub templated_slice: Slice,
pub templated_file: Arc<TemplatedFile>,
pub working_line_no: usize,
pub working_line_pos: usize,
}
impl PositionMarker {
#[must_use]
pub fn new(
source_slice: Slice,
templated_slice: Slice,
templated_file: &Arc<TemplatedFile>,
working_line_no: Option<usize>,
working_line_pos: Option<usize>,
) -> Self {
let (working_line_no, working_line_pos) = match (working_line_no, working_line_pos) {
(Some(working_line_no), Some(working_line_pos)) => (working_line_no, working_line_pos),
_ => templated_file.get_line_pos_of_char_pos(source_slice.start, false),
};
Self {
source_slice,
templated_slice,
templated_file: Arc::clone(templated_file),
working_line_no,
working_line_pos,
}
}
#[must_use]
pub fn working_loc(&self) -> (usize, usize) {
(self.working_line_no, self.working_line_pos)
}
#[must_use]
pub fn working_loc_after(&self, raw: &str) -> (usize, usize) {
// Infer next position based on the raw string
self.infer_next_position(raw, self.working_line_no, self.working_line_pos)
}
#[must_use]
pub fn infer_next_position(
&self,
raw: &str,
line_no: usize,
line_pos: usize,
) -> (usize, usize) {
// Placeholder for position inference logic
// Example implementation: move forward by the length of the string
let lines: Vec<&str> = raw.split('\n').collect();
if lines.len() > 1 {
let num_lines: usize = lines.len();
let last_line_len: usize = lines.last().unwrap().len();
(line_no + num_lines - 1, last_line_len + 1)
} else {
let first_line_len: usize = raw.len();
(line_no, line_pos + first_line_len)
}
}
#[must_use]
pub fn source_position(&self) -> (usize, usize) {
self.templated_file
.get_line_pos_of_char_pos(self.source_slice.start, true)
}
#[must_use]
pub fn templated_position(&self) -> (usize, usize) {
self.templated_file
.get_line_pos_of_char_pos(self.source_slice.start, false)
}
#[must_use]
pub fn line_no(&self) -> usize {
self.source_position().0
}
#[must_use]
pub fn line_pos(&self) -> usize {
self.source_position().1
}
#[must_use]
pub fn to_source_string(&self) -> String {
let (line, pos) = self.source_position();
format!("[L:{line:3}, P:{pos:3}]")
}
#[must_use]
pub fn start_point_marker(&self) -> Self {
PositionMarker::from_point(
self.source_slice.start,
self.templated_slice.start,
&self.templated_file,
Some(self.working_line_no),
Some(self.working_line_pos),
)
}
#[must_use]
pub fn end_point_marker(&self) -> Self {
PositionMarker::from_point(
self.source_slice.stop,
self.templated_slice.stop,
&self.templated_file,
None,
None,
)
}
#[must_use]
pub fn is_point(&self) -> bool {
slice_is_point(&self.source_slice) && slice_is_point(&self.templated_slice)
}
#[must_use]
pub fn with_working_position(&self, line_no: usize, line_pos: usize) -> Self {
PositionMarker {
working_line_no: line_no,
working_line_pos: line_pos,
..self.clone()
}
}
#[must_use]
pub fn is_literal(&self) -> bool {
self.templated_file
.is_source_slice_literal(&self.source_slice)
}
#[must_use]
pub fn source_str(&self) -> String {
self.templated_file
.source_str
.chars()
.skip(self.source_slice.start)
.take(self.source_slice.len())
.collect::<String>()
}
#[must_use]
pub fn to_source_dict(&self) -> HashMap<String, usize> {
self.templated_file
.source_position_dict_from_slice(&self.source_slice)
}
#[must_use]
pub fn from_point(
source_point: usize,
templated_point: usize,
templated_file: &Arc<TemplatedFile>,
working_line_no: Option<usize>,
working_line_pos: Option<usize>,
) -> Self {
let source_slice = Slice::from(source_point..source_point);
let templated_slice = Slice::from(templated_point..templated_point);
PositionMarker::new(
source_slice,
templated_slice,
templated_file,
working_line_no,
working_line_pos,
)
}
#[must_use]
pub fn from_points(start_marker: &PositionMarker, end_marker: &PositionMarker) -> Self {
if start_marker.templated_file != end_marker.templated_file {
panic!("Markers must refer to the same templated file.");
}
PositionMarker::new(
start_marker.source_slice,
start_marker.templated_slice,
&start_marker.templated_file,
Some(start_marker.working_line_no),
Some(start_marker.working_line_pos),
)
}
#[must_use]
pub fn from_child_markers(markers: &[Option<PositionMarker>]) -> Self {
let mut source_start = usize::MAX;
let mut source_stop = usize::MIN;
let mut templated_start = usize::MAX;
let mut templated_stop = usize::MIN;
let mut templated_file = None;
for marker in markers.iter().filter_map(|m| m.as_ref()) {
source_start = source_start.min(marker.source_slice.start);
source_stop = source_stop.max(marker.source_slice.stop);
templated_start = templated_start.min(marker.templated_slice.start);
templated_stop = templated_stop.max(marker.templated_slice.stop);
if templated_file.is_none() {
templated_file = Some(marker.templated_file.clone());
}
if templated_file.as_ref() != Some(&marker.templated_file) {
panic!("Markers must refer to the same templated file.");
}
}
let source_slice = Slice::from(source_start..source_stop);
let templated_slice = Slice::from(templated_start..templated_stop);
PositionMarker::new(
source_slice,
templated_slice,
&templated_file.unwrap(),
None,
None,
)
}
}
impl Eq for PositionMarker {}
impl PartialEq for PositionMarker {
fn eq(&self, other: &Self) -> bool {
self.working_loc() == other.working_loc()
}
}
impl PartialOrd for PositionMarker {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for PositionMarker {
fn cmp(&self, other: &Self) -> Ordering {
self.working_loc().cmp(&other.working_loc())
}
}
impl Display for PositionMarker {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.to_source_string())
}
}
#[must_use]
pub fn slice_is_point(test_slice: &Slice) -> bool {
test_slice.start == test_slice.stop
}
#[cfg(feature = "python")]
pub mod python {
use std::{fmt::Display, sync::Arc};
use hashbrown::HashMap;
use pyo3::{prelude::*, types::PyType};
use crate::{
slice::Slice,
templater::templatefile::{
python::{PySqlFluffTemplatedFile, PyTemplatedFile},
TemplatedFile,
},
};
use super::PositionMarker;
#[pyclass(name = "RsPositionMarker", str, eq, ord, frozen, module = "sqlfluffrs")]
#[repr(transparent)]
#[derive(Debug, Clone)]
pub struct PyPositionMarker(pub PositionMarker);
#[pymethods]
impl PyPositionMarker {
#[getter]
pub fn source_slice(&self) -> Slice {
self.0.source_slice
}
#[getter]
pub fn templated_slice(&self) -> Slice {
self.0.templated_slice
}
// #[getter]
// pub fn templated_file(&self) -> PySqlFluffTemplatedFile {
// dbg!("templated file from PositionMarker");
// PySqlFluffTemplatedFile(PyTemplatedFile::from(self.0.templated_file.clone()))
// }
#[getter]
pub fn templated_file(&self) -> PyTemplatedFile {
PyTemplatedFile(self.0.templated_file.clone())
}
#[getter]
pub fn working_line_no(&self) -> usize {
self.0.working_line_no
}
#[getter]
pub fn working_line_pos(&self) -> usize {
self.0.working_line_pos
}
#[getter]
pub fn working_loc(&self) -> (usize, usize) {
(self.0.working_line_no, self.0.working_line_pos)
}
pub fn start_point_marker(&self) -> Self {
Self(self.0.start_point_marker())
}
pub fn end_point_marker(&self) -> Self {
Self(self.0.end_point_marker())
}
pub fn source_position(&self) -> (usize, usize) {
self.0.source_position()
}
pub fn templated_position(&self) -> (usize, usize) {
self.0.templated_position()
}
pub fn is_literal(&self) -> bool {
self.0.is_literal()
}
pub fn with_working_position(&self, line_no: usize, line_pos: usize) -> Self {
Self(self.0.with_working_position(line_no, line_pos))
}
pub fn infer_next_position(
&self,
raw: &str,
line_no: usize,
line_pos: usize,
) -> (usize, usize) {
self.0.infer_next_position(raw, line_no, line_pos)
}
pub fn line_no(&self) -> usize {
self.0.line_no()
}
pub fn line_pos(&self) -> usize {
self.0.line_pos()
}
pub fn source_str(&self) -> String {
self.0.source_str()
}
pub fn to_source_dict(&self) -> HashMap<String, usize> {
self.0.to_source_dict()
}
#[classmethod]
#[pyo3(signature = (markers))]
pub fn from_child_markers(
_cls: &Bound<'_, PyType>,
markers: Vec<Option<PyPositionMarker>>,
) -> PyResult<Self> {
let rust_markers: Vec<Option<PositionMarker>> =
markers.into_iter().map(|m| m.map(Into::into)).collect();
Ok(Self(PositionMarker::from_child_markers(&rust_markers)))
}
#[classmethod]
pub fn from_point(
_cls: &Bound<'_, PyType>,
source_point: usize,
templated_point: usize,
templated_file: PySqlFluffTemplatedFile,
working_line_no: Option<usize>,
working_line_pos: Option<usize>,
) -> Self {
let templated_file = templated_file.0 .0;
Self(PositionMarker::from_point(
source_point,
templated_point,
&templated_file,
working_line_no,
working_line_pos,
))
}
#[classmethod]
pub fn from_points(
_cls: &Bound<'_, PyType>,
start_marker: &PyPositionMarker,
end_marker: &PyPositionMarker,
) -> Self {
Self(PositionMarker::from_points(&start_marker.0, &end_marker.0))
}
pub fn is_point(&self) -> bool {
self.0.is_point()
}
pub fn to_source_string(&self) -> String {
self.0.to_source_string()
}
}
impl Display for PyPositionMarker {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.to_source_string())
}
}
impl From<PyPositionMarker> for PySqlFluffTemplatedFile {
fn from(value: PyPositionMarker) -> Self {
PySqlFluffTemplatedFile(PyTemplatedFile::from(value.0.templated_file.clone()))
}
}
impl From<PyPositionMarker> for PositionMarker {
fn from(value: PyPositionMarker) -> Self {
value.0
}
}
impl From<PositionMarker> for PyPositionMarker {
fn from(value: PositionMarker) -> Self {
Self(value)
}
}
impl PartialEq for PyPositionMarker {
fn eq(&self, other: &Self) -> bool {
self.0.eq(&other.0)
}
}
impl PartialOrd for PyPositionMarker {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.0.partial_cmp(&other.0)
}
}
#[derive(Clone, IntoPyObject, Debug)]
pub struct PySqlFluffPositionMarker(pub PyPositionMarker);
impl<'py> FromPyObject<'py> for PySqlFluffPositionMarker {
fn extract_bound(obj: &pyo3::Bound<'py, pyo3::PyAny>) -> PyResult<Self> {
let source_slice = obj.getattr("source_slice")?.extract::<Slice>()?;
// dbg!(source_slice);
let templated_slice = obj.getattr("templated_slice")?.extract::<Slice>()?;
// dbg!(templated_slice);
let templated_file: Arc<TemplatedFile> = obj
.getattr("templated_file")?
.extract::<PySqlFluffTemplatedFile>()?
.into();
// dbg!(templated_file.clone());
// let working_line_no = obj.getattr("working_line_no")?.extract::<usize>()?;
// let working_line_pos = obj.getattr("working_line_pos")?.extract::<usize>()?;
Ok(Self(PyPositionMarker(PositionMarker::new(
source_slice,
templated_slice,
&templated_file,
None,
None,
))))
}
}
impl From<PySqlFluffPositionMarker> for PyPositionMarker {
fn from(value: PySqlFluffPositionMarker) -> Self {
value.0
}
}
impl From<PySqlFluffPositionMarker> for PositionMarker {
fn from(value: PySqlFluffPositionMarker) -> Self {
value.0 .0
}
}
}

482
sqlfluffrs/src/matcher.rs Normal file
View File

@@ -0,0 +1,482 @@
use std::fmt::Display;
use fancy_regex::{Regex as FancyRegex, RegexBuilder as FancyRegexBuilder};
use hashbrown::HashSet;
use regex::{Regex, RegexBuilder};
use crate::{
dialect::Dialect,
marker::PositionMarker,
regex::RegexModeGroup,
token::Token,
};
/// Legacy function pointer type for token generation (maintains backward compatibility)
/// This signature accepts individual parameters and constructs a TokenConfig internally
pub type TokenGenerator = fn(
String, // raw
PositionMarker, // pos_marker
HashSet<String>, // class_types
Vec<String>, // instance_types
Option<Vec<String>>, // trim_start
Option<Vec<String>>, // trim_chars
Option<(String, RegexModeGroup)>, // quoted_value
Option<(String, String)>, // escape_replacement
Option<fn(&str) -> str>, // casefold
) -> Token;
#[derive(Debug, Clone)]
pub enum LexerMode {
String(String), // Match a literal string
Regex(Regex, fn(&str) -> bool), // Match using a regex
FancyRegex(FancyRegex, fn(&str) -> bool), // Match using a regex
Function(fn(&str, Dialect) -> Option<&str>),
}
impl Display for LexerMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match *self {
LexerMode::Regex(_, _) => write!(f, "RegexMatcher"),
LexerMode::FancyRegex(_, _) => write!(f, "FancyRegexMatcher"),
LexerMode::String(_) => write!(f, "StringMatcher"),
LexerMode::Function(_) => write!(f, "FunctionMatcher"),
}
}
}
pub struct LexedElement<'a> {
pub raw: &'a str,
pub matcher: &'a LexMatcher,
}
impl<'a> LexedElement<'a> {
pub fn new(raw: &'a str, matcher: &'a LexMatcher) -> Self {
Self { raw, matcher }
}
}
#[derive(Debug, Clone)]
pub struct LexMatcher {
pub dialect: Dialect,
pub name: String,
pub mode: LexerMode,
pub token_class_func: TokenGenerator,
pub subdivider: Option<Box<LexMatcher>>,
pub trim_post_subdivide: Option<Box<LexMatcher>>,
pub trim_start: Option<Vec<String>>,
pub trim_chars: Option<Vec<String>>,
pub quoted_value: Option<(String, RegexModeGroup)>,
pub escape_replacements: Option<(String, String)>,
pub casefold: Option<fn(&str) -> str>,
pub kwarg_type: Option<String>,
}
impl Display for LexMatcher {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "<{}: {}>", self.mode, self.name)
}
}
impl LexMatcher {
pub fn string_lexer(
dialect: Dialect,
name: &str,
template: &str,
token_class_func: TokenGenerator,
subdivider: Option<Box<LexMatcher>>,
trim_post_subdivide: Option<Box<LexMatcher>>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacements: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
kwarg_type: Option<String>,
) -> Self {
Self {
dialect,
name: name.to_string(),
mode: LexerMode::String(template.to_string()),
token_class_func,
subdivider,
trim_post_subdivide,
trim_start,
trim_chars,
quoted_value,
escape_replacements,
casefold,
kwarg_type,
}
}
fn base_regex_lexer(
dialect: Dialect,
name: &str,
pattern: &str,
token_class_func: TokenGenerator,
subdivider: Option<Box<LexMatcher>>,
trim_post_subdivide: Option<Box<LexMatcher>>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacements: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
fallback_lexer: Option<fn(&str, Dialect) -> Option<&str>>,
precheck: fn(&str) -> bool,
kwarg_type: Option<String>,
) -> Self {
let mode = match RegexBuilder::new(&pattern).build() {
Ok(regex) => LexerMode::Regex(regex, precheck),
Err(_) => match FancyRegexBuilder::new(&pattern).build() {
Ok(regex) => LexerMode::FancyRegex(regex, precheck),
Err(_) => {
if let Some(fallback) = fallback_lexer {
LexerMode::Function(fallback)
} else {
panic!(
"Unable to compile regex {} and no fallback function provided",
pattern
)
}
}
},
};
Self {
dialect,
name: name.to_string(),
mode,
token_class_func,
subdivider,
trim_post_subdivide,
trim_start,
trim_chars,
quoted_value,
escape_replacements,
casefold,
kwarg_type,
}
}
pub fn regex_lexer(
dialect: Dialect,
name: &str,
template: &str,
token_class_func: TokenGenerator,
subdivider: Option<Box<LexMatcher>>,
trim_post_subdivide: Option<Box<LexMatcher>>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacements: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
fallback_lexer: Option<fn(&str, Dialect) -> Option<&str>>,
precheck: fn(&str) -> bool,
kwarg_type: Option<String>,
) -> Self {
let pattern = format!(r"(?s)\A(?:{})", template);
Self::base_regex_lexer(
dialect,
name,
&pattern,
token_class_func,
subdivider,
trim_post_subdivide,
trim_start,
trim_chars,
quoted_value,
escape_replacements,
casefold,
fallback_lexer,
precheck,
kwarg_type,
)
}
pub fn regex_subdivider(
dialect: Dialect,
name: &str,
template: &str,
token_class_func: TokenGenerator,
subdivider: Option<Box<LexMatcher>>,
trim_post_subdivide: Option<Box<LexMatcher>>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacements: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
fallback_lexer: Option<fn(&str, Dialect) -> Option<&str>>,
precheck: fn(&str) -> bool,
kwarg_type: Option<String>,
) -> Self {
let pattern = format!(r"(?:{})", template);
Self::base_regex_lexer(
dialect,
name,
&pattern,
token_class_func,
subdivider,
trim_post_subdivide,
trim_start,
trim_chars,
quoted_value,
escape_replacements,
casefold,
fallback_lexer,
precheck,
kwarg_type,
)
}
pub fn scan_match<'a>(&'a self, input: &'a str) -> Option<(Vec<LexedElement<'a>>, usize)> {
// let t = Instant::now();
if input.is_empty() {
panic!("Unexpected empty string!");
}
// Match based on the mode
let matched = match &self.mode {
LexerMode::String(template) => input
.starts_with(template)
.then(|| LexedElement::new(template, self)),
LexerMode::Regex(regex, is_match_valid) => {
if !(is_match_valid)(input) {
// println!("{},{}", self.name, t.elapsed().as_nanos());
return None;
}
regex
.find(input)
.map(|mat| LexedElement::new(mat.as_str(), self))
}
LexerMode::FancyRegex(regex, is_match_valid) => {
if !(is_match_valid)(input) {
// println!("{},{}", self.name, t.elapsed().as_nanos());
return None;
}
regex
.find(input)
.ok()
.flatten()
.map(|mat| LexedElement::new(mat.as_str(), self))
}
LexerMode::Function(function) => {
(function)(input, self.dialect).map(|s| LexedElement::new(s, self))
}
};
// println!("{},{}", self.name, t.elapsed().as_nanos());
// Handle subdivision and trimming
if let Some(matched) = matched {
let len = matched.raw.len();
let elements = self.subdivide(matched);
Some((elements, len))
} else {
None
}
}
fn search(&self, input: &str) -> Option<(usize, usize)> {
match &self.mode {
LexerMode::String(template) => input.find(template).map(|start| {
let end = start + template.len();
(start, end)
}),
LexerMode::Regex(regex, _) => regex.find(input).map(|mat| (mat.start(), mat.end())),
LexerMode::FancyRegex(regex, _) => regex
.find(input)
.ok()
.flatten()
.map(|mat| (mat.start(), mat.end())),
_ => todo!(),
}
}
fn subdivide<'a>(&'a self, matched: LexedElement<'a>) -> Vec<LexedElement<'a>> {
if let Some(subdivider) = &self.subdivider {
let mut elements = Vec::new();
let mut buffer = matched.raw;
while !buffer.is_empty() {
if let Some((start, end)) = subdivider.search(buffer) {
let trimmed_elems = self.trim_match(&buffer[..start]);
elements.extend(trimmed_elems);
elements.push(LexedElement {
raw: &buffer[start..end],
matcher: subdivider,
});
buffer = &buffer[end..];
} else {
let trimmed_elems = self.trim_match(buffer);
elements.extend(trimmed_elems);
break;
}
}
elements
} else {
vec![matched]
}
}
fn trim_match<'a>(&'a self, raw: &'a str) -> Vec<LexedElement<'a>> {
let mut elements = Vec::new();
let mut buffer = raw;
let mut content_buffer = 0..0;
if let Some(trim_post_subdivide) = &self.trim_post_subdivide {
while !buffer.is_empty() {
if let Some((start, end)) = trim_post_subdivide.search(buffer) {
if start == 0 {
// Starting match
elements.push(LexedElement {
raw: &buffer[..end],
matcher: trim_post_subdivide,
});
buffer = &buffer[end..];
content_buffer = end..end;
} else if end == buffer.len() {
elements.push(LexedElement {
raw: &raw[content_buffer.start..content_buffer.end + start],
matcher: self,
});
elements.push(LexedElement {
raw: &buffer[start..end],
matcher: trim_post_subdivide,
});
return elements;
} else {
content_buffer.end += end;
buffer = &buffer[end..];
}
} else {
break;
}
}
}
if !content_buffer.is_empty() || !buffer.is_empty() {
elements.push(LexedElement::new(&raw[content_buffer.start..], self));
}
elements
}
pub fn construct_token(&self, raw: &str, pos_marker: PositionMarker) -> Token {
let instance_types = match self.kwarg_type.clone() {
Some(t) => vec![t],
None => vec![self.name.clone()],
};
(self.token_class_func)(
raw.to_string(),
pos_marker,
HashSet::new(),
instance_types,
self.trim_start.clone(),
self.trim_chars.clone(),
self.quoted_value.clone(),
self.escape_replacements.clone(),
self.casefold,
)
}
}
pub fn extract_nested_block_comment(input: &str, dialect: Dialect) -> Option<&str> {
let mut chars = input.chars().peekable();
let mut comment = String::new();
// Ensure the input starts with "/*"
if chars.next() != Some('/') || chars.next() != Some('*') {
return None;
}
comment.push_str("/*"); // Add the opening delimiter
let mut depth = 1; // Track nesting level
while let Some(c) = chars.next() {
comment.push(c);
if c == '/' && chars.peek() == Some(&'*') {
chars.next(); // Consume '*'
comment.push('*');
depth += 1;
} else if c == '*' && chars.peek() == Some(&'/') {
chars.next(); // Consume '/'
comment.push('/');
depth -= 1;
if depth == 0 {
return Some(&input[..comment.len()]);
}
}
}
// If we reach here, the comment wasn't properly closed
match &dialect {
Dialect::Sqlite => Some(&input[..comment.len()]),
_ => None,
}
}
// TODO: implement python passthroughs
#[cfg(feature = "python")]
pub mod python {}
#[cfg(test)]
mod test {
use crate::{dialect::Dialect, token::Token};
use super::{extract_nested_block_comment, LexMatcher};
#[test]
fn test_subdivide() {
let block_comment_matcher = LexMatcher::regex_lexer(
Dialect::Ansi,
"block_comment",
r#"\/\*([^\*]|\*(?!\/))*\*\/"#,
Token::comment_token_compat,
Some(Box::new(LexMatcher::regex_subdivider(
Dialect::Ansi,
"newline",
r#"\r\n|\n"#,
Token::newline_token_compat,
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
Some(Box::new(LexMatcher::regex_subdivider(
Dialect::Ansi,
"whitespace",
r#"[^\S\r\n]+"#,
Token::whitespace_token_compat,
None,
None,
None,
None,
None,
None,
None,
None,
|_| true,
None,
))),
None,
None,
None,
None,
None,
Some(extract_nested_block_comment),
|input| input.starts_with("/"),
None,
);
let (elems, _) = block_comment_matcher
.scan_match("/*\n)\n*/")
.expect("should match");
for elem in elems {
println!("{}: {}", elem.matcher.name, elem.raw);
}
}
}

25
sqlfluffrs/src/python.rs Normal file
View File

@@ -0,0 +1,25 @@
use crate::lexer::python::{PyLexer, PySQLLexError};
use crate::marker::python::PyPositionMarker;
use crate::templater::{
fileslice::python::{PyRawFileSlice, PyTemplatedFileSlice},
templatefile::python::PyTemplatedFile,
};
use crate::token::python::PyToken;
use pyo3::prelude::*;
/// A Python module implemented in Rust.
#[pymodule(name = "sqlfluffrs", module = "sqlfluffrs")]
fn sqlfluffrs(m: &Bound<'_, PyModule>) -> PyResult<()> {
let env = env_logger::Env::default().filter_or("RUST_LOG", "warn");
env_logger::Builder::from_env(env)
.try_init()
.unwrap_or_else(|_| log::warn!("env_logger already initialized!"));
m.add_class::<PyToken>()?;
m.add_class::<PyTemplatedFile>()?;
m.add_class::<PyTemplatedFileSlice>()?;
m.add_class::<PyRawFileSlice>()?;
m.add_class::<PySQLLexError>()?;
m.add_class::<PyLexer>()?;
m.add_class::<PyPositionMarker>()?;
Ok(())
}

85
sqlfluffrs/src/regex.rs Normal file
View File

@@ -0,0 +1,85 @@
use std::fmt::Display;
use fancy_regex::{Regex as FancyRegex, RegexBuilder as FancyRegexBuilder};
#[cfg(feature = "python")]
use pyo3::pyclass;
use regex::{Regex, RegexBuilder};
#[cfg_attr(feature = "python", pyclass)]
#[derive(Debug, Clone)]
pub enum RegexModeGroup {
Index(usize),
Name(String),
}
impl From<usize> for RegexModeGroup {
fn from(idx: usize) -> Self {
RegexModeGroup::Index(idx)
}
}
impl From<&str> for RegexModeGroup {
fn from(name: &str) -> Self {
RegexModeGroup::Name(name.to_string())
}
}
impl From<String> for RegexModeGroup {
fn from(name: String) -> Self {
RegexModeGroup::Name(name)
}
}
#[derive(Debug, Clone)]
pub enum RegexMode {
Regex(Regex), // Match using a regex
FancyRegex(FancyRegex), // Match using a regex
}
impl RegexMode {
pub fn new(pattern: &str) -> Self {
// Try to compile with the standard regex first
if let Ok(re) = RegexBuilder::new(pattern).build() {
RegexMode::Regex(re)
} else if let Ok(re) = FancyRegexBuilder::new(pattern).build() {
RegexMode::FancyRegex(re)
} else {
panic!("Invalid regex pattern: {}", pattern);
}
}
pub fn capture(&self, group: impl Into<RegexModeGroup>, text: &str) -> Option<String> {
match self {
RegexMode::Regex(re) => {
let caps = re.captures(text)?;
match group.into() {
RegexModeGroup::Index(idx) => caps.get(idx).map(|m| m.as_str().to_string()),
RegexModeGroup::Name(name) => caps.name(&name).map(|m| m.as_str().to_string()),
}
}
RegexMode::FancyRegex(re) => {
let caps = re.captures(text).ok()??;
match group.into() {
RegexModeGroup::Index(idx) => caps.get(idx).map(|m| m.as_str().to_string()),
RegexModeGroup::Name(name) => caps.name(&name).map(|m| m.as_str().to_string()),
}
}
}
}
pub fn replace_all(&self, text: &str, replacement: &str) -> String {
match self {
RegexMode::Regex(re) => re.replace_all(text, replacement).to_string(),
RegexMode::FancyRegex(re) => re.replace_all(text, replacement).to_string(),
}
}
}
impl Display for RegexMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match *self {
RegexMode::Regex(_) => write!(f, "Regex"),
RegexMode::FancyRegex(_) => write!(f, "FancyRegex"),
}
}
}

67
sqlfluffrs/src/slice.rs Normal file
View File

@@ -0,0 +1,67 @@
use std::{fmt::Display, ops::Range};
use serde::{Deserialize, Serialize};
#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy, Serialize, Deserialize)]
pub struct Slice {
pub start: usize,
pub stop: usize,
}
impl From<Range<usize>> for Slice {
fn from(value: Range<usize>) -> Self {
Self {
start: value.start,
stop: value.end,
}
}
}
impl Slice {
pub fn slice_is_point(test_slice: &Range<usize>) -> bool {
test_slice.start == test_slice.end
}
pub fn len(&self) -> usize {
self.stop - self.start
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl Display for Slice {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "slice({}, {}, None)", self.start, self.stop)
}
}
#[cfg(feature = "python")]
pub mod python {
use super::Slice;
use pyo3::{prelude::*, types::PySlice};
impl<'py> FromPyObject<'py> for Slice {
fn extract_bound(obj: &pyo3::Bound<'py, pyo3::PyAny>) -> PyResult<Self> {
let start = obj.getattr("start")?.extract::<usize>()?;
let stop = obj.getattr("stop")?.extract::<usize>()?;
Ok(Slice { start, stop })
}
}
impl<'py> IntoPyObject<'py> for Slice {
type Target = PySlice; // the Python type
type Output = Bound<'py, Self::Target>; // in most cases this will be `Bound`
type Error = PyErr; // the conversion error type, has to be convertible to `PyErr`
fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
Ok(PySlice::new(
py,
self.start.try_into()?,
self.stop.try_into()?,
1,
))
}
}
}

View File

@@ -0,0 +1,323 @@
use serde::{Deserialize, Serialize};
use crate::slice::Slice;
#[derive(Debug, PartialEq, Clone, Hash, Serialize, Deserialize)]
pub struct RawFileSlice {
pub raw: String, // Source string
pub slice_type: String,
pub source_idx: usize, // Offset from beginning of source string
// Block index, incremented on start or end block tags, e.g. "if", "for".
// This is used in `BaseRule.discard_unsafe_fixes()` to reject any fixes
// which span multiple templated blocks.
pub block_idx: usize,
// The command of a templated tag, e.g. "if", "for"
// This is used in template tracing as a kind of cache to identify the kind
// of template element this is without having to re-extract it each time.
pub tag: Option<String>,
}
impl RawFileSlice {
pub fn new(
raw: String,
slice_type: String,
source_idx: usize,
block_idx: Option<usize>,
tag: Option<String>,
) -> Self {
RawFileSlice {
raw,
slice_type,
source_idx,
block_idx: block_idx.unwrap_or_default(),
tag,
}
}
pub fn end_source_idx(&self) -> usize {
// Return the closing index of this slice.
let len: usize = self.raw.chars().count();
self.source_idx + len
}
pub fn source_slice(&self) -> Slice {
Slice::from(self.source_idx..self.end_source_idx())
}
pub fn is_source_only_slice(&self) -> bool {
// Based on its slice_type, does it only appear in the *source*?
// There are some slice types which are automatically source only.
// There are *also* some which are source only because they render
// to an empty string.
// TODO: should any new logic go here?
matches!(
self.slice_type.as_str(),
"comment" | "block_end" | "block_start" | "block_mid"
)
}
}
#[derive(Debug, PartialEq, Clone, Hash, Serialize, Deserialize)]
pub struct TemplatedFileSlice {
pub slice_type: String,
pub source_codepoint_slice: Slice,
pub templated_codepoint_slice: Slice,
}
impl TemplatedFileSlice {
pub fn new(
slice_type: String,
source_codepoint_slice: Slice,
templated_codepoint_slice: Slice,
) -> Self {
TemplatedFileSlice {
slice_type,
source_codepoint_slice,
templated_codepoint_slice,
}
}
}
#[cfg(feature = "python")]
pub mod python {
use bincode;
use pyo3::{prelude::*, types::PyBytes};
use serde::{Deserialize, Serialize};
use crate::slice::Slice;
use super::{RawFileSlice, TemplatedFileSlice};
#[pyclass(name = "RsRawFileSlice", module = "sqlfluffrs")]
#[repr(transparent)]
#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
pub struct PyRawFileSlice(pub(crate) RawFileSlice);
#[pymethods]
impl PyRawFileSlice {
#[new]
#[pyo3(signature = (raw, slice_type, source_idx, block_idx=0, tag=None))]
pub fn new(
raw: String,
slice_type: String,
source_idx: usize,
block_idx: Option<usize>,
tag: Option<String>,
) -> Self {
Self(RawFileSlice::new(
raw, slice_type, source_idx, block_idx, tag,
))
}
pub fn __setstate__(&mut self, state: Bound<'_, PyBytes>) -> PyResult<()> {
*self = bincode::deserialize(state.as_bytes()).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyException, _>(format!(
"Deserialization error: {}",
e
))
})?;
Ok(())
}
pub fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyBytes>> {
let bytes = bincode::serialize(&self.0).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyException, _>(format!(
"Serialization error: {}",
e
))
})?;
Ok(PyBytes::new(py, &bytes))
}
pub fn __getnewargs__(&self) -> PyResult<(String, String, usize, usize, Option<String>)> {
Ok((
self.raw(),
self.slice_type(),
self.source_idx(),
self.block_idx(),
self.tag(),
))
}
#[getter]
pub fn raw(&self) -> String {
self.0.raw.clone()
}
#[getter]
pub fn slice_type(&self) -> String {
self.0.slice_type.clone()
}
#[getter]
pub fn source_idx(&self) -> usize {
self.0.source_idx
}
#[getter]
pub fn block_idx(&self) -> usize {
self.0.block_idx
}
#[getter]
pub fn tag(&self) -> Option<String> {
self.0.tag.clone()
}
}
impl From<PyRawFileSlice> for RawFileSlice {
fn from(value: PyRawFileSlice) -> Self {
value.0
}
}
impl From<RawFileSlice> for PyRawFileSlice {
fn from(value: RawFileSlice) -> Self {
Self(value)
}
}
#[pyclass(name = "RsTemplatedFileSlice", module = "sqlfluffrs")]
#[repr(transparent)]
#[derive(Clone, Debug, PartialEq, Hash, Serialize, Deserialize)]
pub struct PyTemplatedFileSlice(pub(crate) TemplatedFileSlice);
#[pymethods]
impl PyTemplatedFileSlice {
#[new]
fn new(
slice_type: String,
source_codepoint_slice: Slice,
templated_codepoint_slice: Slice,
) -> Self {
Self(TemplatedFileSlice::new(
slice_type,
source_codepoint_slice,
templated_codepoint_slice,
))
}
pub fn __setstate__(&mut self, state: Bound<'_, PyBytes>) -> PyResult<()> {
*self = bincode::deserialize(state.as_bytes()).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyException, _>(format!(
"Deserialization error: {}",
e
))
})?;
Ok(())
}
pub fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyBytes>> {
let bytes = bincode::serialize(&self.0).map_err(|e| {
PyErr::new::<pyo3::exceptions::PyException, _>(format!(
"Serialization error: {}",
e
))
})?;
Ok(PyBytes::new(py, &bytes))
}
pub fn __getnewargs__(&self) -> PyResult<(String, Slice, Slice)> {
Ok((
self.0.slice_type.clone(),
self.0.source_codepoint_slice,
self.0.templated_codepoint_slice,
))
}
#[getter]
fn slice_type(&self) -> PyResult<String> {
Ok(self.0.slice_type.clone())
}
#[getter]
fn source_slice(&self) -> PyResult<Slice> {
Ok(self.0.source_codepoint_slice)
}
#[getter]
fn templated_slice(&self) -> PyResult<Slice> {
Ok(self.0.templated_codepoint_slice)
}
}
impl From<PyTemplatedFileSlice> for TemplatedFileSlice {
fn from(value: PyTemplatedFileSlice) -> Self {
value.0
}
}
impl From<TemplatedFileSlice> for PyTemplatedFileSlice {
fn from(value: TemplatedFileSlice) -> Self {
Self(value)
}
}
pub mod sqlfluff {
use pyo3::prelude::*;
use crate::{
slice::Slice,
templater::fileslice::{RawFileSlice, TemplatedFileSlice},
};
use super::{PyRawFileSlice, PyTemplatedFileSlice};
#[derive(Clone, IntoPyObject)]
pub struct PySqlFluffTemplatedFileSlice(pub PyTemplatedFileSlice);
impl<'py> FromPyObject<'py> for PySqlFluffTemplatedFileSlice {
fn extract_bound(obj: &pyo3::Bound<'py, pyo3::PyAny>) -> PyResult<Self> {
let slice_type = obj.getattr("slice_type")?.extract::<String>()?;
let source_slice = obj.getattr("source_slice")?.extract::<Slice>()?;
let templated_slice = obj.getattr("templated_slice")?.extract::<Slice>()?;
Ok(Self(PyTemplatedFileSlice(TemplatedFileSlice::new(
slice_type,
source_slice,
templated_slice,
))))
}
}
impl From<PySqlFluffTemplatedFileSlice> for PyTemplatedFileSlice {
fn from(value: PySqlFluffTemplatedFileSlice) -> Self {
value.0
}
}
impl From<PySqlFluffTemplatedFileSlice> for TemplatedFileSlice {
fn from(value: PySqlFluffTemplatedFileSlice) -> Self {
value.0 .0
}
}
#[derive(Clone)]
pub struct PySqlFluffRawFileSlice(pub PyRawFileSlice);
impl<'py> FromPyObject<'py> for PySqlFluffRawFileSlice {
fn extract_bound(obj: &pyo3::Bound<'py, pyo3::PyAny>) -> PyResult<Self> {
let raw = obj.getattr("raw")?.extract::<String>()?;
let slice_type = obj.getattr("slice_type")?.extract::<String>()?;
let source_idx = obj.getattr("source_idx")?.extract::<usize>().ok();
let block_idx = obj.getattr("block_idx")?.extract::<usize>().ok();
let tag = obj.getattr("tag")?.extract::<Option<String>>()?;
Ok(Self(PyRawFileSlice(RawFileSlice::new(
raw.clone(),
slice_type,
source_idx.unwrap_or(raw.len()),
block_idx,
tag,
))))
}
}
impl From<PySqlFluffRawFileSlice> for PyRawFileSlice {
fn from(value: PySqlFluffRawFileSlice) -> Self {
value.0
}
}
impl From<PySqlFluffRawFileSlice> for RawFileSlice {
fn from(value: PySqlFluffRawFileSlice) -> Self {
value.0 .0
}
}
}
}

View File

@@ -0,0 +1,2 @@
pub mod fileslice;
pub mod templatefile;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,296 @@
// Wrapper functions that maintain the old TokenGenerator signature for backward compatibility
// These are used by the generated dialect matcher code
use super::{config::TokenConfig, Token};
use crate::{marker::PositionMarker, regex::RegexModeGroup};
use hashbrown::HashSet;
impl Token {
// Wrapper functions that convert from the old 9-parameter signature to TokenConfig
pub fn whitespace_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::whitespace_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn newline_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::newline_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn comment_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::comment_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn code_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn symbol_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::symbol_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn identifier_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::identifier_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn literal_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::literal_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn binary_operator_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::binary_operator_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn comparison_operator_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::comparison_operator_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn word_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::word_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
pub fn unlexable_token_compat(
raw: String,
pos_marker: PositionMarker,
class_types: HashSet<String>,
instance_types: Vec<String>,
trim_start: Option<Vec<String>>,
trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
) -> Self {
Self::unlexable_token(
raw,
pos_marker,
TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
},
)
}
}

View File

@@ -0,0 +1,93 @@
use crate::regex::RegexModeGroup;
use hashbrown::HashSet;
/// Configuration for token construction, grouping optional parameters
#[derive(Debug, Clone, Default)]
pub struct TokenConfig {
pub class_types: HashSet<String>,
pub instance_types: Vec<String>,
pub trim_start: Option<Vec<String>>,
pub trim_chars: Option<Vec<String>>,
pub quoted_value: Option<(String, RegexModeGroup)>,
pub escape_replacement: Option<(String, String)>,
pub casefold: Option<fn(&str) -> str>,
}
impl TokenConfig {
/// Create a new TokenConfig with default values (all empty/None)
pub fn new() -> Self {
Self::default()
}
/// Create TokenConfig with only instance_types set
pub fn with_instance_types(instance_types: Vec<String>) -> Self {
Self {
instance_types,
..Default::default()
}
}
/// Create TokenConfig with class_types and instance_types
pub fn with_types(class_types: HashSet<String>, instance_types: Vec<String>) -> Self {
Self {
class_types,
instance_types,
..Default::default()
}
}
/// Builder method to add trim_start
pub fn trim_start(mut self, chars: Vec<String>) -> Self {
self.trim_start = Some(chars);
self
}
/// Builder method to add trim_chars
pub fn trim_chars(mut self, chars: Vec<String>) -> Self {
self.trim_chars = Some(chars);
self
}
/// Builder method to add quoted_value
pub fn quoted_value(mut self, value: String, mode: RegexModeGroup) -> Self {
self.quoted_value = Some((value, mode));
self
}
/// Builder method to add escape_replacement
pub fn escape_replacement(mut self, pattern: String, replacement: String) -> Self {
self.escape_replacement = Some((pattern, replacement));
self
}
/// Builder method to add casefold function
pub fn casefold(mut self, func: fn(&str) -> str) -> Self {
self.casefold = Some(func);
self
}
}
/// Helper to extract individual fields for backward compatibility
impl TokenConfig {
pub fn into_parts(
self,
) -> (
HashSet<String>,
Vec<String>,
Option<Vec<String>>,
Option<Vec<String>>,
Option<(String, RegexModeGroup)>,
Option<(String, String)>,
Option<fn(&str) -> str>,
) {
(
self.class_types,
self.instance_types,
self.trim_start,
self.trim_chars,
self.quoted_value,
self.escape_replacement,
self.casefold,
)
}
}

View File

@@ -0,0 +1,442 @@
use super::{config::TokenConfig, Token};
use crate::{marker::PositionMarker, slice::Slice, templater::templatefile::TemplatedFile};
use std::sync::Arc;
use hashbrown::HashSet;
use uuid::Uuid;
impl Token {
pub fn base_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
segments: Vec<Token>,
) -> Self {
let TokenConfig {
class_types,
instance_types,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
} = config;
let (token_types, class_types) = iter_base_types("base", class_types.clone());
let raw_value = Token::normalize(&raw, quoted_value.clone(), escape_replacement.clone());
Self {
token_type: token_types,
instance_types,
class_types,
comment_separate: false,
is_meta: false,
allow_empty: false,
pos_marker: Some(pos_marker),
raw,
is_whitespace: false,
is_code: true,
is_comment: false,
_default_raw: "".to_string(),
indent_value: 0,
is_templated: false,
block_uuid: None,
source_str: None,
block_type: None,
parent: None,
parent_idx: None,
segments,
preface_modifier: "".to_string(),
suffix: "".to_string(),
uuid: Uuid::new_v4().as_u128(),
source_fixes: None,
trim_start,
trim_chars,
quoted_value,
escape_replacement,
casefold,
raw_value,
}
}
pub fn raw_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("raw", config.class_types.clone());
let suffix = format!("'{}'", raw.escape_debug().to_string().trim_matches('"'));
let mut token = Token::base_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
vec![],
);
token.suffix = suffix;
token.token_type = token_type;
token
}
pub fn code_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
Self::raw_token(raw, pos_marker, config)
}
pub fn symbol_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("symbol", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn identifier_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("identifier", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn literal_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("literal", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn binary_operator_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("binary_operator", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn comparison_operator_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("comparison_operator", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn word_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("word", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn unlexable_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("unlexable", config.class_types.clone());
let mut token = Self::code_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token
}
pub fn whitespace_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("whitespace", config.class_types.clone());
let mut token = Self::raw_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token.is_whitespace = true;
token.is_code = false;
token.is_comment = false;
token._default_raw = " ".to_string();
token
}
pub fn newline_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("newline", config.class_types.clone());
let mut token = Self::raw_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token.is_whitespace = true;
token.is_code = false;
token.is_comment = false;
token._default_raw = "\n".to_string();
token
}
pub fn comment_token(
raw: String,
pos_marker: PositionMarker,
config: TokenConfig,
) -> Self {
let (token_type, class_types) = iter_base_types("comment", config.class_types.clone());
let mut token = Self::raw_token(
raw,
pos_marker,
TokenConfig {
class_types,
..config
},
);
token.token_type = token_type;
token.is_code = false;
token.is_comment = true;
token
}
pub fn meta_token(
pos_marker: PositionMarker,
is_templated: bool,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("meta", class_types.clone());
let mut token = Self::raw_token(
"".to_string(),
pos_marker,
TokenConfig {
class_types,
instance_types: vec![],
..TokenConfig::default()
},
);
token.token_type = token_type;
token.is_code = false;
token.is_meta = true;
token.is_templated = is_templated;
token.block_uuid = block_uuid;
token.preface_modifier = "[META] ".to_string();
token.suffix = String::new();
token
}
pub fn end_of_file_token(
pos_marker: PositionMarker,
is_templated: bool,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("end_of_file", class_types);
Self {
token_type,
..Self::meta_token(
pos_marker,
is_templated,
block_uuid,
class_types,
)
}
}
pub fn indent_token(
pos_marker: PositionMarker,
is_templated: bool,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("indent", class_types);
Self {
token_type,
indent_value: 1,
suffix: block_uuid
.map(|u| u.as_hyphenated().to_string())
.unwrap_or_default(),
..Self::meta_token(
pos_marker,
is_templated,
block_uuid,
class_types,
)
}
}
pub fn dedent_token(
pos_marker: PositionMarker,
is_templated: bool,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("dedent", class_types);
Self {
token_type,
indent_value: -1,
..Self::indent_token(
pos_marker,
is_templated,
block_uuid,
class_types,
)
}
}
pub fn template_loop_token(
pos_marker: PositionMarker,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("template_loop", class_types);
Self {
token_type,
..Self::meta_token(
pos_marker,
false,
block_uuid,
class_types,
)
}
}
pub fn template_placeholder_token(
pos_marker: PositionMarker,
source_string: String,
block_type: String,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let (token_type, class_types) = iter_base_types("placeholder", class_types);
Self {
token_type,
block_type: Some(block_type),
source_str: Some(source_string),
..Self::meta_token(
pos_marker,
false,
block_uuid,
class_types,
)
}
}
pub fn template_placeholder_token_from_slice(
source_slice: Slice,
templated_slice: Slice,
block_type: String,
templated_file: &Arc<TemplatedFile>,
block_uuid: Option<Uuid>,
class_types: HashSet<String>,
) -> Self {
let pos_marker = PositionMarker::new(
source_slice,
templated_slice,
templated_file,
None,
None,
);
Self {
..Self::template_placeholder_token(
pos_marker,
templated_file
.source_str
.chars()
.skip(source_slice.start)
.take(source_slice.len())
.collect::<String>(),
block_type,
block_uuid,
class_types,
)
}
}
}
fn iter_base_types(token_type: &str, class_types: HashSet<String>) -> (String, HashSet<String>) {
let mut class_types = class_types;
let token_type = token_type.to_string();
class_types.insert(token_type.clone());
(token_type, class_types)
}

View File

@@ -0,0 +1,24 @@
use std::hash::Hash;
use super::Token;
impl PartialEq for Token {
fn eq(&self, other: &Self) -> bool {
self.uuid == other.uuid
|| (self.token_type == other.token_type
&& self.raw == other.raw
&& self.pos_marker.is_some()
&& other.pos_marker.is_some()
&& self.pos_marker == other.pos_marker)
}
}
impl Hash for Token {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.token_type.hash(state);
self.raw.hash(state);
if let Some(p) = self.pos_marker.as_ref() {
p.working_loc().hash(state);
}
}
}

View File

@@ -0,0 +1,21 @@
use crate::slice::Slice;
#[derive(Debug, Clone)]
pub struct SourceFix {
edit: String,
source_slice: Slice,
templated_slice: Slice,
}
impl PartialEq for SourceFix {
fn eq(&self, other: &Self) -> bool {
self.edit == other.edit && self.source_slice == other.source_slice
}
}
impl std::hash::Hash for SourceFix {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.edit.hash(state);
self.source_slice.hash(state);
}
}

View File

@@ -0,0 +1,14 @@
use super::Token;
use std::fmt::Display;
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"<{}: ({}) '{}'>",
self.token_type.clone(),
self.pos_marker.clone().expect("PositionMarker unset"),
self.raw.escape_debug(),
)
}
}

830
sqlfluffrs/src/token/mod.rs Normal file
View File

@@ -0,0 +1,830 @@
pub mod compat;
pub mod config;
pub mod construction;
mod eq;
pub mod fix;
mod fmt;
pub mod path;
#[cfg(feature = "python")]
pub mod python;
use std::{
fmt::Write,
sync::{Arc, Weak},
};
use fix::SourceFix;
use hashbrown::HashSet;
use path::PathStep;
use uuid::Uuid;
use crate::{
marker::PositionMarker,
regex::{RegexMode, RegexModeGroup},
};
#[derive(Debug, Clone, PartialEq)]
pub enum TupleSerialisedSegment {
Str(String, String),
Nested(String, Vec<TupleSerialisedSegment>),
}
#[derive(Debug, Clone)]
pub struct Token {
pub token_type: String,
pub instance_types: Vec<String>,
pub class_types: HashSet<String>,
pub comment_separate: bool,
pub is_meta: bool,
pub allow_empty: bool,
pub pos_marker: Option<PositionMarker>,
pub raw: String,
is_whitespace: bool,
is_code: bool,
is_comment: bool,
_default_raw: String,
pub indent_value: i32,
pub is_templated: bool,
pub block_uuid: Option<Uuid>,
pub source_str: Option<String>,
pub block_type: Option<String>,
parent: Option<Weak<Token>>,
parent_idx: Option<usize>,
pub segments: Vec<Token>,
preface_modifier: String,
suffix: String,
pub uuid: u128,
pub source_fixes: Option<Vec<SourceFix>>,
pub trim_start: Option<Vec<String>>,
pub trim_chars: Option<Vec<String>>,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
casefold: Option<fn(&str) -> str>,
raw_value: String,
}
impl Token {
fn comments(&self) -> Vec<Token> {
self.segments
.clone()
.into_iter()
.filter(|s| s.is_type(&["comment"]))
.collect::<Vec<_>>()
}
fn non_comments(&self) -> Vec<Token> {
self.segments
.clone()
.into_iter()
.filter(|s| !s.is_type(&["comment"]))
.collect::<Vec<_>>()
}
/// Returns True if this segment is code.
pub fn is_code(&self) -> bool {
match self.is_raw() {
true => self.is_code,
false => self.segments.iter().any(|s| s.is_code()),
}
}
fn code_indices(&self) -> Vec<usize> {
self.segments
.iter()
.enumerate()
.filter(|(_i, s)| s.is_code())
.map(|(i, _s)| i)
.collect()
}
pub fn is_comment(&self) -> bool {
match self.is_raw() {
true => self.is_comment,
false => self.segments.iter().all(|s| s.is_comment()),
}
}
pub fn is_whitespace(&self) -> bool {
match self.is_raw() {
true => self.is_whitespace,
false => self.segments.iter().all(|s| s.is_whitespace()),
}
}
pub fn raw(&self) -> String {
self.raw.clone()
}
pub fn raw_upper(&self) -> String {
self.raw.to_uppercase()
}
pub fn normalize(
value: &str,
quoted_value: Option<(String, RegexModeGroup)>,
escape_replacement: Option<(String, String)>,
) -> String {
let mut str_buffer = value.to_string();
if let Some((ref regex_str, idx)) = quoted_value {
if let Some(captured) = RegexMode::new(regex_str).capture(idx, value) {
str_buffer = captured
}
}
if let Some((ref regex_str, ref replacement)) = escape_replacement {
str_buffer = RegexMode::new(regex_str).replace_all(&str_buffer, replacement.as_str());
}
str_buffer
}
pub fn raw_segments(&self) -> Vec<Token> {
match self.is_raw() {
true => vec![self.clone()],
false => self
.segments
.iter()
.flat_map(|s| s.raw_segments())
.collect::<Vec<_>>(),
}
}
/// The set of full types for this token, including inherited.
/// Adds the surrogate type for raw segments.
pub fn class_types(&self) -> HashSet<String> {
let mut full_types = self.instance_types.iter().cloned().collect::<HashSet<_>>();
full_types.extend(self.class_types.clone());
full_types
}
pub fn descendant_type_set(&self) -> HashSet<String> {
self.segments
.iter()
.flat_map(|seg| {
seg.descendant_type_set()
.union(&seg.class_types())
.cloned()
.collect::<HashSet<String>>()
})
.collect::<HashSet<String>>()
}
pub fn direct_descendant_type_set(&self) -> HashSet<String> {
self.segments
.iter()
.flat_map(|seg| seg.class_types())
.collect::<HashSet<String>>()
}
pub fn raw_segments_with_ancestors(&self) -> Vec<(Token, Vec<PathStep>)> {
todo!()
}
pub fn source_fixes(&self) -> Vec<SourceFix> {
match self.is_raw() {
true => self.source_fixes.clone().unwrap_or_default(),
false => self
.segments
.iter()
.flat_map(|s| s.source_fixes())
.collect(),
}
}
pub fn first_non_whitespace_segment_raw_upper(&self) -> Option<String> {
self.raw_segments().iter().find_map(|seg| {
if !seg.raw_upper().trim().is_empty() {
Some(seg.raw_upper().clone())
} else {
None
}
})
}
pub fn is_templated(&self) -> bool {
let pos_marker = self.pos_marker.clone().expect("PositionMarker must be set");
pos_marker.source_slice.start != pos_marker.source_slice.stop && !pos_marker.is_literal()
}
pub fn get_type(&self) -> String {
self.token_type.clone()
}
pub fn is_type(&self, seg_types: &[&str]) -> bool {
if self
.instance_types
.iter()
.any(|s| seg_types.contains(&s.as_str()))
{
return true;
}
self.class_is_type(seg_types)
}
pub fn get_raw_segments(&self) -> Vec<Token> {
todo!()
}
pub fn raw_trimmed(&self) -> String {
let mut raw_buff = self.raw.clone();
// Trim start sequences
if let Some(trim_start) = &self.trim_start {
for seq in trim_start {
raw_buff = raw_buff.strip_prefix(seq).unwrap_or(&raw_buff).to_string();
}
}
// Trim specified characters from both ends
if let Some(trim_chars) = &self.trim_chars {
raw_buff = self.raw.clone(); // Reset raw_buff before trimming chars
for seq in trim_chars {
while raw_buff.starts_with(seq) {
raw_buff = raw_buff.strip_prefix(seq).unwrap_or(&raw_buff).to_string();
}
while raw_buff.ends_with(seq) {
raw_buff = raw_buff.strip_suffix(seq).unwrap_or(&raw_buff).to_string();
}
}
}
raw_buff
}
fn _raw_normalized(&self) -> String {
todo!()
}
pub fn raw_normalized(&self) -> String {
todo!()
}
pub fn stringify(&self, ident: usize, tabsize: usize, code_only: bool) -> String {
let mut buff = String::new();
let preface = self.preface(ident, tabsize);
writeln!(buff, "{}", preface).unwrap();
if !code_only && self.comment_separate && !self.comments().is_empty() {
if !self.comments().is_empty() {
writeln!(buff, "{}Comments:", " ".repeat((ident + 1) * tabsize)).unwrap();
for seg in &self.comments() {
let segment_string = seg.stringify(ident + 2, tabsize, code_only);
buff.push_str(&segment_string);
}
}
if !self.non_comments().is_empty() {
writeln!(buff, "{}Code:", " ".repeat((ident + 1) * tabsize)).unwrap();
for seg in &self.non_comments() {
let segment_string = seg.stringify(ident + 2, tabsize, code_only);
buff.push_str(&segment_string);
}
}
} else {
for seg in &self.segments {
if !code_only || seg.is_code {
let segment_string = seg.stringify(ident + 1, tabsize, code_only);
buff.push_str(&segment_string);
}
}
}
buff
}
pub fn edit(&self, raw: Option<String>, source_fixes: Option<Vec<SourceFix>>) -> Self {
Self {
raw: raw.unwrap_or(self.raw.clone()),
source_fixes: Some(source_fixes.unwrap_or(self.source_fixes())),
uuid: Uuid::new_v4().as_u128(),
..self.clone()
}
}
// pub fn _get_raw_segment_kwargs(&self) -> HashMap<String, _> {
// let kwargs = HashMap::new();
// kwargs.insert("quoted_value", self.quoted_value);
// kwargs.insert("escape_replacements", vec![self.escape_replacement]);
// kwargs
// }
pub fn iter_unparseables(&self) -> Vec<Token> {
self.segments
.iter()
.flat_map(|s| s.iter_unparseables())
.collect()
}
pub fn set_parent(&mut self, parent: Arc<Token>, idx: usize) {
self.parent = Some(Arc::downgrade(&parent));
self.parent_idx = Some(idx);
}
pub fn class_is_type(&self, seg_types: &[&str]) -> bool {
let seg_hash: HashSet<&str> = seg_types.iter().cloned().collect();
!self
.class_types
.iter()
.filter(|s| seg_hash.contains(s.as_str()))
.collect::<Vec<_>>()
.is_empty()
}
pub fn count_segments(&self, raw_only: bool) -> usize {
if self.is_raw() {
1
} else {
let self_count = if raw_only { 0 } else { 1 };
self.segments
.iter()
.fold(0, |acc, s| acc + s.count_segments(raw_only) + self_count)
}
}
pub fn is_raw(&self) -> bool {
self.segments.is_empty()
}
pub fn block_type(&self) -> Option<String> {
self.block_type.clone()
}
pub fn recursive_crawl(
&self,
seg_types: &[&str],
recurse_into: bool,
no_recursive_seg_type: Option<&[&str]>,
allow_self: bool,
) -> Vec<Token> {
let mut results = Vec::new();
// If recurse_into is False and this matches, don't recurse
if !recurse_into && self.is_type(seg_types) {
if allow_self {
results.push(self.clone());
}
return results;
}
// Check if self matches the given segment types
if allow_self && self.is_type(seg_types) {
results.push(self.clone());
}
// Convert no_recursive_seg_type to HashSet for efficient lookups
let no_recursive_set: HashSet<&str> = no_recursive_seg_type
.unwrap_or(&[])
.iter()
.cloned()
.collect();
// Recursively process child segments
for seg in &self.segments {
if no_recursive_set.contains(seg.token_type.as_str()) {
continue;
}
results.extend(seg.recursive_crawl(
seg_types,
recurse_into,
no_recursive_seg_type,
true,
));
}
results
}
pub fn path_to(self, other: Self) -> Vec<PathStep> {
// Return empty if they are the same segment.
if self == other {
return vec![];
}
// If there are no child segments, return empty.
if self.segments.is_empty() {
return vec![];
}
// Identifying the highest parent we can using any preset parent values.
let mut midpoint = other.clone();
let mut lower_path = Vec::new();
while let Some(weak_parent) = &midpoint.parent.clone().as_ref() {
if let Some(parent) = weak_parent.upgrade() {
let parent_idx = midpoint.parent_idx.expect("Parent index must be set.");
lower_path.push(PathStep {
segment: Arc::clone(&parent),
idx: parent_idx,
len: parent.segments.len(),
code_idxs: parent.code_indices().clone(),
});
midpoint = Arc::unwrap_or_clone(parent);
if midpoint == self {
break;
}
} else {
break;
}
}
// Reverse the path so far
lower_path.reverse();
// If we have already found the parent, return.
if midpoint == self {
return lower_path;
}
// If we've gone all the way up to the file segment, return empty.
if midpoint.class_is_type(&["file"]) {
return vec![];
}
// Check if midpoint is within self's range.
if !(self.get_start_loc() <= midpoint.get_start_loc()
&& midpoint.get_start_loc() <= self.get_end_loc())
{
return vec![];
}
// Now, work downward from `self` toward `midpoint`.
for (idx, seg) in self.segments.clone().iter().enumerate() {
// Set the parent if it's not already set.
let seg = seg.clone();
seg.clone().set_parent(Arc::new(self.clone()), idx);
let step = PathStep {
segment: Arc::new(self.clone()),
idx,
len: self.segments.clone().len(),
code_idxs: self.code_indices().clone(),
};
// If we found the target
if seg == midpoint {
let mut result = vec![step];
result.extend(lower_path);
return result;
}
// Check recursively if a path exists
let res = seg.path_to(midpoint.clone());
if !res.is_empty() {
let mut result = vec![step];
result.extend(res);
result.extend(lower_path);
return result;
}
}
// Not found.
vec![]
}
pub fn get_start_loc(&self) -> (usize, usize) {
self.pos_marker
.clone()
.expect("PositionMarker unset")
.working_loc()
}
pub fn get_end_loc(&self) -> (usize, usize) {
self.pos_marker
.clone()
.expect("PositionMarker unset")
.working_loc_after(&self.raw)
}
pub fn recursive_crawl_all(&self, reverse: bool) -> Box<dyn Iterator<Item = &Token> + '_> {
if reverse {
Box::new(
self.segments
.iter()
.rev()
.flat_map(move |seg| seg.recursive_crawl_all(reverse))
.chain(std::iter::once(self)),
)
} else {
Box::new(
std::iter::once(self).chain(
self.segments
.iter()
.flat_map(move |seg| seg.recursive_crawl_all(reverse)),
),
)
}
}
fn preface(&self, ident: usize, tabsize: usize) -> String {
let padding = " ".repeat(ident * tabsize);
let padded_type = format!("{}{}{}:", padding, self.preface_modifier, self.get_type());
let pos = self.pos_marker.clone();
let suffix = self.suffix.clone();
let preface = format!(
"{:<20}|{:<60} {}",
pos.clone()
.expect("PositionMarker unset")
.to_source_string(),
padded_type,
suffix
);
preface.trim_end().to_string()
}
pub fn to_tuple(
&self,
code_only: Option<bool>,
show_raw: Option<bool>,
include_meta: Option<bool>,
) -> TupleSerialisedSegment {
let code_only = code_only.unwrap_or_default();
let show_raw = show_raw.unwrap_or_default();
let include_meta = include_meta.unwrap_or_default();
// If `show_raw` is true and there are no child segments, return (type, raw)
if show_raw && self.segments.is_empty() {
return TupleSerialisedSegment::Str(self.get_type(), self.raw.clone());
}
// Determine filtering criteria for child segments
let filtered_segments: Vec<TupleSerialisedSegment> = self
.segments
.iter()
.filter(|seg| {
if code_only {
seg.is_code && !seg.is_meta
} else {
include_meta || !seg.is_meta
}
})
.map(|seg| seg.to_tuple(Some(code_only), Some(show_raw), Some(include_meta)))
.collect();
TupleSerialisedSegment::Nested(self.get_type(), filtered_segments)
}
pub fn copy(
&self,
segments: Option<Vec<Token>>,
parent: Option<Arc<Token>>,
parent_idx: Option<usize>,
) -> Token {
let mut new_segment = self.clone();
new_segment.parent = parent.as_ref().map(Arc::downgrade);
new_segment.parent_idx = parent_idx;
if let Some(ref segs) = segments {
new_segment.segments = segs.clone();
} else {
new_segment.segments = self
.segments
.iter()
.enumerate()
.map(|(idx, seg)| {
seg.copy(
None,
Some(Arc::new(new_segment.clone())),
Some(idx),
)
})
.collect();
}
new_segment
}
pub fn position_segments(segments: &[Token], parent_pos: PositionMarker) -> Vec<Token> {
assert!(
!segments.is_empty(),
"position_segments called on empty sequence."
);
let mut line_no = parent_pos.working_line_no;
let mut line_pos = parent_pos.working_line_pos;
let mut segment_buffer = Vec::new();
for (idx, segment) in segments.iter().enumerate() {
let old_position = segment.pos_marker.clone();
let mut new_position = segment.pos_marker.clone();
// If position is missing, try to infer it
if new_position.is_none() {
let mut start_point = None;
if idx > 0 {
let prev_seg: &Token = &segment_buffer[idx - 1];
if let Some(ref pos_marker) = prev_seg.pos_marker {
start_point = Some(pos_marker.end_point_marker());
}
} else {
start_point = Some(parent_pos.start_point_marker());
}
// Search forward for the end point
let mut end_point = None;
for fwd_seg in &segments[idx + 1..] {
if let Some(ref pos_marker) = fwd_seg.pos_marker {
end_point = Some(pos_marker.start_point_marker());
break;
}
}
new_position = match (start_point, end_point) {
(Some(start), Some(end)) if start != end => {
Some(PositionMarker::from_points(&start, &end))
}
(Some(start), _) => Some(start),
(_, Some(end)) => Some(end),
_ => panic!("Unable to position new segment"),
};
}
let new_position = new_position.expect("Position should be assigned");
let new_position = new_position.with_working_position(line_no, line_pos);
let (new_line_no, new_line_pos) =
new_position.infer_next_position(&segment.raw, line_no, line_pos);
line_no = new_line_no;
line_pos = new_line_pos;
// If position changed, recursively process child segments before copying
let new_segment =
if !segment.segments.is_empty() && old_position != Some(new_position.clone()) {
let child_segments =
Token::position_segments(&segment.segments, new_position.clone());
segment.copy(Some(child_segments), None, None)
} else {
segment.copy(None, None, None)
};
segment_buffer.push(new_segment);
}
segment_buffer
}
// /// Simplifies the structure of the token recursively for serialization.
// pub fn structural_simplify(&self) -> HashMap<String, Option<serde_json::Value>> {
// let mut result = HashMap::new();
// let key = self.get_type();
// if self.segments.is_empty() {
// // If there are no child segments, return the raw value.
// result.insert(key, Some(serde_json::Value::String(self.raw.clone())));
// } else {
// // Simplify all child segments recursively.
// let mut child_results = Vec::new();
// for segment in &self.segments {
// child_results.push(serde_json::Value::Object(
// segment.structural_simplify(),
// ));
// }
// // Check for duplicate keys in child results.
// let mut subkeys = Vec::new();
// for child in &child_results {
// if let serde_json::Value::Object(map) = child {
// subkeys.extend(map.keys().cloned());
// }
// }
// if subkeys.len() != subkeys.iter().collect::<std::collections::HashSet<_>>().len() {
// // If there are duplicate keys, use a list of child objects.
// result.insert(key, Some(serde_json::Value::Array(child_results)));
// } else {
// // Otherwise, merge child objects into a single map.
// let mut merged_map = HashMap::new();
// for child in child_results {
// if let serde_json::Value::Object(map) = child {
// for (k, v) in map {
// merged_map.insert(k, v);
// }
// }
// }
// result.insert(key, Some(serde_json::Value::Object(merged_map)));
// }
// }
// result
// }
}
#[cfg(test)]
mod tests {
use crate::matcher::TokenGenerator;
use crate::slice::Slice;
use crate::templater::templatefile::TemplatedFile;
use super::*;
/// Roughly generate test segments.
///
/// This function isn't totally robust, but good enough
/// for testing. Use with caution.
fn generate_test_segments(elems: &[&str]) -> Vec<Token> {
let mut buff = vec![];
let templated_file = Arc::new(TemplatedFile::from(
elems.iter().cloned().collect::<String>(),
));
let mut idx = 0;
for elem in elems {
let elem = &**elem;
if elem == "<indent>" {
buff.push(Token::indent_token(
PositionMarker::from_point(idx, idx, &templated_file, None, None),
false,
None,
HashSet::new(),
));
continue;
} else if elem == "<dedent>" {
buff.push(Token::dedent_token(
PositionMarker::from_point(idx, idx, &templated_file, None, None),
false,
None,
HashSet::new(),
));
continue;
}
let (token_fn, instance_types): (TokenGenerator, Vec<String>) =
match elem {
" " | "\t" => (
Token::whitespace_token_compat,
Vec::new(),
),
"\n" => (Token::newline_token_compat, Vec::new()),
"(" => (
Token::symbol_token_compat,
Vec::from_iter(["start_bracket".to_string()]),
),
")" => (
Token::symbol_token_compat,
Vec::from_iter(["end_bracket".to_string()]),
),
"[" => (
Token::symbol_token_compat,
Vec::from_iter(["start_square_bracket".to_string()]),
),
"]" => (
Token::symbol_token_compat,
Vec::from_iter(["end_square_bracket".to_string()]),
),
s if s.starts_with("--") => (
Token::comment_token_compat,
Vec::from_iter(["inline_comment".to_string()]),
),
s if s.starts_with("\"") => (
Token::code_token_compat,
Vec::from_iter(["double_quote".to_string()]),
),
s if s.starts_with("'") => (
Token::code_token_compat,
Vec::from_iter(["single_quote".to_string()]),
),
_ => (Token::code_token_compat, Vec::new()),
};
buff.push(token_fn(
elem.into(),
PositionMarker::new(
Slice {
start: idx,
stop: idx + elem.len(),
},
Slice {
start: idx,
stop: idx + elem.len(),
},
&templated_file,
None,
None,
),
HashSet::new(),
instance_types,
None,
None,
None,
None,
None,
));
idx += elem.len();
}
buff
}
fn raw_segments() -> Vec<Token> {
generate_test_segments(&["foobar", ".barfoo"])
}
#[test]
/// Test niche case of calling get_raw_segments on a raw segment.
fn test_parser_raw_get_raw_segments() {
for s in raw_segments() {
assert_eq!(s.raw_segments(), [s]);
}
}
}

View File

@@ -0,0 +1,10 @@
use super::Token;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct PathStep {
pub segment: Arc<Token>,
pub idx: usize,
pub len: usize,
pub code_idxs: Vec<usize>,
}

View File

@@ -0,0 +1,567 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use hashbrown::HashSet;
use pyo3::{
prelude::*,
types::{PyDict, PyString, PyTuple, PyType},
};
use uuid::Uuid;
use crate::{
marker::python::{PyPositionMarker, PySqlFluffPositionMarker},
regex::RegexModeGroup,
};
use super::{path::PathStep, SourceFix, Token, TupleSerialisedSegment};
#[pyclass(name = "RsSourceFix")]
#[repr(transparent)]
#[derive(Clone)]
pub struct PySourceFix(pub SourceFix);
impl From<PySourceFix> for SourceFix {
fn from(value: PySourceFix) -> SourceFix {
value.0
}
}
impl From<SourceFix> for PySourceFix {
fn from(value: SourceFix) -> Self {
Self(value)
}
}
#[pyclass(name = "RsPathStep")]
#[repr(transparent)]
#[derive(Clone)]
pub struct PyPathStep(pub PathStep);
impl From<PyPathStep> for PathStep {
fn from(value: PyPathStep) -> Self {
value.0
}
}
impl From<PathStep> for PyPathStep {
fn from(value: PathStep) -> Self {
Self(value)
}
}
#[pyclass(name = "RsTupleSerialisedSegment")]
#[repr(transparent)]
#[derive(Clone)]
pub struct PyTupleSerialisedSegment(pub TupleSerialisedSegment);
impl PyTupleSerialisedSegment {
pub fn to_py_tuple<'py>(&self, py: Python<'py>) -> Result<Bound<'py, PyTuple>, PyErr> {
match &self.0 {
TupleSerialisedSegment::Str(segment_type, raw_value) => {
PyTuple::new(py, [segment_type, raw_value])
}
TupleSerialisedSegment::Nested(segment_type, segments) => {
let py_segment_type = PyString::new(py, segment_type);
let py_segments: Vec<_> = segments
.iter()
.map(|s| {
PyTupleSerialisedSegment::to_py_tuple(
&PyTupleSerialisedSegment(s.clone()),
py,
)
})
.collect::<Result<Vec<_>, _>>()?;
let pt_segments_tuple = PyTuple::new(py, &py_segments)?;
PyTuple::new(
py,
&[py_segment_type.into_any(), pt_segments_tuple.into_any()],
)
}
}
}
}
impl From<PyTupleSerialisedSegment> for TupleSerialisedSegment {
fn from(value: PyTupleSerialisedSegment) -> Self {
value.0
}
}
impl From<TupleSerialisedSegment> for PyTupleSerialisedSegment {
fn from(value: TupleSerialisedSegment) -> Self {
Self(value)
}
}
#[pyclass(name = "RsToken", weakref, module = "sqlfluffrs")]
#[repr(transparent)]
#[derive(Debug, Clone)]
pub struct PyToken(pub Token);
#[pymethods]
impl PyToken {
#[getter]
pub fn raw(&self) -> String {
self.0.raw.to_string()
}
pub fn raw_trimmed(&self) -> String {
self.0.raw_trimmed()
}
#[getter]
pub fn pos_marker(&self) -> Option<PyPositionMarker> {
self.0.pos_marker.clone().map(PyPositionMarker)
}
#[setter]
pub fn set_pos_marker(&mut self, value: Option<PySqlFluffPositionMarker>) {
self.0.pos_marker = value.map(Into::into);
}
pub fn get_type(&self) -> String {
self.0.get_type()
}
#[getter(r#type)]
pub fn type_(&self) -> String {
self.0.get_type()
}
#[getter]
pub fn is_templated(&self) -> bool {
self.0.is_templated()
}
#[getter]
pub fn is_code(&self) -> bool {
self.0.is_code
}
#[getter]
pub fn is_meta(&self) -> bool {
self.0.is_meta
}
#[getter]
pub fn source_str(&self) -> Option<String> {
self.0.source_str.clone()
}
#[getter]
pub fn block_type(&self) -> Option<String> {
self.0.block_type()
}
#[getter]
pub fn block_uuid(&self) -> Option<Uuid> {
self.0.block_uuid
}
#[getter]
pub fn cache_key(&self) -> String {
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
let mut hasher = DefaultHasher::new();
self.0.token_type.hash(&mut hasher);
for t in &self.0.instance_types {
t.hash(&mut hasher);
}
format!("{:016x}", hasher.finish())
}
#[getter]
pub fn trim_start(&self) -> Option<Vec<String>> {
self.0.trim_start.clone()
}
#[getter]
pub fn trim_chars(&self) -> Option<Vec<String>> {
self.0.trim_chars.clone()
}
#[pyo3(signature = (raw_only = false))]
pub fn count_segments(&self, raw_only: Option<bool>) -> usize {
self.0.count_segments(raw_only.unwrap_or_default())
}
#[pyo3(signature = (*seg_type))]
pub fn is_type(&self, seg_type: &Bound<'_, PyTuple>) -> bool {
let seg_strs = seg_type
.extract::<Vec<String>>()
.expect("args should be all strings");
self.0
.is_type(&seg_strs.iter().map(String::as_str).collect::<Vec<&str>>())
}
#[getter]
pub fn indent_val(&self) -> i32 {
self.0.indent_value
}
#[getter]
pub fn is_whitespace(&self) -> bool {
self.0.is_whitespace
}
pub fn is_raw(&self) -> bool {
self.0.is_raw()
}
#[getter]
pub fn is_comment(&self) -> bool {
self.0.is_comment
}
#[getter]
pub fn class_types(&self) -> HashSet<String> {
self.0.class_types()
}
#[getter]
pub fn instance_types(&self) -> Vec<String> {
self.0.instance_types.clone()
}
#[getter]
pub fn preface_modifier(&self) -> String {
self.0.preface_modifier.clone()
}
#[getter]
pub fn source_fixes(&self) -> Vec<PySourceFix> {
self.0.source_fixes().into_iter().map(Into::into).collect()
}
#[getter]
pub fn _source_fixes(&self) -> Option<Vec<PySourceFix>> {
self.0
.source_fixes
.clone()
.map(|sf| sf.into_iter().map(Into::into).collect())
}
#[pyo3(signature = (*seg_type))]
pub fn class_is_type(&self, seg_type: &Bound<'_, PyTuple>) -> bool {
let seg_strs = seg_type
.extract::<Vec<String>>()
.expect("args should be all strings");
self.0
.class_is_type(&seg_strs.iter().map(String::as_str).collect::<Vec<&str>>())
}
#[getter]
pub fn first_non_whitespace_segment_raw_upper(&self) -> Option<String> {
self.0.first_non_whitespace_segment_raw_upper()
}
#[getter]
pub fn raw_upper(&self) -> String {
self.0.raw_upper()
}
pub fn invalidate_caches(&self) {}
#[getter]
pub fn uuid(&self) -> u128 {
self.0.uuid
}
#[getter]
pub fn descendant_type_set(&self) -> HashSet<String> {
self.0.descendant_type_set()
}
#[pyo3(signature = (*seg_type, recurse_into = true, no_recursive_seg_type = None, allow_self = true))]
pub fn recursive_crawl(
&self,
seg_type: &Bound<'_, PyTuple>,
recurse_into: bool,
no_recursive_seg_type: Option<Bound<'_, PyAny>>,
allow_self: bool,
) -> Vec<PyToken> {
let seg_type = seg_type
.extract::<Vec<String>>()
.expect("args should be all strings");
let temp: Option<Vec<String>> = match no_recursive_seg_type {
Some(py_any) => {
if let Ok(single_str) = py_any.extract::<String>() {
Some(vec![single_str]) // Convert single string into a Vec<String>
} else if let Ok(list_of_str) = py_any.extract::<Vec<String>>() {
Some(list_of_str) // Already a Vec<String>, return as is
} else {
Some(vec![]) // If it's neither, return an empty vector
}
}
None => None, // If None, return an empty vector
};
let no_recursive_seg_type: Option<Vec<&str>> = temp
.as_ref()
.map(|vec| vec.iter().map(String::as_str).collect());
self.0
.recursive_crawl(
&seg_type.iter().map(String::as_str).collect::<Vec<&str>>(),
recurse_into,
no_recursive_seg_type.as_deref(),
allow_self,
)
.into_iter()
.map(Into::into)
.collect()
}
pub fn recursive_crawl_all(&self, reverse: bool) -> Vec<PyToken> {
self.0
.recursive_crawl_all(reverse)
.map(|t| t.clone().into())
.collect()
}
#[getter]
pub fn segments(&self) -> Vec<PyToken> {
self.0
.segments
.clone()
.into_iter()
.map(Into::into)
.collect()
}
pub fn path_to(&self, other: PyToken) -> Vec<PyPathStep> {
self.0
.clone()
.path_to(other.into())
.into_iter()
.map(Into::into)
.collect()
}
pub fn get_start_loc(&self) -> (usize, usize) {
self.0.get_start_loc()
}
pub fn get_end_loc(&self) -> (usize, usize) {
self.0.get_end_loc()
}
#[getter]
pub fn raw_segments(&self) -> Vec<PyToken> {
self.0.raw_segments().into_iter().map(Into::into).collect()
}
pub fn _get_raw_segment_kwargs<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> {
let dict = PyDict::new(py);
if let Some(ref quoted_value) = self.0.quoted_value {
dict.set_item("quoted_value", quoted_value.clone()).unwrap();
} else {
dict.set_item("quoted_value", py.None()).unwrap();
}
if let Some(ref escape_replacement) = self.0.escape_replacement {
dict.set_item("escape_replacements", vec![escape_replacement])
.unwrap();
} else {
dict.set_item("escape_replacements", py.None()).unwrap();
}
dict
}
#[getter]
pub fn quoted_value(&self, py: Python<'_>) -> Option<(String, Py<PyAny>)> {
self.0.quoted_value.clone().map(|(s, g)| {
let py_group: Py<PyAny> = match g {
RegexModeGroup::Index(idx) => idx.into_pyobject(py).unwrap().into(),
RegexModeGroup::Name(name) => name.into_pyobject(py).unwrap().into(),
};
(s, py_group)
})
}
#[getter]
pub fn escape_replacements(&self) -> Option<Vec<(String, String)>> {
if self.0.escape_replacement.is_none() {
None
} else {
Some(vec![self.0.escape_replacement.clone().unwrap()])
}
}
pub fn set_parent(&self, parent: &Bound<'_, PyAny>, idx: usize) -> PyResult<()> {
let parent: Arc<Token> = parent
.extract()
.map(|t: PySqlFluffToken| Arc::new(t.0 .0))?;
let mut inner = self.0.clone();
inner.set_parent(parent, idx);
Ok(())
}
pub fn get_parent(&self) -> Option<(PyToken, i32)> {
None
}
pub fn iter_unparsables(&self) -> Vec<PyToken> {
self.0
.iter_unparseables()
.into_iter()
.map(Into::into)
.collect()
}
#[pyo3(signature = (ident=0, tabsize=4, code_only=false))]
pub fn stringify(
&self,
ident: Option<usize>,
tabsize: Option<usize>,
code_only: Option<bool>,
) -> String {
self.0.stringify(
ident.unwrap_or(0),
tabsize.unwrap_or(4),
code_only.unwrap_or_default(),
)
}
#[pyo3(signature = (code_only=None, show_raw=None, include_meta=None))]
pub fn to_tuple<'py>(
&self,
py: Python<'py>,
code_only: Option<bool>,
show_raw: Option<bool>,
include_meta: Option<bool>,
) -> Result<Bound<'py, PyTuple>, PyErr> {
PyTupleSerialisedSegment(self.0.to_tuple(code_only, show_raw, include_meta)).to_py_tuple(py)
}
// pub fn structural_simplify(&self) -> HashMap<String, Option<serde_json::Value>> {
// self.0
// .structural_simplify()
// .into_iter()
// .map(|(k, v)| (k, v.map(|v| serde_json::to_value(v).unwrap())))
// .collect()
// }
#[pyo3(signature = (segments=None, parent=None, parent_idx=None))]
pub fn copy(
&self,
segments: Option<Vec<PySqlFluffToken>>,
parent: Option<PySqlFluffToken>,
parent_idx: Option<usize>,
) -> PyToken {
PyToken(
self.0.copy(
segments.map(|s| s.into_iter().map(Into::into).collect()),
parent
.as_ref()
.map(|parent_token| Arc::clone(&parent_token.0 .0.clone().into())),
parent_idx,
),
)
}
#[pyo3(signature = (raw=None, source_fixes=None))]
pub fn edit(&self, raw: Option<String>, source_fixes: Option<Vec<PySourceFix>>) -> Self {
Self(self.0.edit(
raw,
source_fixes.map(|sf| sf.into_iter().map(Into::into).collect()),
))
}
#[classmethod]
pub fn position_segments<'py>(
_cls: &Bound<'py, PyType>,
py: Python<'py>,
segments: Vec<PySqlFluffToken>,
parent_pos: PySqlFluffPositionMarker,
) -> Result<Bound<'py, PyTuple>, PyErr> {
let tokens = Token::position_segments(
&segments
.into_iter()
.map(|s| s.into())
.collect::<Vec<Token>>(),
parent_pos.into(),
);
PyTuple::new(
py,
tokens.into_iter().map(Into::into).collect::<Vec<PyToken>>(),
)
}
pub fn __repr__(&self) -> String {
format!("{}", self)
}
}
impl Display for PyToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<PyToken> for Token {
fn from(value: PyToken) -> Token {
value.0
}
}
impl From<Token> for PyToken {
fn from(value: Token) -> Self {
Self(value)
}
}
#[derive(IntoPyObject)]
pub struct PySqlFluffToken(pub PyToken);
impl<'py> FromPyObject<'py> for PySqlFluffToken {
fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
let raw = ob.getattr("raw")?.extract::<String>()?;
let class_types = ob
.getattr("_class_types")
.unwrap_or(ob.getattr("class_types")?)
.extract::<HashSet<String>>()?
.into_iter()
.map(|s| s.to_string())
.collect::<HashSet<String>>();
let instance_types = ob
.getattr("instance_types")?
.extract::<Vec<String>>()?
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<String>>();
let segments = ob
.getattr("segments")?
.extract::<Vec<PySqlFluffToken>>()
.map(|s| s.into_iter().map(Into::into).collect::<Vec<Token>>())?;
let pos_marker = ob
.getattr("pos_marker")?
.extract::<PySqlFluffPositionMarker>()?;
use crate::token::config::TokenConfig;
Ok(Self(PyToken(Token::base_token(
raw,
pos_marker.into(),
TokenConfig {
class_types,
instance_types,
..TokenConfig::default()
},
segments,
))))
}
}
impl From<PySqlFluffToken> for Token {
fn from(value: PySqlFluffToken) -> Token {
value.0 .0
}
}
impl From<Token> for PySqlFluffToken {
fn from(value: Token) -> Self {
Self(PyToken(value))
}
}

View File

@@ -17,6 +17,12 @@ if TYPE_CHECKING: # pragma: no cover
from sqlfluff.core.parser import BaseSegment, PositionMarker
from sqlfluff.core.rules import BaseRule, LintFix
try:
from sqlfluffrs import RsSQLLexerError
except ImportError:
...
CheckTuple = tuple[str, int, int]
SerializedObject = dict[str, Union[str, int, bool, list["SerializedObject"]]]
@@ -181,6 +187,18 @@ class SQLLexError(SQLBaseError):
_code = "LXR"
_identifier = "lexing"
@classmethod
def from_rs_error(cls, rs_error: "RsSQLLexerError") -> "SQLLexError":
"""Create a SQLLexError from a RsSQLLexerError."""
return cls(
description=rs_error.desc,
line_no=rs_error.line_no,
line_pos=rs_error.line_pos,
ignore=rs_error.ignore,
fatal=rs_error.fatal,
warning=rs_error.warning,
)
class SQLParseError(SQLBaseError):
"""An error which occurred during parsing.

View File

@@ -51,7 +51,7 @@ class ParsedVariant(NamedTuple):
lexing_violations (:obj:`list` of :obj:`SQLLexError`): Any violations
raised during the lexing phase.
parsing_violations (:obj:`list` of :obj:`SQLParseError`): Any violations
raised during the lexing phase.
raised during the parsing phase.
"""
templated_file: TemplatedFile

View File

@@ -14,7 +14,13 @@ from sqlfluff.core.parser.grammar import (
Ref,
Sequence,
)
from sqlfluff.core.parser.lexer import Lexer, RegexLexer, StringLexer
from sqlfluff.core.parser.lexer import (
LexerType,
PyLexer,
RegexLexer,
StringLexer,
get_lexer_class,
)
from sqlfluff.core.parser.markers import PositionMarker
from sqlfluff.core.parser.matchable import Matchable
from sqlfluff.core.parser.parser import Parser
@@ -52,6 +58,9 @@ from sqlfluff.core.parser.segments import (
)
from sqlfluff.core.parser.types import ParseMode
# Get the appropriate lexer class (PyRsLexer if available, otherwise PyLexer)
Lexer = get_lexer_class()
__all__ = (
"BaseSegment",
"SourceFix",
@@ -95,6 +104,8 @@ __all__ = (
"RegexParser",
"PositionMarker",
"Lexer",
"PyLexer",
"LexerType",
"StringLexer",
"RegexLexer",
"Parser",

View File

@@ -15,7 +15,9 @@ from sqlfluff.core.parser.segments import (
BaseSegment,
Dedent,
EndOfFile,
ImplicitIndent,
Indent,
LiteralKeywordSegment,
MetaSegment,
RawSegment,
TemplateLoop,
@@ -723,7 +725,7 @@ def _iter_segments(
)
class Lexer:
class PyLexer:
"""The Lexer class actually does the lexing step."""
def __init__(
@@ -825,7 +827,9 @@ class Lexer:
return tuple(segment_buffer)
@staticmethod
def violations_from_segments(segments: tuple[RawSegment, ...]) -> list[SQLLexError]:
def violations_from_segments(
segments: tuple[RawSegment, ...],
) -> list[SQLLexError]:
"""Generate any lexing errors for any unlexables."""
violations = []
for segment in segments:
@@ -887,3 +891,87 @@ class Lexer:
f"{template.templated_str[template_slice]!r}"
)
return templated_buff
try:
from sqlfluffrs import RsLexer, RsToken
def get_segment_type_map(base_class: type) -> dict[str, type[RawSegment]]:
"""Dynamically create a map of segment types to their subclasses."""
segment_map = {}
for subclass in base_class.__subclasses__():
if subclass is LiteralKeywordSegment or subclass is ImplicitIndent:
continue
if (
hasattr(subclass, "type") and subclass.type
): # Ensure the subclass has a type
segment_map[subclass.type] = subclass
# Recursively add subclasses of subclasses
segment_map.update(get_segment_type_map(subclass))
return segment_map
# Dynamically generate the segment_types map
segment_types = get_segment_type_map(RawSegment)
class PyRsLexer(RsLexer):
"""A wrapper around the sqlfluffrs lexer."""
@staticmethod
def _tokens_to_segments(
tokens: list["RsToken"], py_template: TemplatedFile
) -> tuple[BaseSegment, ...]:
"""Convert tokens to segments."""
return tuple(
segment_types.get(token.type, RawSegment).from_rstoken(
token, py_template
)
for token in tokens
)
def lex(
self, raw: Union[str, TemplatedFile]
) -> tuple[tuple[BaseSegment, ...], list[SQLLexError]]:
"""Take a string or TemplatedFile and return segments."""
tokens, errors = self._lex(raw)
first_token = tokens[0]
assert first_token
template = first_token.pos_marker.templated_file
py_template = TemplatedFile(
template.source_str,
template.fname,
template.templated_str,
template.sliced_file, # type: ignore
template.raw_sliced, # type: ignore
)
return (
self._tokens_to_segments(tokens, py_template),
[SQLLexError.from_rs_error(error) for error in errors],
)
_HAS_RUST_LEXER = True
lexer_logger.info("Using sqlfluffrs lexer.")
except ImportError:
PyRsLexer = None # type: ignore[assignment, misc]
_HAS_RUST_LEXER = False
lexer_logger.info("sqlfluffrs lexer not present or failed to load.")
def get_lexer_class() -> type[Union[PyLexer, "PyRsLexer"]]:
"""Get the appropriate lexer class based on availability.
Returns PyRsLexer if the Rust extension is available,
otherwise returns PyLexer.
This function provides a single point of lexer selection,
making it easy to instantiate the correct lexer:
Lexer = get_lexer_class()
lexer = Lexer(config=config)
Returns:
The lexer class to use (PyRsLexer or PyLexer).
"""
if _HAS_RUST_LEXER:
return PyRsLexer
return PyLexer

View File

@@ -3,13 +3,15 @@
This class is a construct to keep track of positions within a file.
"""
from collections.abc import Sequence
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional
from sqlfluff.core.helpers.slice import zero_slice
if TYPE_CHECKING:
from sqlfluff.core.templaters import TemplatedFile # pragma: no cover
if TYPE_CHECKING: # pragma: no cover
from sqlfluff.core.templaters import TemplatedFile
from sqlfluffrs import RsPositionMarker
@dataclass(frozen=True)
@@ -124,7 +126,7 @@ class PositionMarker:
@classmethod
def from_child_markers(
cls, *markers: Optional["PositionMarker"]
cls, markers: Sequence[Optional["PositionMarker"]]
) -> "PositionMarker":
"""Create a parent marker from it's children."""
source_slice = slice(
@@ -249,3 +251,16 @@ class PositionMarker:
def to_source_dict(self) -> dict[str, int]:
"""Serialise the source position."""
return self.templated_file.source_position_dict_from_slice(self.source_slice)
@classmethod
def from_rs_position_marker(
cls,
rs_position_marker: "RsPositionMarker",
templated_file: "TemplatedFile",
) -> "PositionMarker":
"""Create a PositionMarker from an RsPositionMarker."""
return cls(
source_slice=rs_position_marker.source_slice,
templated_slice=rs_position_marker.templated_slice,
templated_file=templated_file,
)

View File

@@ -195,7 +195,7 @@ class BaseSegment(metaclass=SegmentMetaclass):
# If no pos given, work it out from the children.
if all(seg.pos_marker for seg in segments):
pos_marker = PositionMarker.from_child_markers(
*(seg.pos_marker for seg in segments)
[seg.pos_marker for seg in segments]
)
assert not hasattr(self, "parse_grammar"), "parse_grammar is deprecated."

View File

@@ -1,7 +1,7 @@
"""Indent and Dedent classes."""
from collections.abc import Sequence
from typing import Optional
from typing import TYPE_CHECKING, Optional
from uuid import UUID
from sqlfluff.core.parser.context import ParseContext
@@ -11,6 +11,9 @@ from sqlfluff.core.parser.segments.base import BaseSegment
from sqlfluff.core.parser.segments.raw import RawSegment, SourceFix
from sqlfluff.core.templaters.base import TemplatedFile
if TYPE_CHECKING: # pragma: no cover
from sqlfluffrs import RsToken
class MetaSegment(RawSegment):
"""A segment which is empty but indicates where something should be."""
@@ -80,6 +83,19 @@ class MetaSegment(RawSegment):
"""
return None
@classmethod
def from_rstoken(
cls,
token: "RsToken",
tf: "TemplatedFile",
) -> "MetaSegment":
"""Create a RawSegment from an RSQL token."""
segment = cls(
pos_marker=PositionMarker.from_rs_position_marker(token.pos_marker, tf),
block_uuid=token.block_uuid,
)
return segment
class EndOfFile(MetaSegment):
"""A meta segment to indicate the end of the file."""
@@ -270,3 +286,14 @@ class TemplateSegment(MetaSegment):
source_fixes=sf,
block_uuid=self.block_uuid,
)
@classmethod
def from_rstoken(cls, token: "RsToken", tf: TemplatedFile) -> "TemplateSegment":
"""Create a TemplateSegment from a token."""
segment = cls(
pos_marker=PositionMarker.from_rs_position_marker(token.pos_marker, tf),
source_str=token.source_str,
block_type=token.block_type,
block_uuid=token.block_uuid,
)
return segment

View File

@@ -4,7 +4,7 @@ This is designed to be the root segment, without
any children, and the output of the lexer.
"""
from typing import Any, Callable, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
from uuid import uuid4
import regex as re
@@ -12,6 +12,10 @@ import regex as re
from sqlfluff.core.parser.markers import PositionMarker
from sqlfluff.core.parser.segments.base import BaseSegment, SourceFix
if TYPE_CHECKING: # pragma: no cover
from sqlfluff.core.templaters import TemplatedFile
from sqlfluffrs import RsToken
class RawSegment(BaseSegment):
"""This is a segment without any subsegments."""
@@ -299,6 +303,26 @@ class RawSegment(BaseSegment):
**new_segment_kwargs,
)
@classmethod
def from_rstoken(
cls,
token: "RsToken",
tf: "TemplatedFile",
) -> "RawSegment":
"""Create a RawSegment from an RSQL token."""
segment = cls(
raw=token.raw,
pos_marker=PositionMarker.from_rs_position_marker(token.pos_marker, tf),
instance_types=tuple(token.instance_types),
trim_start=token.trim_start,
trim_chars=token.trim_chars,
source_fixes=token.source_fixes,
uuid=token.uuid,
quoted_value=token.quoted_value,
escape_replacements=token.escape_replacements,
)
return segment
__all__ = [
"PositionMarker",

Some files were not shown because too many files have changed in this diff Show More