├── benches
    ├── csv
    │   ├── src
    │   │   ├── .gitignore
    │   │   └── lib.rs
    │   ├── build.rs
    │   ├── benches
    │   │   ├── csv.pest
    │   │   └── benchmarks.rs
    │   ├── csv.pag
    │   └── Cargo.toml
    └── json
    │   ├── src
    │       ├── .gitignore
    │       └── lib.rs
    │   ├── build.rs
    │   ├── Cargo.toml
    │   ├── benches
    │       ├── json.pest
    │       ├── json.lalrpop
    │       ├── json_logos.lalrpop
    │       ├── lalr_def.rs
    │       └── benchmarks.rs
    │   └── json.pag
├── tests
    ├── arith-expr
    │   ├── src
    │   │   ├── .gitignore
    │   │   └── lib.rs
    │   ├── build.rs
    │   ├── Cargo.toml
    │   └── arith.pag
    ├── tokenizer
    │   ├── src
    │   │   ├── .gitignore
    │   │   ├── lib.rs
    │   │   ├── generated.rs
    │   │   ├── length_differential.rs
    │   │   ├── common_prefix.rs
    │   │   ├── tail_differential.rs
    │   │   └── comment_and_string.rs
    │   ├── Cargo.toml
    │   └── build.rs
    └── sexpr-calculator
    │   ├── src
    │       ├── .gitignore
    │       └── lib.rs
    │   ├── build.rs
    │   ├── Cargo.toml
    │   └── sexpr.pag
├── rust-toolchain.toml
├── .gitignore
├── .github
    ├── images
    │   └── hermit-crab.png
    └── workflows
    │   └── build.yaml
├── pag-parser
    ├── src
    │   ├── tests
    │   │   ├── failure
    │   │   │   ├── err_cyclic_token.pag
    │   │   │   ├── err_nullable_token.pag
    │   │   │   ├── err_sequence_ambiguity.pag
    │   │   │   ├── err_undefined_token_in_lexer.pag
    │   │   │   ├── err_null_sequence_ambiguity.pag
    │   │   │   ├── err_undefined_grammar_rule.pag
    │   │   │   ├── err_undefined_token_in_parser.pag
    │   │   │   ├── err_unguarded_fixpoint.pag
    │   │   │   ├── err_alternation_ambiguity.pag
    │   │   │   ├── err_multiple_skips.pag
    │   │   │   ├── err_multiple_definitions_in_lexer.pag
    │   │   │   ├── err_multiple_definitions_in_parser.pag
    │   │   │   └── mod.rs
    │   │   └── mod.rs
    │   ├── type_system
    │   │   ├── mod.rs
    │   │   ├── binding_proxy.rs
    │   │   ├── context.rs
    │   │   ├── fixpoint.rs
    │   │   └── type_check.rs
    │   ├── frontend
    │   │   ├── example.pag
    │   │   ├── grammar.pest
    │   │   ├── syntax.rs
    │   │   ├── lexical.rs
    │   │   └── unicode.rs
    │   ├── core_syntax.rs
    │   ├── utilities.rs
    │   ├── nf.rs
    │   └── lib.rs
    └── Cargo.toml
├── rustfmt.toml
├── pag-lexer
    ├── src
    │   ├── utilities.rs
    │   ├── derivative.rs
    │   ├── congruence.rs
    │   ├── regex_tree.rs
    │   ├── lib.rs
    │   ├── lookahead.rs
    │   ├── normalization.rs
    │   ├── intervals.rs
    │   └── vector.rs
    └── Cargo.toml
├── pag-compiler
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── Cargo.toml
├── LICENSE-MIT
├── shell.nix
├── README.md
└── LICENSE-APACHE


/benches/csv/src/.gitignore:
--------------------------------------------------------------------------------
1 | parser.rs


--------------------------------------------------------------------------------
/benches/json/src/.gitignore:
--------------------------------------------------------------------------------
1 | parser.rs
2 | 


--------------------------------------------------------------------------------
/tests/arith-expr/src/.gitignore:
--------------------------------------------------------------------------------
1 | parser.rs


--------------------------------------------------------------------------------
/tests/tokenizer/src/.gitignore:
--------------------------------------------------------------------------------
1 | generated


--------------------------------------------------------------------------------
/tests/sexpr-calculator/src/.gitignore:
--------------------------------------------------------------------------------
1 | parser.rs


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "nightly"
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | .idea/
4 | .vscode/
5 | flamegraph.svg
6 | 


--------------------------------------------------------------------------------
/.github/images/hermit-crab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SchrodingerZhu/paguroidea/HEAD/.github/images/hermit-crab.png


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_cyclic_token.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A = 'a' ~ A;
3 | }
4 | 
5 | parser test {
6 |     active test = _;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_nullable_token.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a'*;
3 | }
4 | 
5 | parser test {
6 |     active test = A;
7 | }
8 | 


--------------------------------------------------------------------------------
/benches/csv/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pag_compiler::compile("csv.pag", "src/parser.rs");
3 |     println!("cargo:rerun-if-changed=csv.pag");
4 | }
5 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_sequence_ambiguity.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = A+ ~ A;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_undefined_token_in_lexer.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = C;
3 | }
4 | 
5 | parser test {
6 |     active test = _;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_null_sequence_ambiguity.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = _ ~ A;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_undefined_grammar_rule.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = test2;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_undefined_token_in_parser.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = AA;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_unguarded_fixpoint.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = test ~ A;
7 | }
8 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_alternation_ambiguity.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = 'a';
3 | }
4 | 
5 | parser test {
6 |     active test = A+ | A ~ test;
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/arith-expr/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pag_compiler::compile("arith.pag", "src/parser.rs");
3 |     println!("cargo:rerun-if-changed=arith.pag");
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/sexpr-calculator/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pag_compiler::compile("sexpr.pag", "src/parser.rs");
3 |     println!("cargo:rerun-if-changed=sexpr.pag");
4 | }
5 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_multiple_skips.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     skip = "SKIP";
3 |     skip = "ANOTHER_SKIP";
4 | }
5 | 
6 | parser test {
7 |     active test = _;
8 | }
9 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_multiple_definitions_in_lexer.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = '0';
3 |     A     = '1';
4 | }
5 | 
6 | parser test {
7 |     active test = _;
8 | }
9 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/err_multiple_definitions_in_parser.pag:
--------------------------------------------------------------------------------
1 | lexer {
2 |     A     = '0';
3 | }
4 | 
5 | parser test {
6 |     active test = A;
7 |     active test = A;
8 | }
9 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | ignore = [
2 |   "/tests/arith-expr/src/parser.rs",
3 |   "/tests/sexpr-calculator/src/parser.rs",
4 |   "/tests/tokenizer/src/generated/*.rs",
5 |   "/benches/csv/src/parser.rs",  
6 |   "/benches/json/src/parser.rs",
7 | ]
8 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![feature(portable_simd)]
2 | #![feature(core_intrinsics)]
3 | #![feature(array_chunks)]
4 | mod comment_and_string;
5 | mod common_prefix;
6 | mod generated;
7 | mod length_differential;
8 | mod tail_differential;
9 | 


--------------------------------------------------------------------------------
/benches/csv/benches/csv.pest:
--------------------------------------------------------------------------------
1 | text = _{ (!("," | "\"" | "\r" | "\n") ~ ANY)+ }
2 | string = _{ "\"" ~ ( "\"\"" | !"\"" ~ ANY)* ~ "\"" }
3 | crlf = _{ "\r"? ~ "\n" }
4 | 
5 | field = { text | string }
6 | record = { field ~ ("," ~ field)* ~ crlf }
7 | csv = { record+ }


--------------------------------------------------------------------------------
/benches/json/build.rs:
--------------------------------------------------------------------------------
 1 | extern crate lalrpop;
 2 | 
 3 | fn main() {
 4 |     lalrpop::Configuration::new()
 5 |         .process_dir("benches/")
 6 |         .unwrap();
 7 |     pag_compiler::compile("json.pag", "src/parser.rs");
 8 |     println!("cargo:rerun-if-changed=json.pag");
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/arith-expr/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "arith-expr"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | build = "build.rs"
 6 | publish = false
 7 | 
 8 | [dependencies]
 9 | rand = { version = "0.8" }
10 | 
11 | [build-dependencies]
12 | pag-compiler = { path = "../../pag-compiler" }
13 | 


--------------------------------------------------------------------------------
/tests/sexpr-calculator/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sexpr-calculator"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | build = "build.rs"
 6 | publish = false
 7 | 
 8 | [dependencies]
 9 | rand = { version = "0.8" }
10 | 
11 | [build-dependencies]
12 | pag-compiler = { path = "../../pag-compiler" }
13 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/generated.rs:
--------------------------------------------------------------------------------
1 | #[path = "generated/comment_and_string.rs"]
2 | pub(crate) mod comment_and_string;
3 | #[path = "generated/common_prefix.rs"]
4 | pub(crate) mod common_prefix;
5 | #[path = "generated/length_differential.rs"]
6 | pub(crate) mod length_differential;
7 | #[path = "generated/tail_differential.rs"]
8 | pub(crate) mod tail_differential;
9 | 


--------------------------------------------------------------------------------
/tests/tokenizer/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tokenizer"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | build = "build.rs"
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | rand = { version = "0.8" }
10 | 
11 | [build-dependencies]
12 | tempfile = "3.6.0"
13 | pag-compiler = { path = "../../pag-compiler" }
14 | 


--------------------------------------------------------------------------------
/benches/csv/csv.pag:
--------------------------------------------------------------------------------
 1 | lexer {
 2 |     TEXT   = (!('"' | '\r' | '\n' | ','))+;
 3 |     STRING = '"' ~ ('"' ~ '"' | !'"')* ~ '"';
 4 |     CRLF   = '\r'? ~ '\n';
 5 |     COMMA  = ',';
 6 | }
 7 | 
 8 | parser csv {
 9 |     active csv
10 |         = record+;
11 | 
12 |     active field
13 |         = TEXT | STRING;
14 | 
15 |     active record
16 |         = field ~ (COMMA ~ field)* ~ CRLF;
17 | }
18 | 


--------------------------------------------------------------------------------
/pag-lexer/src/utilities.rs:
--------------------------------------------------------------------------------
 1 | pub fn dbg_sort<T, U, F, K>(data: T, _f: F) -> impl Iterator<Item = U>
 2 | where
 3 |     T: IntoIterator<Item = U>,
 4 |     F: FnMut(&U) -> K,
 5 |     K: Ord,
 6 | {
 7 |     #[cfg(not(debug_assertions))]
 8 |     {
 9 |         data.into_iter()
10 |     }
11 |     #[cfg(debug_assertions)]
12 |     {
13 |         let mut vec = Vec::from_iter(data.into_iter());
14 |         vec.sort_unstable_by_key(_f);
15 |         vec.into_iter()
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/sexpr-calculator/sexpr.pag:
--------------------------------------------------------------------------------
 1 | lexer {
 2 |     DIGIT = '0' .. '9';
 3 | 
 4 |     LPAREN = '(';
 5 |     RPAREN = ')';
 6 |     PLUS   = '+' | '加';
 7 |     MULT   = '*' | '乘';
 8 |     INT    = DIGIT+;
 9 | 
10 |     skip = (' ' | '\t' | '\n' | '\r')+;
11 | }
12 | 
13 | parser sexpr {
14 |     active compound
15 |         = LPAREN ~ op ~ (compound | int)* ~ RPAREN;
16 | 
17 |     active op
18 |         = PLUS | MULT;
19 | 
20 |     active int
21 |         = INT;
22 | 
23 |     active sexpr
24 |         = compound | int;
25 | }
26 | 


--------------------------------------------------------------------------------
/pag-parser/src/type_system/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | mod binding_proxy;
10 | mod context;
11 | mod fixpoint;
12 | mod type_check;
13 | 
14 | pub use fixpoint::infer_fixpoints;
15 | pub use type_check::{type_check, Type, TypeError};
16 | 


--------------------------------------------------------------------------------
/tests/arith-expr/arith.pag:
--------------------------------------------------------------------------------
 1 | lexer {
 2 |     DIGIT = '0' .. '9';
 3 | 
 4 |     LPAREN  = '(';
 5 |     RPAREN  = ')';
 6 |     PLUS    = '+';
 7 |     MULT    = '*';
 8 |     INT     = DIGIT+;
 9 |     SPECIAL = '\u{FF}' .. '\u{D7FF}';
10 | 
11 |     skip = (' ' | '\t' | '\n' | '\r')+;
12 | }
13 | 
14 | parser expr {
15 |     active expr
16 |         = mult ~ (PLUS ~ mult)*;
17 | 
18 |     active mult
19 |         = primary ~ (MULT ~ primary)*;
20 | 
21 |     silent primary
22 |         = special | int | LPAREN ~ expr ~ RPAREN;
23 | 
24 |     active int
25 |         = INT;
26 | 
27 |     active special
28 |         = SPECIAL;
29 | }
30 | 


--------------------------------------------------------------------------------
/benches/csv/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pag-csv"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | build = "build.rs"
 6 | publish = false
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | [dependencies]
10 | rand = { version = "0.8" }
11 | snmalloc-rs = { version = "0.3", features = ["build_cc"] }
12 | 
13 | [build-dependencies]
14 | pag-compiler = { path = "../../pag-compiler" }
15 | 
16 | [dev-dependencies]
17 | csv = { version = "1" }
18 | criterion = { version = "0.4", features = ["html_reports"] }
19 | pest = { version = "2.5.7", features = [ "std", "memchr" ] }
20 | pest_derive = "2.5.7"
21 | 
22 | [[bench]]
23 | name = "benchmarks"
24 | harness = false
25 | 


--------------------------------------------------------------------------------
/benches/json/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pag-json"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | build = "build.rs"
 6 | publish = false
 7 | autobenches = false
 8 | 
 9 | [dependencies]
10 | rand = { version = "0.8" }
11 | serde_json = "1.0"
12 | 
13 | [build-dependencies]
14 | pag-compiler = { path = "../../pag-compiler" }
15 | lalrpop = "0.20.0"
16 | 
17 | [dev-dependencies]
18 | criterion = { version = "0.4", features = ["html_reports"] }
19 | snmalloc-rs = { version = "0.3", features = ["build_cc"] }
20 | pest = { version = "2.5.7", features = [ "std", "memchr" ] }
21 | pest_derive = "2.5.7"
22 | lalrpop-util = { version = "0.20.0", features = ["lexer", "unicode"] }
23 | logos = "0.13.0"
24 | 
25 | [[bench]]
26 | name = "benchmarks"
27 | harness = false
28 | 


--------------------------------------------------------------------------------
/benches/json/benches/json.pest:
--------------------------------------------------------------------------------
 1 | WHITESPACE = _{ (" " | "\t" | "\r" | "\n")+ }
 2 | escape = _{ "\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t" }
 3 | non_zero = _{'1' .. '9'}
 4 | digit    = _{'0' .. '9'}
 5 | hex_digit = _{ '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' }
 6 | string = @{ "\"" ~ ( (!("\"" | "\\") ~ ANY) | "\\" ~ (escape | ("u" ~ hex_digit ~ hex_digit ~ hex_digit ~ hex_digit )) )* ~ "\"" }
 7 | number = @{ "-"? ~ ("0" | non_zero ~ digit*) ~ ("." ~ digit+)? ~ (("e" | "E") ~ ("+" | "-")? ~ digit+)? }
 8 | lit_true = @{ "true" }
 9 | lit_false = @{ "false" }
10 | lit_null = @{ "null" }
11 | attribute = { string ~ ":" ~ value }
12 | object    = { "{" ~ (attribute ~ ("," ~ attribute)*)? ~ "}" }
13 | array     = { "[" ~ (value ~ ("," ~ value)*)? ~ "]" }
14 | value     = _{string | number | array | object | lit_true | lit_false | lit_null}
15 | json      = { SOI ~ value ~ EOI }


--------------------------------------------------------------------------------
/pag-parser/src/frontend/example.pag:
--------------------------------------------------------------------------------
 1 | lexer {
 2 |     // definition in lexer is not a real token, it is just a way to define a set of characters
 3 | 
 4 |     BLANK = ' ';
 5 |     DIGIT = '0' .. '9';
 6 |     ALPHA = 'a' .. 'z' | 'A' .. 'Z';
 7 | 
 8 |     LPAREN = '(';
 9 |     RPAREN = ')';
10 |     ATOM   = ALPHA ~ (ALPHA | DIGIT)*;
11 | 
12 |     skip = (BLANK | '\t' | '\n' | '\r')+;
13 | }
14 | 
15 | // parser must have a entry point
16 | parser sexpr {
17 |     // definition in parser can be a real grammer rule.
18 | 
19 |     active compound
20 |         = LPAREN ~ sexprs ~ RPAREN;
21 | 
22 |     // just for testing
23 |     active atom
24 |         = real_atom;
25 | 
26 |     silent real_atom
27 |         = ATOM;
28 | 
29 |     silent sexprs
30 |         = (compound | atom) *;
31 | 
32 |     active sexpr
33 |         = compound | atom;
34 | 
35 |     active unreachable = unreachable;
36 | }
37 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | on:
 3 |   push:
 4 |     branches: [ "main" ]
 5 |   pull_request:
 6 |     branches: [ "main" ]
 7 | env:
 8 |   CARGO_TERM_COLOR: always
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - name: Initialize Rustup
15 |         run: |
16 |           rustup toolchain install nightly --component rustfmt clippy --profile minimal --force
17 |           rustup override set nightly
18 |       - name: Run build
19 |         run: cargo build --verbose
20 |       - name: Run rustfmt
21 |         run: cargo fmt --all -- --check
22 |       - name: Run clippy
23 |         run: cargo clippy --all
24 |       - name: Build
25 |         run: cargo build --release --verbose
26 |       - name: Run debug tests
27 |         run: cargo test --verbose
28 |       - name: Run release tests
29 |         run: cargo test --verbose --release
30 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | use ariadne::Source;
10 | use strip_ansi_escapes::Writer;
11 | mod failure;
12 | 
13 | fn write_error<S: AsRef<str>, N: AsRef<str>>(input: S, name: N) -> String {
14 |     let mut buffer = Vec::<u8>::new();
15 |     {
16 |         let result = crate::generate_parser(input.as_ref()).unwrap_err();
17 |         let reports = result.to_reports(name.as_ref());
18 |         let mut cache = (name.as_ref(), Source::from(input.as_ref()));
19 |         let mut writer = Writer::new(&mut buffer);
20 |         for i in reports {
21 |             i.write(&mut cache, &mut writer).unwrap();
22 |         }
23 |     }
24 |     String::from_utf8(buffer).unwrap()
25 | }
26 | 


--------------------------------------------------------------------------------
/pag-lexer/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Paguroidea Developers
 2 | #
 3 | # Licensed under the Apache License, Version 2.0
 4 | # <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | # license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | # option. All files in the project carrying such notice may not be copied,
 7 | # modified, or distributed except according to those terms.
 8 | 
 9 | [package]
10 | name = "pag-lexer"
11 | keywords = ["lexer", "cfg", "grammar", "regex"]
12 | description = "Parser-lexer fusion generator (derivative lexer)"
13 | documentation = "https://docs.rs/pag-lexer/"
14 | 
15 | version.workspace = true
16 | edition.workspace = true
17 | license.workspace = true
18 | exclude.workspace = true
19 | categories.workspace = true
20 | repository.workspace = true
21 | rust-version.workspace = true
22 | authors.workspace = true
23 | readme.workspace = true
24 | 
25 | [dependencies]
26 | quote = "1.0.26"
27 | proc-macro2 = "1.0"
28 | smallvec = { version = "1", features = ["union"] }
29 | 
30 | [dev-dependencies]
31 | syn = { version = "2.0", features = ["full"] }
32 | 


--------------------------------------------------------------------------------
/pag-compiler/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Paguroidea Developers
 2 | #
 3 | # Licensed under the Apache License, Version 2.0
 4 | # <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | # license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | # option. All files in the project carrying such notice may not be copied,
 7 | # modified, or distributed except according to those terms.
 8 | 
 9 | [package]
10 | name = "pag-compiler"
11 | keywords = ["parser", "cfg", "grammar"]
12 | description = "Parser-lexer fusion generator (compiler interface)"
13 | documentation = "https://docs.rs/pag-compiler/"
14 | 
15 | version.workspace = true
16 | edition.workspace = true
17 | license.workspace = true
18 | exclude.workspace = true
19 | categories.workspace = true
20 | repository.workspace = true
21 | rust-version.workspace = true
22 | authors.workspace = true
23 | readme.workspace = true
24 | 
25 | [dependencies]
26 | pag-parser = { version = "0.1.0-alpha.1", path = "../pag-parser" }
27 | syn = { version = "2.0", features = ["full"] }
28 | prettyplease = { version = "0.2", features = ["verbatim"] }
29 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Paguroidea Developers
 2 | #
 3 | # Licensed under the Apache License, Version 2.0
 4 | # <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | # license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | # option. All files in the project carrying such notice may not be copied,
 7 | # modified, or distributed except according to those terms.
 8 | 
 9 | [workspace]
10 | members = [
11 |     "pag-lexer",
12 |     "pag-parser",
13 |     "pag-compiler",
14 |     "tests/sexpr-calculator",
15 |     "tests/arith-expr",
16 |     "tests/tokenizer",
17 |     "benches/csv",
18 |     "benches/json",
19 | ]
20 | resolver = "2"
21 | 
22 | [workspace.package]
23 | version = "0.1.0-alpha.1"
24 | edition = "2021"
25 | license = "MIT OR Apache-2.0"
26 | exclude = [".github/*"]
27 | categories = ["parsing"]
28 | repository = "https://github.com/SchrodingerZhu/paguroidea"
29 | rust-version = "1.71.0"
30 | authors = [
31 |     "Schrodinger ZHU Yifan <i@zhuyi.fan>",
32 |     "QuarticCat <QuarticCat@pm.me>",
33 | ]
34 | readme = "README.md"
35 | 
36 | [profile.release]
37 | debug = true
38 | lto = true
39 | 


--------------------------------------------------------------------------------
/benches/json/benches/json.lalrpop:
--------------------------------------------------------------------------------
 1 | use crate::Pvalue;
 2 | 
 3 | grammar;
 4 | 
 5 | Comma<T>: Vec<T> = {
 6 |     <mut es: Comma<T>> "," <e: T> => { es.push(e); es },
 7 |     <e: T> => vec![e],
 8 | }
 9 | 
10 | pub Json = Value;
11 | 
12 | Value: Pvalue<'input> = {
13 |     "true"      => Pvalue::<'input>::Bool(true),
14 |     "false"     => Pvalue::<'input>::Bool(false),
15 |     "null"      => Pvalue::<'input>::Null,
16 |     <s: String> => Pvalue::<'input>::String(&s[1..s.len() - 1]),
17 |     <n: Number> => Pvalue::<'input>::Number(n),
18 |     <a: Array>  => Pvalue::<'input>::Array(a),
19 |     <o: Object> => Pvalue::<'input>::Object(o),
20 | }
21 | 
22 | Attribute: (&'input str, Pvalue<'input>) = {
23 |     <s: String> ":" <v: Value> => (s, v),
24 | }
25 | 
26 | Object: Vec<(&'input str, Pvalue<'input>)> = {
27 |     "{" <attr: Comma<Attribute>> "}" => attr,
28 |     "{" "}" => vec![],
29 | }
30 | 
31 | Array: Vec<Pvalue<'input>> = {
32 |     "[" <a: Comma<Value>> "]" => a,
33 |     "[" "]" => vec![],
34 | }
35 | 
36 | Number = r"-?(0|[1-9][0-9]*)((\.[0-9]+)?)([eE][+-]?[0-9]+)?";
37 | 
38 | String = r#""([^\\"]|\\(["\\/bfnrt]|u[0-9a-fA-F]{4}))*""#;
39 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Paguroidea Developers
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/benches/csv/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(portable_simd)]
 2 | #![feature(core_intrinsics)]
 3 | #![feature(array_chunks)]
 4 | mod parser;
 5 | 
 6 | pub use parser::parse;
 7 | use rand::prelude::StdRng;
 8 | use rand::{Rng, SeedableRng};
 9 | 
10 | pub fn generate_csv(line: usize, width: usize) -> String {
11 |     let mut random = std::env::var("PAG_RANDOM_SEED")
12 |         .ok()
13 |         .and_then(|x| x.parse().ok())
14 |         .map_or_else(StdRng::from_entropy, StdRng::seed_from_u64);
15 |     let mut buffer = String::new();
16 |     for _ in 0..line {
17 |         for i in 0..width {
18 |             if random.gen::<f64>() < 0.5 {
19 |                 buffer.push_str(&format!("\"{}\"", random.gen::<f64>()));
20 |             } else {
21 |                 buffer.push_str(&format!("{}", random.gen::<u64>()));
22 |             }
23 | 
24 |             if i != width - 1 {
25 |                 buffer.push(',');
26 |             }
27 |         }
28 |         buffer.push_str("\r\n");
29 |     }
30 |     buffer
31 | }
32 | 
33 | #[test]
34 | fn test_csv() {
35 |     let data = generate_csv(500, 500);
36 |     let parsed = parser::parse(&data).unwrap();
37 |     assert_eq!(parsed.len(), data.len());
38 | }
39 | 


--------------------------------------------------------------------------------
/shell.nix:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2023 Paguroidea Developers
 2 |  *
 3 |  * Licensed under the Apache License, Version 2.0
 4 |  * <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 |  * license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 |  * option. All files in the project carrying such notice may not be copied,
 7 |  * modified, or distributed except according to those terms.
 8 |  */
 9 | 
10 | { pkgs ? import <nixpkgs> {} }:
11 |   pkgs.gcc.stdenv.mkDerivation {
12 |     name = "paguroidea";
13 |     buildInputs = with pkgs; [
14 |         openssl
15 |         pkg-config
16 |         cmake
17 |         gcc
18 |         autoconf
19 |         automake
20 |         ninja
21 |         gnumake
22 |         zlib
23 |         llvmPackages_latest.clang
24 |         llvmPackages_latest.libclang
25 |         llvmPackages_latest.libclang.lib
26 |         llvmPackages_latest.llvm
27 |         llvmPackages_latest.lld
28 |     ];
29 |     shellHook = ''
30 |         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.llvmPackages_latest.libclang.lib}/lib
31 |         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.stdenv.cc.cc.lib}/lib
32 |         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.zlib}/lib
33 |     '';
34 | }
35 | 


--------------------------------------------------------------------------------
/benches/json/json.pag:
--------------------------------------------------------------------------------
 1 | lexer {
 2 |     DIGIT     = '0'..'9';
 3 |     NONZERO   = '1'..'9';
 4 |     HEX_DIGIT = '0' .. '9' | 'a' .. 'f' | 'A' .. 'F';
 5 |     ESCAPED   = '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't';
 6 | 
 7 |     LBRACKET = '{';
 8 |     RBRACKET = '}';
 9 |     COMMA    = ',';
10 |     COLON    = ':';
11 |     LSQUARE  = '[';
12 |     RSQUARE  = ']';
13 |     TRUE     = "true";
14 |     FALSE    = "false";
15 |     NULL     = "null";
16 |     STRING   = '"' ~ ( !('\\' | '"') | '\\' ~ (ESCAPED | 'u' ~ HEX_DIGIT ~ HEX_DIGIT ~ HEX_DIGIT ~ HEX_DIGIT) )* ~ '"';
17 |     NUMBER   = '-'? ~ ('0' | NONZERO ~ DIGIT*) ~ ('.' ~ DIGIT+)? ~ (('e' | 'E') ~ ('+' | '-')? ~ DIGIT+)?;
18 | 
19 |     skip = ('\n' | '\r' | '\t' | ' ')+;
20 | }
21 | 
22 | parser json {
23 |     active attribute =
24 |         string ~ COLON ~ value;
25 | 
26 |     active string = STRING;
27 |     active number = NUMBER;
28 |     active lit_true = TRUE;
29 |     active lit_false = FALSE;
30 |     active lit_null = NULL;
31 | 
32 |     active object =
33 |         LBRACKET ~ (attribute ~ (COMMA ~ attribute)*)? ~ RBRACKET;
34 | 
35 |     active array =
36 |         LSQUARE ~ (value ~ (COMMA ~ value)*)? ~ RSQUARE;
37 | 
38 |     silent value = string | number | array | object | lit_true | lit_false | lit_null;
39 | 
40 |     active json = value;
41 | }
42 | 


--------------------------------------------------------------------------------
/pag-parser/Cargo.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023 Paguroidea Developers
 2 | #
 3 | # Licensed under the Apache License, Version 2.0
 4 | # <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | # license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | # option. All files in the project carrying such notice may not be copied,
 7 | # modified, or distributed except according to those terms.
 8 | 
 9 | [package]
10 | name = "pag-parser"
11 | keywords = ["parser", "cfg", "grammar"]
12 | description = "Parser-lexer fusion generator (parser generator)"
13 | documentation = "https://docs.rs/pag-parser/"
14 | 
15 | version.workspace = true
16 | edition.workspace = true
17 | license.workspace = true
18 | exclude.workspace = true
19 | categories.workspace = true
20 | repository.workspace = true
21 | rust-version.workspace = true
22 | authors.workspace = true
23 | readme.workspace = true
24 | 
25 | [dependencies]
26 | pest = { version = "2.5.7", features = ["std", "memchr"] }
27 | pest_derive = "2.5.7"
28 | smallvec = { version = "1", features = ["union"] }
29 | lazy_static = "1"
30 | pag-lexer = { version = "0.1.0-alpha.1", path = "../pag-lexer" }
31 | typed-arena = "2.0.2"
32 | quote = "1.0.26"
33 | proc-macro2 = "1.0"
34 | ariadne = { version = "0.3", features = ["auto-color"] }
35 | 
36 | [dev-dependencies]
37 | strip-ansi-escapes = "0.1.1"
38 | 


--------------------------------------------------------------------------------
/benches/json/benches/json_logos.lalrpop:
--------------------------------------------------------------------------------
 1 | use crate::Token;
 2 | use crate::Pvalue;
 3 | 
 4 | grammar<'a>;
 5 | 
 6 | extern {
 7 |     type Location = usize;
 8 |     enum Token<'a> {
 9 |         "true"    => Token::True,
10 |         "false"   => Token::False,
11 |         "null"    => Token::Null,
12 |         ","	      => Token::Comma,
13 |         ":"	      => Token::Colon,
14 |         "{"	      => Token::LBrace,
15 |         "}"	      => Token::RBrace,
16 |         "["	      => Token::LBracket,
17 |         "]"	      => Token::RBracket,
18 |         "number"  => Token::Number(<&'a str>),
19 |         "string"  => Token::String(<&'a str>),
20 |     }
21 | }
22 | 
23 | Comma<T>: Vec<T> = {
24 |     <mut es: Comma<T>> "," <e: T> => { es.push(e); es },
25 |     <e: T> => vec![e],
26 | }
27 | 
28 | pub Json = Value;
29 | 
30 | Value: Pvalue<'a> = {
31 |     "true"        => Pvalue::<'a>::Bool(true),
32 |     "false"       => Pvalue::<'a>::Bool(false),
33 |     "null"        => Pvalue::<'a>::Null,
34 |     <s: "string"> => Pvalue::<'a>::String(&s[1..s.len() - 1]),
35 |     <n: "number"> => Pvalue::<'a>::Number(n),
36 |     <a: Array>    => Pvalue::<'a>::Array(a),
37 |     <o: Object>   => Pvalue::<'a>::Object(o),
38 | }
39 | 
40 | Attribute: (&'a str, Pvalue<'a>) = {
41 |     <s: "string"> ":" <v: Value> => (s, v),
42 | }
43 | 
44 | Object: Vec<(&'a str, Pvalue<'a>)> = {
45 |     "{" <attr: Comma<Attribute>> "}" => attr,
46 |     "{" "}" => vec![],
47 | }
48 | 
49 | Array: Vec<Pvalue<'a>> = {
50 |     "[" <a: Comma<Value>> "]" => a,
51 |     "[" "]" => vec![],
52 | }
53 | 


--------------------------------------------------------------------------------
/pag-parser/src/type_system/binding_proxy.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | use std::collections::HashSet;
 9 | 
10 | use crate::{
11 |     core_syntax::{BindingContext, TermPtr},
12 |     utilities::Symbol,
13 | };
14 | 
15 | pub struct BindingProxy<'src, 'a> {
16 |     binding: &'a BindingContext<'src, 'a>,
17 |     hiding: HashSet<Symbol<'src>>,
18 | }
19 | 
20 | impl<'src, 'a> BindingProxy<'src, 'a> {
21 |     pub fn proxy(binding: &'a BindingContext<'src, 'a>) -> Self {
22 |         BindingProxy {
23 |             binding,
24 |             hiding: HashSet::new(),
25 |         }
26 |     }
27 |     pub fn lookup(&self, sym: &Symbol<'src>) -> Option<TermPtr<'src, 'a>> {
28 |         if self.hiding.contains(sym) {
29 |             return None;
30 |         }
31 |         self.binding.get(sym).map(|x| x.term)
32 |     }
33 |     pub fn with_hiding<F, R>(&mut self, sym: Symbol<'src>, f: F) -> R
34 |     where
35 |         F: FnOnce(&mut Self) -> R,
36 |     {
37 |         let hidden_at_this_layer = self.hiding.insert(sym);
38 |         let result = f(self);
39 |         if hidden_at_this_layer {
40 |             self.hiding.remove(&sym);
41 |         }
42 |         result
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/benches/csv/benches/benchmarks.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion};
 2 | use csv::StringRecord;
 3 | use pag_csv::{generate_csv, parse};
 4 | 
 5 | mod pest_csv {
 6 |     use pest_derive::Parser;
 7 | 
 8 |     #[derive(Parser)]
 9 |     #[grammar = "benches/csv.pest"]
10 |     pub struct CSVParser;
11 | }
12 | 
13 | #[global_allocator]
14 | static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
15 | 
16 | fn csv_read_all(input: &str) -> Vec<StringRecord> {
17 |     let mut records = Vec::new();
18 |     csv::Reader::from_reader(input.as_bytes())
19 |         .into_records()
20 |         .for_each(|r| records.push(r.unwrap()));
21 |     records
22 | }
23 | 
24 | fn criterion_benchmark(c: &mut Criterion) {
25 |     let mut g = c.benchmark_group("throughput");
26 |     let data = generate_csv(1000, 20);
27 |     g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64));
28 |     g.bench_function("pag", |b| {
29 |         b.iter(|| {
30 |             assert_eq!(parse(&data).unwrap().children().len(), 1000);
31 |         })
32 |     });
33 |     g.bench_function("csv", |b| {
34 |         b.iter(|| {
35 |             assert_eq!(csv_read_all(&data).len(), 999);
36 |         })
37 |     });
38 |     g.bench_function("pest", |b| {
39 |         b.iter(|| {
40 |             use pest::Parser;
41 |             let pairs = pest_csv::CSVParser::parse(pest_csv::Rule::csv, &data).unwrap();
42 |             assert_eq!(pairs.into_iter().next().unwrap().into_inner().len(), 1000);
43 |         })
44 |     });
45 |     g.finish();
46 | }
47 | 
48 | criterion_group!(benches, criterion_benchmark);
49 | criterion_main!(benches);
50 | 


--------------------------------------------------------------------------------
/pag-compiler/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | //! The compiler of Paguroidea. Designed for build scripts.
 9 | use std::path::Path;
10 | 
11 | use syn::File;
12 | 
13 | /// Compile the grammar file at `input` to the parser source code
14 | /// at `output`.
15 | /// This function is designed to be used in `build.rs`. It will panic and
16 | /// output the reasons if any error occurs.
17 | pub fn compile<I: AsRef<Path>, O: AsRef<Path>>(input: I, output: O) {
18 |     use std::io::Write;
19 |     let data = std::fs::read_to_string(input.as_ref()).unwrap();
20 |     match pag_parser::generate_parser(&data) {
21 |         Ok(tokens) => {
22 |             #[cfg(pag_print_tokens)]
23 |             println!("{tokens}");
24 |             let tree: File = syn::parse2(tokens).unwrap();
25 |             let prettified = prettyplease::unparse(&tree);
26 |             let mut file = std::fs::File::create(output.as_ref()).unwrap();
27 |             write!(
28 |                 file,
29 |                 "// This file is @generated by Paguroidea.\n\n{}",
30 |                 prettified
31 |             )
32 |             .unwrap();
33 |             file.flush().unwrap();
34 |         }
35 |         Err(errs) => {
36 |             errs.report_stderr(&format!("{}", input.as_ref().display()), &data)
37 |                 .unwrap();
38 |             panic!("failed to compile parser")
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/length_differential.rs:
--------------------------------------------------------------------------------
 1 | use crate::generated::length_differential::Tag;
 2 | #[allow(unused_imports)]
 3 | use rand::{Rng, RngCore};
 4 | 
 5 | #[allow(dead_code)]
 6 | fn random_generate<G: Rng>(gen: &mut G, length: usize) -> (Vec<Tag>, String) {
 7 |     let mut buffer = String::new();
 8 |     let mut tags = Vec::new();
 9 |     for _ in 0..length {
10 |         match gen.next_u64() % 6 {
11 |             0 => {
12 |                 buffer.push_str("a ");
13 |                 tags.push(Tag::a);
14 |             }
15 |             1 => {
16 |                 buffer.push_str("aa ");
17 |                 tags.push(Tag::aa);
18 |             }
19 |             2 => {
20 |                 buffer.push_str("aaa ");
21 |                 tags.push(Tag::aaa);
22 |             }
23 |             3 => {
24 |                 buffer.push_str("aaaa ");
25 |                 tags.push(Tag::aaaa);
26 |             }
27 |             4 => {
28 |                 buffer.push_str("aaaaa ");
29 |                 tags.push(Tag::aaaaa);
30 |             }
31 |             _ => {
32 |                 buffer.push_str("a".repeat(6 + gen.next_u64() as usize % 128).as_str());
33 |                 buffer.push(' ');
34 |                 tags.push(Tag::more);
35 |             }
36 |         }
37 |     }
38 |     (tags, buffer)
39 | }
40 | 
41 | #[test]
42 | fn random_length_differential_test() {
43 |     let mut gen = rand::thread_rng();
44 |     for _ in 0..1000 {
45 |         let length = gen.next_u64() as usize % 1000 + 100;
46 |         let (tags, buffer) = random_generate(&mut gen, length);
47 |         let trimmed = buffer.trim();
48 |         let tree = crate::generated::length_differential::parse(trimmed).unwrap();
49 |         assert_eq!(tree.len(), trimmed.len());
50 |         let tokens = tree.children().iter().map(|x| x.tag()).collect::<Vec<_>>();
51 |         assert_eq!(tokens, tags);
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/pag-parser/src/core_syntax.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | use std::collections::HashMap;
10 | use std::fmt::Display;
11 | 
12 | use typed_arena::Arena;
13 | 
14 | use crate::frontend::WithSpan;
15 | use crate::utilities::Symbol;
16 | 
17 | #[derive(Debug, Clone)]
18 | pub enum Term<'src, 'arena> {
19 |     Epsilon,
20 |     Sequence(TermPtr<'src, 'arena>, TermPtr<'src, 'arena>),
21 |     LexerRef(Symbol<'src>),
22 |     Bottom,
23 |     Alternative(TermPtr<'src, 'arena>, TermPtr<'src, 'arena>),
24 |     Fix(Symbol<'src>, TermPtr<'src, 'arena>),
25 |     ParserRef(Symbol<'src>),
26 | }
27 | 
28 | pub type TermPtr<'src, 'arena> = &'arena WithSpan<'src, Term<'src, 'arena>>;
29 | pub type TermArena<'src, 'arena> = Arena<WithSpan<'src, Term<'src, 'arena>>>;
30 | 
31 | pub struct ParserRule<'src, 'arena> {
32 |     pub active: bool,
33 |     pub term: TermPtr<'src, 'arena>,
34 | }
35 | 
36 | pub type BindingContext<'src, 'arena> = HashMap<Symbol<'src>, ParserRule<'src, 'arena>>;
37 | 
38 | impl Display for Term<'_, '_> {
39 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40 |         match self {
41 |             Term::Epsilon => write!(f, "ε"),
42 |             Term::Sequence(x, y) => write!(f, "({x} ~ {y})"),
43 |             Term::LexerRef(x) => write!(f, "{x}"),
44 |             Term::Bottom => write!(f, "⊥"),
45 |             Term::Alternative(x, y) => write!(f, "({x} | {y})"),
46 |             Term::Fix(x, y) => write!(f, "(μ {x} . {y})"),
47 |             Term::ParserRef(x) => write!(f, "{x}"),
48 |         }
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/pag-parser/src/type_system/context.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | use super::type_check::Type;
10 | use crate::utilities::Symbol;
11 | use std::borrow::Cow;
12 | use std::collections::HashMap;
13 | 
14 | pub(super) struct TypeContext<'src> {
15 |     guarded: bool,
16 |     gamma: HashMap<Symbol<'src>, Type<'src>>,
17 | }
18 | 
19 | impl<'src> TypeContext<'src> {
20 |     pub fn new() -> Self {
21 |         Self {
22 |             guarded: false,
23 |             gamma: HashMap::new(),
24 |         }
25 |     }
26 |     pub fn lookup(&self, sym: Symbol<'src>) -> Option<Cow<Type<'src>>> {
27 |         let target = self.gamma.get(&sym)?;
28 |         Some(if self.guarded {
29 |             Cow::Owned(Type {
30 |                 guarded: true,
31 |                 ..target.clone()
32 |             })
33 |         } else {
34 |             Cow::Borrowed(target)
35 |         })
36 |     }
37 |     pub fn guarded<F, R>(&mut self, f: F) -> R
38 |     where
39 |         F: FnOnce(&mut Self) -> R,
40 |     {
41 |         let backup = self.guarded;
42 |         self.guarded = true;
43 |         let result = f(self);
44 |         self.guarded = backup;
45 |         result
46 |     }
47 |     pub fn with<F, R>(&mut self, sym: Symbol<'src>, r#type: Type<'src>, f: F) -> R
48 |     where
49 |         F: FnOnce(&mut Self) -> R,
50 |     {
51 |         let backup = self.gamma.insert(sym, r#type);
52 |         let result = f(self);
53 |         if let Some(backup) = backup {
54 |             self.gamma.insert(sym, backup);
55 |         } else {
56 |             self.gamma.remove(&sym);
57 |         }
58 |         result
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/pag-lexer/src/derivative.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | use crate::{normalization::normalize, regex_tree::RegexTree};
10 | use smallvec::smallvec;
11 | use std::rc::Rc;
12 | 
13 | pub fn derivative(tree: Rc<RegexTree>, x: u8) -> Rc<RegexTree> {
14 |     use RegexTree::*;
15 |     match tree.as_ref() {
16 |         Set(set) => {
17 |             if set.contains(x) {
18 |                 RegexTree::epsilon()
19 |             } else {
20 |                 RegexTree::bottom()
21 |             }
22 |         }
23 |         Concat(children) => {
24 |             let head = children[0].clone();
25 |             let tail = normalize(Rc::new(Concat(children[1..].iter().cloned().collect())));
26 |             let lhs = Rc::new(Concat(smallvec![derivative(head.clone(), x), tail.clone()]));
27 |             if head.is_nullable() {
28 |                 Rc::new(Union(smallvec![lhs, derivative(tail, x)]))
29 |             } else {
30 |                 lhs
31 |             }
32 |         }
33 |         KleeneClosure(r) => Rc::new(Concat(smallvec![derivative(r.clone(), x), tree.clone()])),
34 |         Union(children) => Rc::new(Union(
35 |             children
36 |                 .iter()
37 |                 .map(|tree| derivative(tree.clone(), x))
38 |                 .collect(),
39 |         )),
40 |         Intersection(children) => Rc::new(Intersection(
41 |             children
42 |                 .iter()
43 |                 .map(|tree| derivative(tree.clone(), x))
44 |                 .collect(),
45 |         )),
46 |         Complement(r) => Rc::new(Complement(derivative(r.clone(), x))),
47 |         Bottom | Epsilon => RegexTree::bottom(),
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/benches/json/benches/lalr_def.rs:
--------------------------------------------------------------------------------
 1 | use logos::Logos;
 2 | use std::fmt;
 3 | 
 4 | #[derive(Logos, Debug, PartialEq, Copy, Clone)]
 5 | #[logos(skip r"[ \r\n\t]+")]
 6 | pub enum Token<'a> {
 7 |     #[token("true")]
 8 |     True,
 9 | 
10 |     #[token("false")]
11 |     False,
12 | 
13 |     #[token("null")]
14 |     Null,
15 | 
16 |     #[token(":")]
17 |     Colon,
18 | 
19 |     #[token(",")]
20 |     Comma,
21 | 
22 |     #[token("{")]
23 |     LBrace,
24 | 
25 |     #[token("}")]
26 |     RBrace,
27 | 
28 |     #[token("[")]
29 |     LBracket,
30 | 
31 |     #[token("]")]
32 |     RBracket,
33 | 
34 |     #[regex(r"-?(0|[1-9][0-9]*)((\.[0-9]+)?)([eE][+-]?[0-9]+)?")]
35 |     Number(&'a str),
36 | 
37 |     #[regex(r#""([^\\"]|\\(["\\/bfnrt]|u[0-9a-fA-F]{4}))*""#)]
38 |     String(&'a str),
39 | }
40 | 
41 | impl<'a> Token<'a> {
42 |     pub fn lalrpop_lexer(
43 |         source: &'a str,
44 |     ) -> impl Iterator<Item = Result<(usize, Token<'a>, usize), &'static str>> {
45 |         Self::lexer(source)
46 |             .spanned()
47 |             .map(|(t, r)| Ok((r.start, t.unwrap(), r.end)))
48 |     }
49 | }
50 | 
51 | impl<'a> fmt::Display for Token<'a> {
52 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53 |         write!(f, "{self:?}")
54 |     }
55 | }
56 | 
57 | #[derive(Debug, Clone, PartialEq)]
58 | pub enum Pvalue<'a> {
59 |     Number(&'a str),
60 |     String(&'a str),
61 |     Object(Vec<(&'a str, Pvalue<'a>)>),
62 |     Bool(bool),
63 |     Null,
64 |     Array(Vec<Pvalue<'a>>),
65 | }
66 | 
67 | impl<'a> fmt::Display for Pvalue<'a> {
68 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69 |         match self {
70 |             Pvalue::Number(number) => write!(f, "{number}"),
71 |             Pvalue::String(string) => write!(f, "\"{string}\""),
72 |             Pvalue::Object(object) => {
73 |                 let iter = object.iter().map(|(k, v)| format!("\"{k}\": {v}"));
74 |                 write!(f, "{{{}}}", iter.collect::<Vec<_>>().join(", "))
75 |             }
76 |             Pvalue::Bool(flag) => write!(f, "{flag}"),
77 |             Pvalue::Null => write!(f, "null"),
78 |             Pvalue::Array(array) => {
79 |                 let iter = array.iter().map(|v| v.to_string());
80 |                 write!(f, "[{}]", iter.collect::<Vec<_>>().join(", "))
81 |             }
82 |         }
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/common_prefix.rs:
--------------------------------------------------------------------------------
 1 | use crate::generated::common_prefix::Tag;
 2 | 
 3 | #[allow(unused_imports)]
 4 | use rand::{Rng, RngCore};
 5 | 
 6 | #[allow(dead_code)]
 7 | const TABLE: &[(Tag, &str)] = &[
 8 |     (Tag::a, "A"),
 9 |     (Tag::ab, "AB"),
10 |     (Tag::abc, "ABC"),
11 |     (Tag::abcd, "ABCD"),
12 |     (Tag::abcde, "ABCDE"),
13 |     (Tag::abcdef, "ABCDEF"),
14 |     (Tag::abcdefg, "ABCDEFG"),
15 |     (Tag::abcdefgh, "ABCDEFGH"),
16 |     (Tag::abcdefghi, "ABCDEFGHI"),
17 |     (Tag::abcdefghij, "ABCDEFGHIJ"),
18 |     (Tag::abcdefghijk, "ABCDEFGHIJK"),
19 |     (Tag::abcdefghijkl, "ABCDEFGHIJKL"),
20 |     (Tag::abcdefghijklm, "ABCDEFGHIJKLM"),
21 |     (Tag::abcdefghijklmn, "ABCDEFGHIJKLMN"),
22 |     (Tag::abcdefghijklmno, "ABCDEFGHIJKLMNO"),
23 |     (Tag::abcdefghijklmnop, "ABCDEFGHIJKLMNOP"),
24 |     (Tag::abcdefghijklmnopq, "ABCDEFGHIJKLMNOPQ"),
25 |     (Tag::abcdefghijklmnopqr, "ABCDEFGHIJKLMNOPQR"),
26 |     (Tag::abcdefghijklmnopqrs, "ABCDEFGHIJKLMNOPQRS"),
27 |     (Tag::abcdefghijklmnopqrst, "ABCDEFGHIJKLMNOPQRST"),
28 |     (Tag::abcdefghijklmnopqrstu, "ABCDEFGHIJKLMNOPQRSTU"),
29 |     (Tag::abcdefghijklmnopqrstuv, "ABCDEFGHIJKLMNOPQRSTUV"),
30 |     (Tag::abcdefghijklmnopqrstuvw, "ABCDEFGHIJKLMNOPQRSTUVW"),
31 |     (Tag::abcdefghijklmnopqrstuvwx, "ABCDEFGHIJKLMNOPQRSTUVWX"),
32 |     (Tag::abcdefghijklmnopqrstuvwxy, "ABCDEFGHIJKLMNOPQRSTUVWXY"),
33 |     (
34 |         Tag::abcdefghijklmnopqrstuvwxyz,
35 |         "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
36 |     ),
37 | ];
38 | 
39 | #[allow(dead_code)]
40 | fn random_generate<G: Rng>(gen: &mut G, length: usize) -> (Vec<Tag>, String) {
41 |     let mut buffer = String::new();
42 |     let mut tags = Vec::new();
43 |     for _ in 0..length {
44 |         let (tag, s) = TABLE[gen.next_u64() as usize % TABLE.len()];
45 |         buffer.push_str(s);
46 |         buffer.push(' ');
47 |         tags.push(tag);
48 |     }
49 |     (tags, buffer)
50 | }
51 | 
52 | #[test]
53 | fn random_common_prefix_test() {
54 |     let mut gen = rand::thread_rng();
55 |     for _ in 0..1000 {
56 |         let length = gen.next_u64() as usize % 1000 + 100;
57 |         let (tags, buffer) = random_generate(&mut gen, length);
58 |         let trimmed = buffer.trim();
59 |         let tree = crate::generated::common_prefix::parse(trimmed).unwrap();
60 |         assert_eq!(tree.len(), trimmed.len());
61 |         let tokens = tree.children().iter().map(|x| x.tag()).collect::<Vec<_>>();
62 |         assert_eq!(tokens, tags);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/tail_differential.rs:
--------------------------------------------------------------------------------
 1 | use crate::generated::tail_differential::Tag;
 2 | #[allow(unused_imports)]
 3 | use rand::{Rng, RngCore};
 4 | 
 5 | #[allow(dead_code)]
 6 | fn random_generate<G: Rng>(gen: &mut G, length: usize) -> (Vec<Tag>, String) {
 7 |     let mut buffer = String::new();
 8 |     let mut tags = Vec::new();
 9 |     for _ in 0..length {
10 |         match gen.next_u64() % 4 {
11 |             0 => {
12 |                 //ab(c*d)?
13 |                 buffer.push_str("ab");
14 |                 if gen.next_u64() % 2 == 0 {
15 |                     for _ in 0..gen.next_u64() % 129 {
16 |                         buffer.push('c');
17 |                     }
18 |                     buffer.push('d');
19 |                 }
20 |                 tags.push(Tag::abcd);
21 |             }
22 |             1 => {
23 |                 // abc*e
24 |                 buffer.push_str("ab");
25 |                 for _ in 0..gen.next_u64() % 129 {
26 |                     buffer.push('c');
27 |                 }
28 |                 buffer.push('e');
29 |                 tags.push(Tag::abce);
30 |             }
31 |             2 => {
32 |                 //abc*dd+
33 |                 buffer.push_str("ab");
34 |                 for _ in 0..gen.next_u64() % 129 {
35 |                     buffer.push('c');
36 |                 }
37 |                 for _ in 0..gen.next_u64() % 129 + 2 {
38 |                     buffer.push('d');
39 |                 }
40 |                 tags.push(Tag::abcdm);
41 |             }
42 |             _ => {
43 |                 // c+
44 |                 for _ in 0..gen.next_u64() % 129 + 1 {
45 |                     buffer.push('c');
46 |                 }
47 |                 tags.push(Tag::cs);
48 |             }
49 |         }
50 |     }
51 |     (tags, buffer)
52 | }
53 | 
54 | #[test]
55 | fn random_tail_differential_test() {
56 |     let mut gen = rand::thread_rng();
57 |     for _ in 0..1000 {
58 |         let length = gen.next_u64() as usize % 1000 + 100;
59 |         let (mut tags, buffer) = random_generate(&mut gen, length);
60 |         // deduplicate only for cs
61 |         tags.dedup_by(|a, b| *a == Tag::cs && *b == Tag::cs);
62 |         let trimmed = buffer.trim();
63 |         let tree = crate::generated::tail_differential::parse(trimmed).unwrap();
64 |         assert_eq!(tree.len(), trimmed.len());
65 |         let tokens = tree.children().iter().map(|x| x.tag()).collect::<Vec<_>>();
66 |         assert_eq!(tokens, tags);
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/tests/tokenizer/src/comment_and_string.rs:
--------------------------------------------------------------------------------
 1 | use crate::generated::comment_and_string::Tag;
 2 | use rand::distributions::Uniform;
 3 | 
 4 | #[allow(unused_imports)]
 5 | use rand::{Rng, RngCore};
 6 | 
 7 | #[allow(dead_code)]
 8 | fn generate_random_string<G: Rng>(gen: &mut G, length: usize, buffer: &mut String) {
 9 |     buffer.push('"');
10 |     let dist = Uniform::<u8>::new_inclusive(u8::MIN, u8::MAX);
11 |     for _ in 0..length {
12 |         let target = gen.sample(dist);
13 |         buffer.push(target as char);
14 |         if target == b'"' {
15 |             buffer.push('"');
16 |         }
17 |     }
18 |     buffer.push('"');
19 | }
20 | 
21 | #[allow(dead_code)]
22 | fn generate_random_comment<G: Rng>(gen: &mut G, length: usize, buffer: &mut String) {
23 |     buffer.push_str("/*");
24 |     let dist = Uniform::<u8>::new_inclusive(u8::MIN, u8::MAX);
25 |     let mut last_is_star = false;
26 |     for _ in 0..length {
27 |         let mut target = gen.sample(dist);
28 |         while last_is_star && target == b'/' {
29 |             target = gen.sample(dist);
30 |         }
31 |         last_is_star = target == b'*';
32 |         buffer.push(target as char);
33 |     }
34 |     buffer.push_str("*/");
35 | }
36 | 
37 | #[allow(dead_code)]
38 | fn random_generate<G: Rng>(gen: &mut G, length: usize) -> (Vec<Tag>, String) {
39 |     let mut buffer = String::new();
40 |     let mut tags = Vec::new();
41 |     for _ in 0..length {
42 |         let inner_length = gen.next_u64() as usize % 64 + 1;
43 |         if gen.next_u64() % 2 == 0 {
44 |             generate_random_comment(gen, inner_length, &mut buffer);
45 |             buffer.push('\n');
46 |             tags.push(Tag::comment);
47 |         } else {
48 |             generate_random_string(gen, inner_length, &mut buffer);
49 |             buffer.push('\n');
50 |             tags.push(Tag::string);
51 |         }
52 |     }
53 |     (tags, buffer)
54 | }
55 | 
56 | #[test]
57 | fn random_comment_and_string_test() {
58 |     let mut gen = rand::thread_rng();
59 |     for _ in 0..100 {
60 |         let length = gen.next_u64() as usize % 64 + 1;
61 |         let (tags, buffer) = random_generate(&mut gen, length);
62 |         let trimmed = buffer.trim();
63 |         let tree = crate::generated::comment_and_string::parse(trimmed).unwrap();
64 |         assert_eq!(tree.len(), trimmed.len(), "{}", buffer.escape_default());
65 |         let tokens = tree.children().iter().map(|x| x.tag()).collect::<Vec<_>>();
66 |         assert_eq!(tokens, tags, "{buffer}");
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/pag-parser/src/utilities.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | #[derive(Debug, Clone, Copy, PartialOrd, Ord)]
10 | pub struct Symbol<'a>(&'a str);
11 | 
12 | impl<'a> Symbol<'a> {
13 |     pub fn new(data: &'a str) -> Self {
14 |         Self(data)
15 |     }
16 | 
17 |     pub fn name(&self) -> &'a str {
18 |         self.0
19 |     }
20 | }
21 | 
22 | impl<'a> std::hash::Hash for Symbol<'a> {
23 |     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
24 |         self.0.as_ptr().hash(state);
25 |         self.0.len().hash(state);
26 |     }
27 | }
28 | 
29 | impl<'a, 'b> PartialEq<Symbol<'b>> for Symbol<'a> {
30 |     fn eq(&self, other: &Symbol<'b>) -> bool {
31 |         self.0.as_ptr() == other.0.as_ptr() && self.0.len() == other.0.len()
32 |     }
33 | }
34 | 
35 | impl<'a> Eq for Symbol<'a> {}
36 | 
37 | fn is_ascii_ident_body(x: &u8) -> bool {
38 |     x.is_ascii_alphanumeric() || *x == b'_'
39 | }
40 | 
41 | fn is_ascii_ident_head(x: &u8) -> bool {
42 |     x.is_ascii_alphabetic() || *x == b'_'
43 | }
44 | 
45 | fn is_ascii_ident(s: &str) -> bool {
46 |     let [x, xs@..] = s.as_bytes() else { return false };
47 |     is_ascii_ident_head(x) && xs.iter().all(is_ascii_ident_body)
48 | }
49 | 
50 | impl<'a> std::fmt::Display for Symbol<'a> {
51 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52 |         if is_ascii_ident(self.0) {
53 |             write!(f, "{}", self.0)
54 |         } else {
55 |             write!(f, "s{:x}_{}", self.0.as_ptr() as usize, self.0.len())
56 |         }
57 |     }
58 | }
59 | 
60 | pub fn merge_results<T, E, U>(
61 |     a: Result<T, Vec<E>>,
62 |     b: Result<T, Vec<E>>,
63 |     f: impl FnOnce(T, T) -> U,
64 | ) -> Result<U, Vec<E>> {
65 |     match (a, b) {
66 |         (Ok(a), Ok(b)) => Ok(f(a, b)),
67 |         (Ok(_), Err(b)) => Err(b),
68 |         (Err(a), Ok(_)) => Err(a),
69 |         (Err(mut a), Err(b)) => {
70 |             a.extend(b);
71 |             Err(a)
72 |         }
73 |     }
74 | }
75 | 
76 | macro_rules! unreachable_branch {
77 |     ($($arg:tt)*) => {
78 |         if cfg!(debug_assertions) {
79 |             unreachable!($($arg)*)
80 |         } else {
81 |             unsafe { std::hint::unreachable_unchecked() }
82 |         }
83 |     };
84 | }
85 | 
86 | pub(crate) use unreachable_branch;
87 | 


--------------------------------------------------------------------------------
/pag-lexer/src/congruence.rs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023 Paguroidea Developers
 2 | //
 3 | // Licensed under the Apache License, Version 2.0
 4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
 5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. All files in the project carrying such notice may not be copied,
 7 | // modified, or distributed except according to those terms.
 8 | 
 9 | use std::ops::ControlFlow;
10 | 
11 | use crate::intervals;
12 | use crate::intervals::Intervals;
13 | use crate::regex_tree::RegexTree;
14 | 
15 | pub fn meet(a: &[Intervals], b: &[Intervals]) -> Vec<Intervals> {
16 |     let mut result = Vec::new();
17 |     for x in a {
18 |         for y in b {
19 |             if let Some(z) = x.intersection(y) {
20 |                 result.push(z);
21 |             }
22 |         }
23 |     }
24 |     result.sort();
25 |     result.dedup();
26 |     result
27 | }
28 | 
29 | // TODO: this part can be optimized
30 | pub fn approximate_congruence_class(tree: &RegexTree) -> Vec<Intervals> {
31 |     use RegexTree::*;
32 |     match tree {
33 |         Epsilon | Bottom => vec![intervals!((0, u8::MAX))],
34 |         Set(x) => {
35 |             let x = x.clone();
36 |             match x.complement() {
37 |                 Some(y) => {
38 |                     if x < y {
39 |                         vec![x, y]
40 |                     } else {
41 |                         vec![y, x]
42 |                     }
43 |                 }
44 |                 None => vec![x],
45 |             }
46 |         }
47 |         Concat(children) => {
48 |             match children[1..]
49 |                 .iter()
50 |                 .zip(children.iter().map(|x| x.is_nullable()))
51 |                 .try_fold(
52 |                     approximate_congruence_class(&children[0]),
53 |                     |acc, (tree, prev_nullable)| {
54 |                         if !prev_nullable {
55 |                             ControlFlow::Break(acc)
56 |                         } else {
57 |                             ControlFlow::Continue(meet(&acc, &approximate_congruence_class(tree)))
58 |                         }
59 |                     },
60 |                 ) {
61 |                 ControlFlow::Break(v) | ControlFlow::Continue(v) => v,
62 |             }
63 |         }
64 |         KleeneClosure(r) | Complement(r) => approximate_congruence_class(r),
65 |         Union(children) | Intersection(children) => children[1..]
66 |             .iter()
67 |             .fold(approximate_congruence_class(&children[0]), |acc, x| {
68 |                 meet(&acc, &approximate_congruence_class(x))
69 |             }),
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/benches/json/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(portable_simd)]
 2 | #![feature(core_intrinsics)]
 3 | #![feature(array_chunks)]
 4 | mod parser;
 5 | 
 6 | pub use parser::parse;
 7 | use rand::rngs::StdRng;
 8 | use rand::{Rng, SeedableRng};
 9 | use serde_json::Value;
10 | 
11 | fn generate_json_value<G: Rng>(depth: usize, gen: &mut G) -> Value {
12 |     if depth == 0 {
13 |         match gen.gen_range(0..4) {
14 |             0 => Value::Null,
15 |             1 => Value::Bool(gen.gen()),
16 |             2 => Value::Number(serde_json::Number::from_f64(gen.gen()).unwrap()),
17 |             _ => Value::String(gen.gen::<u64>().to_string()),
18 |         }
19 |     } else {
20 |         match gen.gen_range(0..7) {
21 |             0 => Value::Null,
22 |             1 => Value::Bool(gen.gen()),
23 |             2 => Value::String(gen.gen::<u64>().to_string()),
24 |             3 | 4 => {
25 |                 let mut array = Vec::new();
26 |                 for _ in 0..gen.gen_range(0..10) {
27 |                     array.push(generate_json_value(depth - 1, gen));
28 |                 }
29 |                 Value::Array(array)
30 |             }
31 |             _ => {
32 |                 let mut object = serde_json::Map::new();
33 |                 for _ in 0..gen.gen_range(0..10) {
34 |                     object.insert(
35 |                         gen.gen::<u64>().to_string(),
36 |                         generate_json_value(depth - 1, gen),
37 |                     );
38 |                 }
39 |                 Value::Object(object)
40 |             }
41 |         }
42 |     }
43 | }
44 | 
45 | pub fn generate_random_json(depth: usize) -> String {
46 |     let mut random = std::env::var("PAG_RANDOM_SEED")
47 |         .ok()
48 |         .and_then(|x| x.parse().ok())
49 |         .map_or_else(StdRng::from_entropy, StdRng::seed_from_u64);
50 |     let mut buffer = Vec::new();
51 |     let value = generate_json_value(depth, &mut random);
52 |     serde_json::to_writer(&mut buffer, &value).unwrap();
53 |     unsafe { String::from_utf8_unchecked(buffer) }
54 | }
55 | 
56 | #[cfg(test)]
57 | mod test {
58 |     use super::*;
59 | 
60 |     #[test]
61 |     fn test_json() {
62 |         let json = r#"{ "hello": { "values": [{}, [], [1, 1e3, -0.5, 9.99]] }, "age" : 13 }"#;
63 |         let tree = parser::parse(json).unwrap();
64 |         println!("{:#?}", tree);
65 |     }
66 |     #[test]
67 |     fn test_random() {
68 |         for _ in 0..10 {
69 |             let json = generate_random_json(10);
70 |             let parsed = parser::parse(&json).unwrap();
71 |             assert_eq!(json.len(), parsed.len())
72 |         }
73 |     }
74 | 
75 |     #[test]
76 |     fn test_twitter() {
77 |         let json = include_str!("../benches/twitter.json");
78 |         let parsed = parser::parse(json).unwrap();
79 |         assert_eq!(json.len(), parsed.len())
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/tests/arith-expr/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(portable_simd)]
 2 | #![feature(core_intrinsics)]
 3 | #![feature(array_chunks)]
 4 | use std::num::Wrapping;
 5 | 
 6 | mod parser;
 7 | 
 8 | #[allow(dead_code)]
 9 | fn eval(tree: &parser::ParserTree) -> Wrapping<usize> {
10 |     match tree.tag() {
11 |         parser::Tag::expr => tree.children()[..].iter().map(eval).sum(),
12 |         parser::Tag::mult => tree.children()[..].iter().map(eval).product(),
13 |         parser::Tag::int => Wrapping(tree.as_slice().parse::<usize>().unwrap()),
14 |         parser::Tag::special => {
15 |             assert_eq!(tree.as_slice().chars().count(), 1);
16 |             Wrapping(tree.as_slice().chars().next().unwrap() as usize)
17 |         }
18 |     }
19 | }
20 | 
21 | #[allow(dead_code)]
22 | fn generate_random_expr<G: rand::Rng>(rng: &mut G, depth: usize) -> (Wrapping<usize>, String) {
23 |     if depth == 0 {
24 |         let x = rng.gen_range(0..100);
25 |         return (Wrapping(x), format!("{}", x));
26 |     }
27 |     match rng.gen_range(0..4) {
28 |         0 => {
29 |             let x = rng.gen_range(0..100);
30 |             (Wrapping(x), format!("{}", x))
31 |         }
32 |         1 => {
33 |             let x = rng.gen_range(0xFF..=0xD7FF);
34 |             (
35 |                 Wrapping(x),
36 |                 format!("{}", char::from_u32(x as u32).unwrap()),
37 |             )
38 |         }
39 |         2 => {
40 |             let (a, s1) = generate_random_expr(rng, depth - 1);
41 |             let (b, s2) = generate_random_expr(rng, depth - 1);
42 |             (a + b, format!("({} + {})", s1, s2))
43 |         }
44 |         _ => {
45 |             let (a, s1) = generate_random_expr(rng, depth - 1);
46 |             let (b, s2) = generate_random_expr(rng, depth - 1);
47 |             (a * b, format!("({} * {})", s1, s2))
48 |         }
49 |     }
50 | }
51 | 
52 | #[test]
53 | fn simple_test() {
54 |     let expr = "55 * (14 + 15) + 66 * 13";
55 |     let tree = parser::parse(expr).unwrap();
56 |     assert_eq!(eval(&tree), Wrapping(55 * (14 + 15) + 66 * 13));
57 |     // (8 * 1 + 3) * 6 + ((37 + 7) * 2)
58 |     let expr = "(8 * 1 + 3) * 6 + ((37 + 7) * 2)";
59 |     let tree = parser::parse(expr).unwrap();
60 |     assert_eq!(eval(&tree), Wrapping((8 + 3) * 6 + ((37 + 7) * 2)));
61 | 
62 |     // ((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91
63 |     let expr = "((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91 + 1 + 2 + 3";
64 |     let tree = parser::parse(expr).unwrap();
65 |     assert_eq!(
66 |         eval(&tree),
67 |         Wrapping(((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91 + 1 + 2 + 3)
68 |     );
69 | }
70 | #[test]
71 | fn random_test() {
72 |     for _ in 0..1000 {
73 |         let (value, expr) = generate_random_expr(&mut rand::thread_rng(), 15);
74 |         let tree = parser::parse(&expr).unwrap();
75 |         assert_eq!(eval(&tree), value);
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/tests/tokenizer/build.rs:
--------------------------------------------------------------------------------
 1 | use std::io::Write;
 2 | use tempfile::NamedTempFile;
 3 | 
 4 | fn generate_tokenizer<A, B, I>(name: &str, rules: I, skip: Option<&str>)
 5 | where
 6 |     I: AsRef<[(A, B)]>,
 7 |     A: AsRef<str>,
 8 |     B: AsRef<str>,
 9 | {
10 |     let mut file = NamedTempFile::new().unwrap();
11 |     writeln!(file.as_file(), "lexer {{").unwrap();
12 |     for (name, rule) in rules.as_ref() {
13 |         writeln!(file.as_file(), "{} = {};", name.as_ref(), rule.as_ref()).unwrap()
14 |     }
15 |     if let Some(skip) = skip {
16 |         writeln!(file.as_file(), "skip = {skip};").unwrap()
17 |     }
18 |     writeln!(file.as_file(), "}}").unwrap();
19 |     writeln!(file.as_file(), "parser tokens {{").unwrap();
20 |     for (name, _) in rules.as_ref() {
21 |         let lowercase = name.as_ref().to_lowercase();
22 |         writeln!(file.as_file(), "active {lowercase} = {};", name.as_ref()).unwrap()
23 |     }
24 |     writeln!(
25 |         file.as_file(),
26 |         "active tokens = ({})*;",
27 |         rules
28 |             .as_ref()
29 |             .iter()
30 |             .map(|(name, _)| name.as_ref().to_lowercase())
31 |             .collect::<Vec<_>>()
32 |             .join("|")
33 |     )
34 |     .unwrap();
35 |     writeln!(file.as_file(), "}}").unwrap();
36 |     file.as_file_mut().flush().unwrap();
37 |     pag_compiler::compile(file.path(), format!("src/generated/{name}.rs"));
38 | }
39 | 
40 | fn main() {
41 |     std::fs::create_dir_all("src/generated").unwrap();
42 |     generate_tokenizer(
43 |         "length_differential",
44 |         [
45 |             ("A", r#"'a'"#),
46 |             ("AA", r#""aa""#),
47 |             ("AAA", r#""aaa""#),
48 |             ("AAAA", r#""aaaa""#),
49 |             ("AAAAA", r#""aaaaa""#),
50 |             ("MORE", r"AAAAA~ 'a'+"),
51 |         ],
52 |         Some(r"'\n' | '\r' | '\t' | ' '"),
53 |     );
54 |     generate_tokenizer(
55 |         "common_prefix",
56 |         {
57 |             let mut rules = Vec::new();
58 |             let mut current = String::new();
59 |             for i in 'A'..='Z' {
60 |                 current.push(i);
61 |                 rules.push((current.clone(), format!("{:?}", current)));
62 |             }
63 |             rules
64 |         },
65 |         Some(r"'\n' | '\r' | '\t' | ' '"),
66 |     );
67 |     generate_tokenizer(
68 |         "comment_and_string",
69 |         [
70 |             ("STRING", r#"'\"' ~ ( (!'\"') | '"' ~ '"')* ~ '\"'"#),
71 |             ("COMMENT", r#""/*" ~ !(.* ~ "*/" ~ .*) ~ "*/""#),
72 |         ],
73 |         Some(r"'\n' | '\r' | '\t' | ' '"),
74 |     );
75 |     generate_tokenizer(
76 |         "tail_differential",
77 |         [
78 |             ("ABCD", r#"'a' ~ 'b' ~ ('c'* ~ 'd')?"#),
79 |             ("ABCE", r#"'a' ~ 'b' ~ ('c'* ~ 'e')"#),
80 |             ("ABCDM", r#"'a' ~ 'b' ~ 'c'* ~ 'd' ~ 'd'+"#),
81 |             ("CS", r#"'c'+"#),
82 |         ],
83 |         None,
84 |     );
85 | }
86 | 


--------------------------------------------------------------------------------
/benches/json/benches/benchmarks.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion};
 2 | use lalrpop_util::lalrpop_mod;
 3 | use pag_json::{generate_random_json, parse};
 4 | use pest::Parser;
 5 | use pest_json::Rule;
 6 | use serde_json::Value;
 7 | 
 8 | mod lalr_def;
 9 | pub use lalr_def::{Pvalue, Token};
10 | 
11 | #[global_allocator]
12 | static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
13 | 
14 | lalrpop_mod!(lalrpop_json, "/benches/json.rs");
15 | lalrpop_mod!(lalrpop_logos_json, "/benches/json_logos.rs");
16 | 
17 | mod pest_json {
18 |     use pest_derive::Parser;
19 | 
20 |     #[derive(Parser)]
21 |     #[grammar = "benches/json.pest"]
22 |     pub struct JSONParser;
23 | }
24 | 
25 | fn criterion_benchmark(c: &mut Criterion) {
26 |     let mut g = c.benchmark_group("random-json");
27 |     let data = generate_random_json(10);
28 |     g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64));
29 |     g.bench_function("pag", |b| {
30 |         b.iter(|| {
31 |             parse(&data).unwrap();
32 |         })
33 |     });
34 |     g.bench_function("serde", |b| {
35 |         b.iter(|| {
36 |             serde_json::from_str::<Value>(&data).unwrap();
37 |         })
38 |     });
39 |     g.bench_function("pest", |b| {
40 |         b.iter(|| {
41 |             pest_json::JSONParser::parse(Rule::json, &data).unwrap();
42 |         })
43 |     });
44 |     g.bench_function("lalrpop", |b| {
45 |         b.iter(|| {
46 |             lalrpop_json::JsonParser::new().parse(&data).unwrap();
47 |         })
48 |     });
49 |     g.bench_function("lalrpop+logos", |b| {
50 |         b.iter(|| {
51 |             let lexer = Token::lalrpop_lexer(&data);
52 |             lalrpop_logos_json::JsonParser::new().parse(lexer).unwrap();
53 |         })
54 |     });
55 |     g.finish();
56 | 
57 |     let mut g = c.benchmark_group("twitter-json");
58 |     let data = include_str!("twitter.json");
59 |     g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64));
60 |     g.bench_function("pag", |b| {
61 |         b.iter(|| {
62 |             parse(data).unwrap();
63 |         })
64 |     });
65 |     g.bench_function("serde", |b| {
66 |         b.iter(|| {
67 |             serde_json::from_str::<Value>(data).unwrap();
68 |         })
69 |     });
70 |     g.bench_function("pest", |b| {
71 |         b.iter(|| {
72 |             pest_json::JSONParser::parse(Rule::json, data).unwrap();
73 |         })
74 |     });
75 |     g.bench_function("lalrpop", |b| {
76 |         b.iter(|| {
77 |             lalrpop_json::JsonParser::new().parse(data).unwrap();
78 |         })
79 |     });
80 |     g.bench_function("lalrpop+logos", |b| {
81 |         b.iter(|| {
82 |             let lexer = Token::lalrpop_lexer(data);
83 |             lalrpop_logos_json::JsonParser::new().parse(lexer).unwrap();
84 |         })
85 |     });
86 |     g.finish();
87 | }
88 | 
89 | criterion_group!(benches, criterion_benchmark);
90 | criterion_main!(benches);
91 | 


--------------------------------------------------------------------------------
/pag-parser/src/frontend/grammar.pest:
--------------------------------------------------------------------------------
 1 | any = { "." }
 2 | bottom = { "<bottom>" | "⊥" }
 3 | silent = _{ "silent" }
 4 | active = _{ "active" }
 5 | skip = _{ "skip" }
 6 | lexer = _{ "lexer" }
 7 | parser = _{ "parser" }
 8 | empty = { "_" }
 9 | 
10 | KEYWORD = { any | empty | bottom | silent | active | skip | lexer | parser }
11 | 
12 | /// A newline character.
13 | newline = _{ "\n" | "\r\n" }
14 | /// A whitespace character.
15 | WHITESPACE = _{ " " | "\t" | newline }
16 | /// A single line comment.
17 | line_comment = _{ ("//" ~ !("/" | "!") ~ (!newline ~ ANY)*) }
18 | /// A multi-line comment.
19 | block_comment = _{ "/*" ~ (block_comment | !"*/" ~ ANY)* ~ "*/" }
20 | /// A grammar comment.
21 | COMMENT = _{ block_comment | line_comment }
22 | 
23 | inner_chr = @{ escape | ANY }
24 | inner_str = @{ (!("\"" | "\\") ~ ANY)* ~ (escape ~ inner_str)? }
25 | hex_digit = @{ '0'..'9' | 'a'..'f' | 'A'..'F' }
26 | code = @{ "x" ~ hex_digit{2} }
27 | uppercase = @{ 'A'..'Z' }
28 | lowercase = @{ 'a'..'z' }
29 | digit = @{ '0'..'9' }
30 | unicode = @{ "u" ~ "{" ~ hex_digit{2, 6} ~ "}" }
31 | escape = @{ "\\" ~ ("\"" | "\\" | "r" | "n" | "t" | "0" | "'" | code | unicode) }
32 | 
33 | character = ${ "'" ~ inner_chr ~ "'" }
34 | string = ${ "\"" ~ inner_str ~ "\"" }
35 | range = { character ~ ".." ~ character }
36 | token_id = ${ !KEYWORD ~ uppercase ~ (uppercase | digit | "_")* }
37 | parser_id = ${ !KEYWORD ~ lowercase ~ (lowercase | digit | "_")* }
38 | 
39 | // pratt parser for lexical expressions
40 | lexical_primary = _{ any | bottom | empty | range | character | string | token_id | "(" ~ lexical_expr ~ ")" }
41 | lexical_expr = { lexical_prefix* ~ lexical_primary ~ lexical_postfix* ~ (lexical_infix ~ lexical_prefix* ~ lexical_primary ~ lexical_postfix* )* }
42 | lexical_postfix = _{ lexical_optional | lexical_star | lexical_plus }
43 | lexical_optional = { "?" }
44 | lexical_star = { "*" }
45 | lexical_plus = { "+" }
46 | lexical_not = { "!" }
47 | lexical_prefix = _{ lexical_not }
48 | lexical_infix = _{ lexical_sequence | lexical_alternative | lexical_and }
49 | lexical_sequence = { "~" }
50 | lexical_alternative = { "|" }
51 | lexical_and = { "&" }
52 | 
53 | // pratt parser for parser expressions
54 | parser_primary = _{ bottom | empty | parser_id | token_id | "(" ~ parser_expr ~ ")" }
55 | parser_expr = { parser_primary ~ parser_postfix* ~ (parser_infix ~ parser_primary ~ parser_postfix* )* }
56 | parser_postfix = _{ parser_optional | parser_star | parser_plus }
57 | parser_optional = { "?" }
58 | parser_star = { "*" }
59 | parser_plus = { "+" }
60 | parser_infix = _{ parser_sequence | parser_alternative }
61 | parser_sequence = { "~" }
62 | parser_alternative = { "|" }
63 | 
64 | // lexer definition
65 | lexer_def = { lexer ~ "{" ~ lexer_rules ~ "}" }
66 | lexer_rules = { ((lexical_rule | lexical_skip) ~ ";")+ }
67 | lexical_rule = { token_id ~ "=" ~ lexical_expr }
68 | lexical_skip = { skip ~ "=" ~ lexical_expr }
69 | 
70 | // parser definition
71 | parser_def = { parser ~ parser_id ~ "{" ~ parser_rules ~ "}" }
72 | parser_rules = { ((active_parser_rule | silent_parser_rule) ~ ";")+ }
73 | active_parser_rule = { active ~ parser_id ~ "=" ~ parser_expr }
74 | silent_parser_rule = { silent ~ parser_id ~ "=" ~ parser_expr }
75 | 
76 | grammar = { SOI ~ lexer_def ~ parser_def ~ EOI }
77 | 


--------------------------------------------------------------------------------
/tests/sexpr-calculator/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(portable_simd)]
 2 | #![feature(core_intrinsics)]
 3 | #![feature(array_chunks)]
 4 | use std::num::Wrapping;
 5 | 
 6 | mod parser;
 7 | 
 8 | #[allow(dead_code)]
 9 | fn eval(tree: &parser::ParserTree) -> Wrapping<usize> {
10 |     match tree.tag() {
11 |         parser::Tag::sexpr => eval(&tree.children()[0]),
12 |         parser::Tag::int => Wrapping(tree.as_slice().parse::<usize>().unwrap()),
13 |         parser::Tag::op => {
14 |             unreachable!("op should be handled by sexpr")
15 |         }
16 |         parser::Tag::compound => match tree.children()[0].as_slice() {
17 |             "+" | "加" => tree.children()[1..].iter().map(eval).sum(),
18 |             "*" | "乘" => tree.children()[1..].iter().map(eval).product(),
19 |             other => unreachable!("only '+' and '*' are supported, found '{other}'"),
20 |         },
21 |     }
22 | }
23 | 
24 | #[allow(dead_code)]
25 | fn generate_sexpr<G: rand::Rng>(mut limit: usize, gen: &mut G) -> (usize, Wrapping<usize>, String) {
26 |     if limit <= 1 {
27 |         let x = Wrapping(gen.next_u64() as usize % 100);
28 |         return (1, x, format!("{}", x));
29 |     }
30 |     match gen.next_u64() % 20 {
31 |         0 => {
32 |             let x = Wrapping(gen.next_u64() as usize % 100);
33 |             (1, x, format!("{}", x))
34 |         }
35 |         1..=15 => {
36 |             let width = 2 + gen.next_u64() % (limit as u64).min(10);
37 |             let mut buffer = if gen.gen_bool(0.5) {
38 |                 "(+".to_string()
39 |             } else {
40 |                 "(加".to_string()
41 |             };
42 |             let mut cnt = 0;
43 |             let mut sum = Wrapping(0);
44 |             for _ in 0..width {
45 |                 let (w, v, s) = generate_sexpr(limit, gen);
46 |                 cnt += w;
47 |                 limit = limit.saturating_sub(w);
48 |                 sum += v;
49 |                 buffer.push_str(&format!(" {}", s));
50 |             }
51 |             buffer.push(')');
52 |             (cnt, sum, buffer)
53 |         }
54 |         _ => {
55 |             let width = 2 + gen.next_u64() % (limit as u64).min(10);
56 |             let mut buffer = if gen.gen_bool(0.5) {
57 |                 "(*".to_string()
58 |             } else {
59 |                 "(乘".to_string()
60 |             };
61 | 
62 |             let mut cnt = 0;
63 |             let mut prod = Wrapping(1);
64 |             for _ in 0..width {
65 |                 let (w, v, s) = generate_sexpr(limit, gen);
66 |                 cnt += w;
67 |                 limit = limit.saturating_sub(w);
68 |                 prod *= v;
69 |                 buffer.push_str(&format!(" {}", s));
70 |             }
71 |             buffer.push(')');
72 |             (cnt, prod, buffer)
73 |         }
74 |     }
75 | }
76 | 
77 | #[test]
78 | fn simple_test() {
79 |     let test = "(加 1 (* 5 55))";
80 |     let tree = parser::parse(test).unwrap();
81 |     assert_eq!(276, eval(&tree).0);
82 |     let test = "(+ 1 (# 5 5))";
83 |     let err = parser::parse(test).unwrap_err().to_string();
84 |     assert_eq!(err, "expecting MULT or PLUS for compound at offset 6");
85 | }
86 | 
87 | #[test]
88 | fn randomized_test() {
89 |     for _ in 0..1000 {
90 |         let (_, value, expr) = generate_sexpr(20, &mut rand::thread_rng());
91 | 
92 |         let tree = parser::parse(&expr).unwrap();
93 |         assert_eq!(value, eval(&tree))
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/pag-parser/src/type_system/fixpoint.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | // Modified from Tarjan's strongly connected components algorithm
 10 | 
 11 | use std::cell::Cell;
 12 | use std::collections::HashMap;
 13 | 
 14 | use crate::core_syntax::{BindingContext, Term, TermArena, TermPtr};
 15 | use crate::frontend::WithSpan;
 16 | use crate::utilities::Symbol;
 17 | 
 18 | type NodeId = u32;
 19 | 
 20 | #[derive(Default)]
 21 | struct Node {
 22 |     neighbors: Vec<NodeId>,
 23 |     in_stack: Cell<bool>,
 24 |     low: Cell<u32>,
 25 |     dfn: Cell<u32>,
 26 |     in_cycle: Cell<bool>, // scc size > 1 or self reference
 27 | }
 28 | 
 29 | fn find_neighbors(
 30 |     term: TermPtr,
 31 |     neighbors: &mut Vec<NodeId>,
 32 |     sym_to_id: &HashMap<Symbol<'_>, NodeId>,
 33 | ) {
 34 |     match &term.node {
 35 |         Term::Sequence(lhs, rhs) | Term::Alternative(lhs, rhs) => {
 36 |             find_neighbors(lhs, neighbors, sym_to_id);
 37 |             find_neighbors(rhs, neighbors, sym_to_id);
 38 |         }
 39 |         Term::Fix(_, expr) => find_neighbors(expr, neighbors, sym_to_id),
 40 |         Term::ParserRef(symbol) => {
 41 |             // unexisted IDs refer to implicit fixpoints
 42 |             let Some(&id) = sym_to_id.get(symbol) else { return };
 43 |             neighbors.push(id);
 44 |         }
 45 |         _ => {}
 46 |     }
 47 | }
 48 | 
 49 | fn tarjan(node_id: NodeId, dfn_cnt: &mut u32, stack: &mut Vec<NodeId>, nodes: &Vec<Node>) {
 50 |     let node = &nodes[node_id as usize];
 51 | 
 52 |     *dfn_cnt += 1;
 53 |     node.low.set(*dfn_cnt);
 54 |     node.dfn.set(*dfn_cnt);
 55 |     stack.push(node_id);
 56 |     node.in_stack.set(true);
 57 | 
 58 |     for &next_id in &node.neighbors {
 59 |         // self reference
 60 |         if next_id == node_id {
 61 |             node.in_cycle.set(true);
 62 |             continue;
 63 |         }
 64 |         let next = &nodes[next_id as usize];
 65 |         if next.dfn.get() == 0 {
 66 |             tarjan(next_id, dfn_cnt, stack, nodes);
 67 |             node.low.set(node.low.get().min(next.low.get())); // u.low = min(u.low, v.low)
 68 |         } else if next.in_stack.get() {
 69 |             node.low.set(node.low.get().min(next.dfn.get())); // u.low = min(u.low, v.dfn)
 70 |         }
 71 |     }
 72 | 
 73 |     if node.low.get() == node.dfn.get() {
 74 |         // scc size == 1
 75 |         if stack.last() == Some(&node_id) {
 76 |             node.in_stack.set(false);
 77 |             stack.pop();
 78 |             return;
 79 |         }
 80 |         // scc size > 1
 81 |         while let Some(top_id) = stack.pop() {
 82 |             let top = &nodes[top_id as usize];
 83 |             top.in_stack.set(false);
 84 |             top.in_cycle.set(true);
 85 |             if top_id == node_id {
 86 |                 break;
 87 |             }
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | pub fn infer_fixpoints<'src, 'arena>(
 93 |     entrypoint: Symbol<'src>,
 94 |     arena: &'arena TermArena<'src, 'arena>,
 95 |     binding_ctx: &mut BindingContext<'src, 'arena>,
 96 | ) {
 97 |     let mut sym_to_id = HashMap::new();
 98 |     let mut id_to_sym = Vec::new();
 99 |     for (idx, (symbol, _)) in binding_ctx.iter().enumerate() {
100 |         sym_to_id.insert(*symbol, idx as NodeId);
101 |         id_to_sym.push(*symbol);
102 |     }
103 | 
104 |     let mut nodes = Vec::new();
105 |     for (_, rule) in binding_ctx.iter() {
106 |         let mut neighbors = Vec::new();
107 |         find_neighbors(rule.term, &mut neighbors, &sym_to_id);
108 |         nodes.push(Node {
109 |             neighbors,
110 |             ..Node::default()
111 |         })
112 |     }
113 | 
114 |     let begin = sym_to_id[&entrypoint] as NodeId;
115 |     let mut dfn_cnt = 0;
116 |     let mut stack = Vec::new();
117 |     tarjan(begin, &mut dfn_cnt, &mut stack, &nodes);
118 | 
119 |     for (id, node) in nodes.iter().enumerate() {
120 |         // unreachable rules
121 |         if node.dfn.get() == 0 {
122 |             let symbol = id_to_sym[id];
123 |             binding_ctx.remove(&symbol);
124 |             continue;
125 |         }
126 |         // fixpoints
127 |         if node.in_cycle.get() {
128 |             let symbol = id_to_sym[id];
129 |             let rule = binding_ctx.get_mut(&symbol).unwrap();
130 |             rule.term = arena.alloc(WithSpan {
131 |                 span: rule.term.span,
132 |                 node: Term::Fix(symbol, rule.term),
133 |             })
134 |         }
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/pag-lexer/src/regex_tree.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use crate::intervals;
 10 | use crate::intervals::Intervals;
 11 | use smallvec::SmallVec;
 12 | use std::fmt::{Display, Formatter};
 13 | use std::ops::RangeInclusive;
 14 | use std::rc::Rc;
 15 | 
 16 | #[derive(Ord, PartialOrd, Eq, PartialEq, Debug, Clone, Hash)]
 17 | pub enum RegexTree {
 18 |     Bottom, // no character
 19 |     Set(Intervals),
 20 |     Epsilon,
 21 |     Concat(SmallVec<[Rc<Self>; 2]>),
 22 |     KleeneClosure(Rc<Self>),
 23 |     Union(SmallVec<[Rc<Self>; 2]>),
 24 |     Intersection(SmallVec<[Rc<Self>; 2]>),
 25 |     Complement(Rc<Self>),
 26 | }
 27 | 
 28 | use RegexTree::*;
 29 | 
 30 | impl Display for RegexTree {
 31 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 32 |         match self {
 33 |             Bottom => write!(f, "⊥"),
 34 |             Concat(x) | Intersection(x) | Union(x) if x.is_empty() => write!(f, "⊥"),
 35 |             Set(x) => write!(f, "{x}"),
 36 |             Epsilon => write!(f, "ε"),
 37 |             Concat(children) => {
 38 |                 write!(f, "({}", children[0])?;
 39 |                 for i in &children[1..] {
 40 |                     write!(f, " ~ {i}")?;
 41 |                 }
 42 |                 write!(f, ")")
 43 |             }
 44 |             KleeneClosure(x) => write!(f, "{x}*"),
 45 |             Union(children) => {
 46 |                 write!(f, "({}", children[0])?;
 47 |                 for i in &children[1..] {
 48 |                     write!(f, " ∪ {i}")?;
 49 |                 }
 50 |                 write!(f, ")")
 51 |             }
 52 |             Intersection(children) => {
 53 |                 write!(f, "({}", children[0])?;
 54 |                 for i in &children[1..] {
 55 |                     write!(f, " ∩ {i}")?;
 56 |                 }
 57 |                 write!(f, ")")
 58 |             }
 59 |             Complement(x) => write!(f, "¬{x}"),
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | thread_local! {
 65 |     static EPSILON: Rc<RegexTree> = Rc::new(RegexTree::Epsilon);
 66 |     static BOTTOM: Rc<RegexTree> = Rc::new(RegexTree::Bottom);
 67 |     static TOP: Rc<RegexTree> = BOTTOM.with(|x| Rc::new(RegexTree::Complement(x.clone())));
 68 | }
 69 | 
 70 | impl RegexTree {
 71 |     pub fn epsilon() -> Rc<Self> {
 72 |         EPSILON.with(Rc::clone)
 73 |     }
 74 |     pub fn bottom() -> Rc<Self> {
 75 |         BOTTOM.with(Rc::clone)
 76 |     }
 77 |     pub fn top() -> Rc<Self> {
 78 |         TOP.with(Rc::clone)
 79 |     }
 80 |     pub fn is_byte_sequence(&self) -> bool {
 81 |         match self {
 82 |             Set(intervals) => intervals.is_single_byte(),
 83 |             Concat(children) => children.iter().all(|x| x.is_byte_sequence()),
 84 |             Epsilon => true,
 85 |             _ => false,
 86 |         }
 87 |     }
 88 |     pub fn as_byte_sequence(&self) -> Option<Vec<u8>> {
 89 |         match self {
 90 |             Set(intervals) if intervals.is_single_byte() => Some(vec![intervals.representative()]),
 91 |             Concat(children) => {
 92 |                 let init = if let Some(x) = children.get(0) {
 93 |                     x.as_byte_sequence()
 94 |                 } else {
 95 |                     return Some(Vec::new());
 96 |                 };
 97 | 
 98 |                 children[1..].iter().fold(init, |acc, x| {
 99 |                     acc.and_then(|mut acc| {
100 |                         Some({
101 |                             acc.extend(x.as_byte_sequence()?);
102 |                             acc
103 |                         })
104 |                     })
105 |                 })
106 |             }
107 |             Epsilon => Some(Vec::new()),
108 |             _ => None,
109 |         }
110 |     }
111 |     pub fn single(x: u8) -> Self {
112 |         Set(intervals!((x, x)))
113 |     }
114 |     pub fn range(x: RangeInclusive<u8>) -> Self {
115 |         if x.is_empty() {
116 |             return Bottom;
117 |         }
118 |         Set(intervals!((*x.start(), *x.end())))
119 |     }
120 |     pub fn is_nullable(&self) -> bool {
121 |         match self {
122 |             Bottom => false,
123 |             Set(_) => false,
124 |             Epsilon => true,
125 |             Concat(children) | Intersection(children) => children.iter().all(|x| x.is_nullable()),
126 |             KleeneClosure(_) => true,
127 |             Union(children) => children.iter().any(|x| x.is_nullable()),
128 |             Complement(r) => !r.is_nullable(),
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/pag-lexer/src/lib.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | #![feature(portable_simd)]
  9 | #![feature(core_intrinsics)]
 10 | #![feature(array_chunks)]
 11 | 
 12 | pub mod congruence;
 13 | pub mod derivative;
 14 | pub mod intervals;
 15 | pub mod lookahead;
 16 | pub mod normalization;
 17 | pub mod regex_tree;
 18 | pub mod utilities;
 19 | pub mod vector;
 20 | 
 21 | #[cfg(test)]
 22 | mod tests {
 23 |     use crate::congruence::approximate_congruence_class;
 24 |     use crate::derivative::derivative;
 25 |     use crate::lookahead::LoopOptimizer;
 26 |     use crate::normalization::normalize;
 27 |     use crate::regex_tree::*;
 28 |     use crate::vector::Vector;
 29 |     use quote::quote;
 30 |     use smallvec::smallvec;
 31 |     use std::rc::Rc;
 32 |     use RegexTree::*;
 33 | 
 34 |     #[test]
 35 |     fn it_prints_basic() {
 36 |         let a = Rc::new(RegexTree::single(b'a'));
 37 |         let b = Rc::new(RegexTree::single(b'b'));
 38 |         let ab = Rc::new(Concat(smallvec![a, b]));
 39 |         let alt = Rc::new(Union(smallvec![ab.clone(), ab]));
 40 |         println!("{}", alt);
 41 |         let derivative = derivative(alt, b'a');
 42 |         println!("{}", derivative);
 43 |         let normalized = normalize(derivative);
 44 |         println!("{}", normalized);
 45 |         println!("{:?}", approximate_congruence_class(&normalized));
 46 |     }
 47 | 
 48 |     #[test]
 49 |     fn renormalize_tests() {
 50 |         // concat
 51 |         let a = Rc::new(RegexTree::single(b'a'));
 52 |         let b = Rc::new(RegexTree::single(b'b'));
 53 |         let concat = Rc::new(Concat(smallvec![a.clone(), b]));
 54 |         let normalized = normalize(concat.clone());
 55 |         assert!(Rc::ptr_eq(&concat, &normalized));
 56 |         // kleene closure
 57 |         let kleene = Rc::new(KleeneClosure(a));
 58 |         let normalized = normalize(kleene.clone());
 59 |         assert!(Rc::ptr_eq(&kleene, &normalized));
 60 |     }
 61 | 
 62 |     #[test]
 63 |     fn beautify_mangle_tests() {
 64 |         // generate huge test for me
 65 |         let a = Rc::new(RegexTree::single(b'a'));
 66 |         let b = Rc::new(RegexTree::single(b'b'));
 67 |         let c = Rc::new(RegexTree::single(b'c'));
 68 |         let d = Rc::new(RegexTree::single(b'd'));
 69 |         let ba = Rc::new(Concat(smallvec![b, a.clone()]));
 70 |         let a_or_ba = Rc::new(Union(smallvec![a, ba]));
 71 |         let a_or_ba_or_c = Rc::new(Union(smallvec![a_or_ba, c]));
 72 |         let a_or_ba_or_c_con_d =
 73 |             Rc::new(KleeneClosure(Rc::new(Concat(smallvec![a_or_ba_or_c, d]))));
 74 |         let normalized = normalize(a_or_ba_or_c_con_d);
 75 |         let congruence = approximate_congruence_class(&normalized);
 76 |         println!("{:?}", congruence);
 77 |         let vectorized = Vector::new([normalized]);
 78 |         let mut optimizer = LoopOptimizer::new();
 79 |         println!(
 80 |             "{}",
 81 |             vectorized.generate_dfa(
 82 |                 &quote!(0),
 83 |                 &mut optimizer,
 84 |                 &[quote!({
 85 |                     return Some(idx);
 86 |                 })],
 87 |                 &quote!({
 88 |                     return None;
 89 |                 })
 90 |             )
 91 |         );
 92 |     }
 93 | 
 94 |     #[test]
 95 |     fn approximate_congruence_class_test() {
 96 |         let a = Rc::new(RegexTree::single(b'a'));
 97 |         let b = Rc::new(RegexTree::single(b'b'));
 98 |         let c = Rc::new(RegexTree::single(b'c'));
 99 |         let ba = Rc::new(Concat(smallvec![b, a.clone()]));
100 |         let a_or_ba = Rc::new(Union(smallvec![a, ba]));
101 |         let a_or_ba_or_c = Rc::new(Union(smallvec![a_or_ba, c]));
102 |         println!("normalized: {}", normalize(a_or_ba_or_c.clone()));
103 |         let star = Rc::new(KleeneClosure(a_or_ba_or_c.clone()));
104 |         let a_or_ba_or_c = Rc::new(Concat(smallvec![a_or_ba_or_c, star]));
105 |         println!("{}", a_or_ba_or_c);
106 |         let normalized = normalize(a_or_ba_or_c);
107 |         println!("normalized: {}", normalized);
108 |         let congruence = approximate_congruence_class(&normalized);
109 |         println!("{:?}", congruence);
110 |         println!();
111 |         let vectorized = Vector::new([normalized]);
112 |         let mut optimizer = LoopOptimizer::new();
113 |         println!(
114 |             "{}",
115 |             vectorized.generate_dfa(
116 |                 &quote!(0),
117 |                 &mut optimizer,
118 |                 &[quote!({
119 |                     return Some(idx);
120 |                 })],
121 |                 &quote!({
122 |                     return None;
123 |                 })
124 |             )
125 |         );
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/pag-lexer/src/lookahead.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use crate::intervals::{byte_char, Interval, Intervals};
 10 | use crate::vector::{DfaState, DfaTable};
 11 | use proc_macro2::TokenStream;
 12 | use quote::quote;
 13 | use std::collections::hash_map::Entry;
 14 | use std::collections::HashMap;
 15 | 
 16 | enum Kind {
 17 |     Positive,
 18 |     Negative,
 19 | }
 20 | 
 21 | fn generate_lut_routine(index: usize) -> TokenStream {
 22 |     let table = index / 8;
 23 |     let shift = index % 8;
 24 |     let bit = 1u8 << shift;
 25 |     quote! {
 26 |         idx = idx
 27 |             + input[idx..]
 28 |                 .iter()
 29 |                 .position(|x| GLOBAL_LUT[#table][*x as usize] & #bit > 0)
 30 |                 .unwrap_or(input.len() - idx);
 31 |     }
 32 | }
 33 | 
 34 | fn byte_simd(byte: u8) -> TokenStream {
 35 |     let byte = byte_char(byte);
 36 |     quote! {
 37 |         data.simd_eq(u8x16::splat(#byte))
 38 |     }
 39 | }
 40 | 
 41 | fn range_simd(min: u8, max: u8) -> TokenStream {
 42 |     let min = byte_char(min);
 43 |     let max = byte_char(max);
 44 |     quote! {
 45 |         data.simd_ge(u8x16::splat(#min)) & data.simd_le(u8x16::splat(#max))
 46 |     }
 47 | }
 48 | 
 49 | fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream {
 50 |     let count_act = match kind {
 51 |         Kind::Positive => quote! { trailing_ones },
 52 |         Kind::Negative => quote! { trailing_zeros },
 53 |     };
 54 |     let idx_offset = intervals
 55 |         .iter()
 56 |         .map(|&Interval(l, r)| match l == r {
 57 |             true => byte_simd(l),
 58 |             false => range_simd(l, r),
 59 |         })
 60 |         .reduce(|acc, x| quote! { #acc | #x })
 61 |         .map(|x| {
 62 |             if cfg!(target_arch = "aarch64") {
 63 |                 quote! {{
 64 |                     let mask : u128 = unsafe { core::mem::transmute(#x) };
 65 |                     mask.#count_act() / 8
 66 |                 }}
 67 |             } else {
 68 |                 quote! {
 69 |                     (#x).to_bitmask().#count_act()
 70 |                 }
 71 |             }
 72 |         });
 73 |     quote! {
 74 |         for i in input[idx..].array_chunks::<16>() {
 75 |             use core::simd::*;
 76 |             let data = u8x16::from_slice(i);
 77 |             let idx_offset = #idx_offset;
 78 |             idx += idx_offset as usize;
 79 |             if core::intrinsics::unlikely(idx_offset != 16) {
 80 |                 break;
 81 |             }
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | fn estimated_cost(intervals: &Intervals) -> u32 {
 87 |     intervals
 88 |         .iter()
 89 |         .map(|Interval(l, r)| if l == r { 1 } else { 2 })
 90 |         .sum()
 91 | }
 92 | 
 93 | #[derive(Default)]
 94 | pub struct LoopOptimizer {
 95 |     global_lut: Vec<[u8; 256]>,
 96 |     assigned: HashMap<Intervals, usize>,
 97 | }
 98 | 
 99 | impl LoopOptimizer {
100 |     pub fn new() -> Self {
101 |         Self {
102 |             global_lut: Vec::new(),
103 |             assigned: HashMap::new(),
104 |         }
105 |     }
106 | 
107 |     fn assign_table(&mut self, negatives: &Intervals) -> usize {
108 |         let assigned_table = self.assigned.len();
109 |         match self.assigned.entry(negatives.clone()) {
110 |             Entry::Occupied(x) => {
111 |                 return *x.get();
112 |             }
113 |             Entry::Vacant(x) => {
114 |                 x.insert(assigned_table);
115 |             }
116 |         };
117 |         let table = assigned_table / 8;
118 |         let offset = assigned_table % 8;
119 |         if self.global_lut.len() <= table {
120 |             self.global_lut.push([0; 256]);
121 |         }
122 |         for &Interval(l, r) in negatives.iter() {
123 |             for i in l..=r {
124 |                 self.global_lut[table][i as usize] |= 1u8 << offset;
125 |             }
126 |         }
127 |         assigned_table
128 |     }
129 | 
130 |     pub fn generate_lut(&self) -> Option<TokenStream> {
131 |         if self.assigned.is_empty() {
132 |             return None;
133 |         }
134 |         let table_size = self.global_lut.len();
135 |         let table = self.global_lut.iter().map(|x| quote!([#(#x,)*]));
136 |         Some(quote! {
137 |             const GLOBAL_LUT : [[u8; 256]; #table_size] = [ #(#table,)* ];
138 |         })
139 |     }
140 | 
141 |     pub fn generate_lookahead(&mut self, dfa: &DfaTable, state: &DfaState) -> Option<TokenStream> {
142 |         let limit = 4;
143 | 
144 |         let positives = direct_self_loops(dfa, state)?;
145 |         if estimated_cost(&positives) <= limit {
146 |             return Some(generate_lookahead_routine(&positives, Kind::Positive));
147 |         }
148 | 
149 |         let negatives = positives.complement()?;
150 |         if estimated_cost(&negatives) <= limit {
151 |             return Some(generate_lookahead_routine(&negatives, Kind::Negative));
152 |         }
153 | 
154 |         let index = self.assign_table(&negatives);
155 |         Some(generate_lut_routine(index))
156 |     }
157 | }
158 | 
159 | fn direct_self_loops(dfa: &DfaTable, state: &DfaState) -> Option<Intervals> {
160 |     let mut intervals: Option<Intervals> = None;
161 |     for (edge, target) in &dfa.get(state)?.transitions {
162 |         if target == state {
163 |             intervals = Some(intervals.map_or_else(|| edge.clone(), |x| x.union(edge)));
164 |         }
165 |     }
166 |     intervals
167 | }
168 | 
169 | #[cfg(test)]
170 | mod test {
171 |     use super::*;
172 | 
173 |     #[test]
174 |     fn test_lookahead_codegen() {
175 |         use crate::intervals;
176 |         let positives = intervals!((b'0', b'9'), (b'0', b'9'), (b'A', b'F'));
177 |         syn::parse2::<syn::Expr>(generate_lookahead_routine(&positives, Kind::Positive)).unwrap();
178 |         syn::parse2::<syn::Expr>(generate_lookahead_routine(&positives, Kind::Negative)).unwrap();
179 |     }
180 | }
181 | 


--------------------------------------------------------------------------------
/pag-parser/src/tests/failure/mod.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use crate::tests::write_error;
 10 | 
 11 | macro_rules! expect_error {
 12 |     ( $path:expr, $name:ident, $expect:expr ) => {
 13 |         #[test]
 14 |         fn $name() {
 15 |             let input = include_str!($path);
 16 |             let errors = write_error(input, $path);
 17 |             let lines1 = errors.trim().lines().map(str::trim_end).collect::<Vec<_>>();
 18 |             let lines2 = $expect
 19 |                 .trim()
 20 |                 .lines()
 21 |                 .map(str::trim_end)
 22 |                 .collect::<Vec<_>>();
 23 |             assert_eq!(lines1, lines2, "\n\n{errors}");
 24 |         }
 25 |     };
 26 | }
 27 | 
 28 | expect_error!(
 29 |     "err_nullable_token.pag",
 30 |     err_nullable_token,
 31 |     r#"
 32 | Error: Nullable token detected
 33 |    ╭─[err_nullable_token.pag:2:5]
 34 |    │
 35 |  2 │     A     = 'a'*;
 36 |    │     ┬
 37 |    │     ╰── token A is nullable
 38 | ───╯
 39 | "#
 40 | );
 41 | 
 42 | expect_error!(
 43 |     "err_unguarded_fixpoint.pag",
 44 |     err_unguarded_fixpoint,
 45 |     r#"
 46 | Error: Unguarded fixpoint
 47 |    ╭─[err_unguarded_fixpoint.pag:6:19]
 48 |    │
 49 |  6 │     active test = test ~ A;
 50 |    │                   ────┬───
 51 |    │                       ╰───── fixpoint rule test is not guarded -- your grammar is left-recursive
 52 | ───╯
 53 | "#
 54 | );
 55 | 
 56 | expect_error!(
 57 |     "err_alternation_ambiguity.pag",
 58 |     err_alternation_ambiguity,
 59 |     r#"
 60 | Error: When type checking an alternation of rules, the following rules are ambiguous
 61 |    ╭─[err_alternation_ambiguity.pag:6:19]
 62 |    │
 63 |  6 │     active test = A+ | A ~ test;
 64 |    │                   ─┬   ────┬───
 65 |    │                    ╰───────────── type info for left-hand side: nullable: false, first set:  {A}, follow set:  {A}
 66 |    │                            │
 67 |    │                            ╰───── type info for right-hand side: nullable: false, first set: {A}, follow set: {}
 68 | ───╯
 69 | "#
 70 | );
 71 | 
 72 | expect_error!(
 73 |     "err_sequence_ambiguity.pag",
 74 |     err_sequence_ambiguity,
 75 |     r#"
 76 | Error: When type checking a sequence of rules, the following rules are ambiguous
 77 |    ╭─[err_sequence_ambiguity.pag:6:19]
 78 |    │
 79 |  6 │     active test = A+ ~ A;
 80 |    │                   ─┬   ┬
 81 |    │                    ╰────── type info for left-hand side: nullable: false, first set: {A}, follow set: {A}
 82 |    │                        │
 83 |    │                        ╰── type info for right-hand side: nullable: false, first set: {A}, follow set: {}
 84 | ───╯
 85 | "#
 86 | );
 87 | 
 88 | expect_error!(
 89 |     "err_null_sequence_ambiguity.pag",
 90 |     err_null_sequence_ambiguity,
 91 |     r#"
 92 | Error: When type checking a sequence of rules, the following rules are ambiguous
 93 |    ╭─[err_null_sequence_ambiguity.pag:6:19]
 94 |    │
 95 |  6 │     active test = _ ~ A;
 96 |    │                   ┬   ┬
 97 |    │                   ╰────── type info for left-hand side: nullable: true, first set: {}, follow set: {}
 98 |    │                       │
 99 |    │                       ╰── type info for right-hand side: nullable: false, first set: {A}, follow set: {}
100 | ───╯
101 | "#
102 | );
103 | 
104 | expect_error!(
105 |     "err_multiple_skips.pag",
106 |     err_multiple_skips,
107 |     r#"
108 | Error: Skipping lexical rule is already defined
109 |    ╭─[err_multiple_skips.pag:3:5]
110 |    │
111 |  2 │     skip = "SKIP";
112 |    │     ──────┬──────
113 |    │           ╰──────── first definition
114 |  3 │     skip = "ANOTHER_SKIP";
115 |    │     ──────────┬──────────
116 |    │               ╰──────────── second definition
117 | ───╯
118 | "#
119 | );
120 | 
121 | expect_error!(
122 |     "err_cyclic_token.pag",
123 |     err_cyclic_token,
124 |     r#"
125 | Error: Cyclic lexical rule reference
126 |    ╭─[err_cyclic_token.pag:2:15]
127 |    │
128 |  2 │     A = 'a' ~ A;
129 |    │               ┬
130 |    │               ╰── this reference causes cyclic dependency
131 | ───╯
132 | "#
133 | );
134 | 
135 | expect_error!(
136 |     "err_undefined_token_in_lexer.pag",
137 |     err_undefined_token_in_lexer,
138 |     r#"
139 | Error: Undefined lexical rule reference
140 |    ╭─[err_undefined_token_in_lexer.pag:2:13]
141 |    │
142 |  2 │     A     = C;
143 |    │             ┬
144 |    │             ╰── lexcical rule C is undefined
145 | ───╯
146 | "#
147 | );
148 | 
149 | expect_error!(
150 |     "err_undefined_token_in_parser.pag",
151 |     err_undefined_token_in_parser,
152 |     r#"
153 | Error: Undefined lexical rule reference
154 |    ╭─[err_undefined_token_in_parser.pag:6:19]
155 |    │
156 |  6 │     active test = AA;
157 |    │                   ─┬
158 |    │                    ╰── lexcical rule AA is undefined
159 | ───╯
160 | "#
161 | );
162 | 
163 | expect_error!(
164 |     "err_undefined_grammar_rule.pag",
165 |     err_undefined_grammar_rule,
166 |     r#"
167 | Error: Undefined parser rule reference
168 |    ╭─[err_undefined_grammar_rule.pag:6:19]
169 |    │
170 |  6 │     active test = test2;
171 |    │                   ──┬──
172 |    │                     ╰──── parser rule test2 is undefined
173 | ───╯
174 | "#
175 | );
176 | 
177 | expect_error!(
178 |     "err_multiple_definitions_in_lexer.pag",
179 |     err_multiple_definitions_in_lexer,
180 |     r#"
181 | Error: Multiple definition of A
182 |    ╭─[err_multiple_definitions_in_lexer.pag:3:5]
183 |    │
184 |  2 │     A     = '0';
185 |    │     ┬
186 |    │     ╰── first definition
187 |  3 │     A     = '1';
188 |    │     ┬
189 |    │     ╰── second definition
190 | ───╯
191 | "#
192 | );
193 | 
194 | expect_error!(
195 |     "err_multiple_definitions_in_parser.pag",
196 |     err_multiple_definitions_in_parser,
197 |     r#"
198 | Error: Multiple definition of test
199 |    ╭─[err_multiple_definitions_in_parser.pag:7:12]
200 |    │
201 |  6 │     active test = A;
202 |    │            ──┬─
203 |    │              ╰─── first definition
204 |  7 │     active test = A;
205 |    │            ──┬─
206 |    │              ╰─── second definition
207 | ───╯
208 | "#
209 | );
210 | 


--------------------------------------------------------------------------------
/pag-lexer/src/normalization.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use crate::{intervals::Intervals, regex_tree::RegexTree};
 10 | use smallvec::SmallVec;
 11 | use std::rc::Rc;
 12 | use RegexTree::*;
 13 | 
 14 | type RcVec = SmallVec<[Rc<RegexTree>; 2]>;
 15 | 
 16 | fn sequence_unchanged(x: &[Rc<RegexTree>], y: &[Rc<RegexTree>]) -> bool {
 17 |     x.iter()
 18 |         .map(|x| x.as_ref() as *const RegexTree)
 19 |         .eq(y.iter().map(|y| y.as_ref() as *const RegexTree))
 20 | }
 21 | 
 22 | pub fn normalize(node: Rc<RegexTree>) -> Rc<RegexTree> {
 23 |     match node.as_ref() {
 24 |         Bottom | Epsilon | Set(..) => node,
 25 |         Concat(old) => {
 26 |             let mut new = RcVec::new();
 27 |             for x in old {
 28 |                 let x = normalize(x.clone());
 29 |                 match x.as_ref() {
 30 |                     Bottom => return RegexTree::bottom(), // x ~ bot == bot
 31 |                     Epsilon => continue,                  // x ~ eps == x
 32 |                     Concat(subvec) => new.extend(subvec.iter().cloned()), // flatten
 33 |                     _ => new.push(x.clone()),
 34 |                 }
 35 |             }
 36 |             if new.is_empty() {
 37 |                 RegexTree::epsilon()
 38 |             } else if new.len() == 1 {
 39 |                 new.pop().unwrap()
 40 |             } else if sequence_unchanged(&new, old) {
 41 |                 node
 42 |             } else {
 43 |                 Rc::new(Concat(new))
 44 |             }
 45 |         }
 46 |         Union(old) => {
 47 |             let mut new = RcVec::new();
 48 |             let mut set = None;
 49 |             for x in old {
 50 |                 let x = normalize(x.clone());
 51 |                 match x.as_ref() {
 52 |                     _ if x == RegexTree::top() => return RegexTree::top(), // x | top == top
 53 |                     Bottom => continue,                                    // x | bot == x
 54 |                     Union(subvec) => {
 55 |                         for y in subvec {
 56 |                             match y.as_ref() {
 57 |                                 Set(subset) => {
 58 |                                     set = match set {
 59 |                                         None => Some(subset.clone()),
 60 |                                         Some(set) => Some(set.union(subset)),
 61 |                                     }
 62 |                                 }
 63 |                                 _ => new.push(y.clone()),
 64 |                             }
 65 |                         }
 66 |                     }
 67 |                     Set(subset) => {
 68 |                         set = match set {
 69 |                             None => Some(subset.clone()),
 70 |                             Some(set) => Some(set.union(subset)),
 71 |                         }
 72 |                     }
 73 |                     _ => new.push(x.clone()),
 74 |                 }
 75 |             }
 76 |             if let Some(set) = set {
 77 |                 new.push(Rc::new(Set(set)));
 78 |             }
 79 | 
 80 |             new.sort_unstable();
 81 |             new.dedup_by(|x, y| Rc::ptr_eq(x, y) || x == y);
 82 | 
 83 |             if new
 84 |                 .iter()
 85 |                 .any(|x| !matches!(x.as_ref(), Epsilon) && x.is_nullable())
 86 |             {
 87 |                 new.retain(|x| !matches!(x.as_ref(), Epsilon));
 88 |             }
 89 | 
 90 |             if new.is_empty() {
 91 |                 RegexTree::bottom()
 92 |             } else if new.len() == 1 {
 93 |                 new.pop().unwrap()
 94 |             } else if sequence_unchanged(&new, old) {
 95 |                 node
 96 |             } else {
 97 |                 Rc::new(Union(new))
 98 |             }
 99 |         }
100 |         Intersection(old) => {
101 |             let mut new = RcVec::new();
102 |             let mut set = Intervals::full_set();
103 |             for x in old {
104 |                 let x = normalize(x.clone());
105 |                 match x.as_ref() {
106 |                     Bottom => return RegexTree::bottom(),   // x & bot == bot
107 |                     _ if x == RegexTree::top() => continue, // x & top == x
108 |                     Intersection(subvec) => {
109 |                         for y in subvec {
110 |                             match y.as_ref() {
111 |                                 Set(subset) => match set.intersection(subset) {
112 |                                     Some(new_set) => set = new_set,
113 |                                     None => return RegexTree::bottom(),
114 |                                 },
115 |                                 _ => new.push(y.clone()),
116 |                             }
117 |                         }
118 |                     }
119 |                     Set(subset) => match set.intersection(subset) {
120 |                         Some(new_set) => set = new_set,
121 |                         None => return RegexTree::bottom(),
122 |                     },
123 |                     _ => new.push(x.clone()),
124 |                 }
125 |             }
126 |             new.push(Rc::new(Set(set)));
127 | 
128 |             new.sort_unstable();
129 |             new.dedup_by(|x, y| Rc::ptr_eq(x, y) || x == y);
130 | 
131 |             if new.is_empty() {
132 |                 RegexTree::top()
133 |             } else if new.len() == 1 {
134 |                 new.pop().unwrap()
135 |             } else if sequence_unchanged(&new, old) {
136 |                 node
137 |             } else {
138 |                 Rc::new(Intersection(new))
139 |             }
140 |         }
141 |         KleeneClosure(old) => {
142 |             let new = normalize(old.clone());
143 |             match new.as_ref() {
144 |                 KleeneClosure(_) => new,
145 |                 Bottom | Epsilon => RegexTree::epsilon(),
146 |                 _ => {
147 |                     if Rc::ptr_eq(old, &new) {
148 |                         node
149 |                     } else {
150 |                         Rc::new(KleeneClosure(new))
151 |                     }
152 |                 }
153 |             }
154 |         }
155 |         Complement(old) => {
156 |             let new = normalize(old.clone());
157 |             match new.as_ref() {
158 |                 Set(x) => match x.complement() {
159 |                     Some(y) => Rc::new(Set(y)),
160 |                     None => RegexTree::bottom(),
161 |                 },
162 |                 Complement(r) => r.clone(),
163 |                 // capture renormalization cases (no need to do allocations)
164 |                 _ if Rc::ptr_eq(old, &new) => node,
165 |                 _ => Rc::new(Complement(new)),
166 |             }
167 |         }
168 |     }
169 | }
170 | 


--------------------------------------------------------------------------------
/pag-parser/src/frontend/syntax.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use std::collections::{HashMap, HashSet};
 10 | 
 11 | use crate::{
 12 |     core_syntax::BindingContext,
 13 |     core_syntax::{ParserRule, Term, TermArena, TermPtr},
 14 |     nf::Tag,
 15 |     type_system::{infer_fixpoints, type_check, TypeError},
 16 |     utilities::{merge_results, unreachable_branch, Symbol},
 17 | };
 18 | 
 19 | use super::{
 20 |     lexical::LexerDatabase,
 21 |     FrontendError::*,
 22 |     FrontendResult,
 23 |     SurfaceSyntaxTree::{self, *},
 24 |     WithSpan,
 25 | };
 26 | 
 27 | pub struct Parser<'src, 'arena> {
 28 |     pub entrypoint: Symbol<'src>,
 29 |     pub arena: &'arena TermArena<'src, 'arena>,
 30 |     pub bindings: BindingContext<'src, 'arena>,
 31 |     pub symbol_set: HashSet<&'src str>,
 32 |     pub lexer_database: LexerDatabase<'src>,
 33 | }
 34 | 
 35 | impl<'src, 'arena> Parser<'src, 'arena> {
 36 |     pub fn infer_fixpoints(&mut self) {
 37 |         infer_fixpoints(self.entrypoint, self.arena, &mut self.bindings);
 38 |     }
 39 | 
 40 |     pub fn type_check(&self) -> Vec<TypeError<'src>> {
 41 |         let target = &self.bindings[&self.entrypoint];
 42 |         type_check(&self.bindings, target.term, self.entrypoint)
 43 |     }
 44 | 
 45 |     pub fn is_active(&self, tag: &Tag<'src>) -> bool {
 46 |         tag.is_original() && self.bindings.get(&tag.symbol()).map_or(false, |x| x.active)
 47 |     }
 48 | }
 49 | 
 50 | pub fn construct_parser<'src, 'arena>(
 51 |     arena: &'arena TermArena<'src, 'arena>,
 52 |     lexer_database: LexerDatabase<'src>,
 53 |     sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>,
 54 | ) -> FrontendResult<'src, Parser<'src, 'arena>> {
 55 |     let ParserDef { entrypoint, rules } = &sst.node else {
 56 |         unreachable_branch!("sst should be a parser definition")
 57 |     };
 58 |     let symbol_set = construct_symbol_set(sst)?;
 59 |     let entrypoint = match symbol_set.get(entrypoint.span.as_str()) {
 60 |         Some(name) => Symbol::new(name),
 61 |         None => {
 62 |             return Err(vec![UndefinedParserRuleReference(entrypoint.span)]);
 63 |         }
 64 |     };
 65 |     let mut parser = Parser {
 66 |         entrypoint,
 67 |         arena,
 68 |         bindings: HashMap::new(),
 69 |         lexer_database,
 70 |         symbol_set,
 71 |     };
 72 |     let mut errs = Vec::new();
 73 |     for rule in rules {
 74 |         let ParserRuleDef { active, name, expr, } = &rule.node else {
 75 |             unreachable_branch!("parser should only contain rule definitions")
 76 |         };
 77 |         match construct_core_syntax_tree(&parser, expr) {
 78 |             Ok(term) => {
 79 |                 let symbol = Symbol::new(name.span.as_str());
 80 |                 parser.bindings.insert(
 81 |                     symbol,
 82 |                     ParserRule {
 83 |                         active: *active,
 84 |                         term,
 85 |                     },
 86 |                 );
 87 |             }
 88 |             Err(e) => errs.extend(e),
 89 |         }
 90 |     }
 91 |     if !errs.is_empty() {
 92 |         return Err(errs);
 93 |     }
 94 |     Ok(parser)
 95 | }
 96 | 
 97 | fn construct_symbol_set<'src>(
 98 |     sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>,
 99 | ) -> FrontendResult<'src, HashSet<&'src str>> {
100 |     let ParserDef { rules, .. } = &sst.node else {
101 |         unreachable_branch!("sst should be a parser definition")
102 |     };
103 |     let mut symbol_table = HashMap::with_capacity(rules.len());
104 |     for rule in rules {
105 |         let ParserRuleDef { name, .. } = &rule.node else {
106 |             unreachable_branch!("parser should only contain rule definitions")
107 |         };
108 |         if let Some(previous) = symbol_table.get(name.span.as_str()) {
109 |             return Err(vec![MultipleDefinition(*previous, name.span)]);
110 |         } else {
111 |             symbol_table.insert(name.span.as_str(), name.span);
112 |         }
113 |     }
114 |     Ok(symbol_table.keys().copied().collect())
115 | }
116 | 
117 | fn construct_core_syntax_tree<'src, 'arena>(
118 |     context: &Parser<'src, 'arena>,
119 |     sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>,
120 | ) -> FrontendResult<'src, TermPtr<'src, 'arena>> {
121 |     let spanned = |node: Term<'src, 'arena>| {
122 |         context.arena.alloc(WithSpan {
123 |             span: sst.span,
124 |             node,
125 |         })
126 |     };
127 |     match &sst.node {
128 |         ParserAlternative { lhs, rhs } => {
129 |             let lhs = construct_core_syntax_tree(context, lhs);
130 |             let rhs = construct_core_syntax_tree(context, rhs);
131 |             merge_results(lhs, rhs, |l, r| &*spanned(Term::Alternative(l, r)))
132 |         }
133 |         ParserSequence { lhs, rhs } => {
134 |             let lhs = construct_core_syntax_tree(context, lhs);
135 |             let rhs = construct_core_syntax_tree(context, rhs);
136 |             merge_results(lhs, rhs, |l, r| &*spanned(Term::Sequence(l, r)))
137 |         }
138 |         ParserStar { inner } => {
139 |             let symbol = Symbol::new(sst.span.as_str());
140 |             let inner = construct_core_syntax_tree(context, inner)?;
141 |             // \x . (i ~ x) | epsilon
142 |             let sequence = spanned(Term::Sequence(inner, spanned(Term::ParserRef(symbol))));
143 |             let alternative = spanned(Term::Alternative(sequence, spanned(Term::Epsilon)));
144 |             Ok(spanned(Term::Fix(symbol, alternative)))
145 |         }
146 |         ParserPlus { inner } => {
147 |             let symbol = Symbol::new(sst.span.as_str());
148 |             let inner = construct_core_syntax_tree(context, inner)?;
149 |             // i ~ (\x . (i ~ x) | epsilon)
150 |             let sequence = spanned(Term::Sequence(inner, spanned(Term::ParserRef(symbol))));
151 |             let alternative = spanned(Term::Alternative(sequence, spanned(Term::Epsilon)));
152 |             let fixpoint = spanned(Term::Fix(symbol, alternative));
153 |             Ok(spanned(Term::Sequence(inner, fixpoint)))
154 |         }
155 |         ParserOptional { inner } => {
156 |             let inner = construct_core_syntax_tree(context, inner)?;
157 |             Ok(spanned(Term::Alternative(inner, spanned(Term::Epsilon))))
158 |         }
159 |         Bottom => Ok(spanned(Term::Bottom)),
160 |         Empty => Ok(spanned(Term::Epsilon)),
161 |         ParserRuleRef { name } => {
162 |             match context.symbol_set.get(name.span.as_str()) {
163 |                 // Symbol::hash depends on the address so use the original &str
164 |                 Some(target) => Ok(spanned(Term::ParserRef(Symbol::new(target)))),
165 |                 None => Err(vec![UndefinedParserRuleReference(name.span)]),
166 |             }
167 |         }
168 |         LexicalRuleRef { name } => {
169 |             match context.lexer_database.symbol_set.get(name.span.as_str()) {
170 |                 // Symbol::hash depends on the address so use the original &str
171 |                 Some(target) => Ok(spanned(Term::LexerRef(Symbol::new(target)))),
172 |                 None => Err(vec![UndefinedLexicalRuleReference(name.span)]),
173 |             }
174 |         }
175 |         _ => unreachable_branch!("called with unsupported node: {}", sst.span.as_str()),
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/pag-parser/src/frontend/lexical.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use smallvec::smallvec;
 10 | use std::cell::Cell;
 11 | use std::collections::{HashMap, HashSet};
 12 | use std::rc::Rc;
 13 | 
 14 | use pag_lexer::{normalization::normalize, regex_tree::RegexTree};
 15 | use pest::Span;
 16 | 
 17 | use crate::utilities::{merge_results, unreachable_branch, Symbol};
 18 | 
 19 | use super::{
 20 |     unicode::{encode_char, encode_range},
 21 |     FrontendError::*,
 22 |     FrontendResult,
 23 |     SurfaceSyntaxTree::{self, *},
 24 |     WithSpan,
 25 | };
 26 | 
 27 | type SpanRegexTree<'src> = WithSpan<'src, Rc<RegexTree>>;
 28 | 
 29 | pub struct LexerDatabase<'src> {
 30 |     pub symbol_set: HashSet<&'src str>,
 31 |     pub entries: HashMap<Symbol<'src>, SpanRegexTree<'src>>,
 32 |     pub skip: Option<SpanRegexTree<'src>>,
 33 | }
 34 | 
 35 | impl<'src> LexerDatabase<'src> {
 36 |     pub fn nullability_check(&self) -> FrontendResult<'src, ()> {
 37 |         let mut errs = Vec::new();
 38 |         for (sym, rule) in &self.entries {
 39 |             if rule.node.is_nullable() {
 40 |                 errs.push(NullableToken(sym.name(), rule.span));
 41 |             }
 42 |         }
 43 |         if let Some(skip) = &self.skip {
 44 |             if skip.node.is_nullable() {
 45 |                 errs.push(NullableToken("<skip>", skip.span));
 46 |             }
 47 |         }
 48 |         if !errs.is_empty() {
 49 |             return Err(errs);
 50 |         }
 51 |         Ok(())
 52 |     }
 53 | }
 54 | 
 55 | enum State<'src, 'local> {
 56 |     Unresolved(&'local WithSpan<'src, SurfaceSyntaxTree<'src>>),
 57 |     Pending,
 58 |     Resolved(Rc<RegexTree>),
 59 | }
 60 | 
 61 | pub fn construct_lexer_database<'src>(
 62 |     sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>,
 63 | ) -> FrontendResult<'src, LexerDatabase<'src>> {
 64 |     let LexerDef { rules } = &sst.node else {
 65 |         unreachable_branch!("sst should be a lexical definition")
 66 |     };
 67 | 
 68 |     let mut rule_defs = HashMap::new();
 69 |     let mut skip_def = None;
 70 |     let mut errs = Vec::new();
 71 |     for rule in rules {
 72 |         match &rule.node {
 73 |             LexicalRuleDef { name, expr } => {
 74 |                 let value = (name.span, Cell::new(State::Unresolved(expr)));
 75 |                 if let Some((previous, _)) = rule_defs.insert(name.span.as_str(), value) {
 76 |                     errs.push(MultipleDefinition(previous, name.span));
 77 |                 }
 78 |             }
 79 |             LexicalSkipDef { expr } => {
 80 |                 if let Some((previous, _)) = skip_def.replace((rule.span, expr)) {
 81 |                     errs.push(MultipleSkippingRule(previous, rule.span));
 82 |                 }
 83 |             }
 84 |             _ => {}
 85 |         }
 86 |     }
 87 |     if !errs.is_empty() {
 88 |         return Err(errs);
 89 |     }
 90 | 
 91 |     let mut entries = HashMap::new();
 92 |     for (name, (span, state)) in &rule_defs {
 93 |         let node = match state.replace(State::Pending) {
 94 |             State::Unresolved(expr_sst) => {
 95 |                 let expr_regex = construct_regex_tree(expr_sst, &rule_defs)?;
 96 |                 let expr_regex = normalize(expr_regex);
 97 |                 state.set(State::Resolved(expr_regex.clone()));
 98 |                 expr_regex
 99 |             }
100 |             State::Pending => unreachable!(),
101 |             State::Resolved(expr_regex) => {
102 |                 state.set(State::Resolved(expr_regex.clone()));
103 |                 expr_regex
104 |             }
105 |         };
106 |         entries.insert(Symbol::new(name), WithSpan { span: *span, node });
107 |     }
108 | 
109 |     let mut skip = None;
110 |     if let Some((span, skip_sst)) = skip_def {
111 |         let node = construct_regex_tree(skip_sst, &rule_defs)?;
112 |         let node = normalize(node);
113 |         skip = Some(WithSpan { span, node });
114 |     }
115 | 
116 |     Ok(LexerDatabase {
117 |         entries,
118 |         symbol_set: rule_defs.keys().copied().collect(),
119 |         skip,
120 |     })
121 | }
122 | 
123 | // 3-color DFS algorithm to detect cycle
124 | fn construct_regex_tree<'src>(
125 |     sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>,
126 |     rule_defs: &HashMap<&'src str, (Span<'src>, Cell<State<'src, '_>>)>,
127 | ) -> FrontendResult<'src, Rc<RegexTree>> {
128 |     match &sst.node {
129 |         LexicalAlternative { lhs, rhs } => {
130 |             let lhs = construct_regex_tree(lhs, rule_defs);
131 |             let rhs = construct_regex_tree(rhs, rule_defs);
132 |             merge_results(lhs, rhs, |l, r| Rc::new(RegexTree::Union(smallvec![l, r])))
133 |         }
134 |         LexicalSequence { lhs, rhs } => {
135 |             let lhs = construct_regex_tree(lhs, rule_defs);
136 |             let rhs = construct_regex_tree(rhs, rule_defs);
137 |             merge_results(lhs, rhs, |l, r| Rc::new(RegexTree::Concat(smallvec![l, r])))
138 |         }
139 |         LexicalAnd { lhs, rhs } => {
140 |             let lhs = construct_regex_tree(lhs, rule_defs);
141 |             let rhs = construct_regex_tree(rhs, rule_defs);
142 |             merge_results(lhs, rhs, |l, r| {
143 |                 Rc::new(RegexTree::Intersection(smallvec![l, r]))
144 |             })
145 |         }
146 |         LexicalStar { inner } => {
147 |             let inner = construct_regex_tree(inner, rule_defs)?;
148 |             Ok(Rc::new(RegexTree::KleeneClosure(inner)))
149 |         }
150 |         LexicalPlus { inner } => {
151 |             let inner = construct_regex_tree(inner, rule_defs)?;
152 |             Ok(Rc::new(RegexTree::Concat(smallvec![
153 |                 inner.clone(),
154 |                 Rc::new(RegexTree::KleeneClosure(inner))
155 |             ])))
156 |         }
157 |         LexicalOptional { inner } => {
158 |             let inner = construct_regex_tree(inner, rule_defs)?;
159 |             Ok(Rc::new(RegexTree::Union(smallvec![
160 |                 inner,
161 |                 RegexTree::epsilon()
162 |             ])))
163 |         }
164 |         LexicalNot { inner } => {
165 |             let inner = construct_regex_tree(inner, rule_defs)?;
166 |             Ok(Rc::new(RegexTree::Complement(inner)))
167 |         }
168 |         RangeLit { start, end } => Ok(encode_range(*start, *end)),
169 |         StringLit(x) => Ok(x
170 |             .bytes()
171 |             .map(|b| Rc::new(RegexTree::single(b)))
172 |             .reduce(|acc, b| Rc::new(RegexTree::Concat(smallvec![acc, b])))
173 |             .unwrap_or_else(RegexTree::epsilon)),
174 |         Bottom => Ok(RegexTree::bottom()),
175 |         Empty => Ok(RegexTree::epsilon()),
176 |         CharLit { value } => Ok(encode_char(value.node)),
177 |         LexicalRuleRef { name } => match rule_defs.get(name.span.as_str()) {
178 |             Some((_, state)) => match state.replace(State::Pending) {
179 |                 State::Unresolved(expr_sst) => {
180 |                     let expr_regex = construct_regex_tree(expr_sst, rule_defs)?;
181 |                     let expr_regex = normalize(expr_regex);
182 |                     state.set(State::Resolved(expr_regex.clone()));
183 |                     Ok(expr_regex)
184 |                 }
185 |                 State::Pending => Err(vec![CyclicLexicalRuleReference(name.span)]),
186 |                 State::Resolved(expr_regex) => {
187 |                     state.set(State::Resolved(expr_regex.clone()));
188 |                     Ok(expr_regex)
189 |                 }
190 |             },
191 |             None => Err(vec![UndefinedLexicalRuleReference(name.span)]),
192 |         },
193 |         _ => unreachable_branch!("called with unsupported node: {}", sst.span.as_str()),
194 |     }
195 | }
196 | 


--------------------------------------------------------------------------------
/pag-parser/src/type_system/type_check.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use pest::Span;
 10 | use std::collections::HashSet;
 11 | 
 12 | use crate::core_syntax::{BindingContext, Term, TermPtr};
 13 | use crate::utilities::Symbol;
 14 | 
 15 | use super::{binding_proxy::BindingProxy, context::TypeContext};
 16 | 
 17 | #[derive(Debug)]
 18 | pub enum TypeError<'a> {
 19 |     SequentialUniquenessViolation {
 20 |         lhs: (Span<'a>, Type<'a>),
 21 |         rhs: (Span<'a>, Type<'a>),
 22 |         total: Span<'a>,
 23 |     },
 24 |     DisjunctiveUniquenessViolation {
 25 |         lhs: (Span<'a>, Type<'a>),
 26 |         rhs: (Span<'a>, Type<'a>),
 27 |         total: Span<'a>,
 28 |     },
 29 |     UnguardedFixpoint(Symbol<'a>, Span<'a>),
 30 |     UnresolvedReference(Symbol<'a>, Span<'a>),
 31 | }
 32 | 
 33 | #[derive(PartialEq, Eq, Debug, Clone)]
 34 | pub struct Type<'src> {
 35 |     pub first: HashSet<Symbol<'src>>,
 36 |     pub follow: HashSet<Symbol<'src>>,
 37 |     pub nullable: bool,
 38 |     pub guarded: bool,
 39 | }
 40 | 
 41 | impl<'src> Type<'src> {
 42 |     fn sequential_uniqueness(&self, other: &Self) -> bool {
 43 |         !self.nullable && self.follow.is_disjoint(&other.first)
 44 |     }
 45 | 
 46 |     fn disjunctive_uniqueness(&self, other: &Self) -> bool {
 47 |         !(self.nullable && other.nullable) && self.first.is_disjoint(&other.first)
 48 |     }
 49 | 
 50 |     fn epsilon() -> Self {
 51 |         Self {
 52 |             first: HashSet::new(),
 53 |             follow: HashSet::new(),
 54 |             nullable: true,
 55 |             guarded: true,
 56 |         }
 57 |     }
 58 |     fn token(token: Symbol<'src>) -> Self {
 59 |         Self {
 60 |             first: HashSet::from([token]),
 61 |             follow: HashSet::new(),
 62 |             nullable: false,
 63 |             guarded: true,
 64 |         }
 65 |     }
 66 | 
 67 |     fn sequence(
 68 |         t1: &Self,
 69 |         t2: &Self,
 70 |         lhs: Span<'src>,
 71 |         rhs: Span<'src>,
 72 |         total: Span<'src>,
 73 |     ) -> Result<Self, Box<TypeError<'src>>> {
 74 |         if t1.sequential_uniqueness(t2) {
 75 |             Ok(Self {
 76 |                 first: t1.first.clone(),
 77 |                 follow: if t2.nullable {
 78 |                     t2.follow
 79 |                         .union(&t2.first)
 80 |                         .chain(t1.follow.iter())
 81 |                         .cloned()
 82 |                         .collect()
 83 |                 } else {
 84 |                     t2.follow.clone()
 85 |                 },
 86 |                 nullable: false,
 87 |                 guarded: t1.guarded,
 88 |             })
 89 |         } else {
 90 |             Err(Box::new(TypeError::SequentialUniquenessViolation {
 91 |                 lhs: (lhs, t1.clone()),
 92 |                 rhs: (rhs, t2.clone()),
 93 |                 total,
 94 |             }))
 95 |         }
 96 |     }
 97 | 
 98 |     fn bottom() -> Self {
 99 |         Self {
100 |             first: HashSet::new(),
101 |             follow: HashSet::new(),
102 |             nullable: false,
103 |             guarded: true,
104 |         }
105 |     }
106 | 
107 |     fn alternative(
108 |         t1: &Self,
109 |         t2: &Self,
110 |         lhs: Span<'src>,
111 |         rhs: Span<'src>,
112 |         total: Span<'src>,
113 |     ) -> Result<Self, Box<TypeError<'src>>> {
114 |         if t1.disjunctive_uniqueness(t2) {
115 |             Ok(Self {
116 |                 first: t1.first.union(&t2.first).cloned().collect(),
117 |                 follow: t1.follow.union(&t2.follow).cloned().collect(),
118 |                 nullable: t1.nullable || t2.nullable,
119 |                 guarded: t1.guarded && t2.guarded,
120 |             })
121 |         } else {
122 |             Err(Box::new(TypeError::DisjunctiveUniquenessViolation {
123 |                 lhs: (lhs, t1.clone()),
124 |                 rhs: (rhs, t2.clone()),
125 |                 total,
126 |             }))
127 |         }
128 |     }
129 | 
130 |     fn minimum() -> Self {
131 |         Self {
132 |             first: HashSet::new(),
133 |             follow: HashSet::new(),
134 |             nullable: false,
135 |             guarded: false,
136 |         }
137 |     }
138 | 
139 |     fn fixpoint<F>(mut f: F) -> (Self, Vec<TypeError<'src>>)
140 |     where
141 |         F: FnMut(&Self) -> (Self, Vec<TypeError<'src>>),
142 |     {
143 |         let mut last = Self::minimum();
144 |         loop {
145 |             let (next, errs) = f(&last);
146 |             if !errs.is_empty() || next == last {
147 |                 return (next, errs);
148 |             }
149 |             last = next;
150 |         }
151 |     }
152 | }
153 | 
154 | fn type_check_impl<'src, 'a>(
155 |     typing_ctx: &mut TypeContext<'src>,
156 |     binding_ctx: &mut BindingProxy<'src, 'a>,
157 |     term: TermPtr<'src, 'a>,
158 | ) -> (Type<'src>, Vec<TypeError<'src>>) {
159 |     match &term.node {
160 |         Term::Epsilon => (Type::epsilon(), vec![]),
161 |         Term::Sequence(x, y) => {
162 |             let (x_type, x_errors) = type_check_impl(typing_ctx, binding_ctx, x);
163 |             let (y_type, y_errors) = typing_ctx.guarded(|ctx| type_check_impl(ctx, binding_ctx, y));
164 |             let (r#type, err) = match Type::sequence(&x_type, &y_type, x.span, y.span, term.span) {
165 |                 Ok(r#type) => (r#type, None),
166 |                 Err(err) => (Type::bottom(), Some(err)),
167 |             };
168 |             (
169 |                 r#type,
170 |                 x_errors
171 |                     .into_iter()
172 |                     .chain(y_errors)
173 |                     .chain(err.map(|e| *e))
174 |                     .collect(),
175 |             )
176 |         }
177 |         Term::LexerRef(name) => (Type::token(*name), vec![]),
178 |         Term::Bottom => (Type::bottom(), vec![]),
179 |         Term::Alternative(x, y) => {
180 |             let (x_type, x_errors) = type_check_impl(typing_ctx, binding_ctx, x);
181 |             let (y_type, y_errors) = type_check_impl(typing_ctx, binding_ctx, y);
182 |             let (r#type, err) = match Type::alternative(&x_type, &y_type, x.span, y.span, term.span)
183 |             {
184 |                 Ok(r#type) => (r#type, None),
185 |                 Err(err) => (Type::bottom(), Some(err)),
186 |             };
187 |             (
188 |                 r#type,
189 |                 x_errors
190 |                     .into_iter()
191 |                     .chain(y_errors)
192 |                     .chain(err.map(|e| *e))
193 |                     .collect(),
194 |             )
195 |         }
196 |         Term::ParserRef(name) => {
197 |             // first check if name is already typed in the context.
198 |             // if so return that type directly.
199 |             if let Some(ty) = typing_ctx.lookup(*name) {
200 |                 return (ty.as_ref().clone(), vec![]);
201 |             }
202 |             // otherwise, we need to type check the parser definition.
203 |             if let Some(target) = binding_ctx.lookup(name) {
204 |                 // we should not cache the result, since it can be recursive and changed during the calculation of the fixpoint.
205 |                 let (r#type, errors) = binding_ctx.with_hiding(*name, |binding_ctx| {
206 |                     type_check_impl(typing_ctx, binding_ctx, target)
207 |                 });
208 |                 (r#type, errors)
209 |             } else {
210 |                 (
211 |                     Type::bottom(),
212 |                     vec![TypeError::UnresolvedReference(*name, term.span)],
213 |                 )
214 |             }
215 |         }
216 |         Term::Fix(var, body) => {
217 |             if let Some(ty) = typing_ctx.lookup(*var) {
218 |                 return (ty.as_ref().clone(), vec![]);
219 |             }
220 |             let (r#type, errs) = Type::fixpoint(|x| {
221 |                 typing_ctx.with(*var, x.clone(), |ctx| {
222 |                     type_check_impl(ctx, binding_ctx, body)
223 |                 })
224 |             });
225 |             if !errs.is_empty() {
226 |                 return (r#type, errs);
227 |             }
228 |             if r#type.guarded {
229 |                 typing_ctx.with(*var, r#type.clone(), |ctx| {
230 |                     type_check_impl(ctx, binding_ctx, body)
231 |                 })
232 |             } else {
233 |                 (
234 |                     Type::bottom(),
235 |                     vec![TypeError::UnguardedFixpoint(*var, term.span)],
236 |                 )
237 |             }
238 |         }
239 |     }
240 | }
241 | 
242 | pub fn type_check<'src, 'a>(
243 |     binding_ctx: &BindingContext<'src, 'a>,
244 |     term: TermPtr<'src, 'a>,
245 |     name: Symbol<'src>,
246 | ) -> Vec<TypeError<'src>> {
247 |     let mut typing_ctx = TypeContext::new();
248 |     let mut proxy = BindingProxy::proxy(binding_ctx);
249 |     proxy.with_hiding(name, |binding_ctx| {
250 |         type_check_impl(&mut typing_ctx, binding_ctx, term).1
251 |     })
252 | }
253 | 


--------------------------------------------------------------------------------
/pag-lexer/src/intervals.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use proc_macro2::{Literal, TokenStream};
 10 | use quote::{quote, ToTokens};
 11 | use smallvec::{smallvec, SmallVec};
 12 | use std::fmt::{Display, Formatter};
 13 | 
 14 | #[macro_export]
 15 | macro_rules! interval {
 16 |     ($start:expr, $end:expr) => {
 17 |         $crate::intervals::Interval($start as u8, $end as u8)
 18 |     };
 19 | }
 20 | 
 21 | #[macro_export]
 22 | macro_rules! intervals {
 23 |     ($(($start:expr, $end:expr)),+ $(,)?) => {
 24 |         unsafe {
 25 |             $crate::intervals::Intervals::new(
 26 |                 [$($crate::interval!($start, $end)),+]
 27 |             ).unwrap_unchecked()
 28 |         }
 29 |     };
 30 | }
 31 | 
 32 | // A closed interval of u8s.
 33 | #[derive(Debug, Ord, PartialOrd, Eq, PartialEq, Hash, Copy, Clone)]
 34 | pub struct Interval(pub u8, pub u8);
 35 | 
 36 | impl Display for Interval {
 37 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 38 |         let start = self.0.escape_ascii();
 39 |         let end = self.1.escape_ascii();
 40 |         if self.0 == self.1 {
 41 |             write!(f, "{start}")
 42 |         } else {
 43 |             write!(f, "[{start}, {end}]")
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | impl Interval {
 49 |     // Check if two intervals overlap.
 50 |     pub fn overlaps(&self, other: &Self) -> bool {
 51 |         self.0 <= other.1 && other.0 <= self.1
 52 |     }
 53 | 
 54 |     pub fn intersection(&self, other: &Self) -> Self {
 55 |         debug_assert!(self.overlaps(other));
 56 |         Self(self.0.max(other.0), self.1.min(other.1))
 57 |     }
 58 | 
 59 |     pub fn contains(&self, other: &Self) -> bool {
 60 |         self.0 <= other.0 && other.1 <= self.1
 61 |     }
 62 | }
 63 | 
 64 | // Invariants:
 65 | // - Ordered
 66 | // - Non-empty
 67 | // - Non-consecutive
 68 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
 69 | pub struct Intervals(SmallVec<[Interval; 8]>);
 70 | 
 71 | impl Intervals {
 72 |     pub fn new<I>(data: I) -> Option<Self>
 73 |     where
 74 |         I: IntoIterator<Item = Interval>,
 75 |     {
 76 |         data.into_iter()
 77 |             .map(|x| Self(smallvec![x]))
 78 |             .reduce(|acc, x| acc.union(&x))
 79 |     }
 80 | 
 81 |     pub fn full_set() -> Self {
 82 |         Self(smallvec![Interval(u8::MIN, u8::MAX)])
 83 |     }
 84 | 
 85 |     pub fn iter(&self) -> impl Iterator<Item = &Interval> {
 86 |         self.0.iter()
 87 |     }
 88 | 
 89 |     pub fn is_single_byte(&self) -> bool {
 90 |         self.0.len() == 1 && self.0[0].0 == self.0[0].1
 91 |     }
 92 | 
 93 |     pub fn representative(&self) -> u8 {
 94 |         self.0[0].0
 95 |     }
 96 | 
 97 |     pub fn is_full_set(&self) -> bool {
 98 |         self.0.len() == 1 && self.0[0] == Interval(u8::MIN, u8::MAX)
 99 |     }
100 | 
101 |     // it is okay it contains non-unicode code points; they will never be read anyway.
102 |     pub fn complement(&self) -> Option<Self> {
103 |         let mut current = Some(0);
104 |         let mut result = SmallVec::new();
105 |         for i in self.0.iter() {
106 |             if let Some(c) = current {
107 |                 if c < i.0 {
108 |                     result.push(Interval(c, i.0 - 1));
109 |                 }
110 |             }
111 |             current = i.1.checked_add(1);
112 |         }
113 |         if let Some(current) = current {
114 |             result.push(Interval(current, u8::MAX));
115 |         }
116 |         if result.is_empty() {
117 |             None
118 |         } else {
119 |             Some(Self(result))
120 |         }
121 |     }
122 | 
123 |     pub fn contains(&self, target: u8) -> bool {
124 |         match self.0.binary_search_by_key(&target, |x| x.0) {
125 |             Ok(_) => true,
126 |             Err(0) => false,
127 |             Err(idx) => self.0[idx - 1].1 >= target,
128 |         }
129 |     }
130 | 
131 |     pub fn intersection(&self, other: &Self) -> Option<Self> {
132 |         let mut result: Option<Self> = None;
133 |         for i in self.0.iter().copied() {
134 |             for j in other.0.iter().copied() {
135 |                 if i.overlaps(&j) {
136 |                     let temp = Self(smallvec![i.intersection(&j)]);
137 |                     result = match result {
138 |                         None => Some(temp),
139 |                         Some(x) => Some(x.union(&temp)),
140 |                     };
141 |                 } else if j.0 > i.1 {
142 |                     break;
143 |                 }
144 |             }
145 |         }
146 |         result
147 |     }
148 | 
149 |     pub fn union(&self, other: &Self) -> Self {
150 |         let mut result = SmallVec::new();
151 |         let mut i = self.0.iter().copied().peekable();
152 |         let mut j = other.0.iter().copied().peekable();
153 |         loop {
154 |             let mut current = match (i.peek(), j.peek()) {
155 |                 (Some(&x), Some(&y)) if x.0 < y.0 => i.next().unwrap(),
156 |                 (_, Some(_)) => j.next().unwrap(),
157 |                 (Some(_), _) => i.next().unwrap(),
158 |                 _ => break,
159 |             };
160 |             loop {
161 |                 match (i.peek(), j.peek()) {
162 |                     (Some(x), _) if current.1.wrapping_add(1) >= x.0 => {
163 |                         current.1 = current.1.max(i.next().unwrap().1);
164 |                     }
165 |                     (_, Some(y)) if current.1.wrapping_add(1) >= y.0 => {
166 |                         current.1 = current.1.max(j.next().unwrap().1);
167 |                     }
168 |                     _ => break,
169 |                 }
170 |             }
171 |             result.push(current);
172 |         }
173 |         Self(result)
174 |     }
175 | }
176 | 
177 | impl Display for Intervals {
178 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
179 |         match self.0.as_slice() {
180 |             [] => Ok(()),
181 |             [single] => write!(f, "{single}"),
182 |             multiple => {
183 |                 let iter = multiple.iter().map(|i| i.to_string());
184 |                 write!(f, "({})", iter.collect::<Vec<_>>().join(" | "))
185 |             }
186 |         }
187 |     }
188 | }
189 | 
190 | pub fn byte_char(c: u8) -> Literal {
191 |     format!("b'{}'", c.escape_ascii()).parse().unwrap()
192 | }
193 | 
194 | impl ToTokens for Intervals {
195 |     fn to_tokens(&self, tokens: &mut TokenStream) {
196 |         debug_assert!(!self.0.is_empty());
197 |         let iter = self.0.iter().map(|Interval(start, end)| {
198 |             let start_lit = byte_char(*start);
199 |             let end_lit = byte_char(*end);
200 |             if start == end {
201 |                 quote! { #start_lit }
202 |             } else {
203 |                 quote! { #start_lit ..= #end_lit }
204 |             }
205 |         });
206 |         tokens.extend(quote! { #(#iter)|* });
207 |     }
208 | }
209 | 
210 | #[cfg(test)]
211 | mod test {
212 |     #[test]
213 |     fn basic_format() {
214 |         let interval = interval!(0x41, 0x5A);
215 |         assert_eq!(format!("{interval}"), "[A, Z]");
216 |         let interval = interval!(0x41, 0x7A);
217 |         assert_eq!(format!("{interval}"), "[A, z]");
218 |         let interval = interval!(0x41, 0x7B);
219 |         assert_eq!(format!("{interval}"), "[A, {]");
220 |         // whitespace
221 |         let interval = interval!(b'\t', b'\t');
222 |         assert_eq!(format!("{interval}"), r"\t");
223 |     }
224 | 
225 |     #[test]
226 |     fn intervals_format() {
227 |         let intervals = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9'));
228 |         assert_eq!(format!("{intervals}"), "([0, 9] | [A, Z] | [a, z])");
229 |     }
230 | 
231 |     #[test]
232 |     fn union() {
233 |         let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9'));
234 |         assert_eq!(x.union(&x), x);
235 |         let y = intervals!(('!', '7'));
236 |         assert_eq!(x.union(&y), intervals!(('!', '9'), ('A', 'Z'), ('a', 'z')));
237 |         let z = intervals!(('!', '7'), ('C', 'e'));
238 |         assert_eq!(x.union(&z), intervals!(('!', '9'), ('A', 'z')));
239 |     }
240 | 
241 |     #[test]
242 |     fn complement() {
243 |         let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9'));
244 |         let y = intervals!((0, 47), (58, 64), (91, 96), (123, u8::MAX));
245 |         assert_eq!(x.complement(), Some(y));
246 |         let z = intervals!(('\0', '7'));
247 |         assert_eq!(z.complement().unwrap(), intervals!(('8', u8::MAX)));
248 |         assert_eq!(x.complement().unwrap().complement().unwrap(), x);
249 |         assert_eq!(x.union(&x.complement().unwrap()), intervals!((0, u8::MAX)));
250 |     }
251 | 
252 |     #[test]
253 |     fn intersection() {
254 |         let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9'));
255 |         let z = intervals!(('\0', '7'));
256 |         assert_eq!(x.intersection(&z), Some(intervals!(('0', '7'))));
257 |         assert!(x.intersection(&x.complement().unwrap()).is_none());
258 |         assert_eq!(x.intersection(&intervals!((0, u8::MAX))).unwrap(), x);
259 |         let a = intervals!(('E', 'c'));
260 |         assert_eq!(x.intersection(&a), Some(intervals!(('E', 'Z'), ('a', 'c'))));
261 |     }
262 | }
263 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | 
  3 | -->
  4 | <h1 align="center">
  5 | <!--suppress CheckImageSize -->
  6 | <img src="https://raw.githubusercontent.com/SchrodingerZhu/paguroidea/main/.github/images/hermit-crab.png" alt="hermit-crab" width="50%" height="50%">
  7 | <br>
  8 | Paguroidea
  9 | </h1>
 10 | 
 11 | ![GITHUB-BADGE](https://github.com/SchrodingerZhu/paguroidea/workflows/Build/badge.svg)
 12 | 
 13 | | Crate          | Status                                                         |
 14 | |----------------|----------------------------------------------------------------|
 15 | | `pag-lexer`    | ![crates.io](https://img.shields.io/crates/v/pag-lexer.svg)    |
 16 | | `pag-parser`   | ![crates.io](https://img.shields.io/crates/v/pag-parser.svg)   |
 17 | | `pag-compiler` | ![crates.io](https://img.shields.io/crates/v/pag-compiler.svg) |
 18 | 
 19 | 
 20 | 
 21 | A reimplementation of the Flap parser in Rust (with our own modifications applied)!
 22 | 
 23 | ## 🚧 Under Construction 🚧
 24 | This project is still under its early-stage development. The grammar for Paguroidea is subject to change
 25 | (see [Issue #22](https://github.com/SchrodingerZhu/paguroidea/issues/22)). The parser generation is not thoroughly tested,
 26 | which may still shake some bugs out from time to time. There are also ongoing works to improvement the quality of the generated
 27 | code.
 28 | 
 29 | ## Introduction
 30 | Paguroidea is a parser generator (a.k.a. the compiler of compiler). The theoretical foundation of Paguroidea is built
 31 | on a few papers:
 32 | 
 33 | - [Regular-expression derivatives reexamined](https://www.ccs.neu.edu/home/turon/re-deriv.pdf) introduced a way to generate
 34 |   DFAs for lexer directly based on language derivatives. The number of states in DFAs created by this approach is close to
 35 |   the minimal.
 36 | - [A Typed, Algebraic Approach to Parsing](https://www.cl.cam.ac.uk/~nk480/parsing.pdf) provides a method to "type check"
 37 |   context-free grammars such that the checked grammar is guaranteed to be parsed in linear time with single-token
 38 |   lookahead. This is especially useful in Flap/Paguroidea to make sure the total correctness of normalization.
 39 | - [flap: A Deterministic Parser with Fused Lexing](https://arxiv.org/abs/2304.05276v2) invented a novel approach to normalize
 40 |   context-free grammars into the so-called Deterministic Greibach Normal Form (DGNF), where one can use localized smaller lexers
 41 |   for each individual parser routine rather than lexing the whole input with a lexer containing all regular expressions of
 42 |   all tokens.
 43 | 
 44 | We modified the work of flap by extending DGNF with tree-generation actions, which are similar to the "reduce" operation in a
 45 | traditional shift-reduce parser.
 46 | 
 47 | ## How to use
 48 | > **Notice**: grammar for parser definitions used in this section will be changed in the near future.
 49 | 
 50 | It is simple: just define your grammar and pass it to our compiler. Then, Paguroidea will output a standalone parser file.
 51 | 
 52 | For example, a simple S-expression parser can be defined as the following
 53 | ```text
 54 | lexer {
 55 |     definition BLANK     = ' ';
 56 |     definition DIGIT     = '0' .. '9';
 57 |     definition ALPHA     = 'a' .. 'z' | 'A' .. 'Z';
 58 |     active token LPAREN     = '(';
 59 |     active token RPAREN     = ')';
 60 |     active token ATOM       = ALPHA ~ (ALPHA | DIGIT)*;
 61 |     silent token WHITESPACE = (BLANK | '\t' | '\n' | '\r')+;
 62 | }
 63 | parser sexpr {
 64 |     active fixpoint compound
 65 |         = LPAREN ~ (compound | atom) * ~ RPAREN;
 66 | 
 67 |     active definition atom
 68 |         = ATOM;
 69 | 
 70 |     active definition sexpr
 71 |         = compound | atom;
 72 | }
 73 | ```
 74 | 
 75 | <details>
 76 | <summary>How to write a grammar file</summary>
 77 | 
 78 | You can put up your own one with the following rules:
 79 | 
 80 | - A grammar file must contain both lexer and parser parts.
 81 | - A `definition` in lexer part is a `macro` representing some common lexical rules. A definition
 82 |   itself does not count as a token, which is similar to `fragment` in ANTLR.
 83 | - A lexer can atmost have one `silent` token. `silent` tokens will be automatically skipped during
 84 |   parsing.
 85 | - All rules defined in lexer part must be full uppercase.
 86 | - You can use
 87 |   - empty (`'_'`)
 88 |   - characters (`'a', '\x12', '😊'`)
 89 |   - strings (`"你好", "Rust"`)
 90 |   - ranges (`'A' .. 'Z'`)
 91 |   - sequences (`'a' ~ 'b'`),
 92 |   - alternatives (`'a' | 'b'`)
 93 |   - optionals (`'a'?`)
 94 |   - zero-or-mores (`'a'\*`)
 95 |   - one-or-mores (`'a'+`)
 96 |   - complements (`!'a'`)
 97 | 
 98 |   to make up your regular expressions. Notice that complement is not negative lookahead in the common sense. Rather,
 99 |   it represents characters or languages complement to negated one. It is required that all active tokens cannot be nullable.
100 | - The parser part must have an entrypoint specified in the header.
101 | - Strings/characters/ranges cannot be directly used in the parser part, but parser can refer to tokens defined in lexer.
102 | - Parser rules are all in lowercase.
103 | - Most combinators in the lexer part are also supported in the parser part except for complement.
104 | 
105 | For more complicated examples, one can see [json.pag](benches/json/json.pag).
106 | </details>
107 | 
108 | <details>
109 | <summary>How to compile and use a grammar file</summary>
110 | 
111 | To compile your grammar file, the recommended way is to add `pag-compiler` as your build dependency. With `pag-compiler`,
112 | the parser file can be easily generated in a build script as the following:
113 | ```rust
114 | fn main() {
115 |     pag_compiler::compile("csv.pag", "src/parser.rs");
116 |     println!("cargo:rerun-if-changed=csv.pag");
117 | }
118 | ```
119 | 
120 | For some reasons (mostly performance issues), only nightly rust (1.71+) is supported for now. It is also required that the crate containing the parser file
121 | should be annotated with
122 | ```rust
123 | #![feature(portable_simd)]
124 | #![feature(core_intrinsics)]
125 | #![feature(array_chunks)]
126 | ```
127 | </details>
128 | 
129 | ## Performance
130 | 
131 | We are continuously working on improvement the quality of our generated parser. For now, on workloads of CSV/JSON,
132 | the performance is close to or even better than those specialized parsers.
133 | ```
134 | === Random Generated CSV ===
135 | throughput/pag          time:   [635.88 µs 637.64 µs 639.46 µs]
136 |                         thrpt:  [622.63 MiB/s 624.41 MiB/s 626.14 MiB/s]
137 | throughput/csv          time:   [528.36 µs 541.72 µs 559.54 µs]
138 |                         thrpt:  [711.56 MiB/s 734.97 MiB/s 753.55 MiB/s]
139 | throughput/pest         time:   [3.7278 ms 3.7364 ms 3.7460 ms]
140 |                         thrpt:  [106.29 MiB/s 106.56 MiB/s 106.80 MiB/s]
141 | === Random Generated JSON ===
142 | random-json/pag-json    time:   [22.634 ns 22.650 ns 22.666 ns]
143 |                         thrpt:  [84.149 MiB/s 84.209 MiB/s 84.271 MiB/s]
144 | random-json/serde-json  time:   [12.493 ns 12.587 ns 12.694 ns]
145 |                         thrpt:  [150.26 MiB/s 151.54 MiB/s 152.67 MiB/s]
146 | random-json/pest-json   time:   [177.38 ns 178.17 ns 179.17 ns]
147 |                         thrpt:  [10.645 MiB/s 10.705 MiB/s 10.753 MiB/s]
148 | === twitter.json ===
149 | twitter-json/pag-json   time:   [1.0923 ms 1.0941 ms 1.0961 ms]
150 |                         thrpt:  [667.24 MiB/s 668.46 MiB/s 669.59 MiB/s]
151 | twitter-json/serde-json time:   [1.2281 ms 1.2295 ms 1.2312 ms]
152 |                         thrpt:  [594.02 MiB/s 594.88 MiB/s 595.54 MiB/s]
153 | twitter-json/pest-json  time:   [5.2977 ms 5.3055 ms 5.3148 ms]
154 |                         thrpt:  [137.61 MiB/s 137.85 MiB/s 138.06 MiB/s]
155 | ```
156 | 
157 | <details>
158 | <summary>Why is it fast and how can I make my grammar faster</summary>
159 | 
160 | - Thanks to the work of the Flap parser, we can fuse lexer and parser together such that lexers can be localized.
161 | - We apply tail-call optimizations explicitly. To utilize this feature, default more grammar rules using `*`, `+` or
162 |   mark them rule as silent rules if possible.
163 | - We apply batched lookahead strategy using SIMD or lookup tables. This optimization applies when you repeat simple character sets
164 |   (for instance, `(BLANK | '\t' | '\n' | '\r')+`).
165 | - We are working on to inline/reduce more operations involving state transitions and lexer-parse communications.
166 | </details>
167 | 
168 | ## Diagnostic Grammar Error Check
169 | We provide diagnostic information for "type errors" in your grammar definitions. Here are some examples:
170 | 
171 | **Left-recursion**
172 | ```
173 |   Error: Unguarded fixpoint
174 |       ╭─[json.pag:39:5]
175 |       │
176 |    39 │     active fixpoint json = json ~ value;
177 |       │     ─────────────────┬─────────────────
178 |       │                      ╰─────────────────── fixpoint rule json is not guarded -- your grammar is left-recursive
179 |   ────╯
180 | ```
181 | **Sequence Ambiguity**
182 | 
183 | > **Explanation**: there may be ambiguity when separating a sequence into two part according to the grammar definition
184 | 
185 | ```
186 |   Error: When type checking a sequence of rules, the following rules are ambiguous
187 |       ╭─[json.pag:39:28]
188 |       │
189 |    39 │     active fixpoint test = NUMBER+ ~ NUMBER+;
190 |       │                            ───┬───   ───┬───
191 |       │                               ╰─────────────── type info for left-hand side: nullable: false, first set: {NUMBER}, follow set: {NUMBER}
192 |       │                                         │
193 |       │                                         ╰───── type info for right-hand side: nullable: false, first set: {NUMBER}, follow set: {NUMBER}
194 |   ────╯
195 | ```
196 | 
197 | **Alternation Ambiguity**
198 | > **Explanation**: there may be ambiguity when select a match in an alternation of two rules.
199 | ```
200 |   Error: When type checking an alternation of rules, the following rules are ambiguous
201 |       ╭─[json.pag:39:28]
202 |       │
203 |    39 │     active fixpoint test = NUMBER+ | NUMBER;
204 |       │                            ───┬───   ───┬──
205 |       │                               ╰────────────── type info for left-hand side: nullable false, first set: NUMBER, follow set: NUMBER
206 |       │                                         │
207 |       │                                         ╰──── type info for right-hand side: nullable false, first set: NUMBER, follow set:
208 |   ────╯
209 | ```
210 | 
211 | There are other diagnostic information for undefined references, nullable tokens in lexer, character format error, etc.
212 | 


--------------------------------------------------------------------------------
/pag-lexer/src/vector.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use crate::congruence::{approximate_congruence_class, meet};
 10 | use crate::derivative::derivative;
 11 | use crate::intervals::Intervals;
 12 | use crate::normalization::normalize;
 13 | use crate::regex_tree::RegexTree;
 14 | use crate::utilities::dbg_sort;
 15 | 
 16 | use crate::lookahead::LoopOptimizer;
 17 | use proc_macro2::{Literal, TokenStream};
 18 | use quote::{format_ident, quote};
 19 | use std::collections::{HashMap, HashSet};
 20 | use std::fmt::{Display, Formatter};
 21 | use std::rc::Rc;
 22 | 
 23 | #[derive(Hash, PartialEq, Eq, Debug, Clone, Ord, PartialOrd)]
 24 | pub struct Vector {
 25 |     regex_trees: Vec<Rc<RegexTree>>,
 26 | }
 27 | 
 28 | impl Display for Vector {
 29 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 30 |         write!(f, "(")?;
 31 |         for (i, regex_tree) in self.regex_trees.iter().enumerate() {
 32 |             if i != 0 {
 33 |                 write!(f, ", ")?;
 34 |             }
 35 |             write!(f, "{}", regex_tree)?;
 36 |         }
 37 |         write!(f, ")")
 38 |     }
 39 | }
 40 | 
 41 | impl Vector {
 42 |     pub fn new<I>(iter: I) -> Self
 43 |     where
 44 |         I: IntoIterator<Item = Rc<RegexTree>>,
 45 |     {
 46 |         let regex_trees = iter.into_iter().collect();
 47 |         Self { regex_trees }
 48 |     }
 49 | 
 50 |     pub fn is_byte_sequence(&self) -> bool {
 51 |         let mut iter = self
 52 |             .regex_trees
 53 |             .iter()
 54 |             .filter(|x| !matches!(x.as_ref(), RegexTree::Bottom))
 55 |             .map(|x| x.is_byte_sequence());
 56 |         matches!(iter.next(), Some(true)) && iter.next().is_none()
 57 |     }
 58 | 
 59 |     pub fn as_byte_sequence(&self) -> Option<(usize, Vec<u8>)> {
 60 |         let failing = self
 61 |             .regex_trees
 62 |             .iter()
 63 |             .filter(|x| matches!(x.as_ref(), RegexTree::Bottom))
 64 |             .count();
 65 |         if failing == self.regex_trees.len() - 1 {
 66 |             self.regex_trees
 67 |                 .iter()
 68 |                 .enumerate()
 69 |                 .find_map(|(idx, x)| x.as_byte_sequence().map(|x| (idx, x)))
 70 |         } else {
 71 |             None
 72 |         }
 73 |     }
 74 | 
 75 |     pub fn derivative(&self, x: u8) -> Self {
 76 |         Vector {
 77 |             regex_trees: self
 78 |                 .regex_trees
 79 |                 .iter()
 80 |                 .map(|t| derivative(t.clone(), x))
 81 |                 .collect(),
 82 |         }
 83 |     }
 84 | 
 85 |     pub fn accepting_state(&self) -> Option<usize> {
 86 |         self.regex_trees.iter().enumerate().find_map(|t| {
 87 |             if t.1.is_nullable() {
 88 |                 Some(t.0)
 89 |             } else {
 90 |                 None
 91 |             }
 92 |         })
 93 |     }
 94 | 
 95 |     pub fn is_rejecting_state(&self) -> bool {
 96 |         self.regex_trees
 97 |             .iter()
 98 |             .all(|t| matches!(t.as_ref(), RegexTree::Bottom))
 99 |     }
100 | 
101 |     pub fn approximate_congruence_class(&self) -> Vec<Intervals> {
102 |         // meet all congruence classes for each regex tree
103 |         self.regex_trees
104 |             .iter()
105 |             .map(|x| approximate_congruence_class(x))
106 |             .reduce(|acc, x| meet(acc.as_slice(), x.as_slice()))
107 |             .unwrap_or_default()
108 |     }
109 | 
110 |     pub fn normalize(&self) -> Self {
111 |         let regex_trees = self
112 |             .regex_trees
113 |             .iter()
114 |             .map(|x| normalize(x.clone()))
115 |             .collect();
116 |         Self { regex_trees }
117 |     }
118 | 
119 |     pub fn generate_dfa(
120 |         &self,
121 |         initial_idx: &TokenStream,
122 |         optimizer: &mut LoopOptimizer,
123 |         success_actions: &[TokenStream],
124 |         failure_action: &TokenStream,
125 |     ) -> TokenStream {
126 |         let initial_state = {
127 |             let initial_state = self.normalize();
128 |             let last_success = initial_state.accepting_state();
129 |             DfaState {
130 |                 state_vec: initial_state,
131 |                 last_success,
132 |             }
133 |         };
134 |         let mut dfa = build_dfa(initial_state.state_vec.clone());
135 |         let leaf_states = extract_leaf_states(&mut dfa);
136 |         let initial_label = format_ident!("S{}", dfa[&initial_state].state_id);
137 |         let actions = dbg_sort(&dfa, |(_, info)| info.state_id).map(|(state, info)| {
138 |             let label = format_ident!("S{}", info.state_id);
139 |             if let Some((rule_idx, seq)) = state.state_vec.as_byte_sequence() {
140 |                 let literal = Literal::byte_string(&seq);
141 |                 let length = seq.len();
142 |                 let on_success = &success_actions[rule_idx];
143 |                 return quote! {
144 |                     State::#label => {
145 |                         if input[idx..].starts_with(#literal) {
146 |                             cursor = idx + #length;
147 |                             #on_success
148 |                         } else {
149 |                             #failure_action
150 |                         }
151 |                     },
152 |                 };
153 |             }
154 |             let transitions = info.transitions.iter().map(|(interval, target)| {
155 |                 if leaf_states.contains(target) {
156 |                     let rule_idx = target.last_success.unwrap();
157 |                     let on_success = &success_actions[rule_idx];
158 |                     return quote! { Some(#interval) => { cursor = idx + 1; #on_success }, };
159 |                 }
160 |                 let target_label = format_ident!("S{}", dfa[target].state_id);
161 |                 quote! { Some(#interval) => state = State::#target_label, }
162 |             });
163 |             let lookahead = optimizer.generate_lookahead(&dfa, state);
164 |             let otherwise = state
165 |                 .last_success
166 |                 .and_then(|x| success_actions.get(x))
167 |                 .unwrap_or(failure_action);
168 |             let advance_cursor = if state.state_vec.accepting_state().is_some() {
169 |                 Some(quote!(cursor = idx;))
170 |             } else {
171 |                 None
172 |             };
173 |             quote! {
174 |                 State::#label => {
175 |                     #lookahead
176 |                     #advance_cursor
177 |                     match input.get(idx) {
178 |                         #(#transitions)*
179 |                         _ => { #otherwise }
180 |                     }
181 |                 },
182 |             }
183 |         });
184 | 
185 |         let labels = dbg_sort(dfa.values(), |info| info.state_id)
186 |             .map(|info| format_ident!("S{}", info.state_id));
187 | 
188 |         quote! {
189 |             enum State {
190 |                 #(#labels,)*
191 |             }
192 |             let mut idx = #initial_idx;
193 |             let mut state = State::#initial_label;
194 |             loop {
195 |                 match state {
196 |                     #(#actions)*
197 |                 }
198 |                 idx += 1;
199 |             }
200 |         }
201 |     }
202 | }
203 | 
204 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
205 | pub struct DfaState {
206 |     state_vec: Vector,
207 |     last_success: Option<usize>,
208 | }
209 | 
210 | #[derive(Debug, Clone)]
211 | pub struct DfaInfo {
212 |     state_id: usize,
213 |     pub(crate) transitions: Vec<(Intervals, DfaState)>,
214 | }
215 | 
216 | pub type DfaTable = HashMap<DfaState, DfaInfo>;
217 | 
218 | fn explore_dfa_node(dfa: &mut DfaTable, state: DfaState, state_id: &mut usize) {
219 |     dfa.insert(
220 |         state.clone(),
221 |         DfaInfo {
222 |             state_id: *state_id,
223 |             transitions: vec![],
224 |         },
225 |     );
226 |     *state_id += 1;
227 | 
228 |     if state.state_vec.is_byte_sequence() {
229 |         return;
230 |     }
231 | 
232 |     let classes = state.state_vec.approximate_congruence_class();
233 |     let mut transitions = Vec::with_capacity(classes.len());
234 | 
235 |     for intervals in classes {
236 |         let char = intervals.representative();
237 |         let target = state.state_vec.derivative(char).normalize();
238 |         let last_success = target.accepting_state().or(state.last_success);
239 |         let next = DfaState {
240 |             state_vec: target,
241 |             last_success,
242 |         };
243 |         if !next.state_vec.is_rejecting_state() {
244 |             transitions.push((intervals, next.clone()));
245 |             if !dfa.contains_key(&next) {
246 |                 explore_dfa_node(dfa, next, state_id)
247 |             }
248 |         }
249 |     }
250 | 
251 |     dfa.get_mut(&state).unwrap().transitions = transitions;
252 | }
253 | 
254 | pub fn build_dfa(state: Vector) -> DfaTable {
255 |     let mut state_id = 0;
256 |     let mut dfa = HashMap::new();
257 |     let last_success = state.accepting_state();
258 |     let state = DfaState {
259 |         state_vec: state,
260 |         last_success,
261 |     };
262 |     explore_dfa_node(&mut dfa, state, &mut state_id);
263 |     #[cfg(pag_print_dfa)]
264 |     print_dfa(&dfa);
265 |     dfa
266 | }
267 | 
268 | fn extract_leaf_states(dfa: &mut DfaTable) -> HashSet<DfaState> {
269 |     // TODO: switch to `drain_filter` (nightly) / `extract_if` (hashbrown)
270 |     let leaf_states = dfa
271 |         .iter()
272 |         .filter_map(|(state, info)| {
273 |             if info.transitions.is_empty() && state.last_success.is_some() {
274 |                 Some(state.clone())
275 |             } else {
276 |                 None
277 |             }
278 |         })
279 |         .collect();
280 |     for s in &leaf_states {
281 |         dfa.remove(s);
282 |     }
283 |     leaf_states
284 | }
285 | 
286 | #[cfg(pag_print_dfa)]
287 | fn print_dfa(dfa: &DfaTable) {
288 |     for (state, info) in dfa {
289 |         println!(
290 |             "S{}({:?}): {}",
291 |             info.state_id, state.last_success, state.state_vec
292 |         );
293 |         for (intervals, target) in &info.transitions {
294 |             println!("  {} -> S{}", intervals, dfa[target].state_id);
295 |         }
296 |     }
297 | }
298 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction,
 10 | and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by
 13 | the copyright owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all
 16 | other entities that control, are controlled by, or are under common
 17 | control with that entity. For the purposes of this definition,
 18 | "control" means (i) the power, direct or indirect, to cause the
 19 | direction or management of such entity, whether by contract or
 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 | outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 | "You" (or "Your") shall mean an individual or Legal Entity
 24 | exercising permissions granted by this License.
 25 | 
 26 | "Source" form shall mean the preferred form for making modifications,
 27 | including but not limited to software source code, documentation
 28 | source, and configuration files.
 29 | 
 30 | "Object" form shall mean any form resulting from mechanical
 31 | transformation or translation of a Source form, including but
 32 | not limited to compiled object code, generated documentation,
 33 | and conversions to other media types.
 34 | 
 35 | "Work" shall mean the work of authorship, whether in Source or
 36 | Object form, made available under the License, as indicated by a
 37 | copyright notice that is included in or attached to the work
 38 | (an example is provided in the Appendix below).
 39 | 
 40 | "Derivative Works" shall mean any work, whether in Source or Object
 41 | form, that is based on (or derived from) the Work and for which the
 42 | editorial revisions, annotations, elaborations, or other modifications
 43 | represent, as a whole, an original work of authorship. For the purposes
 44 | of this License, Derivative Works shall not include works that remain
 45 | separable from, or merely link (or bind by name) to the interfaces of,
 46 | the Work and Derivative Works thereof.
 47 | 
 48 | "Contribution" shall mean any work of authorship, including
 49 | the original version of the Work and any modifications or additions
 50 | to that Work or Derivative Works thereof, that is intentionally
 51 | submitted to Licensor for inclusion in the Work by the copyright owner
 52 | or by an individual or Legal Entity authorized to submit on behalf of
 53 | the copyright owner. For the purposes of this definition, "submitted"
 54 | means any form of electronic, verbal, or written communication sent
 55 | to the Licensor or its representatives, including but not limited to
 56 | communication on electronic mailing lists, source code control systems,
 57 | and issue tracking systems that are managed by, or on behalf of, the
 58 | Licensor for the purpose of discussing and improving the Work, but
 59 | excluding communication that is conspicuously marked or otherwise
 60 | designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 | "Contributor" shall mean Licensor and any individual or Legal Entity
 63 | on behalf of whom a Contribution has been received by Licensor and
 64 | subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 | this License, each Contributor hereby grants to You a perpetual,
 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 | copyright license to reproduce, prepare Derivative Works of,
 70 | publicly display, publicly perform, sublicense, and distribute the
 71 | Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 | this License, each Contributor hereby grants to You a perpetual,
 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 | (except as stated in this section) patent license to make, have made,
 77 | use, offer to sell, sell, import, and otherwise transfer the Work,
 78 | where such license applies only to those patent claims licensable
 79 | by such Contributor that are necessarily infringed by their
 80 | Contribution(s) alone or by combination of their Contribution(s)
 81 | with the Work to which such Contribution(s) was submitted. If You
 82 | institute patent litigation against any entity (including a
 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 | or a Contribution incorporated within the Work constitutes direct
 85 | or contributory patent infringement, then any patent licenses
 86 | granted to You under this License for that Work shall terminate
 87 | as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 | Work or Derivative Works thereof in any medium, with or without
 91 | modifications, and in Source or Object form, provided that You
 92 | meet the following conditions:
 93 | 
 94 | (a) You must give any other recipients of the Work or
 95 | Derivative Works a copy of this License; and
 96 | 
 97 | (b) You must cause any modified files to carry prominent notices
 98 | stating that You changed the files; and
 99 | 
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 | 
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 | 
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!)  The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/pag-parser/src/frontend/unicode.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use pag_lexer::normalization::normalize;
 10 | use pag_lexer::regex_tree::RegexTree;
 11 | use smallvec::smallvec;
 12 | use std::rc::Rc;
 13 | 
 14 | pub fn encode_char(x: char) -> Rc<RegexTree> {
 15 |     let mut buf = [0; 4];
 16 |     normalize(Rc::new(RegexTree::Concat(
 17 |         x.encode_utf8(&mut buf)
 18 |             .bytes()
 19 |             .map(|b| Rc::new(RegexTree::single(b)))
 20 |             .collect(),
 21 |     )))
 22 | }
 23 | 
 24 | fn full_range_2() -> Rc<RegexTree> {
 25 |     Rc::new(RegexTree::Concat(smallvec![
 26 |         Rc::new(RegexTree::range(0xc0..=0xdf)),
 27 |         Rc::new(RegexTree::range(0x80..=0xbf))
 28 |     ]))
 29 | }
 30 | 
 31 | fn full_range_3() -> Rc<RegexTree> {
 32 |     Rc::new(RegexTree::Concat(smallvec![
 33 |         Rc::new(RegexTree::range(0xe0..=0xef)),
 34 |         Rc::new(RegexTree::range(0x80..=0xbf)),
 35 |         Rc::new(RegexTree::range(0x80..=0xbf))
 36 |     ]))
 37 | }
 38 | 
 39 | fn encode_same_level1(x: char, y: char) -> Rc<RegexTree> {
 40 |     encode_same_level_expanded(1, &[x as u8], &[y as u8])
 41 | }
 42 | 
 43 | fn encode_same_level2(x: char, y: char) -> Rc<RegexTree> {
 44 |     let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
 45 |     let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
 46 |     let y_fst = (0xc0 | (y as u32 >> 6)) as u8;
 47 |     let y_snd = (0x80 | (y as u32 & 0x3f)) as u8;
 48 |     encode_same_level_expanded(2, &[x_fst, x_snd], &[y_fst, y_snd])
 49 | }
 50 | 
 51 | fn encode_same_level3(x: char, y: char) -> Rc<RegexTree> {
 52 |     let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
 53 |     let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
 54 |     let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
 55 |     let y_fst = (0xe0 | (y as u32 >> 12)) as u8;
 56 |     let y_snd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
 57 |     let y_trd = (0x80 | (y as u32 & 0x3f)) as u8;
 58 |     encode_same_level_expanded(3, &[x_fst, x_snd, x_trd], &[y_fst, y_snd, y_trd])
 59 | }
 60 | 
 61 | fn encode_same_level4(x: char, y: char) -> Rc<RegexTree> {
 62 |     let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
 63 |     let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
 64 |     let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
 65 |     let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
 66 |     let y_fst = (0xf0 | (y as u32 >> 18)) as u8;
 67 |     let y_snd = (0x80 | ((y as u32 >> 12) & 0x3f)) as u8;
 68 |     let y_trd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8;
 69 |     let y_fth = (0x80 | (y as u32 & 0x3f)) as u8;
 70 |     encode_same_level_expanded(
 71 |         4,
 72 |         &[x_fst, x_snd, x_trd, x_fth],
 73 |         &[y_fst, y_snd, y_trd, y_fth],
 74 |     )
 75 | }
 76 | 
 77 | const ALL_BF: [u8; 4] = [0xbf, 0xbf, 0xbf, 0xbf];
 78 | const ALL_80: [u8; 4] = [0x80, 0x80, 0x80, 0x80];
 79 | 
 80 | fn encode_same_level_expanded(level: usize, tuple_x: &[u8], tuple_y: &[u8]) -> Rc<RegexTree> {
 81 |     if level == 1 {
 82 |         return Rc::new(RegexTree::range(tuple_x[0]..=tuple_y[0]));
 83 |     }
 84 |     if tuple_x[0] == tuple_y[0] {
 85 |         Rc::new(RegexTree::Concat(smallvec![
 86 |             Rc::new(RegexTree::single(tuple_x[0])),
 87 |             encode_same_level_expanded(level - 1, &tuple_x[1..], &tuple_y[1..]),
 88 |         ]))
 89 |     } else {
 90 |         Rc::new(RegexTree::Union(smallvec![
 91 |             Rc::new(RegexTree::Concat(smallvec![
 92 |                 Rc::new(RegexTree::single(tuple_x[0])),
 93 |                 encode_same_level_expanded(level - 1, &tuple_x[1..], &ALL_BF),
 94 |             ])),
 95 |             Rc::new(RegexTree::Concat(smallvec![
 96 |                 Rc::new(RegexTree::range(tuple_x[0] + 1..=tuple_y[0] - 1)),
 97 |                 encode_same_level_expanded(level - 1, &ALL_80, &ALL_BF),
 98 |             ])),
 99 |             Rc::new(RegexTree::Concat(smallvec![
100 |                 Rc::new(RegexTree::single(tuple_y[0])),
101 |                 encode_same_level_expanded(level - 1, &ALL_80, &tuple_y[1..]),
102 |             ])),
103 |         ]))
104 |     }
105 | }
106 | 
107 | fn encode_le_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
108 |     if level == 1 {
109 |         return Rc::new(RegexTree::range(fst_bound..=tuple[0]));
110 |     }
111 |     Rc::new(RegexTree::Union(smallvec![
112 |         Rc::new(RegexTree::Concat(smallvec![
113 |             Rc::new(RegexTree::single(tuple[0])),
114 |             encode_le_expanded(level - 1, 0x80, &tuple[1..]),
115 |         ])),
116 |         Rc::new(RegexTree::Concat(smallvec![
117 |             Rc::new(RegexTree::range(fst_bound..=tuple[0] - 1)),
118 |             encode_le_expanded(level - 1, 0x80, &ALL_BF),
119 |         ])),
120 |     ]))
121 | }
122 | 
123 | fn encode_ge_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc<RegexTree> {
124 |     if level == 1 {
125 |         return Rc::new(RegexTree::range(tuple[0]..=fst_bound));
126 |     }
127 |     Rc::new(RegexTree::Union(smallvec![
128 |         Rc::new(RegexTree::Concat(smallvec![
129 |             Rc::new(RegexTree::single(tuple[0])),
130 |             encode_ge_expanded(level - 1, 0xBF, &tuple[1..]),
131 |         ])),
132 |         Rc::new(RegexTree::Concat(smallvec![
133 |             Rc::new(RegexTree::range(tuple[0] + 1..=fst_bound)),
134 |             encode_ge_expanded(level - 1, 0xBF, &ALL_80),
135 |         ])),
136 |     ]))
137 | }
138 | 
139 | fn encode_ge1(x: char) -> Rc<RegexTree> {
140 |     encode_ge_expanded(1, 0x7F, &[x as u8])
141 | }
142 | 
143 | fn encode_ge2(x: char) -> Rc<RegexTree> {
144 |     let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
145 |     let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
146 |     encode_ge_expanded(2, 0xDF, &[x_fst, x_snd])
147 | }
148 | 
149 | fn encode_ge3(x: char) -> Rc<RegexTree> {
150 |     let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
151 |     let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
152 |     let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
153 |     encode_ge_expanded(3, 0xEF, &[x_fst, x_snd, x_trd])
154 | }
155 | 
156 | fn encode_le2(x: char) -> Rc<RegexTree> {
157 |     let x_fst = (0xc0 | (x as u32 >> 6)) as u8;
158 |     let x_snd = (0x80 | (x as u32 & 0x3f)) as u8;
159 |     encode_le_expanded(2, 0xC0, &[x_fst, x_snd])
160 | }
161 | 
162 | fn encode_le3(x: char) -> Rc<RegexTree> {
163 |     let x_fst = (0xe0 | (x as u32 >> 12)) as u8;
164 |     let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
165 |     let x_trd = (0x80 | (x as u32 & 0x3f)) as u8;
166 |     encode_le_expanded(3, 0xE0, &[x_fst, x_snd, x_trd])
167 | }
168 | 
169 | fn encode_le4(x: char) -> Rc<RegexTree> {
170 |     let x_fst = (0xf0 | (x as u32 >> 18)) as u8;
171 |     let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8;
172 |     let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8;
173 |     let x_fth = (0x80 | (x as u32 & 0x3f)) as u8;
174 |     encode_le_expanded(4, 0xF0, &[x_fst, x_snd, x_trd, x_fth])
175 | }
176 | 
177 | fn try_encode_same_level(x: char, y: char) -> Option<Rc<RegexTree>> {
178 |     match (x as u32, y as u32) {
179 |         (0x00..=0x7F, 0x00..=0x7F) => Some(encode_same_level1(x, y)),
180 |         (0x80..=0x7FF, 0x80..=0x7FF) => Some(encode_same_level2(x, y)),
181 |         (0x800..=0xFFFF, 0x800..=0xFFFF) => Some(encode_same_level3(x, y)),
182 |         (0x10000..=0x10FFFF, 0x10000..=0x10FFFF) => Some(encode_same_level4(x, y)),
183 |         _ => None,
184 |     }
185 | }
186 | 
187 | pub fn encode_range(x: char, y: char) -> Rc<RegexTree> {
188 |     if let Some(tree) = try_encode_same_level(x, y) {
189 |         return normalize(tree);
190 |     }
191 |     let ranges = match (x as u32, y as u32) {
192 |         (0x00..=0x7F, 0x80..=0x7FF) => vec![encode_ge1(x), encode_le2(y)],
193 |         (0x00..=0x7F, 0x800..=0xFFFF) => vec![encode_ge1(x), full_range_2(), encode_le3(y)],
194 |         (0x00..=0x7F, 0x10000..=0x10FFFF) => {
195 |             vec![encode_ge1(x), full_range_2(), full_range_3(), encode_le4(y)]
196 |         }
197 |         (0x80..=0x7FF, 0x800..=0xFFFF) => vec![encode_ge2(x), encode_le3(y)],
198 |         (0x80..=0x7FF, 0x10000..=0x10FFFF) => vec![encode_ge2(x), full_range_3(), encode_le4(y)],
199 |         (0x800..=0xFFFF, 0x10000..=0x10FFFF) => vec![encode_ge3(x), encode_le4(y)],
200 |         _ => unreachable!(),
201 |     };
202 |     // fold union
203 |     normalize(Rc::new(RegexTree::Union(ranges.into_iter().collect())))
204 | }
205 | 
206 | #[cfg(test)]
207 | mod test {
208 |     use super::*;
209 | 
210 |     #[test]
211 |     fn test_encode_char() {
212 |         assert_eq!(encode_char('a').to_string(), "a");
213 |         assert_eq!(encode_char('b').to_string(), "b");
214 |         assert_eq!(encode_char('æ').to_string(), r"(\xc3 ~ \xa6)");
215 |         assert_eq!(encode_char('我').to_string(), r"(\xe6 ~ \x88 ~ \x91)");
216 |     }
217 | 
218 |     #[test]
219 |     fn test_encode_range() {
220 |         assert_eq!(encode_range('a', 'a').to_string(), "a");
221 |         assert_eq!(encode_range('a', 'b').to_string(), "[a, b]");
222 |         assert_eq!(
223 |             encode_range('\u{80}', '\u{88}').to_string(),
224 |             r"(\xc2 ~ [\x80, \x88])"
225 |         );
226 |         assert_eq!(
227 |             encode_range('\u{81}', '\u{7FA}').to_string(),
228 |             r"((\xc2 ~ [\x81, \xbf]) ∪ ([\xc3, \xde] ~ [\x80, \xbf]) ∪ (\xdf ~ [\x80, \xba]))"
229 |         );
230 |         assert_eq!(
231 |             encode_range('\u{800}', '\u{808}').to_string(),
232 |             r"(\xe0 ~ \xa0 ~ [\x80, \x88])"
233 |         );
234 |         assert_eq!(
235 |             encode_range('\u{881}', '\u{FFA}').to_string(),
236 |             r"(\xe0 ~ ((\xa2 ~ [\x81, \xbf]) ∪ ([\xa3, \xbe] ~ [\x80, \xbf]) ∪ (\xbf ~ [\x80, \xba])))"
237 |         );
238 |         assert_eq!(
239 |             encode_range('\u{901}', '\u{FF00}').to_string(),
240 |             "((\\xe0 ~ ((\\xa4 ~ [\\x81, \\xbf]) ∪ ([\\xa5, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ ([\\xe1, \\xee] ~ ((\\x80 ~ [\\x80, \\xbf]) ∪ ([\\x81, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ (\\xef ~ ((\\x80 ~ [\\x80, \\xbf]) ∪ ([\\x81, \\xbb] ~ [\\x80, \\xbf]) ∪ (\\xbc ~ \\x80))))"
241 |         );
242 |         assert_eq!(
243 |             encode_range('a', '\u{90}').to_string(),
244 |             r"([a, \x7f] ∪ ([\xc0, \xc1] ~ [\x80, \xbf]) ∪ (\xc2 ~ [\x80, \x90]))"
245 |         );
246 |         assert_eq!(
247 |             encode_range('a', '\u{801}').to_string(),
248 |             r"([a, \x7f] ∪ ([\xc0, \xdf] ~ [\x80, \xbf]) ∪ (\xe0 ~ (([\x80, \x9f] ~ [\x80, \xbf]) ∪ (\xa0 ~ [\x80, \x81]))))"
249 |         );
250 |         assert_eq!(
251 |             encode_range('\u{99}', '\u{2771}').to_string(),
252 |             "((\\xc2 ~ [\\x99, \\xbf]) ∪ ([\\xc3, \\xdf] ~ [\\x80, \\xbf]) ∪ ([\\xe0, \\xe1] ~ (([\\x80, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ (\\xe2 ~ (([\\x80, \\x9c] ~ [\\x80, \\xbf]) ∪ (\\x9d ~ [\\x80, \\xb1]))))"
253 |         )
254 |     }
255 | }
256 | 


--------------------------------------------------------------------------------
/pag-parser/src/nf.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | use std::collections::{HashMap, HashSet};
 10 | use std::fmt::Display;
 11 | 
 12 | use smallvec::{smallvec, SmallVec};
 13 | use typed_arena::Arena;
 14 | 
 15 | use crate::{core_syntax::Term, frontend::syntax::Parser, utilities::Symbol};
 16 | 
 17 | // thinking a while...
 18 | 
 19 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
 20 | pub struct Tag<'src> {
 21 |     symbol: Symbol<'src>,
 22 |     version: u32,
 23 | }
 24 | 
 25 | impl<'src> Tag<'src> {
 26 |     pub fn new(symbol: Symbol<'src>) -> Self {
 27 |         Self { symbol, version: 0 }
 28 |     }
 29 | 
 30 |     pub fn is_original(&self) -> bool {
 31 |         self.version == 0
 32 |     }
 33 | 
 34 |     pub fn symbol(&self) -> Symbol<'src> {
 35 |         self.symbol
 36 |     }
 37 | }
 38 | 
 39 | impl<'src> Display for Tag<'src> {
 40 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 41 |         self.symbol.fmt(f)?;
 42 |         if self.version > 0 {
 43 |             write!(f, "_{}", self.version)?;
 44 |         }
 45 |         Ok(())
 46 |     }
 47 | }
 48 | 
 49 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 50 | pub enum Action<'src> {
 51 |     Subroutine(Tag<'src>),
 52 |     Summarize(Symbol<'src>),
 53 | }
 54 | 
 55 | impl<'src> Display for Action<'src> {
 56 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 57 |         match self {
 58 |             Action::Subroutine(tag) => write!(f, "{}", tag),
 59 |             Action::Summarize(tag) => write!(f, "[{}]", tag),
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | impl<'src> Action<'src> {
 65 |     fn symbol(&self) -> Symbol<'src> {
 66 |         match self {
 67 |             Action::Subroutine(tag) => tag.symbol,
 68 |             Action::Summarize(sym) => *sym,
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 74 | pub enum NormalForm<'src> {
 75 |     Empty(SmallVec<[Symbol<'src>; 1]>),
 76 |     Unexpanded(SmallVec<[Action<'src>; 1]>),
 77 |     Sequence {
 78 |         terminal: Symbol<'src>,
 79 |         nonterminals: SmallVec<[Action<'src>; 1]>,
 80 |     },
 81 | }
 82 | 
 83 | impl<'src> Display for NormalForm<'src> {
 84 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 85 |         match self {
 86 |             NormalForm::Empty(trees) => trees.iter().fold(write!(f, "ε"), |acc, x| {
 87 |                 acc.and_then(|_| write!(f, " [{x}]"))
 88 |             }),
 89 |             NormalForm::Sequence {
 90 |                 terminal,
 91 |                 nonterminals,
 92 |             } => {
 93 |                 write!(f, "{terminal}")?;
 94 |                 for i in nonterminals {
 95 |                     write!(f, " {i}")?;
 96 |                 }
 97 |                 Ok(())
 98 |             }
 99 |             NormalForm::Unexpanded(tags) => {
100 |                 write!(f, "{}", tags[0])?;
101 |                 for i in &tags[1..] {
102 |                     write!(f, " {i}")?;
103 |                 }
104 |                 Ok(())
105 |             }
106 |         }
107 |     }
108 | }
109 | 
110 | pub struct NormalForms<'src, 'a> {
111 |     pub entries: HashMap<Tag<'src>, SmallVec<[&'a NormalForm<'src>; 4]>>,
112 | }
113 | 
114 | impl<'src, 'a> NormalForms<'src, 'a> {
115 |     pub fn new() -> Self {
116 |         Self {
117 |             entries: HashMap::new(),
118 |         }
119 |     }
120 | }
121 | 
122 | impl<'src, 'a> Display for NormalForms<'src, 'a> {
123 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124 |         for (tag, nf) in self.entries.iter() {
125 |             for i in nf {
126 |                 writeln!(f, "{tag} -> {i}")?;
127 |             }
128 |         }
129 |         Ok(())
130 |     }
131 | }
132 | 
133 | pub fn semi_normalize<'src, 'p, 'nf>(
134 |     target: &Term<'src, 'p>,
135 |     symbol: Symbol<'src>,
136 |     arena: &'nf Arena<NormalForm<'src>>,
137 |     nfs: &mut NormalForms<'src, 'nf>,
138 |     tag_cnt: &mut u32,
139 |     parser: &Parser<'src, 'p>,
140 | ) -> Tag<'src> {
141 |     let version = *tag_cnt;
142 |     *tag_cnt += 1;
143 |     let tag = Tag { symbol, version };
144 | 
145 |     match target {
146 |         Term::Epsilon => {
147 |             let nf = smallvec![&*arena.alloc(NormalForm::Empty(Default::default()))];
148 |             nfs.entries.insert(tag, nf);
149 |             tag
150 |         }
151 |         Term::Sequence(x, y) => {
152 |             let x_tag = semi_normalize(&x.node, symbol, arena, nfs, tag_cnt, parser);
153 |             let y_tag = semi_normalize(&y.node, symbol, arena, nfs, tag_cnt, parser);
154 |             let acts = smallvec![Action::Subroutine(x_tag), Action::Subroutine(y_tag)];
155 |             let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))];
156 |             nfs.entries.insert(tag, nf);
157 |             tag
158 |         }
159 |         Term::LexerRef(lexer) => {
160 |             let nf = smallvec![&*arena.alloc(NormalForm::Sequence {
161 |                 terminal: *lexer,
162 |                 nonterminals: SmallVec::new(),
163 |             })];
164 |             nfs.entries.insert(tag, nf);
165 |             tag
166 |         }
167 |         Term::Bottom => {
168 |             let nf = SmallVec::new();
169 |             nfs.entries.insert(tag, nf);
170 |             tag
171 |         }
172 |         Term::Alternative(x, y) => {
173 |             let x_tag = semi_normalize(&x.node, symbol, arena, nfs, tag_cnt, parser);
174 |             let y_tag = semi_normalize(&y.node, symbol, arena, nfs, tag_cnt, parser);
175 |             let nf = smallvec![
176 |                 &*arena.alloc(NormalForm::Unexpanded(smallvec![Action::Subroutine(x_tag)])),
177 |                 &*arena.alloc(NormalForm::Unexpanded(smallvec![Action::Subroutine(y_tag)])),
178 |             ];
179 |             nfs.entries.insert(tag, nf);
180 |             tag
181 |         }
182 |         Term::Fix(var, body) => {
183 |             let body_tag = semi_normalize(&body.node, *var, arena, nfs, &mut 0, parser);
184 |             if symbol != *var {
185 |                 nfs.entries.insert(tag, nfs.entries[&body_tag].clone());
186 |             }
187 |             body_tag
188 |         }
189 |         Term::ParserRef(x) => {
190 |             let ref_tag = Tag::new(*x);
191 |             if parser.is_active(&ref_tag) {
192 |                 let acts = smallvec![Action::Subroutine(ref_tag), Action::Summarize(*x)];
193 |                 let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))];
194 |                 nfs.entries.insert(tag, nf);
195 |                 tag
196 |             } else if tag.version == 0 {
197 |                 let acts = smallvec![Action::Subroutine(ref_tag)];
198 |                 let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))];
199 |                 nfs.entries.insert(tag, nf);
200 |                 tag
201 |             } else {
202 |                 ref_tag
203 |             }
204 |         }
205 |     }
206 | }
207 | 
208 | pub fn fully_normalize<'src, 'nf>(
209 |     arena: &'nf Arena<NormalForm<'src>>,
210 |     nfs: &mut NormalForms<'src, 'nf>,
211 | ) {
212 |     let mut updates = Vec::new();
213 |     loop {
214 |         for (tag, i) in nfs.entries.iter() {
215 |             if !i.iter().any(|x| matches!(x, NormalForm::Unexpanded(..))) {
216 |                 continue;
217 |             }
218 |             let mut result = SmallVec::new();
219 |             for j in i.iter() {
220 |                 let NormalForm::Unexpanded(actions) = j else {
221 |                     result.push(*j);
222 |                     continue;
223 |                 };
224 |                 let first_subroutine = actions.iter().enumerate().find_map(|(index, act)| {
225 |                     if let Action::Subroutine(x) = act {
226 |                         Some((index, x))
227 |                     } else {
228 |                         None
229 |                     }
230 |                 });
231 |                 match first_subroutine {
232 |                     None => {
233 |                         let nf = NormalForm::Empty(actions.iter().map(|x| x.symbol()).collect());
234 |                         result.push(&*arena.alloc(nf));
235 |                     }
236 |                     Some((index, x)) => {
237 |                         let variable_nf = &nfs.entries[x];
238 |                         for k in variable_nf.iter().copied() {
239 |                             let head = actions[..index].iter().cloned();
240 |                             let tail = actions[index + 1..].iter().cloned();
241 |                             match k {
242 |                                 NormalForm::Empty(trees) => {
243 |                                     let insert = trees.iter().map(|x| Action::Summarize(*x));
244 |                                     let acts = head.chain(insert).chain(tail).collect();
245 |                                     result.push(&*arena.alloc(NormalForm::Unexpanded(acts)));
246 |                                 }
247 |                                 NormalForm::Unexpanded(subacts) => {
248 |                                     let insert = subacts.iter().cloned();
249 |                                     let acts = head.chain(insert).chain(tail).collect();
250 |                                     result.push(&*arena.alloc(NormalForm::Unexpanded(acts)));
251 |                                 }
252 |                                 NormalForm::Sequence {
253 |                                     terminal,
254 |                                     nonterminals,
255 |                                 } => {
256 |                                     let insert = nonterminals.iter().cloned();
257 |                                     let acts = head.chain(insert).chain(tail).collect();
258 |                                     result.push(&*arena.alloc(NormalForm::Sequence {
259 |                                         terminal: *terminal,
260 |                                         nonterminals: acts,
261 |                                     }));
262 |                                 }
263 |                             }
264 |                         }
265 |                     }
266 |                 }
267 |             }
268 |             updates.push((*tag, result));
269 |         }
270 |         if updates.is_empty() {
271 |             break;
272 |         }
273 |         nfs.entries.extend(updates.drain(..));
274 |     }
275 | }
276 | 
277 | pub fn merge_inactive_rules<'src, 'nf>(
278 |     nfs: &mut NormalForms<'src, 'nf>,
279 |     parser: &Parser<'src, '_>,
280 |     arena: &'nf Arena<NormalForm<'src>>,
281 | ) {
282 |     // sort all rules
283 |     for i in nfs.entries.values_mut() {
284 |         i.sort_unstable();
285 |     }
286 |     let mut table: HashMap<&[&NormalForm], Tag<'src>> = HashMap::new();
287 |     let mut rename = Vec::new();
288 |     for (tag, nf) in nfs.entries.iter() {
289 |         if parser.is_active(tag) {
290 |             continue;
291 |         }
292 |         table
293 |             .entry(nf.as_slice())
294 |             .and_modify(|new_tag| rename.push((*tag, *new_tag)))
295 |             .or_insert(*tag);
296 |     }
297 |     for (tag, new_tag) in rename {
298 |         nfs.entries.remove(&tag);
299 |         for i in nfs.entries.values_mut() {
300 |             for j in i.iter_mut() {
301 |                 let NormalForm::Sequence {
302 |                     terminal,
303 |                     nonterminals,
304 |                 } = j else { continue };
305 |                 if nonterminals.contains(&Action::Subroutine(tag)) {
306 |                     *j = &*arena.alloc(NormalForm::Sequence {
307 |                         terminal: *terminal,
308 |                         nonterminals: nonterminals
309 |                             .iter()
310 |                             .map(|x| {
311 |                                 if *x == Action::Subroutine(tag) {
312 |                                     Action::Subroutine(new_tag)
313 |                                 } else {
314 |                                     *x
315 |                                 }
316 |                             })
317 |                             .collect(),
318 |                     });
319 |                 }
320 |             }
321 |         }
322 |     }
323 | }
324 | 
325 | pub fn remove_unreachable_rules<'src>(nfs: &mut NormalForms<'src, '_>, parser: &Parser<'src, '_>) {
326 |     fn dfs<'src>(
327 |         nfs: &NormalForms<'src, '_>,
328 |         current: Tag<'src>,
329 |         visited: &mut HashSet<Tag<'src>>,
330 |     ) {
331 |         if visited.contains(&current) {
332 |             return;
333 |         }
334 |         visited.insert(current);
335 |         let Some(tag) = nfs.entries.get(&current) else { return };
336 |         for i in tag {
337 |             let NormalForm::Sequence { nonterminals, .. } = i else { continue };
338 |             for i in nonterminals {
339 |                 let Action::Subroutine(x) = i else { continue };
340 |                 dfs(nfs, *x, visited);
341 |             }
342 |         }
343 |     }
344 | 
345 |     let mut visited = HashSet::new();
346 |     dfs(nfs, Tag::new(parser.entrypoint), &mut visited);
347 |     nfs.entries.retain(|k, _| visited.contains(k));
348 | }
349 | 


--------------------------------------------------------------------------------
/pag-parser/src/lib.rs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2023 Paguroidea Developers
  2 | //
  3 | // Licensed under the Apache License, Version 2.0
  4 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
  5 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. All files in the project carrying such notice may not be copied,
  7 | // modified, or distributed except according to those terms.
  8 | 
  9 | mod core_syntax;
 10 | mod frontend;
 11 | mod fusion;
 12 | mod nf;
 13 | mod type_system;
 14 | mod utilities;
 15 | 
 16 | use ariadne::{Color, Label, Report, ReportKind, Source};
 17 | use proc_macro2::TokenStream;
 18 | use quote::format_ident;
 19 | use typed_arena::Arena;
 20 | 
 21 | use std::ops::Range;
 22 | 
 23 | use core_syntax::TermArena;
 24 | use frontend::{
 25 |     lexical::construct_lexer_database, syntax::construct_parser, FrontendError,
 26 |     GrammarDefinitionError,
 27 | };
 28 | use fusion::fusion_parser;
 29 | use nf::{
 30 |     fully_normalize, merge_inactive_rules, remove_unreachable_rules, semi_normalize, NormalForms,
 31 | };
 32 | use type_system::TypeError;
 33 | use utilities::unreachable_branch;
 34 | 
 35 | pub enum Error<'src> {
 36 |     GrammarDefinitionError(GrammarDefinitionError<'src>),
 37 |     FrontendErrors(Vec<FrontendError<'src>>),
 38 |     TypeErrors(Vec<TypeError<'src>>),
 39 | }
 40 | 
 41 | impl<'src> From<Vec<FrontendError<'src>>> for Error<'src> {
 42 |     fn from(errors: Vec<FrontendError<'src>>) -> Self {
 43 |         Error::FrontendErrors(errors)
 44 |     }
 45 | }
 46 | 
 47 | impl<'src> From<Vec<TypeError<'src>>> for Error<'src> {
 48 |     fn from(errors: Vec<TypeError<'src>>) -> Self {
 49 |         Error::TypeErrors(errors)
 50 |     }
 51 | }
 52 | 
 53 | impl<'src> Error<'src> {
 54 |     pub fn report_stderr(&self, input_name: &str, input: &'src str) -> Result<(), std::io::Error> {
 55 |         let mut cache = (input_name, Source::from(input));
 56 |         for i in self.to_reports(input_name) {
 57 |             i.eprint(&mut cache)?;
 58 |         }
 59 |         Ok(())
 60 |     }
 61 | 
 62 |     pub fn report_stdout(&self, input_name: &str, input: &'src str) -> Result<(), std::io::Error> {
 63 |         let mut cache = (input_name, Source::from(input));
 64 |         for i in self.to_reports(input_name) {
 65 |             i.print(&mut cache)?;
 66 |         }
 67 |         Ok(())
 68 |     }
 69 | 
 70 |     pub fn to_reports<'a>(&self, input_name: &'a str) -> Vec<Report<'a, (&'a str, Range<usize>)>> {
 71 |         match self {
 72 |             Error::GrammarDefinitionError(e) => {
 73 |                 use GrammarDefinitionError::*;
 74 |                 vec![match e {
 75 |                     SyntaxError(x) => {
 76 |                         let span = match x.location {
 77 |                             pest::error::InputLocation::Pos(x) => x..x+1,
 78 |                             pest::error::InputLocation::Span((x, y)) => x..y,
 79 |                         };
 80 |                         Report::build(ReportKind::Error, input_name, span.start)
 81 |                             .with_message("Syntax error in grammar definition")
 82 |                             .with_label(Label::new((input_name, span))
 83 |                                 .with_message(format!("{}", x.variant.message()))
 84 |                                 .with_color(Color::Red))
 85 |                             .finish()
 86 |                     },
 87 |                     FormatError { span, message } => {
 88 |                         Report::build(ReportKind::Error, input_name, span.start())
 89 |                             .with_message("Format error in grammar definition")
 90 |                             .with_label(Label::new((input_name, span.start()..span.end()))
 91 |                                 .with_message(format!("{}", message))
 92 |                                 .with_color(Color::Red))
 93 |                             .finish()
 94 |                     },
 95 |                     ParserLogicError(e) => {
 96 |                         Report::build(ReportKind::Error, input_name, 0)
 97 |                             .with_message(format!("Internal logical error when parsing grammar definition {}", e))
 98 |                             .finish()
 99 |                     },
100 |                     UnexpectedEOI(e) => {
101 |                         Report::build(ReportKind::Error, input_name, 0)
102 |                             .with_message(format!("Internal logical error when parsing grammar definition, pest parser failed to give {}", e))
103 |                             .finish()
104 |                     },
105 |                 }]
106 |             },
107 |             Error::FrontendErrors(errors) => errors
108 |                 .iter()
109 |                 .map(|e|  {
110 |                     use FrontendError::*;
111 |                     match &e {
112 |                         // InternalLogicalError(span, msg) => {
113 |                         //     Report::build(ReportKind::Error, input_name, span.start())
114 |                         //         .with_message("Internal logical error encountered")
115 |                         //         .with_label(Label::new((input_name, span.start()..span.end()))
116 |                         //             .with_message(msg)
117 |                         //             .with_color(Color::Red))
118 |                         //         .finish()
119 |                         // },
120 |                         MultipleDefinition(fst, snd) => {
121 |                             Report::build(ReportKind::Error, input_name, snd.start())
122 |                                 .with_message(format!("Multiple definition of {}", fst.as_str()))
123 |                                 .with_label(Label::new((input_name, fst.start()..fst.end()))
124 |                                     .with_message("first definition")
125 |                                     .with_color(Color::Green))
126 |                                 .with_label(Label::new((input_name, snd.start()..snd.end()))
127 |                                     .with_message("second definition")
128 |                                     .with_color(Color::Blue))
129 |                                 .finish()
130 |                         },
131 |                         UndefinedLexicalRuleReference(span) => {
132 |                             Report::build(ReportKind::Error, input_name, span.start())
133 |                                 .with_message("Undefined lexical rule reference")
134 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
135 |                                     .with_message(format!("lexcical rule {} is undefined", span.as_str()))
136 |                                     .with_color(Color::Red))
137 |                                 .finish()
138 |                         },
139 |                         CyclicLexicalRuleReference(span) => {
140 |                             Report::build(ReportKind::Error, input_name, span.start())
141 |                                 .with_message("Cyclic lexical rule reference")
142 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
143 |                                     .with_message("this reference causes cyclic dependency")
144 |                                     .with_color(Color::Red))
145 |                                 .finish()
146 |                         },
147 |                         UndefinedParserRuleReference(span) => {
148 |                             Report::build(ReportKind::Error, input_name, span.start())
149 |                                 .with_message("Undefined parser rule reference")
150 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
151 |                                     .with_message(format!("parser rule {} is undefined", span.as_str()))
152 |                                     .with_color(Color::Red))
153 |                                 .finish()
154 |                         },
155 |                         MultipleSkippingRule(fst, snd) => {
156 |                             Report::build(ReportKind::Error, input_name, snd.start())
157 |                                 .with_message("Skipping lexical rule is already defined")
158 |                                 .with_label(Label::new((input_name, fst.start()..fst.end()))
159 |                                     .with_message("first definition")
160 |                                     .with_color(Color::Green))
161 |                                 .with_label(Label::new((input_name, snd.start()..snd.end()))
162 |                                     .with_message("second definition")
163 |                                     .with_color(Color::Blue))
164 |                                 .finish()
165 |                         },
166 |                         NullableToken(name, span) => {
167 |                             Report::build(ReportKind::Error, input_name, span.start())
168 |                                 .with_message("Nullable token detected")
169 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
170 |                                     .with_message(format!("token {name} is nullable"))
171 |                                     .with_color(Color::Red))
172 |                                 .finish()
173 |                         },
174 |                     }
175 |                 })
176 |                 .collect::<Vec<_>>(),
177 |             Error::TypeErrors(errors) => errors
178 |                 .iter()
179 |                 .map(|e| {
180 |                     use TypeError::*;
181 |                     match e {
182 |                         SequentialUniquenessViolation { lhs, rhs, total } => {
183 |                             Report::build(ReportKind::Error, input_name, total.start())
184 |                                 .with_message("When type checking a sequence of rules, the following rules are ambiguous")
185 |                                 .with_label(Label::new((input_name, lhs.0.start()..lhs.0.end()))
186 |                                     .with_message(format!("type info for left-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}",
187 |                                     lhs.1.nullable, lhs.1.first.iter().map(|x|x.name()).collect::<Vec<_>>().join(", "),
188 |                                     lhs.1.follow.iter().map(|x|x.name()).collect::<Vec<_>>().join(", ")
189 |                                 ))
190 |                                     .with_color(Color::Green))
191 |                                 .with_label(Label::new((input_name, rhs.0.start()..rhs.0.end()))
192 |                                     .with_message(format!("type info for right-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}",
193 |                                     rhs.1.nullable, rhs.1.first.iter().map(|x|x.name()).collect::<Vec<_>>().join(", "),
194 |                                     rhs.1.follow.iter().map(|x|x.name()).collect::<Vec<_>>().join(", ")
195 |                                 ))
196 |                                     .with_color(Color::Blue))
197 |                                 .finish()
198 |                         },
199 |                         DisjunctiveUniquenessViolation { lhs, rhs, total } => {
200 |                             Report::build(ReportKind::Error, input_name, total.start())
201 |                                 .with_message("When type checking an alternation of rules, the following rules are ambiguous")
202 |                                 .with_label(Label::new((input_name, lhs.0.start()..lhs.0.end()))
203 |                                     .with_message(format!("type info for left-hand side: nullable: {}, first set:  {{{}}}, follow set:  {{{}}}",
204 |                                     lhs.1.nullable, lhs.1.first.iter().map(|x|x.name()).collect::<Vec<_>>().join(", "),
205 |                                     lhs.1.follow.iter().map(|x|x.name()).collect::<Vec<_>>().join(", ")
206 |                                 ))
207 |                                     .with_color(Color::Green))
208 |                                 .with_label(Label::new((input_name, rhs.0.start()..rhs.0.end()))
209 |                                     .with_message(format!("type info for right-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}",
210 |                                     rhs.1.nullable, rhs.1.first.iter().map(|x|x.name()).collect::<Vec<_>>().join(", "),
211 |                                     rhs.1.follow.iter().map(|x|x.name()).collect::<Vec<_>>().join(", ")
212 |                                 ))
213 |                                     .with_color(Color::Blue))
214 |                                 .finish()
215 |                         },
216 |                         UnguardedFixpoint(sym, span) => {
217 |                             Report::build(ReportKind::Error, input_name, span.start())
218 |                                 .with_message("Unguarded fixpoint")
219 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
220 |                                     .with_message(format!("fixpoint rule {} is not guarded -- your grammar is left-recursive", sym))
221 |                                     .with_color(Color::Red))
222 |                                 .finish()
223 |                         },
224 |                         UnresolvedReference(sym, span) => {
225 |                             Report::build(ReportKind::Error, input_name, span.start())
226 |                                 .with_message("Unresolved reference")
227 |                                 .with_label(Label::new((input_name, span.start()..span.end()))
228 |                                     .with_message(format!("cannot resolve parser rule {} within context -- did you forget to put recursive rule into fixpoint", sym))
229 |                                     .with_color(Color::Red))
230 |                                 .finish()
231 |                         },
232 |                     }
233 |                 })
234 |                 .collect::<Vec<_>>(),
235 |         }
236 |     }
237 | }
238 | 
239 | pub fn generate_parser(input: &str) -> Result<TokenStream, Error> {
240 |     use frontend::SurfaceSyntaxTree::Grammar;
241 | 
242 |     let sst = frontend::parse(input)?;
243 |     let Grammar { lexer, parser } = &sst.node else {
244 |         unreachable_branch!("the entrypoint of sst can only be Grammar")
245 |     };
246 |     let lexer_database = construct_lexer_database(lexer)?;
247 |     lexer_database.nullability_check()?;
248 |     let term_arena = TermArena::new();
249 |     let mut parser = construct_parser(&term_arena, lexer_database, parser)?;
250 |     parser.infer_fixpoints();
251 |     let type_errs = parser.type_check();
252 |     if !type_errs.is_empty() {
253 |         return Err(Error::TypeErrors(type_errs));
254 |     }
255 |     let nf_arena = Arena::new();
256 |     let mut nfs = NormalForms::new();
257 |     for (symbol, rule) in parser.bindings.iter() {
258 |         semi_normalize(
259 |             &rule.term.node,
260 |             *symbol,
261 |             &nf_arena,
262 |             &mut nfs,
263 |             &mut 0,
264 |             &parser,
265 |         );
266 |     }
267 |     fully_normalize(&nf_arena, &mut nfs);
268 |     merge_inactive_rules(&mut nfs, &parser, &nf_arena);
269 |     remove_unreachable_rules(&mut nfs, &parser);
270 |     let parser_routines = fusion_parser(&nfs, &parser);
271 |     let entrypoint = format_ident!("parse_{}", parser.entrypoint.name());
272 |     Ok(quote::quote! {
273 |         #![allow(
274 |             dead_code,
275 |             non_camel_case_types,
276 |             unused_variables,
277 |             unused_mut,
278 |             unreachable_code,
279 |             unused_assignments,
280 |             clippy::single_match,
281 |             clippy::never_loop,
282 |             clippy::match_single_binding,
283 |         )]
284 |         #parser_routines
285 |         pub fn parse(input: &str) -> Result<ParserTree, Error> {
286 |             #entrypoint(input, 0)
287 |         }
288 |     })
289 | }
290 | 
291 | #[cfg(test)]
292 | mod tests;
293 | 


--------------------------------------------------------------------------------