├── benches ├── csv │ ├── src │ │ ├── .gitignore │ │ └── lib.rs │ ├── build.rs │ ├── benches │ │ ├── csv.pest │ │ └── benchmarks.rs │ ├── csv.pag │ └── Cargo.toml └── json │ ├── src │ ├── .gitignore │ └── lib.rs │ ├── build.rs │ ├── Cargo.toml │ ├── benches │ ├── json.pest │ ├── json.lalrpop │ ├── json_logos.lalrpop │ ├── lalr_def.rs │ └── benchmarks.rs │ └── json.pag ├── tests ├── arith-expr │ ├── src │ │ ├── .gitignore │ │ └── lib.rs │ ├── build.rs │ ├── Cargo.toml │ └── arith.pag ├── tokenizer │ ├── src │ │ ├── .gitignore │ │ ├── lib.rs │ │ ├── generated.rs │ │ ├── length_differential.rs │ │ ├── common_prefix.rs │ │ ├── tail_differential.rs │ │ └── comment_and_string.rs │ ├── Cargo.toml │ └── build.rs └── sexpr-calculator │ ├── src │ ├── .gitignore │ └── lib.rs │ ├── build.rs │ ├── Cargo.toml │ └── sexpr.pag ├── rust-toolchain.toml ├── .gitignore ├── .github ├── images │ └── hermit-crab.png └── workflows │ └── build.yaml ├── pag-parser ├── src │ ├── tests │ │ ├── failure │ │ │ ├── err_cyclic_token.pag │ │ │ ├── err_nullable_token.pag │ │ │ ├── err_sequence_ambiguity.pag │ │ │ ├── err_undefined_token_in_lexer.pag │ │ │ ├── err_null_sequence_ambiguity.pag │ │ │ ├── err_undefined_grammar_rule.pag │ │ │ ├── err_undefined_token_in_parser.pag │ │ │ ├── err_unguarded_fixpoint.pag │ │ │ ├── err_alternation_ambiguity.pag │ │ │ ├── err_multiple_skips.pag │ │ │ ├── err_multiple_definitions_in_lexer.pag │ │ │ ├── err_multiple_definitions_in_parser.pag │ │ │ └── mod.rs │ │ └── mod.rs │ ├── type_system │ │ ├── mod.rs │ │ ├── binding_proxy.rs │ │ ├── context.rs │ │ ├── fixpoint.rs │ │ └── type_check.rs │ ├── frontend │ │ ├── example.pag │ │ ├── grammar.pest │ │ ├── syntax.rs │ │ ├── lexical.rs │ │ └── unicode.rs │ ├── core_syntax.rs │ ├── utilities.rs │ ├── nf.rs │ └── lib.rs └── Cargo.toml ├── rustfmt.toml ├── pag-lexer ├── src │ ├── utilities.rs │ ├── derivative.rs │ ├── congruence.rs │ ├── regex_tree.rs │ ├── lib.rs │ ├── lookahead.rs │ ├── normalization.rs │ ├── intervals.rs │ └── vector.rs └── Cargo.toml ├── pag-compiler ├── Cargo.toml └── src │ └── lib.rs ├── Cargo.toml ├── LICENSE-MIT ├── shell.nix ├── README.md └── LICENSE-APACHE /benches/csv/src/.gitignore: -------------------------------------------------------------------------------- 1 | parser.rs -------------------------------------------------------------------------------- /benches/json/src/.gitignore: -------------------------------------------------------------------------------- 1 | parser.rs 2 | -------------------------------------------------------------------------------- /tests/arith-expr/src/.gitignore: -------------------------------------------------------------------------------- 1 | parser.rs -------------------------------------------------------------------------------- /tests/tokenizer/src/.gitignore: -------------------------------------------------------------------------------- 1 | generated -------------------------------------------------------------------------------- /tests/sexpr-calculator/src/.gitignore: -------------------------------------------------------------------------------- 1 | parser.rs -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "nightly" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | .idea/ 4 | .vscode/ 5 | flamegraph.svg 6 | -------------------------------------------------------------------------------- /.github/images/hermit-crab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SchrodingerZhu/paguroidea/HEAD/.github/images/hermit-crab.png -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_cyclic_token.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a' ~ A; 3 | } 4 | 5 | parser test { 6 | active test = _; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_nullable_token.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'*; 3 | } 4 | 5 | parser test { 6 | active test = A; 7 | } 8 | -------------------------------------------------------------------------------- /benches/csv/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pag_compiler::compile("csv.pag", "src/parser.rs"); 3 | println!("cargo:rerun-if-changed=csv.pag"); 4 | } 5 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_sequence_ambiguity.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = A+ ~ A; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_undefined_token_in_lexer.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = C; 3 | } 4 | 5 | parser test { 6 | active test = _; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_null_sequence_ambiguity.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = _ ~ A; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_undefined_grammar_rule.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = test2; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_undefined_token_in_parser.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = AA; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_unguarded_fixpoint.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = test ~ A; 7 | } 8 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_alternation_ambiguity.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = 'a'; 3 | } 4 | 5 | parser test { 6 | active test = A+ | A ~ test; 7 | } 8 | -------------------------------------------------------------------------------- /tests/arith-expr/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pag_compiler::compile("arith.pag", "src/parser.rs"); 3 | println!("cargo:rerun-if-changed=arith.pag"); 4 | } 5 | -------------------------------------------------------------------------------- /tests/sexpr-calculator/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pag_compiler::compile("sexpr.pag", "src/parser.rs"); 3 | println!("cargo:rerun-if-changed=sexpr.pag"); 4 | } 5 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_multiple_skips.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | skip = "SKIP"; 3 | skip = "ANOTHER_SKIP"; 4 | } 5 | 6 | parser test { 7 | active test = _; 8 | } 9 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_multiple_definitions_in_lexer.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = '0'; 3 | A = '1'; 4 | } 5 | 6 | parser test { 7 | active test = _; 8 | } 9 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/err_multiple_definitions_in_parser.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | A = '0'; 3 | } 4 | 5 | parser test { 6 | active test = A; 7 | active test = A; 8 | } 9 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | ignore = [ 2 | "/tests/arith-expr/src/parser.rs", 3 | "/tests/sexpr-calculator/src/parser.rs", 4 | "/tests/tokenizer/src/generated/*.rs", 5 | "/benches/csv/src/parser.rs", 6 | "/benches/json/src/parser.rs", 7 | ] 8 | -------------------------------------------------------------------------------- /tests/tokenizer/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(portable_simd)] 2 | #![feature(core_intrinsics)] 3 | #![feature(array_chunks)] 4 | mod comment_and_string; 5 | mod common_prefix; 6 | mod generated; 7 | mod length_differential; 8 | mod tail_differential; 9 | -------------------------------------------------------------------------------- /benches/csv/benches/csv.pest: -------------------------------------------------------------------------------- 1 | text = _{ (!("," | "\"" | "\r" | "\n") ~ ANY)+ } 2 | string = _{ "\"" ~ ( "\"\"" | !"\"" ~ ANY)* ~ "\"" } 3 | crlf = _{ "\r"? ~ "\n" } 4 | 5 | field = { text | string } 6 | record = { field ~ ("," ~ field)* ~ crlf } 7 | csv = { record+ } -------------------------------------------------------------------------------- /benches/json/build.rs: -------------------------------------------------------------------------------- 1 | extern crate lalrpop; 2 | 3 | fn main() { 4 | lalrpop::Configuration::new() 5 | .process_dir("benches/") 6 | .unwrap(); 7 | pag_compiler::compile("json.pag", "src/parser.rs"); 8 | println!("cargo:rerun-if-changed=json.pag"); 9 | } 10 | -------------------------------------------------------------------------------- /tests/arith-expr/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "arith-expr" 3 | version = "0.1.0" 4 | edition = "2021" 5 | build = "build.rs" 6 | publish = false 7 | 8 | [dependencies] 9 | rand = { version = "0.8" } 10 | 11 | [build-dependencies] 12 | pag-compiler = { path = "../../pag-compiler" } 13 | -------------------------------------------------------------------------------- /tests/sexpr-calculator/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sexpr-calculator" 3 | version = "0.1.0" 4 | edition = "2021" 5 | build = "build.rs" 6 | publish = false 7 | 8 | [dependencies] 9 | rand = { version = "0.8" } 10 | 11 | [build-dependencies] 12 | pag-compiler = { path = "../../pag-compiler" } 13 | -------------------------------------------------------------------------------- /tests/tokenizer/src/generated.rs: -------------------------------------------------------------------------------- 1 | #[path = "generated/comment_and_string.rs"] 2 | pub(crate) mod comment_and_string; 3 | #[path = "generated/common_prefix.rs"] 4 | pub(crate) mod common_prefix; 5 | #[path = "generated/length_differential.rs"] 6 | pub(crate) mod length_differential; 7 | #[path = "generated/tail_differential.rs"] 8 | pub(crate) mod tail_differential; 9 | -------------------------------------------------------------------------------- /tests/tokenizer/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tokenizer" 3 | version = "0.1.0" 4 | edition = "2021" 5 | build = "build.rs" 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | rand = { version = "0.8" } 10 | 11 | [build-dependencies] 12 | tempfile = "3.6.0" 13 | pag-compiler = { path = "../../pag-compiler" } 14 | -------------------------------------------------------------------------------- /benches/csv/csv.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | TEXT = (!('"' | '\r' | '\n' | ','))+; 3 | STRING = '"' ~ ('"' ~ '"' | !'"')* ~ '"'; 4 | CRLF = '\r'? ~ '\n'; 5 | COMMA = ','; 6 | } 7 | 8 | parser csv { 9 | active csv 10 | = record+; 11 | 12 | active field 13 | = TEXT | STRING; 14 | 15 | active record 16 | = field ~ (COMMA ~ field)* ~ CRLF; 17 | } 18 | -------------------------------------------------------------------------------- /pag-lexer/src/utilities.rs: -------------------------------------------------------------------------------- 1 | pub fn dbg_sort(data: T, _f: F) -> impl Iterator 2 | where 3 | T: IntoIterator, 4 | F: FnMut(&U) -> K, 5 | K: Ord, 6 | { 7 | #[cfg(not(debug_assertions))] 8 | { 9 | data.into_iter() 10 | } 11 | #[cfg(debug_assertions)] 12 | { 13 | let mut vec = Vec::from_iter(data.into_iter()); 14 | vec.sort_unstable_by_key(_f); 15 | vec.into_iter() 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/sexpr-calculator/sexpr.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | DIGIT = '0' .. '9'; 3 | 4 | LPAREN = '('; 5 | RPAREN = ')'; 6 | PLUS = '+' | '加'; 7 | MULT = '*' | '乘'; 8 | INT = DIGIT+; 9 | 10 | skip = (' ' | '\t' | '\n' | '\r')+; 11 | } 12 | 13 | parser sexpr { 14 | active compound 15 | = LPAREN ~ op ~ (compound | int)* ~ RPAREN; 16 | 17 | active op 18 | = PLUS | MULT; 19 | 20 | active int 21 | = INT; 22 | 23 | active sexpr 24 | = compound | int; 25 | } 26 | -------------------------------------------------------------------------------- /pag-parser/src/type_system/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | mod binding_proxy; 10 | mod context; 11 | mod fixpoint; 12 | mod type_check; 13 | 14 | pub use fixpoint::infer_fixpoints; 15 | pub use type_check::{type_check, Type, TypeError}; 16 | -------------------------------------------------------------------------------- /tests/arith-expr/arith.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | DIGIT = '0' .. '9'; 3 | 4 | LPAREN = '('; 5 | RPAREN = ')'; 6 | PLUS = '+'; 7 | MULT = '*'; 8 | INT = DIGIT+; 9 | SPECIAL = '\u{FF}' .. '\u{D7FF}'; 10 | 11 | skip = (' ' | '\t' | '\n' | '\r')+; 12 | } 13 | 14 | parser expr { 15 | active expr 16 | = mult ~ (PLUS ~ mult)*; 17 | 18 | active mult 19 | = primary ~ (MULT ~ primary)*; 20 | 21 | silent primary 22 | = special | int | LPAREN ~ expr ~ RPAREN; 23 | 24 | active int 25 | = INT; 26 | 27 | active special 28 | = SPECIAL; 29 | } 30 | -------------------------------------------------------------------------------- /benches/csv/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pag-csv" 3 | version = "0.1.0" 4 | edition = "2021" 5 | build = "build.rs" 6 | publish = false 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | [dependencies] 10 | rand = { version = "0.8" } 11 | snmalloc-rs = { version = "0.3", features = ["build_cc"] } 12 | 13 | [build-dependencies] 14 | pag-compiler = { path = "../../pag-compiler" } 15 | 16 | [dev-dependencies] 17 | csv = { version = "1" } 18 | criterion = { version = "0.4", features = ["html_reports"] } 19 | pest = { version = "2.5.7", features = [ "std", "memchr" ] } 20 | pest_derive = "2.5.7" 21 | 22 | [[bench]] 23 | name = "benchmarks" 24 | harness = false 25 | -------------------------------------------------------------------------------- /benches/json/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pag-json" 3 | version = "0.1.0" 4 | edition = "2021" 5 | build = "build.rs" 6 | publish = false 7 | autobenches = false 8 | 9 | [dependencies] 10 | rand = { version = "0.8" } 11 | serde_json = "1.0" 12 | 13 | [build-dependencies] 14 | pag-compiler = { path = "../../pag-compiler" } 15 | lalrpop = "0.20.0" 16 | 17 | [dev-dependencies] 18 | criterion = { version = "0.4", features = ["html_reports"] } 19 | snmalloc-rs = { version = "0.3", features = ["build_cc"] } 20 | pest = { version = "2.5.7", features = [ "std", "memchr" ] } 21 | pest_derive = "2.5.7" 22 | lalrpop-util = { version = "0.20.0", features = ["lexer", "unicode"] } 23 | logos = "0.13.0" 24 | 25 | [[bench]] 26 | name = "benchmarks" 27 | harness = false 28 | -------------------------------------------------------------------------------- /benches/json/benches/json.pest: -------------------------------------------------------------------------------- 1 | WHITESPACE = _{ (" " | "\t" | "\r" | "\n")+ } 2 | escape = _{ "\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t" } 3 | non_zero = _{'1' .. '9'} 4 | digit = _{'0' .. '9'} 5 | hex_digit = _{ '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' } 6 | string = @{ "\"" ~ ( (!("\"" | "\\") ~ ANY) | "\\" ~ (escape | ("u" ~ hex_digit ~ hex_digit ~ hex_digit ~ hex_digit )) )* ~ "\"" } 7 | number = @{ "-"? ~ ("0" | non_zero ~ digit*) ~ ("." ~ digit+)? ~ (("e" | "E") ~ ("+" | "-")? ~ digit+)? } 8 | lit_true = @{ "true" } 9 | lit_false = @{ "false" } 10 | lit_null = @{ "null" } 11 | attribute = { string ~ ":" ~ value } 12 | object = { "{" ~ (attribute ~ ("," ~ attribute)*)? ~ "}" } 13 | array = { "[" ~ (value ~ ("," ~ value)*)? ~ "]" } 14 | value = _{string | number | array | object | lit_true | lit_false | lit_null} 15 | json = { SOI ~ value ~ EOI } -------------------------------------------------------------------------------- /pag-parser/src/frontend/example.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | // definition in lexer is not a real token, it is just a way to define a set of characters 3 | 4 | BLANK = ' '; 5 | DIGIT = '0' .. '9'; 6 | ALPHA = 'a' .. 'z' | 'A' .. 'Z'; 7 | 8 | LPAREN = '('; 9 | RPAREN = ')'; 10 | ATOM = ALPHA ~ (ALPHA | DIGIT)*; 11 | 12 | skip = (BLANK | '\t' | '\n' | '\r')+; 13 | } 14 | 15 | // parser must have a entry point 16 | parser sexpr { 17 | // definition in parser can be a real grammer rule. 18 | 19 | active compound 20 | = LPAREN ~ sexprs ~ RPAREN; 21 | 22 | // just for testing 23 | active atom 24 | = real_atom; 25 | 26 | silent real_atom 27 | = ATOM; 28 | 29 | silent sexprs 30 | = (compound | atom) *; 31 | 32 | active sexpr 33 | = compound | atom; 34 | 35 | active unreachable = unreachable; 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: [ "main" ] 5 | pull_request: 6 | branches: [ "main" ] 7 | env: 8 | CARGO_TERM_COLOR: always 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Initialize Rustup 15 | run: | 16 | rustup toolchain install nightly --component rustfmt clippy --profile minimal --force 17 | rustup override set nightly 18 | - name: Run build 19 | run: cargo build --verbose 20 | - name: Run rustfmt 21 | run: cargo fmt --all -- --check 22 | - name: Run clippy 23 | run: cargo clippy --all 24 | - name: Build 25 | run: cargo build --release --verbose 26 | - name: Run debug tests 27 | run: cargo test --verbose 28 | - name: Run release tests 29 | run: cargo test --verbose --release 30 | -------------------------------------------------------------------------------- /pag-parser/src/tests/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use ariadne::Source; 10 | use strip_ansi_escapes::Writer; 11 | mod failure; 12 | 13 | fn write_error, N: AsRef>(input: S, name: N) -> String { 14 | let mut buffer = Vec::::new(); 15 | { 16 | let result = crate::generate_parser(input.as_ref()).unwrap_err(); 17 | let reports = result.to_reports(name.as_ref()); 18 | let mut cache = (name.as_ref(), Source::from(input.as_ref())); 19 | let mut writer = Writer::new(&mut buffer); 20 | for i in reports { 21 | i.write(&mut cache, &mut writer).unwrap(); 22 | } 23 | } 24 | String::from_utf8(buffer).unwrap() 25 | } 26 | -------------------------------------------------------------------------------- /pag-lexer/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Paguroidea Developers 2 | # 3 | # Licensed under the Apache License, Version 2.0 4 | # or the MIT 5 | # license , at your 6 | # option. All files in the project carrying such notice may not be copied, 7 | # modified, or distributed except according to those terms. 8 | 9 | [package] 10 | name = "pag-lexer" 11 | keywords = ["lexer", "cfg", "grammar", "regex"] 12 | description = "Parser-lexer fusion generator (derivative lexer)" 13 | documentation = "https://docs.rs/pag-lexer/" 14 | 15 | version.workspace = true 16 | edition.workspace = true 17 | license.workspace = true 18 | exclude.workspace = true 19 | categories.workspace = true 20 | repository.workspace = true 21 | rust-version.workspace = true 22 | authors.workspace = true 23 | readme.workspace = true 24 | 25 | [dependencies] 26 | quote = "1.0.26" 27 | proc-macro2 = "1.0" 28 | smallvec = { version = "1", features = ["union"] } 29 | 30 | [dev-dependencies] 31 | syn = { version = "2.0", features = ["full"] } 32 | -------------------------------------------------------------------------------- /pag-compiler/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Paguroidea Developers 2 | # 3 | # Licensed under the Apache License, Version 2.0 4 | # or the MIT 5 | # license , at your 6 | # option. All files in the project carrying such notice may not be copied, 7 | # modified, or distributed except according to those terms. 8 | 9 | [package] 10 | name = "pag-compiler" 11 | keywords = ["parser", "cfg", "grammar"] 12 | description = "Parser-lexer fusion generator (compiler interface)" 13 | documentation = "https://docs.rs/pag-compiler/" 14 | 15 | version.workspace = true 16 | edition.workspace = true 17 | license.workspace = true 18 | exclude.workspace = true 19 | categories.workspace = true 20 | repository.workspace = true 21 | rust-version.workspace = true 22 | authors.workspace = true 23 | readme.workspace = true 24 | 25 | [dependencies] 26 | pag-parser = { version = "0.1.0-alpha.1", path = "../pag-parser" } 27 | syn = { version = "2.0", features = ["full"] } 28 | prettyplease = { version = "0.2", features = ["verbatim"] } 29 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Paguroidea Developers 2 | # 3 | # Licensed under the Apache License, Version 2.0 4 | # or the MIT 5 | # license , at your 6 | # option. All files in the project carrying such notice may not be copied, 7 | # modified, or distributed except according to those terms. 8 | 9 | [workspace] 10 | members = [ 11 | "pag-lexer", 12 | "pag-parser", 13 | "pag-compiler", 14 | "tests/sexpr-calculator", 15 | "tests/arith-expr", 16 | "tests/tokenizer", 17 | "benches/csv", 18 | "benches/json", 19 | ] 20 | resolver = "2" 21 | 22 | [workspace.package] 23 | version = "0.1.0-alpha.1" 24 | edition = "2021" 25 | license = "MIT OR Apache-2.0" 26 | exclude = [".github/*"] 27 | categories = ["parsing"] 28 | repository = "https://github.com/SchrodingerZhu/paguroidea" 29 | rust-version = "1.71.0" 30 | authors = [ 31 | "Schrodinger ZHU Yifan ", 32 | "QuarticCat ", 33 | ] 34 | readme = "README.md" 35 | 36 | [profile.release] 37 | debug = true 38 | lto = true 39 | -------------------------------------------------------------------------------- /benches/json/benches/json.lalrpop: -------------------------------------------------------------------------------- 1 | use crate::Pvalue; 2 | 3 | grammar; 4 | 5 | Comma: Vec = { 6 | > "," => { es.push(e); es }, 7 | => vec![e], 8 | } 9 | 10 | pub Json = Value; 11 | 12 | Value: Pvalue<'input> = { 13 | "true" => Pvalue::<'input>::Bool(true), 14 | "false" => Pvalue::<'input>::Bool(false), 15 | "null" => Pvalue::<'input>::Null, 16 | => Pvalue::<'input>::String(&s[1..s.len() - 1]), 17 | => Pvalue::<'input>::Number(n), 18 | => Pvalue::<'input>::Array(a), 19 | => Pvalue::<'input>::Object(o), 20 | } 21 | 22 | Attribute: (&'input str, Pvalue<'input>) = { 23 | ":" => (s, v), 24 | } 25 | 26 | Object: Vec<(&'input str, Pvalue<'input>)> = { 27 | "{" > "}" => attr, 28 | "{" "}" => vec![], 29 | } 30 | 31 | Array: Vec> = { 32 | "[" > "]" => a, 33 | "[" "]" => vec![], 34 | } 35 | 36 | Number = r"-?(0|[1-9][0-9]*)((\.[0-9]+)?)([eE][+-]?[0-9]+)?"; 37 | 38 | String = r#""([^\\"]|\\(["\\/bfnrt]|u[0-9a-fA-F]{4}))*""#; 39 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Paguroidea Developers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benches/csv/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(portable_simd)] 2 | #![feature(core_intrinsics)] 3 | #![feature(array_chunks)] 4 | mod parser; 5 | 6 | pub use parser::parse; 7 | use rand::prelude::StdRng; 8 | use rand::{Rng, SeedableRng}; 9 | 10 | pub fn generate_csv(line: usize, width: usize) -> String { 11 | let mut random = std::env::var("PAG_RANDOM_SEED") 12 | .ok() 13 | .and_then(|x| x.parse().ok()) 14 | .map_or_else(StdRng::from_entropy, StdRng::seed_from_u64); 15 | let mut buffer = String::new(); 16 | for _ in 0..line { 17 | for i in 0..width { 18 | if random.gen::() < 0.5 { 19 | buffer.push_str(&format!("\"{}\"", random.gen::())); 20 | } else { 21 | buffer.push_str(&format!("{}", random.gen::())); 22 | } 23 | 24 | if i != width - 1 { 25 | buffer.push(','); 26 | } 27 | } 28 | buffer.push_str("\r\n"); 29 | } 30 | buffer 31 | } 32 | 33 | #[test] 34 | fn test_csv() { 35 | let data = generate_csv(500, 500); 36 | let parsed = parser::parse(&data).unwrap(); 37 | assert_eq!(parsed.len(), data.len()); 38 | } 39 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2023 Paguroidea Developers 2 | * 3 | * Licensed under the Apache License, Version 2.0 4 | * or the MIT 5 | * license , at your 6 | * option. All files in the project carrying such notice may not be copied, 7 | * modified, or distributed except according to those terms. 8 | */ 9 | 10 | { pkgs ? import {} }: 11 | pkgs.gcc.stdenv.mkDerivation { 12 | name = "paguroidea"; 13 | buildInputs = with pkgs; [ 14 | openssl 15 | pkg-config 16 | cmake 17 | gcc 18 | autoconf 19 | automake 20 | ninja 21 | gnumake 22 | zlib 23 | llvmPackages_latest.clang 24 | llvmPackages_latest.libclang 25 | llvmPackages_latest.libclang.lib 26 | llvmPackages_latest.llvm 27 | llvmPackages_latest.lld 28 | ]; 29 | shellHook = '' 30 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.llvmPackages_latest.libclang.lib}/lib 31 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.stdenv.cc.cc.lib}/lib 32 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${pkgs.zlib}/lib 33 | ''; 34 | } 35 | -------------------------------------------------------------------------------- /benches/json/json.pag: -------------------------------------------------------------------------------- 1 | lexer { 2 | DIGIT = '0'..'9'; 3 | NONZERO = '1'..'9'; 4 | HEX_DIGIT = '0' .. '9' | 'a' .. 'f' | 'A' .. 'F'; 5 | ESCAPED = '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't'; 6 | 7 | LBRACKET = '{'; 8 | RBRACKET = '}'; 9 | COMMA = ','; 10 | COLON = ':'; 11 | LSQUARE = '['; 12 | RSQUARE = ']'; 13 | TRUE = "true"; 14 | FALSE = "false"; 15 | NULL = "null"; 16 | STRING = '"' ~ ( !('\\' | '"') | '\\' ~ (ESCAPED | 'u' ~ HEX_DIGIT ~ HEX_DIGIT ~ HEX_DIGIT ~ HEX_DIGIT) )* ~ '"'; 17 | NUMBER = '-'? ~ ('0' | NONZERO ~ DIGIT*) ~ ('.' ~ DIGIT+)? ~ (('e' | 'E') ~ ('+' | '-')? ~ DIGIT+)?; 18 | 19 | skip = ('\n' | '\r' | '\t' | ' ')+; 20 | } 21 | 22 | parser json { 23 | active attribute = 24 | string ~ COLON ~ value; 25 | 26 | active string = STRING; 27 | active number = NUMBER; 28 | active lit_true = TRUE; 29 | active lit_false = FALSE; 30 | active lit_null = NULL; 31 | 32 | active object = 33 | LBRACKET ~ (attribute ~ (COMMA ~ attribute)*)? ~ RBRACKET; 34 | 35 | active array = 36 | LSQUARE ~ (value ~ (COMMA ~ value)*)? ~ RSQUARE; 37 | 38 | silent value = string | number | array | object | lit_true | lit_false | lit_null; 39 | 40 | active json = value; 41 | } 42 | -------------------------------------------------------------------------------- /pag-parser/Cargo.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Paguroidea Developers 2 | # 3 | # Licensed under the Apache License, Version 2.0 4 | # or the MIT 5 | # license , at your 6 | # option. All files in the project carrying such notice may not be copied, 7 | # modified, or distributed except according to those terms. 8 | 9 | [package] 10 | name = "pag-parser" 11 | keywords = ["parser", "cfg", "grammar"] 12 | description = "Parser-lexer fusion generator (parser generator)" 13 | documentation = "https://docs.rs/pag-parser/" 14 | 15 | version.workspace = true 16 | edition.workspace = true 17 | license.workspace = true 18 | exclude.workspace = true 19 | categories.workspace = true 20 | repository.workspace = true 21 | rust-version.workspace = true 22 | authors.workspace = true 23 | readme.workspace = true 24 | 25 | [dependencies] 26 | pest = { version = "2.5.7", features = ["std", "memchr"] } 27 | pest_derive = "2.5.7" 28 | smallvec = { version = "1", features = ["union"] } 29 | lazy_static = "1" 30 | pag-lexer = { version = "0.1.0-alpha.1", path = "../pag-lexer" } 31 | typed-arena = "2.0.2" 32 | quote = "1.0.26" 33 | proc-macro2 = "1.0" 34 | ariadne = { version = "0.3", features = ["auto-color"] } 35 | 36 | [dev-dependencies] 37 | strip-ansi-escapes = "0.1.1" 38 | -------------------------------------------------------------------------------- /benches/json/benches/json_logos.lalrpop: -------------------------------------------------------------------------------- 1 | use crate::Token; 2 | use crate::Pvalue; 3 | 4 | grammar<'a>; 5 | 6 | extern { 7 | type Location = usize; 8 | enum Token<'a> { 9 | "true" => Token::True, 10 | "false" => Token::False, 11 | "null" => Token::Null, 12 | "," => Token::Comma, 13 | ":" => Token::Colon, 14 | "{" => Token::LBrace, 15 | "}" => Token::RBrace, 16 | "[" => Token::LBracket, 17 | "]" => Token::RBracket, 18 | "number" => Token::Number(<&'a str>), 19 | "string" => Token::String(<&'a str>), 20 | } 21 | } 22 | 23 | Comma: Vec = { 24 | > "," => { es.push(e); es }, 25 | => vec![e], 26 | } 27 | 28 | pub Json = Value; 29 | 30 | Value: Pvalue<'a> = { 31 | "true" => Pvalue::<'a>::Bool(true), 32 | "false" => Pvalue::<'a>::Bool(false), 33 | "null" => Pvalue::<'a>::Null, 34 | => Pvalue::<'a>::String(&s[1..s.len() - 1]), 35 | => Pvalue::<'a>::Number(n), 36 | => Pvalue::<'a>::Array(a), 37 | => Pvalue::<'a>::Object(o), 38 | } 39 | 40 | Attribute: (&'a str, Pvalue<'a>) = { 41 | ":" => (s, v), 42 | } 43 | 44 | Object: Vec<(&'a str, Pvalue<'a>)> = { 45 | "{" > "}" => attr, 46 | "{" "}" => vec![], 47 | } 48 | 49 | Array: Vec> = { 50 | "[" > "]" => a, 51 | "[" "]" => vec![], 52 | } 53 | -------------------------------------------------------------------------------- /pag-parser/src/type_system/binding_proxy.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | use std::collections::HashSet; 9 | 10 | use crate::{ 11 | core_syntax::{BindingContext, TermPtr}, 12 | utilities::Symbol, 13 | }; 14 | 15 | pub struct BindingProxy<'src, 'a> { 16 | binding: &'a BindingContext<'src, 'a>, 17 | hiding: HashSet>, 18 | } 19 | 20 | impl<'src, 'a> BindingProxy<'src, 'a> { 21 | pub fn proxy(binding: &'a BindingContext<'src, 'a>) -> Self { 22 | BindingProxy { 23 | binding, 24 | hiding: HashSet::new(), 25 | } 26 | } 27 | pub fn lookup(&self, sym: &Symbol<'src>) -> Option> { 28 | if self.hiding.contains(sym) { 29 | return None; 30 | } 31 | self.binding.get(sym).map(|x| x.term) 32 | } 33 | pub fn with_hiding(&mut self, sym: Symbol<'src>, f: F) -> R 34 | where 35 | F: FnOnce(&mut Self) -> R, 36 | { 37 | let hidden_at_this_layer = self.hiding.insert(sym); 38 | let result = f(self); 39 | if hidden_at_this_layer { 40 | self.hiding.remove(&sym); 41 | } 42 | result 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /benches/csv/benches/benchmarks.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use csv::StringRecord; 3 | use pag_csv::{generate_csv, parse}; 4 | 5 | mod pest_csv { 6 | use pest_derive::Parser; 7 | 8 | #[derive(Parser)] 9 | #[grammar = "benches/csv.pest"] 10 | pub struct CSVParser; 11 | } 12 | 13 | #[global_allocator] 14 | static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; 15 | 16 | fn csv_read_all(input: &str) -> Vec { 17 | let mut records = Vec::new(); 18 | csv::Reader::from_reader(input.as_bytes()) 19 | .into_records() 20 | .for_each(|r| records.push(r.unwrap())); 21 | records 22 | } 23 | 24 | fn criterion_benchmark(c: &mut Criterion) { 25 | let mut g = c.benchmark_group("throughput"); 26 | let data = generate_csv(1000, 20); 27 | g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64)); 28 | g.bench_function("pag", |b| { 29 | b.iter(|| { 30 | assert_eq!(parse(&data).unwrap().children().len(), 1000); 31 | }) 32 | }); 33 | g.bench_function("csv", |b| { 34 | b.iter(|| { 35 | assert_eq!(csv_read_all(&data).len(), 999); 36 | }) 37 | }); 38 | g.bench_function("pest", |b| { 39 | b.iter(|| { 40 | use pest::Parser; 41 | let pairs = pest_csv::CSVParser::parse(pest_csv::Rule::csv, &data).unwrap(); 42 | assert_eq!(pairs.into_iter().next().unwrap().into_inner().len(), 1000); 43 | }) 44 | }); 45 | g.finish(); 46 | } 47 | 48 | criterion_group!(benches, criterion_benchmark); 49 | criterion_main!(benches); 50 | -------------------------------------------------------------------------------- /pag-compiler/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | //! The compiler of Paguroidea. Designed for build scripts. 9 | use std::path::Path; 10 | 11 | use syn::File; 12 | 13 | /// Compile the grammar file at `input` to the parser source code 14 | /// at `output`. 15 | /// This function is designed to be used in `build.rs`. It will panic and 16 | /// output the reasons if any error occurs. 17 | pub fn compile, O: AsRef>(input: I, output: O) { 18 | use std::io::Write; 19 | let data = std::fs::read_to_string(input.as_ref()).unwrap(); 20 | match pag_parser::generate_parser(&data) { 21 | Ok(tokens) => { 22 | #[cfg(pag_print_tokens)] 23 | println!("{tokens}"); 24 | let tree: File = syn::parse2(tokens).unwrap(); 25 | let prettified = prettyplease::unparse(&tree); 26 | let mut file = std::fs::File::create(output.as_ref()).unwrap(); 27 | write!( 28 | file, 29 | "// This file is @generated by Paguroidea.\n\n{}", 30 | prettified 31 | ) 32 | .unwrap(); 33 | file.flush().unwrap(); 34 | } 35 | Err(errs) => { 36 | errs.report_stderr(&format!("{}", input.as_ref().display()), &data) 37 | .unwrap(); 38 | panic!("failed to compile parser") 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/tokenizer/src/length_differential.rs: -------------------------------------------------------------------------------- 1 | use crate::generated::length_differential::Tag; 2 | #[allow(unused_imports)] 3 | use rand::{Rng, RngCore}; 4 | 5 | #[allow(dead_code)] 6 | fn random_generate(gen: &mut G, length: usize) -> (Vec, String) { 7 | let mut buffer = String::new(); 8 | let mut tags = Vec::new(); 9 | for _ in 0..length { 10 | match gen.next_u64() % 6 { 11 | 0 => { 12 | buffer.push_str("a "); 13 | tags.push(Tag::a); 14 | } 15 | 1 => { 16 | buffer.push_str("aa "); 17 | tags.push(Tag::aa); 18 | } 19 | 2 => { 20 | buffer.push_str("aaa "); 21 | tags.push(Tag::aaa); 22 | } 23 | 3 => { 24 | buffer.push_str("aaaa "); 25 | tags.push(Tag::aaaa); 26 | } 27 | 4 => { 28 | buffer.push_str("aaaaa "); 29 | tags.push(Tag::aaaaa); 30 | } 31 | _ => { 32 | buffer.push_str("a".repeat(6 + gen.next_u64() as usize % 128).as_str()); 33 | buffer.push(' '); 34 | tags.push(Tag::more); 35 | } 36 | } 37 | } 38 | (tags, buffer) 39 | } 40 | 41 | #[test] 42 | fn random_length_differential_test() { 43 | let mut gen = rand::thread_rng(); 44 | for _ in 0..1000 { 45 | let length = gen.next_u64() as usize % 1000 + 100; 46 | let (tags, buffer) = random_generate(&mut gen, length); 47 | let trimmed = buffer.trim(); 48 | let tree = crate::generated::length_differential::parse(trimmed).unwrap(); 49 | assert_eq!(tree.len(), trimmed.len()); 50 | let tokens = tree.children().iter().map(|x| x.tag()).collect::>(); 51 | assert_eq!(tokens, tags); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /pag-parser/src/core_syntax.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use std::collections::HashMap; 10 | use std::fmt::Display; 11 | 12 | use typed_arena::Arena; 13 | 14 | use crate::frontend::WithSpan; 15 | use crate::utilities::Symbol; 16 | 17 | #[derive(Debug, Clone)] 18 | pub enum Term<'src, 'arena> { 19 | Epsilon, 20 | Sequence(TermPtr<'src, 'arena>, TermPtr<'src, 'arena>), 21 | LexerRef(Symbol<'src>), 22 | Bottom, 23 | Alternative(TermPtr<'src, 'arena>, TermPtr<'src, 'arena>), 24 | Fix(Symbol<'src>, TermPtr<'src, 'arena>), 25 | ParserRef(Symbol<'src>), 26 | } 27 | 28 | pub type TermPtr<'src, 'arena> = &'arena WithSpan<'src, Term<'src, 'arena>>; 29 | pub type TermArena<'src, 'arena> = Arena>>; 30 | 31 | pub struct ParserRule<'src, 'arena> { 32 | pub active: bool, 33 | pub term: TermPtr<'src, 'arena>, 34 | } 35 | 36 | pub type BindingContext<'src, 'arena> = HashMap, ParserRule<'src, 'arena>>; 37 | 38 | impl Display for Term<'_, '_> { 39 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 40 | match self { 41 | Term::Epsilon => write!(f, "ε"), 42 | Term::Sequence(x, y) => write!(f, "({x} ~ {y})"), 43 | Term::LexerRef(x) => write!(f, "{x}"), 44 | Term::Bottom => write!(f, "⊥"), 45 | Term::Alternative(x, y) => write!(f, "({x} | {y})"), 46 | Term::Fix(x, y) => write!(f, "(μ {x} . {y})"), 47 | Term::ParserRef(x) => write!(f, "{x}"), 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pag-parser/src/type_system/context.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use super::type_check::Type; 10 | use crate::utilities::Symbol; 11 | use std::borrow::Cow; 12 | use std::collections::HashMap; 13 | 14 | pub(super) struct TypeContext<'src> { 15 | guarded: bool, 16 | gamma: HashMap, Type<'src>>, 17 | } 18 | 19 | impl<'src> TypeContext<'src> { 20 | pub fn new() -> Self { 21 | Self { 22 | guarded: false, 23 | gamma: HashMap::new(), 24 | } 25 | } 26 | pub fn lookup(&self, sym: Symbol<'src>) -> Option>> { 27 | let target = self.gamma.get(&sym)?; 28 | Some(if self.guarded { 29 | Cow::Owned(Type { 30 | guarded: true, 31 | ..target.clone() 32 | }) 33 | } else { 34 | Cow::Borrowed(target) 35 | }) 36 | } 37 | pub fn guarded(&mut self, f: F) -> R 38 | where 39 | F: FnOnce(&mut Self) -> R, 40 | { 41 | let backup = self.guarded; 42 | self.guarded = true; 43 | let result = f(self); 44 | self.guarded = backup; 45 | result 46 | } 47 | pub fn with(&mut self, sym: Symbol<'src>, r#type: Type<'src>, f: F) -> R 48 | where 49 | F: FnOnce(&mut Self) -> R, 50 | { 51 | let backup = self.gamma.insert(sym, r#type); 52 | let result = f(self); 53 | if let Some(backup) = backup { 54 | self.gamma.insert(sym, backup); 55 | } else { 56 | self.gamma.remove(&sym); 57 | } 58 | result 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pag-lexer/src/derivative.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::{normalization::normalize, regex_tree::RegexTree}; 10 | use smallvec::smallvec; 11 | use std::rc::Rc; 12 | 13 | pub fn derivative(tree: Rc, x: u8) -> Rc { 14 | use RegexTree::*; 15 | match tree.as_ref() { 16 | Set(set) => { 17 | if set.contains(x) { 18 | RegexTree::epsilon() 19 | } else { 20 | RegexTree::bottom() 21 | } 22 | } 23 | Concat(children) => { 24 | let head = children[0].clone(); 25 | let tail = normalize(Rc::new(Concat(children[1..].iter().cloned().collect()))); 26 | let lhs = Rc::new(Concat(smallvec![derivative(head.clone(), x), tail.clone()])); 27 | if head.is_nullable() { 28 | Rc::new(Union(smallvec![lhs, derivative(tail, x)])) 29 | } else { 30 | lhs 31 | } 32 | } 33 | KleeneClosure(r) => Rc::new(Concat(smallvec![derivative(r.clone(), x), tree.clone()])), 34 | Union(children) => Rc::new(Union( 35 | children 36 | .iter() 37 | .map(|tree| derivative(tree.clone(), x)) 38 | .collect(), 39 | )), 40 | Intersection(children) => Rc::new(Intersection( 41 | children 42 | .iter() 43 | .map(|tree| derivative(tree.clone(), x)) 44 | .collect(), 45 | )), 46 | Complement(r) => Rc::new(Complement(derivative(r.clone(), x))), 47 | Bottom | Epsilon => RegexTree::bottom(), 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /benches/json/benches/lalr_def.rs: -------------------------------------------------------------------------------- 1 | use logos::Logos; 2 | use std::fmt; 3 | 4 | #[derive(Logos, Debug, PartialEq, Copy, Clone)] 5 | #[logos(skip r"[ \r\n\t]+")] 6 | pub enum Token<'a> { 7 | #[token("true")] 8 | True, 9 | 10 | #[token("false")] 11 | False, 12 | 13 | #[token("null")] 14 | Null, 15 | 16 | #[token(":")] 17 | Colon, 18 | 19 | #[token(",")] 20 | Comma, 21 | 22 | #[token("{")] 23 | LBrace, 24 | 25 | #[token("}")] 26 | RBrace, 27 | 28 | #[token("[")] 29 | LBracket, 30 | 31 | #[token("]")] 32 | RBracket, 33 | 34 | #[regex(r"-?(0|[1-9][0-9]*)((\.[0-9]+)?)([eE][+-]?[0-9]+)?")] 35 | Number(&'a str), 36 | 37 | #[regex(r#""([^\\"]|\\(["\\/bfnrt]|u[0-9a-fA-F]{4}))*""#)] 38 | String(&'a str), 39 | } 40 | 41 | impl<'a> Token<'a> { 42 | pub fn lalrpop_lexer( 43 | source: &'a str, 44 | ) -> impl Iterator, usize), &'static str>> { 45 | Self::lexer(source) 46 | .spanned() 47 | .map(|(t, r)| Ok((r.start, t.unwrap(), r.end))) 48 | } 49 | } 50 | 51 | impl<'a> fmt::Display for Token<'a> { 52 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 53 | write!(f, "{self:?}") 54 | } 55 | } 56 | 57 | #[derive(Debug, Clone, PartialEq)] 58 | pub enum Pvalue<'a> { 59 | Number(&'a str), 60 | String(&'a str), 61 | Object(Vec<(&'a str, Pvalue<'a>)>), 62 | Bool(bool), 63 | Null, 64 | Array(Vec>), 65 | } 66 | 67 | impl<'a> fmt::Display for Pvalue<'a> { 68 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 69 | match self { 70 | Pvalue::Number(number) => write!(f, "{number}"), 71 | Pvalue::String(string) => write!(f, "\"{string}\""), 72 | Pvalue::Object(object) => { 73 | let iter = object.iter().map(|(k, v)| format!("\"{k}\": {v}")); 74 | write!(f, "{{{}}}", iter.collect::>().join(", ")) 75 | } 76 | Pvalue::Bool(flag) => write!(f, "{flag}"), 77 | Pvalue::Null => write!(f, "null"), 78 | Pvalue::Array(array) => { 79 | let iter = array.iter().map(|v| v.to_string()); 80 | write!(f, "[{}]", iter.collect::>().join(", ")) 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /tests/tokenizer/src/common_prefix.rs: -------------------------------------------------------------------------------- 1 | use crate::generated::common_prefix::Tag; 2 | 3 | #[allow(unused_imports)] 4 | use rand::{Rng, RngCore}; 5 | 6 | #[allow(dead_code)] 7 | const TABLE: &[(Tag, &str)] = &[ 8 | (Tag::a, "A"), 9 | (Tag::ab, "AB"), 10 | (Tag::abc, "ABC"), 11 | (Tag::abcd, "ABCD"), 12 | (Tag::abcde, "ABCDE"), 13 | (Tag::abcdef, "ABCDEF"), 14 | (Tag::abcdefg, "ABCDEFG"), 15 | (Tag::abcdefgh, "ABCDEFGH"), 16 | (Tag::abcdefghi, "ABCDEFGHI"), 17 | (Tag::abcdefghij, "ABCDEFGHIJ"), 18 | (Tag::abcdefghijk, "ABCDEFGHIJK"), 19 | (Tag::abcdefghijkl, "ABCDEFGHIJKL"), 20 | (Tag::abcdefghijklm, "ABCDEFGHIJKLM"), 21 | (Tag::abcdefghijklmn, "ABCDEFGHIJKLMN"), 22 | (Tag::abcdefghijklmno, "ABCDEFGHIJKLMNO"), 23 | (Tag::abcdefghijklmnop, "ABCDEFGHIJKLMNOP"), 24 | (Tag::abcdefghijklmnopq, "ABCDEFGHIJKLMNOPQ"), 25 | (Tag::abcdefghijklmnopqr, "ABCDEFGHIJKLMNOPQR"), 26 | (Tag::abcdefghijklmnopqrs, "ABCDEFGHIJKLMNOPQRS"), 27 | (Tag::abcdefghijklmnopqrst, "ABCDEFGHIJKLMNOPQRST"), 28 | (Tag::abcdefghijklmnopqrstu, "ABCDEFGHIJKLMNOPQRSTU"), 29 | (Tag::abcdefghijklmnopqrstuv, "ABCDEFGHIJKLMNOPQRSTUV"), 30 | (Tag::abcdefghijklmnopqrstuvw, "ABCDEFGHIJKLMNOPQRSTUVW"), 31 | (Tag::abcdefghijklmnopqrstuvwx, "ABCDEFGHIJKLMNOPQRSTUVWX"), 32 | (Tag::abcdefghijklmnopqrstuvwxy, "ABCDEFGHIJKLMNOPQRSTUVWXY"), 33 | ( 34 | Tag::abcdefghijklmnopqrstuvwxyz, 35 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ", 36 | ), 37 | ]; 38 | 39 | #[allow(dead_code)] 40 | fn random_generate(gen: &mut G, length: usize) -> (Vec, String) { 41 | let mut buffer = String::new(); 42 | let mut tags = Vec::new(); 43 | for _ in 0..length { 44 | let (tag, s) = TABLE[gen.next_u64() as usize % TABLE.len()]; 45 | buffer.push_str(s); 46 | buffer.push(' '); 47 | tags.push(tag); 48 | } 49 | (tags, buffer) 50 | } 51 | 52 | #[test] 53 | fn random_common_prefix_test() { 54 | let mut gen = rand::thread_rng(); 55 | for _ in 0..1000 { 56 | let length = gen.next_u64() as usize % 1000 + 100; 57 | let (tags, buffer) = random_generate(&mut gen, length); 58 | let trimmed = buffer.trim(); 59 | let tree = crate::generated::common_prefix::parse(trimmed).unwrap(); 60 | assert_eq!(tree.len(), trimmed.len()); 61 | let tokens = tree.children().iter().map(|x| x.tag()).collect::>(); 62 | assert_eq!(tokens, tags); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/tokenizer/src/tail_differential.rs: -------------------------------------------------------------------------------- 1 | use crate::generated::tail_differential::Tag; 2 | #[allow(unused_imports)] 3 | use rand::{Rng, RngCore}; 4 | 5 | #[allow(dead_code)] 6 | fn random_generate(gen: &mut G, length: usize) -> (Vec, String) { 7 | let mut buffer = String::new(); 8 | let mut tags = Vec::new(); 9 | for _ in 0..length { 10 | match gen.next_u64() % 4 { 11 | 0 => { 12 | //ab(c*d)? 13 | buffer.push_str("ab"); 14 | if gen.next_u64() % 2 == 0 { 15 | for _ in 0..gen.next_u64() % 129 { 16 | buffer.push('c'); 17 | } 18 | buffer.push('d'); 19 | } 20 | tags.push(Tag::abcd); 21 | } 22 | 1 => { 23 | // abc*e 24 | buffer.push_str("ab"); 25 | for _ in 0..gen.next_u64() % 129 { 26 | buffer.push('c'); 27 | } 28 | buffer.push('e'); 29 | tags.push(Tag::abce); 30 | } 31 | 2 => { 32 | //abc*dd+ 33 | buffer.push_str("ab"); 34 | for _ in 0..gen.next_u64() % 129 { 35 | buffer.push('c'); 36 | } 37 | for _ in 0..gen.next_u64() % 129 + 2 { 38 | buffer.push('d'); 39 | } 40 | tags.push(Tag::abcdm); 41 | } 42 | _ => { 43 | // c+ 44 | for _ in 0..gen.next_u64() % 129 + 1 { 45 | buffer.push('c'); 46 | } 47 | tags.push(Tag::cs); 48 | } 49 | } 50 | } 51 | (tags, buffer) 52 | } 53 | 54 | #[test] 55 | fn random_tail_differential_test() { 56 | let mut gen = rand::thread_rng(); 57 | for _ in 0..1000 { 58 | let length = gen.next_u64() as usize % 1000 + 100; 59 | let (mut tags, buffer) = random_generate(&mut gen, length); 60 | // deduplicate only for cs 61 | tags.dedup_by(|a, b| *a == Tag::cs && *b == Tag::cs); 62 | let trimmed = buffer.trim(); 63 | let tree = crate::generated::tail_differential::parse(trimmed).unwrap(); 64 | assert_eq!(tree.len(), trimmed.len()); 65 | let tokens = tree.children().iter().map(|x| x.tag()).collect::>(); 66 | assert_eq!(tokens, tags); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /tests/tokenizer/src/comment_and_string.rs: -------------------------------------------------------------------------------- 1 | use crate::generated::comment_and_string::Tag; 2 | use rand::distributions::Uniform; 3 | 4 | #[allow(unused_imports)] 5 | use rand::{Rng, RngCore}; 6 | 7 | #[allow(dead_code)] 8 | fn generate_random_string(gen: &mut G, length: usize, buffer: &mut String) { 9 | buffer.push('"'); 10 | let dist = Uniform::::new_inclusive(u8::MIN, u8::MAX); 11 | for _ in 0..length { 12 | let target = gen.sample(dist); 13 | buffer.push(target as char); 14 | if target == b'"' { 15 | buffer.push('"'); 16 | } 17 | } 18 | buffer.push('"'); 19 | } 20 | 21 | #[allow(dead_code)] 22 | fn generate_random_comment(gen: &mut G, length: usize, buffer: &mut String) { 23 | buffer.push_str("/*"); 24 | let dist = Uniform::::new_inclusive(u8::MIN, u8::MAX); 25 | let mut last_is_star = false; 26 | for _ in 0..length { 27 | let mut target = gen.sample(dist); 28 | while last_is_star && target == b'/' { 29 | target = gen.sample(dist); 30 | } 31 | last_is_star = target == b'*'; 32 | buffer.push(target as char); 33 | } 34 | buffer.push_str("*/"); 35 | } 36 | 37 | #[allow(dead_code)] 38 | fn random_generate(gen: &mut G, length: usize) -> (Vec, String) { 39 | let mut buffer = String::new(); 40 | let mut tags = Vec::new(); 41 | for _ in 0..length { 42 | let inner_length = gen.next_u64() as usize % 64 + 1; 43 | if gen.next_u64() % 2 == 0 { 44 | generate_random_comment(gen, inner_length, &mut buffer); 45 | buffer.push('\n'); 46 | tags.push(Tag::comment); 47 | } else { 48 | generate_random_string(gen, inner_length, &mut buffer); 49 | buffer.push('\n'); 50 | tags.push(Tag::string); 51 | } 52 | } 53 | (tags, buffer) 54 | } 55 | 56 | #[test] 57 | fn random_comment_and_string_test() { 58 | let mut gen = rand::thread_rng(); 59 | for _ in 0..100 { 60 | let length = gen.next_u64() as usize % 64 + 1; 61 | let (tags, buffer) = random_generate(&mut gen, length); 62 | let trimmed = buffer.trim(); 63 | let tree = crate::generated::comment_and_string::parse(trimmed).unwrap(); 64 | assert_eq!(tree.len(), trimmed.len(), "{}", buffer.escape_default()); 65 | let tokens = tree.children().iter().map(|x| x.tag()).collect::>(); 66 | assert_eq!(tokens, tags, "{buffer}"); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /pag-parser/src/utilities.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | #[derive(Debug, Clone, Copy, PartialOrd, Ord)] 10 | pub struct Symbol<'a>(&'a str); 11 | 12 | impl<'a> Symbol<'a> { 13 | pub fn new(data: &'a str) -> Self { 14 | Self(data) 15 | } 16 | 17 | pub fn name(&self) -> &'a str { 18 | self.0 19 | } 20 | } 21 | 22 | impl<'a> std::hash::Hash for Symbol<'a> { 23 | fn hash(&self, state: &mut H) { 24 | self.0.as_ptr().hash(state); 25 | self.0.len().hash(state); 26 | } 27 | } 28 | 29 | impl<'a, 'b> PartialEq> for Symbol<'a> { 30 | fn eq(&self, other: &Symbol<'b>) -> bool { 31 | self.0.as_ptr() == other.0.as_ptr() && self.0.len() == other.0.len() 32 | } 33 | } 34 | 35 | impl<'a> Eq for Symbol<'a> {} 36 | 37 | fn is_ascii_ident_body(x: &u8) -> bool { 38 | x.is_ascii_alphanumeric() || *x == b'_' 39 | } 40 | 41 | fn is_ascii_ident_head(x: &u8) -> bool { 42 | x.is_ascii_alphabetic() || *x == b'_' 43 | } 44 | 45 | fn is_ascii_ident(s: &str) -> bool { 46 | let [x, xs@..] = s.as_bytes() else { return false }; 47 | is_ascii_ident_head(x) && xs.iter().all(is_ascii_ident_body) 48 | } 49 | 50 | impl<'a> std::fmt::Display for Symbol<'a> { 51 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 52 | if is_ascii_ident(self.0) { 53 | write!(f, "{}", self.0) 54 | } else { 55 | write!(f, "s{:x}_{}", self.0.as_ptr() as usize, self.0.len()) 56 | } 57 | } 58 | } 59 | 60 | pub fn merge_results( 61 | a: Result>, 62 | b: Result>, 63 | f: impl FnOnce(T, T) -> U, 64 | ) -> Result> { 65 | match (a, b) { 66 | (Ok(a), Ok(b)) => Ok(f(a, b)), 67 | (Ok(_), Err(b)) => Err(b), 68 | (Err(a), Ok(_)) => Err(a), 69 | (Err(mut a), Err(b)) => { 70 | a.extend(b); 71 | Err(a) 72 | } 73 | } 74 | } 75 | 76 | macro_rules! unreachable_branch { 77 | ($($arg:tt)*) => { 78 | if cfg!(debug_assertions) { 79 | unreachable!($($arg)*) 80 | } else { 81 | unsafe { std::hint::unreachable_unchecked() } 82 | } 83 | }; 84 | } 85 | 86 | pub(crate) use unreachable_branch; 87 | -------------------------------------------------------------------------------- /pag-lexer/src/congruence.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use std::ops::ControlFlow; 10 | 11 | use crate::intervals; 12 | use crate::intervals::Intervals; 13 | use crate::regex_tree::RegexTree; 14 | 15 | pub fn meet(a: &[Intervals], b: &[Intervals]) -> Vec { 16 | let mut result = Vec::new(); 17 | for x in a { 18 | for y in b { 19 | if let Some(z) = x.intersection(y) { 20 | result.push(z); 21 | } 22 | } 23 | } 24 | result.sort(); 25 | result.dedup(); 26 | result 27 | } 28 | 29 | // TODO: this part can be optimized 30 | pub fn approximate_congruence_class(tree: &RegexTree) -> Vec { 31 | use RegexTree::*; 32 | match tree { 33 | Epsilon | Bottom => vec![intervals!((0, u8::MAX))], 34 | Set(x) => { 35 | let x = x.clone(); 36 | match x.complement() { 37 | Some(y) => { 38 | if x < y { 39 | vec![x, y] 40 | } else { 41 | vec![y, x] 42 | } 43 | } 44 | None => vec![x], 45 | } 46 | } 47 | Concat(children) => { 48 | match children[1..] 49 | .iter() 50 | .zip(children.iter().map(|x| x.is_nullable())) 51 | .try_fold( 52 | approximate_congruence_class(&children[0]), 53 | |acc, (tree, prev_nullable)| { 54 | if !prev_nullable { 55 | ControlFlow::Break(acc) 56 | } else { 57 | ControlFlow::Continue(meet(&acc, &approximate_congruence_class(tree))) 58 | } 59 | }, 60 | ) { 61 | ControlFlow::Break(v) | ControlFlow::Continue(v) => v, 62 | } 63 | } 64 | KleeneClosure(r) | Complement(r) => approximate_congruence_class(r), 65 | Union(children) | Intersection(children) => children[1..] 66 | .iter() 67 | .fold(approximate_congruence_class(&children[0]), |acc, x| { 68 | meet(&acc, &approximate_congruence_class(x)) 69 | }), 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /benches/json/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(portable_simd)] 2 | #![feature(core_intrinsics)] 3 | #![feature(array_chunks)] 4 | mod parser; 5 | 6 | pub use parser::parse; 7 | use rand::rngs::StdRng; 8 | use rand::{Rng, SeedableRng}; 9 | use serde_json::Value; 10 | 11 | fn generate_json_value(depth: usize, gen: &mut G) -> Value { 12 | if depth == 0 { 13 | match gen.gen_range(0..4) { 14 | 0 => Value::Null, 15 | 1 => Value::Bool(gen.gen()), 16 | 2 => Value::Number(serde_json::Number::from_f64(gen.gen()).unwrap()), 17 | _ => Value::String(gen.gen::().to_string()), 18 | } 19 | } else { 20 | match gen.gen_range(0..7) { 21 | 0 => Value::Null, 22 | 1 => Value::Bool(gen.gen()), 23 | 2 => Value::String(gen.gen::().to_string()), 24 | 3 | 4 => { 25 | let mut array = Vec::new(); 26 | for _ in 0..gen.gen_range(0..10) { 27 | array.push(generate_json_value(depth - 1, gen)); 28 | } 29 | Value::Array(array) 30 | } 31 | _ => { 32 | let mut object = serde_json::Map::new(); 33 | for _ in 0..gen.gen_range(0..10) { 34 | object.insert( 35 | gen.gen::().to_string(), 36 | generate_json_value(depth - 1, gen), 37 | ); 38 | } 39 | Value::Object(object) 40 | } 41 | } 42 | } 43 | } 44 | 45 | pub fn generate_random_json(depth: usize) -> String { 46 | let mut random = std::env::var("PAG_RANDOM_SEED") 47 | .ok() 48 | .and_then(|x| x.parse().ok()) 49 | .map_or_else(StdRng::from_entropy, StdRng::seed_from_u64); 50 | let mut buffer = Vec::new(); 51 | let value = generate_json_value(depth, &mut random); 52 | serde_json::to_writer(&mut buffer, &value).unwrap(); 53 | unsafe { String::from_utf8_unchecked(buffer) } 54 | } 55 | 56 | #[cfg(test)] 57 | mod test { 58 | use super::*; 59 | 60 | #[test] 61 | fn test_json() { 62 | let json = r#"{ "hello": { "values": [{}, [], [1, 1e3, -0.5, 9.99]] }, "age" : 13 }"#; 63 | let tree = parser::parse(json).unwrap(); 64 | println!("{:#?}", tree); 65 | } 66 | #[test] 67 | fn test_random() { 68 | for _ in 0..10 { 69 | let json = generate_random_json(10); 70 | let parsed = parser::parse(&json).unwrap(); 71 | assert_eq!(json.len(), parsed.len()) 72 | } 73 | } 74 | 75 | #[test] 76 | fn test_twitter() { 77 | let json = include_str!("../benches/twitter.json"); 78 | let parsed = parser::parse(json).unwrap(); 79 | assert_eq!(json.len(), parsed.len()) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /tests/arith-expr/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(portable_simd)] 2 | #![feature(core_intrinsics)] 3 | #![feature(array_chunks)] 4 | use std::num::Wrapping; 5 | 6 | mod parser; 7 | 8 | #[allow(dead_code)] 9 | fn eval(tree: &parser::ParserTree) -> Wrapping { 10 | match tree.tag() { 11 | parser::Tag::expr => tree.children()[..].iter().map(eval).sum(), 12 | parser::Tag::mult => tree.children()[..].iter().map(eval).product(), 13 | parser::Tag::int => Wrapping(tree.as_slice().parse::().unwrap()), 14 | parser::Tag::special => { 15 | assert_eq!(tree.as_slice().chars().count(), 1); 16 | Wrapping(tree.as_slice().chars().next().unwrap() as usize) 17 | } 18 | } 19 | } 20 | 21 | #[allow(dead_code)] 22 | fn generate_random_expr(rng: &mut G, depth: usize) -> (Wrapping, String) { 23 | if depth == 0 { 24 | let x = rng.gen_range(0..100); 25 | return (Wrapping(x), format!("{}", x)); 26 | } 27 | match rng.gen_range(0..4) { 28 | 0 => { 29 | let x = rng.gen_range(0..100); 30 | (Wrapping(x), format!("{}", x)) 31 | } 32 | 1 => { 33 | let x = rng.gen_range(0xFF..=0xD7FF); 34 | ( 35 | Wrapping(x), 36 | format!("{}", char::from_u32(x as u32).unwrap()), 37 | ) 38 | } 39 | 2 => { 40 | let (a, s1) = generate_random_expr(rng, depth - 1); 41 | let (b, s2) = generate_random_expr(rng, depth - 1); 42 | (a + b, format!("({} + {})", s1, s2)) 43 | } 44 | _ => { 45 | let (a, s1) = generate_random_expr(rng, depth - 1); 46 | let (b, s2) = generate_random_expr(rng, depth - 1); 47 | (a * b, format!("({} * {})", s1, s2)) 48 | } 49 | } 50 | } 51 | 52 | #[test] 53 | fn simple_test() { 54 | let expr = "55 * (14 + 15) + 66 * 13"; 55 | let tree = parser::parse(expr).unwrap(); 56 | assert_eq!(eval(&tree), Wrapping(55 * (14 + 15) + 66 * 13)); 57 | // (8 * 1 + 3) * 6 + ((37 + 7) * 2) 58 | let expr = "(8 * 1 + 3) * 6 + ((37 + 7) * 2)"; 59 | let tree = parser::parse(expr).unwrap(); 60 | assert_eq!(eval(&tree), Wrapping((8 + 3) * 6 + ((37 + 7) * 2))); 61 | 62 | // ((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91 63 | let expr = "((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91 + 1 + 2 + 3"; 64 | let tree = parser::parse(expr).unwrap(); 65 | assert_eq!( 66 | eval(&tree), 67 | Wrapping(((((8 + 13) + 3) * 6) * ((3 + 7) * 22)) * 91 + 1 + 2 + 3) 68 | ); 69 | } 70 | #[test] 71 | fn random_test() { 72 | for _ in 0..1000 { 73 | let (value, expr) = generate_random_expr(&mut rand::thread_rng(), 15); 74 | let tree = parser::parse(&expr).unwrap(); 75 | assert_eq!(eval(&tree), value); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /tests/tokenizer/build.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use tempfile::NamedTempFile; 3 | 4 | fn generate_tokenizer(name: &str, rules: I, skip: Option<&str>) 5 | where 6 | I: AsRef<[(A, B)]>, 7 | A: AsRef, 8 | B: AsRef, 9 | { 10 | let mut file = NamedTempFile::new().unwrap(); 11 | writeln!(file.as_file(), "lexer {{").unwrap(); 12 | for (name, rule) in rules.as_ref() { 13 | writeln!(file.as_file(), "{} = {};", name.as_ref(), rule.as_ref()).unwrap() 14 | } 15 | if let Some(skip) = skip { 16 | writeln!(file.as_file(), "skip = {skip};").unwrap() 17 | } 18 | writeln!(file.as_file(), "}}").unwrap(); 19 | writeln!(file.as_file(), "parser tokens {{").unwrap(); 20 | for (name, _) in rules.as_ref() { 21 | let lowercase = name.as_ref().to_lowercase(); 22 | writeln!(file.as_file(), "active {lowercase} = {};", name.as_ref()).unwrap() 23 | } 24 | writeln!( 25 | file.as_file(), 26 | "active tokens = ({})*;", 27 | rules 28 | .as_ref() 29 | .iter() 30 | .map(|(name, _)| name.as_ref().to_lowercase()) 31 | .collect::>() 32 | .join("|") 33 | ) 34 | .unwrap(); 35 | writeln!(file.as_file(), "}}").unwrap(); 36 | file.as_file_mut().flush().unwrap(); 37 | pag_compiler::compile(file.path(), format!("src/generated/{name}.rs")); 38 | } 39 | 40 | fn main() { 41 | std::fs::create_dir_all("src/generated").unwrap(); 42 | generate_tokenizer( 43 | "length_differential", 44 | [ 45 | ("A", r#"'a'"#), 46 | ("AA", r#""aa""#), 47 | ("AAA", r#""aaa""#), 48 | ("AAAA", r#""aaaa""#), 49 | ("AAAAA", r#""aaaaa""#), 50 | ("MORE", r"AAAAA~ 'a'+"), 51 | ], 52 | Some(r"'\n' | '\r' | '\t' | ' '"), 53 | ); 54 | generate_tokenizer( 55 | "common_prefix", 56 | { 57 | let mut rules = Vec::new(); 58 | let mut current = String::new(); 59 | for i in 'A'..='Z' { 60 | current.push(i); 61 | rules.push((current.clone(), format!("{:?}", current))); 62 | } 63 | rules 64 | }, 65 | Some(r"'\n' | '\r' | '\t' | ' '"), 66 | ); 67 | generate_tokenizer( 68 | "comment_and_string", 69 | [ 70 | ("STRING", r#"'\"' ~ ( (!'\"') | '"' ~ '"')* ~ '\"'"#), 71 | ("COMMENT", r#""/*" ~ !(.* ~ "*/" ~ .*) ~ "*/""#), 72 | ], 73 | Some(r"'\n' | '\r' | '\t' | ' '"), 74 | ); 75 | generate_tokenizer( 76 | "tail_differential", 77 | [ 78 | ("ABCD", r#"'a' ~ 'b' ~ ('c'* ~ 'd')?"#), 79 | ("ABCE", r#"'a' ~ 'b' ~ ('c'* ~ 'e')"#), 80 | ("ABCDM", r#"'a' ~ 'b' ~ 'c'* ~ 'd' ~ 'd'+"#), 81 | ("CS", r#"'c'+"#), 82 | ], 83 | None, 84 | ); 85 | } 86 | -------------------------------------------------------------------------------- /benches/json/benches/benchmarks.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use lalrpop_util::lalrpop_mod; 3 | use pag_json::{generate_random_json, parse}; 4 | use pest::Parser; 5 | use pest_json::Rule; 6 | use serde_json::Value; 7 | 8 | mod lalr_def; 9 | pub use lalr_def::{Pvalue, Token}; 10 | 11 | #[global_allocator] 12 | static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; 13 | 14 | lalrpop_mod!(lalrpop_json, "/benches/json.rs"); 15 | lalrpop_mod!(lalrpop_logos_json, "/benches/json_logos.rs"); 16 | 17 | mod pest_json { 18 | use pest_derive::Parser; 19 | 20 | #[derive(Parser)] 21 | #[grammar = "benches/json.pest"] 22 | pub struct JSONParser; 23 | } 24 | 25 | fn criterion_benchmark(c: &mut Criterion) { 26 | let mut g = c.benchmark_group("random-json"); 27 | let data = generate_random_json(10); 28 | g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64)); 29 | g.bench_function("pag", |b| { 30 | b.iter(|| { 31 | parse(&data).unwrap(); 32 | }) 33 | }); 34 | g.bench_function("serde", |b| { 35 | b.iter(|| { 36 | serde_json::from_str::(&data).unwrap(); 37 | }) 38 | }); 39 | g.bench_function("pest", |b| { 40 | b.iter(|| { 41 | pest_json::JSONParser::parse(Rule::json, &data).unwrap(); 42 | }) 43 | }); 44 | g.bench_function("lalrpop", |b| { 45 | b.iter(|| { 46 | lalrpop_json::JsonParser::new().parse(&data).unwrap(); 47 | }) 48 | }); 49 | g.bench_function("lalrpop+logos", |b| { 50 | b.iter(|| { 51 | let lexer = Token::lalrpop_lexer(&data); 52 | lalrpop_logos_json::JsonParser::new().parse(lexer).unwrap(); 53 | }) 54 | }); 55 | g.finish(); 56 | 57 | let mut g = c.benchmark_group("twitter-json"); 58 | let data = include_str!("twitter.json"); 59 | g.throughput(criterion::Throughput::Bytes(data.bytes().len() as u64)); 60 | g.bench_function("pag", |b| { 61 | b.iter(|| { 62 | parse(data).unwrap(); 63 | }) 64 | }); 65 | g.bench_function("serde", |b| { 66 | b.iter(|| { 67 | serde_json::from_str::(data).unwrap(); 68 | }) 69 | }); 70 | g.bench_function("pest", |b| { 71 | b.iter(|| { 72 | pest_json::JSONParser::parse(Rule::json, data).unwrap(); 73 | }) 74 | }); 75 | g.bench_function("lalrpop", |b| { 76 | b.iter(|| { 77 | lalrpop_json::JsonParser::new().parse(data).unwrap(); 78 | }) 79 | }); 80 | g.bench_function("lalrpop+logos", |b| { 81 | b.iter(|| { 82 | let lexer = Token::lalrpop_lexer(data); 83 | lalrpop_logos_json::JsonParser::new().parse(lexer).unwrap(); 84 | }) 85 | }); 86 | g.finish(); 87 | } 88 | 89 | criterion_group!(benches, criterion_benchmark); 90 | criterion_main!(benches); 91 | -------------------------------------------------------------------------------- /pag-parser/src/frontend/grammar.pest: -------------------------------------------------------------------------------- 1 | any = { "." } 2 | bottom = { "" | "⊥" } 3 | silent = _{ "silent" } 4 | active = _{ "active" } 5 | skip = _{ "skip" } 6 | lexer = _{ "lexer" } 7 | parser = _{ "parser" } 8 | empty = { "_" } 9 | 10 | KEYWORD = { any | empty | bottom | silent | active | skip | lexer | parser } 11 | 12 | /// A newline character. 13 | newline = _{ "\n" | "\r\n" } 14 | /// A whitespace character. 15 | WHITESPACE = _{ " " | "\t" | newline } 16 | /// A single line comment. 17 | line_comment = _{ ("//" ~ !("/" | "!") ~ (!newline ~ ANY)*) } 18 | /// A multi-line comment. 19 | block_comment = _{ "/*" ~ (block_comment | !"*/" ~ ANY)* ~ "*/" } 20 | /// A grammar comment. 21 | COMMENT = _{ block_comment | line_comment } 22 | 23 | inner_chr = @{ escape | ANY } 24 | inner_str = @{ (!("\"" | "\\") ~ ANY)* ~ (escape ~ inner_str)? } 25 | hex_digit = @{ '0'..'9' | 'a'..'f' | 'A'..'F' } 26 | code = @{ "x" ~ hex_digit{2} } 27 | uppercase = @{ 'A'..'Z' } 28 | lowercase = @{ 'a'..'z' } 29 | digit = @{ '0'..'9' } 30 | unicode = @{ "u" ~ "{" ~ hex_digit{2, 6} ~ "}" } 31 | escape = @{ "\\" ~ ("\"" | "\\" | "r" | "n" | "t" | "0" | "'" | code | unicode) } 32 | 33 | character = ${ "'" ~ inner_chr ~ "'" } 34 | string = ${ "\"" ~ inner_str ~ "\"" } 35 | range = { character ~ ".." ~ character } 36 | token_id = ${ !KEYWORD ~ uppercase ~ (uppercase | digit | "_")* } 37 | parser_id = ${ !KEYWORD ~ lowercase ~ (lowercase | digit | "_")* } 38 | 39 | // pratt parser for lexical expressions 40 | lexical_primary = _{ any | bottom | empty | range | character | string | token_id | "(" ~ lexical_expr ~ ")" } 41 | lexical_expr = { lexical_prefix* ~ lexical_primary ~ lexical_postfix* ~ (lexical_infix ~ lexical_prefix* ~ lexical_primary ~ lexical_postfix* )* } 42 | lexical_postfix = _{ lexical_optional | lexical_star | lexical_plus } 43 | lexical_optional = { "?" } 44 | lexical_star = { "*" } 45 | lexical_plus = { "+" } 46 | lexical_not = { "!" } 47 | lexical_prefix = _{ lexical_not } 48 | lexical_infix = _{ lexical_sequence | lexical_alternative | lexical_and } 49 | lexical_sequence = { "~" } 50 | lexical_alternative = { "|" } 51 | lexical_and = { "&" } 52 | 53 | // pratt parser for parser expressions 54 | parser_primary = _{ bottom | empty | parser_id | token_id | "(" ~ parser_expr ~ ")" } 55 | parser_expr = { parser_primary ~ parser_postfix* ~ (parser_infix ~ parser_primary ~ parser_postfix* )* } 56 | parser_postfix = _{ parser_optional | parser_star | parser_plus } 57 | parser_optional = { "?" } 58 | parser_star = { "*" } 59 | parser_plus = { "+" } 60 | parser_infix = _{ parser_sequence | parser_alternative } 61 | parser_sequence = { "~" } 62 | parser_alternative = { "|" } 63 | 64 | // lexer definition 65 | lexer_def = { lexer ~ "{" ~ lexer_rules ~ "}" } 66 | lexer_rules = { ((lexical_rule | lexical_skip) ~ ";")+ } 67 | lexical_rule = { token_id ~ "=" ~ lexical_expr } 68 | lexical_skip = { skip ~ "=" ~ lexical_expr } 69 | 70 | // parser definition 71 | parser_def = { parser ~ parser_id ~ "{" ~ parser_rules ~ "}" } 72 | parser_rules = { ((active_parser_rule | silent_parser_rule) ~ ";")+ } 73 | active_parser_rule = { active ~ parser_id ~ "=" ~ parser_expr } 74 | silent_parser_rule = { silent ~ parser_id ~ "=" ~ parser_expr } 75 | 76 | grammar = { SOI ~ lexer_def ~ parser_def ~ EOI } 77 | -------------------------------------------------------------------------------- /tests/sexpr-calculator/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(portable_simd)] 2 | #![feature(core_intrinsics)] 3 | #![feature(array_chunks)] 4 | use std::num::Wrapping; 5 | 6 | mod parser; 7 | 8 | #[allow(dead_code)] 9 | fn eval(tree: &parser::ParserTree) -> Wrapping { 10 | match tree.tag() { 11 | parser::Tag::sexpr => eval(&tree.children()[0]), 12 | parser::Tag::int => Wrapping(tree.as_slice().parse::().unwrap()), 13 | parser::Tag::op => { 14 | unreachable!("op should be handled by sexpr") 15 | } 16 | parser::Tag::compound => match tree.children()[0].as_slice() { 17 | "+" | "加" => tree.children()[1..].iter().map(eval).sum(), 18 | "*" | "乘" => tree.children()[1..].iter().map(eval).product(), 19 | other => unreachable!("only '+' and '*' are supported, found '{other}'"), 20 | }, 21 | } 22 | } 23 | 24 | #[allow(dead_code)] 25 | fn generate_sexpr(mut limit: usize, gen: &mut G) -> (usize, Wrapping, String) { 26 | if limit <= 1 { 27 | let x = Wrapping(gen.next_u64() as usize % 100); 28 | return (1, x, format!("{}", x)); 29 | } 30 | match gen.next_u64() % 20 { 31 | 0 => { 32 | let x = Wrapping(gen.next_u64() as usize % 100); 33 | (1, x, format!("{}", x)) 34 | } 35 | 1..=15 => { 36 | let width = 2 + gen.next_u64() % (limit as u64).min(10); 37 | let mut buffer = if gen.gen_bool(0.5) { 38 | "(+".to_string() 39 | } else { 40 | "(加".to_string() 41 | }; 42 | let mut cnt = 0; 43 | let mut sum = Wrapping(0); 44 | for _ in 0..width { 45 | let (w, v, s) = generate_sexpr(limit, gen); 46 | cnt += w; 47 | limit = limit.saturating_sub(w); 48 | sum += v; 49 | buffer.push_str(&format!(" {}", s)); 50 | } 51 | buffer.push(')'); 52 | (cnt, sum, buffer) 53 | } 54 | _ => { 55 | let width = 2 + gen.next_u64() % (limit as u64).min(10); 56 | let mut buffer = if gen.gen_bool(0.5) { 57 | "(*".to_string() 58 | } else { 59 | "(乘".to_string() 60 | }; 61 | 62 | let mut cnt = 0; 63 | let mut prod = Wrapping(1); 64 | for _ in 0..width { 65 | let (w, v, s) = generate_sexpr(limit, gen); 66 | cnt += w; 67 | limit = limit.saturating_sub(w); 68 | prod *= v; 69 | buffer.push_str(&format!(" {}", s)); 70 | } 71 | buffer.push(')'); 72 | (cnt, prod, buffer) 73 | } 74 | } 75 | } 76 | 77 | #[test] 78 | fn simple_test() { 79 | let test = "(加 1 (* 5 55))"; 80 | let tree = parser::parse(test).unwrap(); 81 | assert_eq!(276, eval(&tree).0); 82 | let test = "(+ 1 (# 5 5))"; 83 | let err = parser::parse(test).unwrap_err().to_string(); 84 | assert_eq!(err, "expecting MULT or PLUS for compound at offset 6"); 85 | } 86 | 87 | #[test] 88 | fn randomized_test() { 89 | for _ in 0..1000 { 90 | let (_, value, expr) = generate_sexpr(20, &mut rand::thread_rng()); 91 | 92 | let tree = parser::parse(&expr).unwrap(); 93 | assert_eq!(value, eval(&tree)) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /pag-parser/src/type_system/fixpoint.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | // Modified from Tarjan's strongly connected components algorithm 10 | 11 | use std::cell::Cell; 12 | use std::collections::HashMap; 13 | 14 | use crate::core_syntax::{BindingContext, Term, TermArena, TermPtr}; 15 | use crate::frontend::WithSpan; 16 | use crate::utilities::Symbol; 17 | 18 | type NodeId = u32; 19 | 20 | #[derive(Default)] 21 | struct Node { 22 | neighbors: Vec, 23 | in_stack: Cell, 24 | low: Cell, 25 | dfn: Cell, 26 | in_cycle: Cell, // scc size > 1 or self reference 27 | } 28 | 29 | fn find_neighbors( 30 | term: TermPtr, 31 | neighbors: &mut Vec, 32 | sym_to_id: &HashMap, NodeId>, 33 | ) { 34 | match &term.node { 35 | Term::Sequence(lhs, rhs) | Term::Alternative(lhs, rhs) => { 36 | find_neighbors(lhs, neighbors, sym_to_id); 37 | find_neighbors(rhs, neighbors, sym_to_id); 38 | } 39 | Term::Fix(_, expr) => find_neighbors(expr, neighbors, sym_to_id), 40 | Term::ParserRef(symbol) => { 41 | // unexisted IDs refer to implicit fixpoints 42 | let Some(&id) = sym_to_id.get(symbol) else { return }; 43 | neighbors.push(id); 44 | } 45 | _ => {} 46 | } 47 | } 48 | 49 | fn tarjan(node_id: NodeId, dfn_cnt: &mut u32, stack: &mut Vec, nodes: &Vec) { 50 | let node = &nodes[node_id as usize]; 51 | 52 | *dfn_cnt += 1; 53 | node.low.set(*dfn_cnt); 54 | node.dfn.set(*dfn_cnt); 55 | stack.push(node_id); 56 | node.in_stack.set(true); 57 | 58 | for &next_id in &node.neighbors { 59 | // self reference 60 | if next_id == node_id { 61 | node.in_cycle.set(true); 62 | continue; 63 | } 64 | let next = &nodes[next_id as usize]; 65 | if next.dfn.get() == 0 { 66 | tarjan(next_id, dfn_cnt, stack, nodes); 67 | node.low.set(node.low.get().min(next.low.get())); // u.low = min(u.low, v.low) 68 | } else if next.in_stack.get() { 69 | node.low.set(node.low.get().min(next.dfn.get())); // u.low = min(u.low, v.dfn) 70 | } 71 | } 72 | 73 | if node.low.get() == node.dfn.get() { 74 | // scc size == 1 75 | if stack.last() == Some(&node_id) { 76 | node.in_stack.set(false); 77 | stack.pop(); 78 | return; 79 | } 80 | // scc size > 1 81 | while let Some(top_id) = stack.pop() { 82 | let top = &nodes[top_id as usize]; 83 | top.in_stack.set(false); 84 | top.in_cycle.set(true); 85 | if top_id == node_id { 86 | break; 87 | } 88 | } 89 | } 90 | } 91 | 92 | pub fn infer_fixpoints<'src, 'arena>( 93 | entrypoint: Symbol<'src>, 94 | arena: &'arena TermArena<'src, 'arena>, 95 | binding_ctx: &mut BindingContext<'src, 'arena>, 96 | ) { 97 | let mut sym_to_id = HashMap::new(); 98 | let mut id_to_sym = Vec::new(); 99 | for (idx, (symbol, _)) in binding_ctx.iter().enumerate() { 100 | sym_to_id.insert(*symbol, idx as NodeId); 101 | id_to_sym.push(*symbol); 102 | } 103 | 104 | let mut nodes = Vec::new(); 105 | for (_, rule) in binding_ctx.iter() { 106 | let mut neighbors = Vec::new(); 107 | find_neighbors(rule.term, &mut neighbors, &sym_to_id); 108 | nodes.push(Node { 109 | neighbors, 110 | ..Node::default() 111 | }) 112 | } 113 | 114 | let begin = sym_to_id[&entrypoint] as NodeId; 115 | let mut dfn_cnt = 0; 116 | let mut stack = Vec::new(); 117 | tarjan(begin, &mut dfn_cnt, &mut stack, &nodes); 118 | 119 | for (id, node) in nodes.iter().enumerate() { 120 | // unreachable rules 121 | if node.dfn.get() == 0 { 122 | let symbol = id_to_sym[id]; 123 | binding_ctx.remove(&symbol); 124 | continue; 125 | } 126 | // fixpoints 127 | if node.in_cycle.get() { 128 | let symbol = id_to_sym[id]; 129 | let rule = binding_ctx.get_mut(&symbol).unwrap(); 130 | rule.term = arena.alloc(WithSpan { 131 | span: rule.term.span, 132 | node: Term::Fix(symbol, rule.term), 133 | }) 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /pag-lexer/src/regex_tree.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::intervals; 10 | use crate::intervals::Intervals; 11 | use smallvec::SmallVec; 12 | use std::fmt::{Display, Formatter}; 13 | use std::ops::RangeInclusive; 14 | use std::rc::Rc; 15 | 16 | #[derive(Ord, PartialOrd, Eq, PartialEq, Debug, Clone, Hash)] 17 | pub enum RegexTree { 18 | Bottom, // no character 19 | Set(Intervals), 20 | Epsilon, 21 | Concat(SmallVec<[Rc; 2]>), 22 | KleeneClosure(Rc), 23 | Union(SmallVec<[Rc; 2]>), 24 | Intersection(SmallVec<[Rc; 2]>), 25 | Complement(Rc), 26 | } 27 | 28 | use RegexTree::*; 29 | 30 | impl Display for RegexTree { 31 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 32 | match self { 33 | Bottom => write!(f, "⊥"), 34 | Concat(x) | Intersection(x) | Union(x) if x.is_empty() => write!(f, "⊥"), 35 | Set(x) => write!(f, "{x}"), 36 | Epsilon => write!(f, "ε"), 37 | Concat(children) => { 38 | write!(f, "({}", children[0])?; 39 | for i in &children[1..] { 40 | write!(f, " ~ {i}")?; 41 | } 42 | write!(f, ")") 43 | } 44 | KleeneClosure(x) => write!(f, "{x}*"), 45 | Union(children) => { 46 | write!(f, "({}", children[0])?; 47 | for i in &children[1..] { 48 | write!(f, " ∪ {i}")?; 49 | } 50 | write!(f, ")") 51 | } 52 | Intersection(children) => { 53 | write!(f, "({}", children[0])?; 54 | for i in &children[1..] { 55 | write!(f, " ∩ {i}")?; 56 | } 57 | write!(f, ")") 58 | } 59 | Complement(x) => write!(f, "¬{x}"), 60 | } 61 | } 62 | } 63 | 64 | thread_local! { 65 | static EPSILON: Rc = Rc::new(RegexTree::Epsilon); 66 | static BOTTOM: Rc = Rc::new(RegexTree::Bottom); 67 | static TOP: Rc = BOTTOM.with(|x| Rc::new(RegexTree::Complement(x.clone()))); 68 | } 69 | 70 | impl RegexTree { 71 | pub fn epsilon() -> Rc { 72 | EPSILON.with(Rc::clone) 73 | } 74 | pub fn bottom() -> Rc { 75 | BOTTOM.with(Rc::clone) 76 | } 77 | pub fn top() -> Rc { 78 | TOP.with(Rc::clone) 79 | } 80 | pub fn is_byte_sequence(&self) -> bool { 81 | match self { 82 | Set(intervals) => intervals.is_single_byte(), 83 | Concat(children) => children.iter().all(|x| x.is_byte_sequence()), 84 | Epsilon => true, 85 | _ => false, 86 | } 87 | } 88 | pub fn as_byte_sequence(&self) -> Option> { 89 | match self { 90 | Set(intervals) if intervals.is_single_byte() => Some(vec![intervals.representative()]), 91 | Concat(children) => { 92 | let init = if let Some(x) = children.get(0) { 93 | x.as_byte_sequence() 94 | } else { 95 | return Some(Vec::new()); 96 | }; 97 | 98 | children[1..].iter().fold(init, |acc, x| { 99 | acc.and_then(|mut acc| { 100 | Some({ 101 | acc.extend(x.as_byte_sequence()?); 102 | acc 103 | }) 104 | }) 105 | }) 106 | } 107 | Epsilon => Some(Vec::new()), 108 | _ => None, 109 | } 110 | } 111 | pub fn single(x: u8) -> Self { 112 | Set(intervals!((x, x))) 113 | } 114 | pub fn range(x: RangeInclusive) -> Self { 115 | if x.is_empty() { 116 | return Bottom; 117 | } 118 | Set(intervals!((*x.start(), *x.end()))) 119 | } 120 | pub fn is_nullable(&self) -> bool { 121 | match self { 122 | Bottom => false, 123 | Set(_) => false, 124 | Epsilon => true, 125 | Concat(children) | Intersection(children) => children.iter().all(|x| x.is_nullable()), 126 | KleeneClosure(_) => true, 127 | Union(children) => children.iter().any(|x| x.is_nullable()), 128 | Complement(r) => !r.is_nullable(), 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /pag-lexer/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | #![feature(portable_simd)] 9 | #![feature(core_intrinsics)] 10 | #![feature(array_chunks)] 11 | 12 | pub mod congruence; 13 | pub mod derivative; 14 | pub mod intervals; 15 | pub mod lookahead; 16 | pub mod normalization; 17 | pub mod regex_tree; 18 | pub mod utilities; 19 | pub mod vector; 20 | 21 | #[cfg(test)] 22 | mod tests { 23 | use crate::congruence::approximate_congruence_class; 24 | use crate::derivative::derivative; 25 | use crate::lookahead::LoopOptimizer; 26 | use crate::normalization::normalize; 27 | use crate::regex_tree::*; 28 | use crate::vector::Vector; 29 | use quote::quote; 30 | use smallvec::smallvec; 31 | use std::rc::Rc; 32 | use RegexTree::*; 33 | 34 | #[test] 35 | fn it_prints_basic() { 36 | let a = Rc::new(RegexTree::single(b'a')); 37 | let b = Rc::new(RegexTree::single(b'b')); 38 | let ab = Rc::new(Concat(smallvec![a, b])); 39 | let alt = Rc::new(Union(smallvec![ab.clone(), ab])); 40 | println!("{}", alt); 41 | let derivative = derivative(alt, b'a'); 42 | println!("{}", derivative); 43 | let normalized = normalize(derivative); 44 | println!("{}", normalized); 45 | println!("{:?}", approximate_congruence_class(&normalized)); 46 | } 47 | 48 | #[test] 49 | fn renormalize_tests() { 50 | // concat 51 | let a = Rc::new(RegexTree::single(b'a')); 52 | let b = Rc::new(RegexTree::single(b'b')); 53 | let concat = Rc::new(Concat(smallvec![a.clone(), b])); 54 | let normalized = normalize(concat.clone()); 55 | assert!(Rc::ptr_eq(&concat, &normalized)); 56 | // kleene closure 57 | let kleene = Rc::new(KleeneClosure(a)); 58 | let normalized = normalize(kleene.clone()); 59 | assert!(Rc::ptr_eq(&kleene, &normalized)); 60 | } 61 | 62 | #[test] 63 | fn beautify_mangle_tests() { 64 | // generate huge test for me 65 | let a = Rc::new(RegexTree::single(b'a')); 66 | let b = Rc::new(RegexTree::single(b'b')); 67 | let c = Rc::new(RegexTree::single(b'c')); 68 | let d = Rc::new(RegexTree::single(b'd')); 69 | let ba = Rc::new(Concat(smallvec![b, a.clone()])); 70 | let a_or_ba = Rc::new(Union(smallvec![a, ba])); 71 | let a_or_ba_or_c = Rc::new(Union(smallvec![a_or_ba, c])); 72 | let a_or_ba_or_c_con_d = 73 | Rc::new(KleeneClosure(Rc::new(Concat(smallvec![a_or_ba_or_c, d])))); 74 | let normalized = normalize(a_or_ba_or_c_con_d); 75 | let congruence = approximate_congruence_class(&normalized); 76 | println!("{:?}", congruence); 77 | let vectorized = Vector::new([normalized]); 78 | let mut optimizer = LoopOptimizer::new(); 79 | println!( 80 | "{}", 81 | vectorized.generate_dfa( 82 | "e!(0), 83 | &mut optimizer, 84 | &[quote!({ 85 | return Some(idx); 86 | })], 87 | "e!({ 88 | return None; 89 | }) 90 | ) 91 | ); 92 | } 93 | 94 | #[test] 95 | fn approximate_congruence_class_test() { 96 | let a = Rc::new(RegexTree::single(b'a')); 97 | let b = Rc::new(RegexTree::single(b'b')); 98 | let c = Rc::new(RegexTree::single(b'c')); 99 | let ba = Rc::new(Concat(smallvec![b, a.clone()])); 100 | let a_or_ba = Rc::new(Union(smallvec![a, ba])); 101 | let a_or_ba_or_c = Rc::new(Union(smallvec![a_or_ba, c])); 102 | println!("normalized: {}", normalize(a_or_ba_or_c.clone())); 103 | let star = Rc::new(KleeneClosure(a_or_ba_or_c.clone())); 104 | let a_or_ba_or_c = Rc::new(Concat(smallvec![a_or_ba_or_c, star])); 105 | println!("{}", a_or_ba_or_c); 106 | let normalized = normalize(a_or_ba_or_c); 107 | println!("normalized: {}", normalized); 108 | let congruence = approximate_congruence_class(&normalized); 109 | println!("{:?}", congruence); 110 | println!(); 111 | let vectorized = Vector::new([normalized]); 112 | let mut optimizer = LoopOptimizer::new(); 113 | println!( 114 | "{}", 115 | vectorized.generate_dfa( 116 | "e!(0), 117 | &mut optimizer, 118 | &[quote!({ 119 | return Some(idx); 120 | })], 121 | "e!({ 122 | return None; 123 | }) 124 | ) 125 | ); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /pag-lexer/src/lookahead.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::intervals::{byte_char, Interval, Intervals}; 10 | use crate::vector::{DfaState, DfaTable}; 11 | use proc_macro2::TokenStream; 12 | use quote::quote; 13 | use std::collections::hash_map::Entry; 14 | use std::collections::HashMap; 15 | 16 | enum Kind { 17 | Positive, 18 | Negative, 19 | } 20 | 21 | fn generate_lut_routine(index: usize) -> TokenStream { 22 | let table = index / 8; 23 | let shift = index % 8; 24 | let bit = 1u8 << shift; 25 | quote! { 26 | idx = idx 27 | + input[idx..] 28 | .iter() 29 | .position(|x| GLOBAL_LUT[#table][*x as usize] & #bit > 0) 30 | .unwrap_or(input.len() - idx); 31 | } 32 | } 33 | 34 | fn byte_simd(byte: u8) -> TokenStream { 35 | let byte = byte_char(byte); 36 | quote! { 37 | data.simd_eq(u8x16::splat(#byte)) 38 | } 39 | } 40 | 41 | fn range_simd(min: u8, max: u8) -> TokenStream { 42 | let min = byte_char(min); 43 | let max = byte_char(max); 44 | quote! { 45 | data.simd_ge(u8x16::splat(#min)) & data.simd_le(u8x16::splat(#max)) 46 | } 47 | } 48 | 49 | fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream { 50 | let count_act = match kind { 51 | Kind::Positive => quote! { trailing_ones }, 52 | Kind::Negative => quote! { trailing_zeros }, 53 | }; 54 | let idx_offset = intervals 55 | .iter() 56 | .map(|&Interval(l, r)| match l == r { 57 | true => byte_simd(l), 58 | false => range_simd(l, r), 59 | }) 60 | .reduce(|acc, x| quote! { #acc | #x }) 61 | .map(|x| { 62 | if cfg!(target_arch = "aarch64") { 63 | quote! {{ 64 | let mask : u128 = unsafe { core::mem::transmute(#x) }; 65 | mask.#count_act() / 8 66 | }} 67 | } else { 68 | quote! { 69 | (#x).to_bitmask().#count_act() 70 | } 71 | } 72 | }); 73 | quote! { 74 | for i in input[idx..].array_chunks::<16>() { 75 | use core::simd::*; 76 | let data = u8x16::from_slice(i); 77 | let idx_offset = #idx_offset; 78 | idx += idx_offset as usize; 79 | if core::intrinsics::unlikely(idx_offset != 16) { 80 | break; 81 | } 82 | } 83 | } 84 | } 85 | 86 | fn estimated_cost(intervals: &Intervals) -> u32 { 87 | intervals 88 | .iter() 89 | .map(|Interval(l, r)| if l == r { 1 } else { 2 }) 90 | .sum() 91 | } 92 | 93 | #[derive(Default)] 94 | pub struct LoopOptimizer { 95 | global_lut: Vec<[u8; 256]>, 96 | assigned: HashMap, 97 | } 98 | 99 | impl LoopOptimizer { 100 | pub fn new() -> Self { 101 | Self { 102 | global_lut: Vec::new(), 103 | assigned: HashMap::new(), 104 | } 105 | } 106 | 107 | fn assign_table(&mut self, negatives: &Intervals) -> usize { 108 | let assigned_table = self.assigned.len(); 109 | match self.assigned.entry(negatives.clone()) { 110 | Entry::Occupied(x) => { 111 | return *x.get(); 112 | } 113 | Entry::Vacant(x) => { 114 | x.insert(assigned_table); 115 | } 116 | }; 117 | let table = assigned_table / 8; 118 | let offset = assigned_table % 8; 119 | if self.global_lut.len() <= table { 120 | self.global_lut.push([0; 256]); 121 | } 122 | for &Interval(l, r) in negatives.iter() { 123 | for i in l..=r { 124 | self.global_lut[table][i as usize] |= 1u8 << offset; 125 | } 126 | } 127 | assigned_table 128 | } 129 | 130 | pub fn generate_lut(&self) -> Option { 131 | if self.assigned.is_empty() { 132 | return None; 133 | } 134 | let table_size = self.global_lut.len(); 135 | let table = self.global_lut.iter().map(|x| quote!([#(#x,)*])); 136 | Some(quote! { 137 | const GLOBAL_LUT : [[u8; 256]; #table_size] = [ #(#table,)* ]; 138 | }) 139 | } 140 | 141 | pub fn generate_lookahead(&mut self, dfa: &DfaTable, state: &DfaState) -> Option { 142 | let limit = 4; 143 | 144 | let positives = direct_self_loops(dfa, state)?; 145 | if estimated_cost(&positives) <= limit { 146 | return Some(generate_lookahead_routine(&positives, Kind::Positive)); 147 | } 148 | 149 | let negatives = positives.complement()?; 150 | if estimated_cost(&negatives) <= limit { 151 | return Some(generate_lookahead_routine(&negatives, Kind::Negative)); 152 | } 153 | 154 | let index = self.assign_table(&negatives); 155 | Some(generate_lut_routine(index)) 156 | } 157 | } 158 | 159 | fn direct_self_loops(dfa: &DfaTable, state: &DfaState) -> Option { 160 | let mut intervals: Option = None; 161 | for (edge, target) in &dfa.get(state)?.transitions { 162 | if target == state { 163 | intervals = Some(intervals.map_or_else(|| edge.clone(), |x| x.union(edge))); 164 | } 165 | } 166 | intervals 167 | } 168 | 169 | #[cfg(test)] 170 | mod test { 171 | use super::*; 172 | 173 | #[test] 174 | fn test_lookahead_codegen() { 175 | use crate::intervals; 176 | let positives = intervals!((b'0', b'9'), (b'0', b'9'), (b'A', b'F')); 177 | syn::parse2::(generate_lookahead_routine(&positives, Kind::Positive)).unwrap(); 178 | syn::parse2::(generate_lookahead_routine(&positives, Kind::Negative)).unwrap(); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /pag-parser/src/tests/failure/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::tests::write_error; 10 | 11 | macro_rules! expect_error { 12 | ( $path:expr, $name:ident, $expect:expr ) => { 13 | #[test] 14 | fn $name() { 15 | let input = include_str!($path); 16 | let errors = write_error(input, $path); 17 | let lines1 = errors.trim().lines().map(str::trim_end).collect::>(); 18 | let lines2 = $expect 19 | .trim() 20 | .lines() 21 | .map(str::trim_end) 22 | .collect::>(); 23 | assert_eq!(lines1, lines2, "\n\n{errors}"); 24 | } 25 | }; 26 | } 27 | 28 | expect_error!( 29 | "err_nullable_token.pag", 30 | err_nullable_token, 31 | r#" 32 | Error: Nullable token detected 33 | ╭─[err_nullable_token.pag:2:5] 34 | │ 35 | 2 │ A = 'a'*; 36 | │ ┬ 37 | │ ╰── token A is nullable 38 | ───╯ 39 | "# 40 | ); 41 | 42 | expect_error!( 43 | "err_unguarded_fixpoint.pag", 44 | err_unguarded_fixpoint, 45 | r#" 46 | Error: Unguarded fixpoint 47 | ╭─[err_unguarded_fixpoint.pag:6:19] 48 | │ 49 | 6 │ active test = test ~ A; 50 | │ ────┬─── 51 | │ ╰───── fixpoint rule test is not guarded -- your grammar is left-recursive 52 | ───╯ 53 | "# 54 | ); 55 | 56 | expect_error!( 57 | "err_alternation_ambiguity.pag", 58 | err_alternation_ambiguity, 59 | r#" 60 | Error: When type checking an alternation of rules, the following rules are ambiguous 61 | ╭─[err_alternation_ambiguity.pag:6:19] 62 | │ 63 | 6 │ active test = A+ | A ~ test; 64 | │ ─┬ ────┬─── 65 | │ ╰───────────── type info for left-hand side: nullable: false, first set: {A}, follow set: {A} 66 | │ │ 67 | │ ╰───── type info for right-hand side: nullable: false, first set: {A}, follow set: {} 68 | ───╯ 69 | "# 70 | ); 71 | 72 | expect_error!( 73 | "err_sequence_ambiguity.pag", 74 | err_sequence_ambiguity, 75 | r#" 76 | Error: When type checking a sequence of rules, the following rules are ambiguous 77 | ╭─[err_sequence_ambiguity.pag:6:19] 78 | │ 79 | 6 │ active test = A+ ~ A; 80 | │ ─┬ ┬ 81 | │ ╰────── type info for left-hand side: nullable: false, first set: {A}, follow set: {A} 82 | │ │ 83 | │ ╰── type info for right-hand side: nullable: false, first set: {A}, follow set: {} 84 | ───╯ 85 | "# 86 | ); 87 | 88 | expect_error!( 89 | "err_null_sequence_ambiguity.pag", 90 | err_null_sequence_ambiguity, 91 | r#" 92 | Error: When type checking a sequence of rules, the following rules are ambiguous 93 | ╭─[err_null_sequence_ambiguity.pag:6:19] 94 | │ 95 | 6 │ active test = _ ~ A; 96 | │ ┬ ┬ 97 | │ ╰────── type info for left-hand side: nullable: true, first set: {}, follow set: {} 98 | │ │ 99 | │ ╰── type info for right-hand side: nullable: false, first set: {A}, follow set: {} 100 | ───╯ 101 | "# 102 | ); 103 | 104 | expect_error!( 105 | "err_multiple_skips.pag", 106 | err_multiple_skips, 107 | r#" 108 | Error: Skipping lexical rule is already defined 109 | ╭─[err_multiple_skips.pag:3:5] 110 | │ 111 | 2 │ skip = "SKIP"; 112 | │ ──────┬────── 113 | │ ╰──────── first definition 114 | 3 │ skip = "ANOTHER_SKIP"; 115 | │ ──────────┬────────── 116 | │ ╰──────────── second definition 117 | ───╯ 118 | "# 119 | ); 120 | 121 | expect_error!( 122 | "err_cyclic_token.pag", 123 | err_cyclic_token, 124 | r#" 125 | Error: Cyclic lexical rule reference 126 | ╭─[err_cyclic_token.pag:2:15] 127 | │ 128 | 2 │ A = 'a' ~ A; 129 | │ ┬ 130 | │ ╰── this reference causes cyclic dependency 131 | ───╯ 132 | "# 133 | ); 134 | 135 | expect_error!( 136 | "err_undefined_token_in_lexer.pag", 137 | err_undefined_token_in_lexer, 138 | r#" 139 | Error: Undefined lexical rule reference 140 | ╭─[err_undefined_token_in_lexer.pag:2:13] 141 | │ 142 | 2 │ A = C; 143 | │ ┬ 144 | │ ╰── lexcical rule C is undefined 145 | ───╯ 146 | "# 147 | ); 148 | 149 | expect_error!( 150 | "err_undefined_token_in_parser.pag", 151 | err_undefined_token_in_parser, 152 | r#" 153 | Error: Undefined lexical rule reference 154 | ╭─[err_undefined_token_in_parser.pag:6:19] 155 | │ 156 | 6 │ active test = AA; 157 | │ ─┬ 158 | │ ╰── lexcical rule AA is undefined 159 | ───╯ 160 | "# 161 | ); 162 | 163 | expect_error!( 164 | "err_undefined_grammar_rule.pag", 165 | err_undefined_grammar_rule, 166 | r#" 167 | Error: Undefined parser rule reference 168 | ╭─[err_undefined_grammar_rule.pag:6:19] 169 | │ 170 | 6 │ active test = test2; 171 | │ ──┬── 172 | │ ╰──── parser rule test2 is undefined 173 | ───╯ 174 | "# 175 | ); 176 | 177 | expect_error!( 178 | "err_multiple_definitions_in_lexer.pag", 179 | err_multiple_definitions_in_lexer, 180 | r#" 181 | Error: Multiple definition of A 182 | ╭─[err_multiple_definitions_in_lexer.pag:3:5] 183 | │ 184 | 2 │ A = '0'; 185 | │ ┬ 186 | │ ╰── first definition 187 | 3 │ A = '1'; 188 | │ ┬ 189 | │ ╰── second definition 190 | ───╯ 191 | "# 192 | ); 193 | 194 | expect_error!( 195 | "err_multiple_definitions_in_parser.pag", 196 | err_multiple_definitions_in_parser, 197 | r#" 198 | Error: Multiple definition of test 199 | ╭─[err_multiple_definitions_in_parser.pag:7:12] 200 | │ 201 | 6 │ active test = A; 202 | │ ──┬─ 203 | │ ╰─── first definition 204 | 7 │ active test = A; 205 | │ ──┬─ 206 | │ ╰─── second definition 207 | ───╯ 208 | "# 209 | ); 210 | -------------------------------------------------------------------------------- /pag-lexer/src/normalization.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::{intervals::Intervals, regex_tree::RegexTree}; 10 | use smallvec::SmallVec; 11 | use std::rc::Rc; 12 | use RegexTree::*; 13 | 14 | type RcVec = SmallVec<[Rc; 2]>; 15 | 16 | fn sequence_unchanged(x: &[Rc], y: &[Rc]) -> bool { 17 | x.iter() 18 | .map(|x| x.as_ref() as *const RegexTree) 19 | .eq(y.iter().map(|y| y.as_ref() as *const RegexTree)) 20 | } 21 | 22 | pub fn normalize(node: Rc) -> Rc { 23 | match node.as_ref() { 24 | Bottom | Epsilon | Set(..) => node, 25 | Concat(old) => { 26 | let mut new = RcVec::new(); 27 | for x in old { 28 | let x = normalize(x.clone()); 29 | match x.as_ref() { 30 | Bottom => return RegexTree::bottom(), // x ~ bot == bot 31 | Epsilon => continue, // x ~ eps == x 32 | Concat(subvec) => new.extend(subvec.iter().cloned()), // flatten 33 | _ => new.push(x.clone()), 34 | } 35 | } 36 | if new.is_empty() { 37 | RegexTree::epsilon() 38 | } else if new.len() == 1 { 39 | new.pop().unwrap() 40 | } else if sequence_unchanged(&new, old) { 41 | node 42 | } else { 43 | Rc::new(Concat(new)) 44 | } 45 | } 46 | Union(old) => { 47 | let mut new = RcVec::new(); 48 | let mut set = None; 49 | for x in old { 50 | let x = normalize(x.clone()); 51 | match x.as_ref() { 52 | _ if x == RegexTree::top() => return RegexTree::top(), // x | top == top 53 | Bottom => continue, // x | bot == x 54 | Union(subvec) => { 55 | for y in subvec { 56 | match y.as_ref() { 57 | Set(subset) => { 58 | set = match set { 59 | None => Some(subset.clone()), 60 | Some(set) => Some(set.union(subset)), 61 | } 62 | } 63 | _ => new.push(y.clone()), 64 | } 65 | } 66 | } 67 | Set(subset) => { 68 | set = match set { 69 | None => Some(subset.clone()), 70 | Some(set) => Some(set.union(subset)), 71 | } 72 | } 73 | _ => new.push(x.clone()), 74 | } 75 | } 76 | if let Some(set) = set { 77 | new.push(Rc::new(Set(set))); 78 | } 79 | 80 | new.sort_unstable(); 81 | new.dedup_by(|x, y| Rc::ptr_eq(x, y) || x == y); 82 | 83 | if new 84 | .iter() 85 | .any(|x| !matches!(x.as_ref(), Epsilon) && x.is_nullable()) 86 | { 87 | new.retain(|x| !matches!(x.as_ref(), Epsilon)); 88 | } 89 | 90 | if new.is_empty() { 91 | RegexTree::bottom() 92 | } else if new.len() == 1 { 93 | new.pop().unwrap() 94 | } else if sequence_unchanged(&new, old) { 95 | node 96 | } else { 97 | Rc::new(Union(new)) 98 | } 99 | } 100 | Intersection(old) => { 101 | let mut new = RcVec::new(); 102 | let mut set = Intervals::full_set(); 103 | for x in old { 104 | let x = normalize(x.clone()); 105 | match x.as_ref() { 106 | Bottom => return RegexTree::bottom(), // x & bot == bot 107 | _ if x == RegexTree::top() => continue, // x & top == x 108 | Intersection(subvec) => { 109 | for y in subvec { 110 | match y.as_ref() { 111 | Set(subset) => match set.intersection(subset) { 112 | Some(new_set) => set = new_set, 113 | None => return RegexTree::bottom(), 114 | }, 115 | _ => new.push(y.clone()), 116 | } 117 | } 118 | } 119 | Set(subset) => match set.intersection(subset) { 120 | Some(new_set) => set = new_set, 121 | None => return RegexTree::bottom(), 122 | }, 123 | _ => new.push(x.clone()), 124 | } 125 | } 126 | new.push(Rc::new(Set(set))); 127 | 128 | new.sort_unstable(); 129 | new.dedup_by(|x, y| Rc::ptr_eq(x, y) || x == y); 130 | 131 | if new.is_empty() { 132 | RegexTree::top() 133 | } else if new.len() == 1 { 134 | new.pop().unwrap() 135 | } else if sequence_unchanged(&new, old) { 136 | node 137 | } else { 138 | Rc::new(Intersection(new)) 139 | } 140 | } 141 | KleeneClosure(old) => { 142 | let new = normalize(old.clone()); 143 | match new.as_ref() { 144 | KleeneClosure(_) => new, 145 | Bottom | Epsilon => RegexTree::epsilon(), 146 | _ => { 147 | if Rc::ptr_eq(old, &new) { 148 | node 149 | } else { 150 | Rc::new(KleeneClosure(new)) 151 | } 152 | } 153 | } 154 | } 155 | Complement(old) => { 156 | let new = normalize(old.clone()); 157 | match new.as_ref() { 158 | Set(x) => match x.complement() { 159 | Some(y) => Rc::new(Set(y)), 160 | None => RegexTree::bottom(), 161 | }, 162 | Complement(r) => r.clone(), 163 | // capture renormalization cases (no need to do allocations) 164 | _ if Rc::ptr_eq(old, &new) => node, 165 | _ => Rc::new(Complement(new)), 166 | } 167 | } 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /pag-parser/src/frontend/syntax.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use std::collections::{HashMap, HashSet}; 10 | 11 | use crate::{ 12 | core_syntax::BindingContext, 13 | core_syntax::{ParserRule, Term, TermArena, TermPtr}, 14 | nf::Tag, 15 | type_system::{infer_fixpoints, type_check, TypeError}, 16 | utilities::{merge_results, unreachable_branch, Symbol}, 17 | }; 18 | 19 | use super::{ 20 | lexical::LexerDatabase, 21 | FrontendError::*, 22 | FrontendResult, 23 | SurfaceSyntaxTree::{self, *}, 24 | WithSpan, 25 | }; 26 | 27 | pub struct Parser<'src, 'arena> { 28 | pub entrypoint: Symbol<'src>, 29 | pub arena: &'arena TermArena<'src, 'arena>, 30 | pub bindings: BindingContext<'src, 'arena>, 31 | pub symbol_set: HashSet<&'src str>, 32 | pub lexer_database: LexerDatabase<'src>, 33 | } 34 | 35 | impl<'src, 'arena> Parser<'src, 'arena> { 36 | pub fn infer_fixpoints(&mut self) { 37 | infer_fixpoints(self.entrypoint, self.arena, &mut self.bindings); 38 | } 39 | 40 | pub fn type_check(&self) -> Vec> { 41 | let target = &self.bindings[&self.entrypoint]; 42 | type_check(&self.bindings, target.term, self.entrypoint) 43 | } 44 | 45 | pub fn is_active(&self, tag: &Tag<'src>) -> bool { 46 | tag.is_original() && self.bindings.get(&tag.symbol()).map_or(false, |x| x.active) 47 | } 48 | } 49 | 50 | pub fn construct_parser<'src, 'arena>( 51 | arena: &'arena TermArena<'src, 'arena>, 52 | lexer_database: LexerDatabase<'src>, 53 | sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>, 54 | ) -> FrontendResult<'src, Parser<'src, 'arena>> { 55 | let ParserDef { entrypoint, rules } = &sst.node else { 56 | unreachable_branch!("sst should be a parser definition") 57 | }; 58 | let symbol_set = construct_symbol_set(sst)?; 59 | let entrypoint = match symbol_set.get(entrypoint.span.as_str()) { 60 | Some(name) => Symbol::new(name), 61 | None => { 62 | return Err(vec![UndefinedParserRuleReference(entrypoint.span)]); 63 | } 64 | }; 65 | let mut parser = Parser { 66 | entrypoint, 67 | arena, 68 | bindings: HashMap::new(), 69 | lexer_database, 70 | symbol_set, 71 | }; 72 | let mut errs = Vec::new(); 73 | for rule in rules { 74 | let ParserRuleDef { active, name, expr, } = &rule.node else { 75 | unreachable_branch!("parser should only contain rule definitions") 76 | }; 77 | match construct_core_syntax_tree(&parser, expr) { 78 | Ok(term) => { 79 | let symbol = Symbol::new(name.span.as_str()); 80 | parser.bindings.insert( 81 | symbol, 82 | ParserRule { 83 | active: *active, 84 | term, 85 | }, 86 | ); 87 | } 88 | Err(e) => errs.extend(e), 89 | } 90 | } 91 | if !errs.is_empty() { 92 | return Err(errs); 93 | } 94 | Ok(parser) 95 | } 96 | 97 | fn construct_symbol_set<'src>( 98 | sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>, 99 | ) -> FrontendResult<'src, HashSet<&'src str>> { 100 | let ParserDef { rules, .. } = &sst.node else { 101 | unreachable_branch!("sst should be a parser definition") 102 | }; 103 | let mut symbol_table = HashMap::with_capacity(rules.len()); 104 | for rule in rules { 105 | let ParserRuleDef { name, .. } = &rule.node else { 106 | unreachable_branch!("parser should only contain rule definitions") 107 | }; 108 | if let Some(previous) = symbol_table.get(name.span.as_str()) { 109 | return Err(vec![MultipleDefinition(*previous, name.span)]); 110 | } else { 111 | symbol_table.insert(name.span.as_str(), name.span); 112 | } 113 | } 114 | Ok(symbol_table.keys().copied().collect()) 115 | } 116 | 117 | fn construct_core_syntax_tree<'src, 'arena>( 118 | context: &Parser<'src, 'arena>, 119 | sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>, 120 | ) -> FrontendResult<'src, TermPtr<'src, 'arena>> { 121 | let spanned = |node: Term<'src, 'arena>| { 122 | context.arena.alloc(WithSpan { 123 | span: sst.span, 124 | node, 125 | }) 126 | }; 127 | match &sst.node { 128 | ParserAlternative { lhs, rhs } => { 129 | let lhs = construct_core_syntax_tree(context, lhs); 130 | let rhs = construct_core_syntax_tree(context, rhs); 131 | merge_results(lhs, rhs, |l, r| &*spanned(Term::Alternative(l, r))) 132 | } 133 | ParserSequence { lhs, rhs } => { 134 | let lhs = construct_core_syntax_tree(context, lhs); 135 | let rhs = construct_core_syntax_tree(context, rhs); 136 | merge_results(lhs, rhs, |l, r| &*spanned(Term::Sequence(l, r))) 137 | } 138 | ParserStar { inner } => { 139 | let symbol = Symbol::new(sst.span.as_str()); 140 | let inner = construct_core_syntax_tree(context, inner)?; 141 | // \x . (i ~ x) | epsilon 142 | let sequence = spanned(Term::Sequence(inner, spanned(Term::ParserRef(symbol)))); 143 | let alternative = spanned(Term::Alternative(sequence, spanned(Term::Epsilon))); 144 | Ok(spanned(Term::Fix(symbol, alternative))) 145 | } 146 | ParserPlus { inner } => { 147 | let symbol = Symbol::new(sst.span.as_str()); 148 | let inner = construct_core_syntax_tree(context, inner)?; 149 | // i ~ (\x . (i ~ x) | epsilon) 150 | let sequence = spanned(Term::Sequence(inner, spanned(Term::ParserRef(symbol)))); 151 | let alternative = spanned(Term::Alternative(sequence, spanned(Term::Epsilon))); 152 | let fixpoint = spanned(Term::Fix(symbol, alternative)); 153 | Ok(spanned(Term::Sequence(inner, fixpoint))) 154 | } 155 | ParserOptional { inner } => { 156 | let inner = construct_core_syntax_tree(context, inner)?; 157 | Ok(spanned(Term::Alternative(inner, spanned(Term::Epsilon)))) 158 | } 159 | Bottom => Ok(spanned(Term::Bottom)), 160 | Empty => Ok(spanned(Term::Epsilon)), 161 | ParserRuleRef { name } => { 162 | match context.symbol_set.get(name.span.as_str()) { 163 | // Symbol::hash depends on the address so use the original &str 164 | Some(target) => Ok(spanned(Term::ParserRef(Symbol::new(target)))), 165 | None => Err(vec![UndefinedParserRuleReference(name.span)]), 166 | } 167 | } 168 | LexicalRuleRef { name } => { 169 | match context.lexer_database.symbol_set.get(name.span.as_str()) { 170 | // Symbol::hash depends on the address so use the original &str 171 | Some(target) => Ok(spanned(Term::LexerRef(Symbol::new(target)))), 172 | None => Err(vec![UndefinedLexicalRuleReference(name.span)]), 173 | } 174 | } 175 | _ => unreachable_branch!("called with unsupported node: {}", sst.span.as_str()), 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /pag-parser/src/frontend/lexical.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use smallvec::smallvec; 10 | use std::cell::Cell; 11 | use std::collections::{HashMap, HashSet}; 12 | use std::rc::Rc; 13 | 14 | use pag_lexer::{normalization::normalize, regex_tree::RegexTree}; 15 | use pest::Span; 16 | 17 | use crate::utilities::{merge_results, unreachable_branch, Symbol}; 18 | 19 | use super::{ 20 | unicode::{encode_char, encode_range}, 21 | FrontendError::*, 22 | FrontendResult, 23 | SurfaceSyntaxTree::{self, *}, 24 | WithSpan, 25 | }; 26 | 27 | type SpanRegexTree<'src> = WithSpan<'src, Rc>; 28 | 29 | pub struct LexerDatabase<'src> { 30 | pub symbol_set: HashSet<&'src str>, 31 | pub entries: HashMap, SpanRegexTree<'src>>, 32 | pub skip: Option>, 33 | } 34 | 35 | impl<'src> LexerDatabase<'src> { 36 | pub fn nullability_check(&self) -> FrontendResult<'src, ()> { 37 | let mut errs = Vec::new(); 38 | for (sym, rule) in &self.entries { 39 | if rule.node.is_nullable() { 40 | errs.push(NullableToken(sym.name(), rule.span)); 41 | } 42 | } 43 | if let Some(skip) = &self.skip { 44 | if skip.node.is_nullable() { 45 | errs.push(NullableToken("", skip.span)); 46 | } 47 | } 48 | if !errs.is_empty() { 49 | return Err(errs); 50 | } 51 | Ok(()) 52 | } 53 | } 54 | 55 | enum State<'src, 'local> { 56 | Unresolved(&'local WithSpan<'src, SurfaceSyntaxTree<'src>>), 57 | Pending, 58 | Resolved(Rc), 59 | } 60 | 61 | pub fn construct_lexer_database<'src>( 62 | sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>, 63 | ) -> FrontendResult<'src, LexerDatabase<'src>> { 64 | let LexerDef { rules } = &sst.node else { 65 | unreachable_branch!("sst should be a lexical definition") 66 | }; 67 | 68 | let mut rule_defs = HashMap::new(); 69 | let mut skip_def = None; 70 | let mut errs = Vec::new(); 71 | for rule in rules { 72 | match &rule.node { 73 | LexicalRuleDef { name, expr } => { 74 | let value = (name.span, Cell::new(State::Unresolved(expr))); 75 | if let Some((previous, _)) = rule_defs.insert(name.span.as_str(), value) { 76 | errs.push(MultipleDefinition(previous, name.span)); 77 | } 78 | } 79 | LexicalSkipDef { expr } => { 80 | if let Some((previous, _)) = skip_def.replace((rule.span, expr)) { 81 | errs.push(MultipleSkippingRule(previous, rule.span)); 82 | } 83 | } 84 | _ => {} 85 | } 86 | } 87 | if !errs.is_empty() { 88 | return Err(errs); 89 | } 90 | 91 | let mut entries = HashMap::new(); 92 | for (name, (span, state)) in &rule_defs { 93 | let node = match state.replace(State::Pending) { 94 | State::Unresolved(expr_sst) => { 95 | let expr_regex = construct_regex_tree(expr_sst, &rule_defs)?; 96 | let expr_regex = normalize(expr_regex); 97 | state.set(State::Resolved(expr_regex.clone())); 98 | expr_regex 99 | } 100 | State::Pending => unreachable!(), 101 | State::Resolved(expr_regex) => { 102 | state.set(State::Resolved(expr_regex.clone())); 103 | expr_regex 104 | } 105 | }; 106 | entries.insert(Symbol::new(name), WithSpan { span: *span, node }); 107 | } 108 | 109 | let mut skip = None; 110 | if let Some((span, skip_sst)) = skip_def { 111 | let node = construct_regex_tree(skip_sst, &rule_defs)?; 112 | let node = normalize(node); 113 | skip = Some(WithSpan { span, node }); 114 | } 115 | 116 | Ok(LexerDatabase { 117 | entries, 118 | symbol_set: rule_defs.keys().copied().collect(), 119 | skip, 120 | }) 121 | } 122 | 123 | // 3-color DFS algorithm to detect cycle 124 | fn construct_regex_tree<'src>( 125 | sst: &WithSpan<'src, SurfaceSyntaxTree<'src>>, 126 | rule_defs: &HashMap<&'src str, (Span<'src>, Cell>)>, 127 | ) -> FrontendResult<'src, Rc> { 128 | match &sst.node { 129 | LexicalAlternative { lhs, rhs } => { 130 | let lhs = construct_regex_tree(lhs, rule_defs); 131 | let rhs = construct_regex_tree(rhs, rule_defs); 132 | merge_results(lhs, rhs, |l, r| Rc::new(RegexTree::Union(smallvec![l, r]))) 133 | } 134 | LexicalSequence { lhs, rhs } => { 135 | let lhs = construct_regex_tree(lhs, rule_defs); 136 | let rhs = construct_regex_tree(rhs, rule_defs); 137 | merge_results(lhs, rhs, |l, r| Rc::new(RegexTree::Concat(smallvec![l, r]))) 138 | } 139 | LexicalAnd { lhs, rhs } => { 140 | let lhs = construct_regex_tree(lhs, rule_defs); 141 | let rhs = construct_regex_tree(rhs, rule_defs); 142 | merge_results(lhs, rhs, |l, r| { 143 | Rc::new(RegexTree::Intersection(smallvec![l, r])) 144 | }) 145 | } 146 | LexicalStar { inner } => { 147 | let inner = construct_regex_tree(inner, rule_defs)?; 148 | Ok(Rc::new(RegexTree::KleeneClosure(inner))) 149 | } 150 | LexicalPlus { inner } => { 151 | let inner = construct_regex_tree(inner, rule_defs)?; 152 | Ok(Rc::new(RegexTree::Concat(smallvec![ 153 | inner.clone(), 154 | Rc::new(RegexTree::KleeneClosure(inner)) 155 | ]))) 156 | } 157 | LexicalOptional { inner } => { 158 | let inner = construct_regex_tree(inner, rule_defs)?; 159 | Ok(Rc::new(RegexTree::Union(smallvec![ 160 | inner, 161 | RegexTree::epsilon() 162 | ]))) 163 | } 164 | LexicalNot { inner } => { 165 | let inner = construct_regex_tree(inner, rule_defs)?; 166 | Ok(Rc::new(RegexTree::Complement(inner))) 167 | } 168 | RangeLit { start, end } => Ok(encode_range(*start, *end)), 169 | StringLit(x) => Ok(x 170 | .bytes() 171 | .map(|b| Rc::new(RegexTree::single(b))) 172 | .reduce(|acc, b| Rc::new(RegexTree::Concat(smallvec![acc, b]))) 173 | .unwrap_or_else(RegexTree::epsilon)), 174 | Bottom => Ok(RegexTree::bottom()), 175 | Empty => Ok(RegexTree::epsilon()), 176 | CharLit { value } => Ok(encode_char(value.node)), 177 | LexicalRuleRef { name } => match rule_defs.get(name.span.as_str()) { 178 | Some((_, state)) => match state.replace(State::Pending) { 179 | State::Unresolved(expr_sst) => { 180 | let expr_regex = construct_regex_tree(expr_sst, rule_defs)?; 181 | let expr_regex = normalize(expr_regex); 182 | state.set(State::Resolved(expr_regex.clone())); 183 | Ok(expr_regex) 184 | } 185 | State::Pending => Err(vec![CyclicLexicalRuleReference(name.span)]), 186 | State::Resolved(expr_regex) => { 187 | state.set(State::Resolved(expr_regex.clone())); 188 | Ok(expr_regex) 189 | } 190 | }, 191 | None => Err(vec![UndefinedLexicalRuleReference(name.span)]), 192 | }, 193 | _ => unreachable_branch!("called with unsupported node: {}", sst.span.as_str()), 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /pag-parser/src/type_system/type_check.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use pest::Span; 10 | use std::collections::HashSet; 11 | 12 | use crate::core_syntax::{BindingContext, Term, TermPtr}; 13 | use crate::utilities::Symbol; 14 | 15 | use super::{binding_proxy::BindingProxy, context::TypeContext}; 16 | 17 | #[derive(Debug)] 18 | pub enum TypeError<'a> { 19 | SequentialUniquenessViolation { 20 | lhs: (Span<'a>, Type<'a>), 21 | rhs: (Span<'a>, Type<'a>), 22 | total: Span<'a>, 23 | }, 24 | DisjunctiveUniquenessViolation { 25 | lhs: (Span<'a>, Type<'a>), 26 | rhs: (Span<'a>, Type<'a>), 27 | total: Span<'a>, 28 | }, 29 | UnguardedFixpoint(Symbol<'a>, Span<'a>), 30 | UnresolvedReference(Symbol<'a>, Span<'a>), 31 | } 32 | 33 | #[derive(PartialEq, Eq, Debug, Clone)] 34 | pub struct Type<'src> { 35 | pub first: HashSet>, 36 | pub follow: HashSet>, 37 | pub nullable: bool, 38 | pub guarded: bool, 39 | } 40 | 41 | impl<'src> Type<'src> { 42 | fn sequential_uniqueness(&self, other: &Self) -> bool { 43 | !self.nullable && self.follow.is_disjoint(&other.first) 44 | } 45 | 46 | fn disjunctive_uniqueness(&self, other: &Self) -> bool { 47 | !(self.nullable && other.nullable) && self.first.is_disjoint(&other.first) 48 | } 49 | 50 | fn epsilon() -> Self { 51 | Self { 52 | first: HashSet::new(), 53 | follow: HashSet::new(), 54 | nullable: true, 55 | guarded: true, 56 | } 57 | } 58 | fn token(token: Symbol<'src>) -> Self { 59 | Self { 60 | first: HashSet::from([token]), 61 | follow: HashSet::new(), 62 | nullable: false, 63 | guarded: true, 64 | } 65 | } 66 | 67 | fn sequence( 68 | t1: &Self, 69 | t2: &Self, 70 | lhs: Span<'src>, 71 | rhs: Span<'src>, 72 | total: Span<'src>, 73 | ) -> Result>> { 74 | if t1.sequential_uniqueness(t2) { 75 | Ok(Self { 76 | first: t1.first.clone(), 77 | follow: if t2.nullable { 78 | t2.follow 79 | .union(&t2.first) 80 | .chain(t1.follow.iter()) 81 | .cloned() 82 | .collect() 83 | } else { 84 | t2.follow.clone() 85 | }, 86 | nullable: false, 87 | guarded: t1.guarded, 88 | }) 89 | } else { 90 | Err(Box::new(TypeError::SequentialUniquenessViolation { 91 | lhs: (lhs, t1.clone()), 92 | rhs: (rhs, t2.clone()), 93 | total, 94 | })) 95 | } 96 | } 97 | 98 | fn bottom() -> Self { 99 | Self { 100 | first: HashSet::new(), 101 | follow: HashSet::new(), 102 | nullable: false, 103 | guarded: true, 104 | } 105 | } 106 | 107 | fn alternative( 108 | t1: &Self, 109 | t2: &Self, 110 | lhs: Span<'src>, 111 | rhs: Span<'src>, 112 | total: Span<'src>, 113 | ) -> Result>> { 114 | if t1.disjunctive_uniqueness(t2) { 115 | Ok(Self { 116 | first: t1.first.union(&t2.first).cloned().collect(), 117 | follow: t1.follow.union(&t2.follow).cloned().collect(), 118 | nullable: t1.nullable || t2.nullable, 119 | guarded: t1.guarded && t2.guarded, 120 | }) 121 | } else { 122 | Err(Box::new(TypeError::DisjunctiveUniquenessViolation { 123 | lhs: (lhs, t1.clone()), 124 | rhs: (rhs, t2.clone()), 125 | total, 126 | })) 127 | } 128 | } 129 | 130 | fn minimum() -> Self { 131 | Self { 132 | first: HashSet::new(), 133 | follow: HashSet::new(), 134 | nullable: false, 135 | guarded: false, 136 | } 137 | } 138 | 139 | fn fixpoint(mut f: F) -> (Self, Vec>) 140 | where 141 | F: FnMut(&Self) -> (Self, Vec>), 142 | { 143 | let mut last = Self::minimum(); 144 | loop { 145 | let (next, errs) = f(&last); 146 | if !errs.is_empty() || next == last { 147 | return (next, errs); 148 | } 149 | last = next; 150 | } 151 | } 152 | } 153 | 154 | fn type_check_impl<'src, 'a>( 155 | typing_ctx: &mut TypeContext<'src>, 156 | binding_ctx: &mut BindingProxy<'src, 'a>, 157 | term: TermPtr<'src, 'a>, 158 | ) -> (Type<'src>, Vec>) { 159 | match &term.node { 160 | Term::Epsilon => (Type::epsilon(), vec![]), 161 | Term::Sequence(x, y) => { 162 | let (x_type, x_errors) = type_check_impl(typing_ctx, binding_ctx, x); 163 | let (y_type, y_errors) = typing_ctx.guarded(|ctx| type_check_impl(ctx, binding_ctx, y)); 164 | let (r#type, err) = match Type::sequence(&x_type, &y_type, x.span, y.span, term.span) { 165 | Ok(r#type) => (r#type, None), 166 | Err(err) => (Type::bottom(), Some(err)), 167 | }; 168 | ( 169 | r#type, 170 | x_errors 171 | .into_iter() 172 | .chain(y_errors) 173 | .chain(err.map(|e| *e)) 174 | .collect(), 175 | ) 176 | } 177 | Term::LexerRef(name) => (Type::token(*name), vec![]), 178 | Term::Bottom => (Type::bottom(), vec![]), 179 | Term::Alternative(x, y) => { 180 | let (x_type, x_errors) = type_check_impl(typing_ctx, binding_ctx, x); 181 | let (y_type, y_errors) = type_check_impl(typing_ctx, binding_ctx, y); 182 | let (r#type, err) = match Type::alternative(&x_type, &y_type, x.span, y.span, term.span) 183 | { 184 | Ok(r#type) => (r#type, None), 185 | Err(err) => (Type::bottom(), Some(err)), 186 | }; 187 | ( 188 | r#type, 189 | x_errors 190 | .into_iter() 191 | .chain(y_errors) 192 | .chain(err.map(|e| *e)) 193 | .collect(), 194 | ) 195 | } 196 | Term::ParserRef(name) => { 197 | // first check if name is already typed in the context. 198 | // if so return that type directly. 199 | if let Some(ty) = typing_ctx.lookup(*name) { 200 | return (ty.as_ref().clone(), vec![]); 201 | } 202 | // otherwise, we need to type check the parser definition. 203 | if let Some(target) = binding_ctx.lookup(name) { 204 | // we should not cache the result, since it can be recursive and changed during the calculation of the fixpoint. 205 | let (r#type, errors) = binding_ctx.with_hiding(*name, |binding_ctx| { 206 | type_check_impl(typing_ctx, binding_ctx, target) 207 | }); 208 | (r#type, errors) 209 | } else { 210 | ( 211 | Type::bottom(), 212 | vec![TypeError::UnresolvedReference(*name, term.span)], 213 | ) 214 | } 215 | } 216 | Term::Fix(var, body) => { 217 | if let Some(ty) = typing_ctx.lookup(*var) { 218 | return (ty.as_ref().clone(), vec![]); 219 | } 220 | let (r#type, errs) = Type::fixpoint(|x| { 221 | typing_ctx.with(*var, x.clone(), |ctx| { 222 | type_check_impl(ctx, binding_ctx, body) 223 | }) 224 | }); 225 | if !errs.is_empty() { 226 | return (r#type, errs); 227 | } 228 | if r#type.guarded { 229 | typing_ctx.with(*var, r#type.clone(), |ctx| { 230 | type_check_impl(ctx, binding_ctx, body) 231 | }) 232 | } else { 233 | ( 234 | Type::bottom(), 235 | vec![TypeError::UnguardedFixpoint(*var, term.span)], 236 | ) 237 | } 238 | } 239 | } 240 | } 241 | 242 | pub fn type_check<'src, 'a>( 243 | binding_ctx: &BindingContext<'src, 'a>, 244 | term: TermPtr<'src, 'a>, 245 | name: Symbol<'src>, 246 | ) -> Vec> { 247 | let mut typing_ctx = TypeContext::new(); 248 | let mut proxy = BindingProxy::proxy(binding_ctx); 249 | proxy.with_hiding(name, |binding_ctx| { 250 | type_check_impl(&mut typing_ctx, binding_ctx, term).1 251 | }) 252 | } 253 | -------------------------------------------------------------------------------- /pag-lexer/src/intervals.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use proc_macro2::{Literal, TokenStream}; 10 | use quote::{quote, ToTokens}; 11 | use smallvec::{smallvec, SmallVec}; 12 | use std::fmt::{Display, Formatter}; 13 | 14 | #[macro_export] 15 | macro_rules! interval { 16 | ($start:expr, $end:expr) => { 17 | $crate::intervals::Interval($start as u8, $end as u8) 18 | }; 19 | } 20 | 21 | #[macro_export] 22 | macro_rules! intervals { 23 | ($(($start:expr, $end:expr)),+ $(,)?) => { 24 | unsafe { 25 | $crate::intervals::Intervals::new( 26 | [$($crate::interval!($start, $end)),+] 27 | ).unwrap_unchecked() 28 | } 29 | }; 30 | } 31 | 32 | // A closed interval of u8s. 33 | #[derive(Debug, Ord, PartialOrd, Eq, PartialEq, Hash, Copy, Clone)] 34 | pub struct Interval(pub u8, pub u8); 35 | 36 | impl Display for Interval { 37 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 38 | let start = self.0.escape_ascii(); 39 | let end = self.1.escape_ascii(); 40 | if self.0 == self.1 { 41 | write!(f, "{start}") 42 | } else { 43 | write!(f, "[{start}, {end}]") 44 | } 45 | } 46 | } 47 | 48 | impl Interval { 49 | // Check if two intervals overlap. 50 | pub fn overlaps(&self, other: &Self) -> bool { 51 | self.0 <= other.1 && other.0 <= self.1 52 | } 53 | 54 | pub fn intersection(&self, other: &Self) -> Self { 55 | debug_assert!(self.overlaps(other)); 56 | Self(self.0.max(other.0), self.1.min(other.1)) 57 | } 58 | 59 | pub fn contains(&self, other: &Self) -> bool { 60 | self.0 <= other.0 && other.1 <= self.1 61 | } 62 | } 63 | 64 | // Invariants: 65 | // - Ordered 66 | // - Non-empty 67 | // - Non-consecutive 68 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] 69 | pub struct Intervals(SmallVec<[Interval; 8]>); 70 | 71 | impl Intervals { 72 | pub fn new(data: I) -> Option 73 | where 74 | I: IntoIterator, 75 | { 76 | data.into_iter() 77 | .map(|x| Self(smallvec![x])) 78 | .reduce(|acc, x| acc.union(&x)) 79 | } 80 | 81 | pub fn full_set() -> Self { 82 | Self(smallvec![Interval(u8::MIN, u8::MAX)]) 83 | } 84 | 85 | pub fn iter(&self) -> impl Iterator { 86 | self.0.iter() 87 | } 88 | 89 | pub fn is_single_byte(&self) -> bool { 90 | self.0.len() == 1 && self.0[0].0 == self.0[0].1 91 | } 92 | 93 | pub fn representative(&self) -> u8 { 94 | self.0[0].0 95 | } 96 | 97 | pub fn is_full_set(&self) -> bool { 98 | self.0.len() == 1 && self.0[0] == Interval(u8::MIN, u8::MAX) 99 | } 100 | 101 | // it is okay it contains non-unicode code points; they will never be read anyway. 102 | pub fn complement(&self) -> Option { 103 | let mut current = Some(0); 104 | let mut result = SmallVec::new(); 105 | for i in self.0.iter() { 106 | if let Some(c) = current { 107 | if c < i.0 { 108 | result.push(Interval(c, i.0 - 1)); 109 | } 110 | } 111 | current = i.1.checked_add(1); 112 | } 113 | if let Some(current) = current { 114 | result.push(Interval(current, u8::MAX)); 115 | } 116 | if result.is_empty() { 117 | None 118 | } else { 119 | Some(Self(result)) 120 | } 121 | } 122 | 123 | pub fn contains(&self, target: u8) -> bool { 124 | match self.0.binary_search_by_key(&target, |x| x.0) { 125 | Ok(_) => true, 126 | Err(0) => false, 127 | Err(idx) => self.0[idx - 1].1 >= target, 128 | } 129 | } 130 | 131 | pub fn intersection(&self, other: &Self) -> Option { 132 | let mut result: Option = None; 133 | for i in self.0.iter().copied() { 134 | for j in other.0.iter().copied() { 135 | if i.overlaps(&j) { 136 | let temp = Self(smallvec![i.intersection(&j)]); 137 | result = match result { 138 | None => Some(temp), 139 | Some(x) => Some(x.union(&temp)), 140 | }; 141 | } else if j.0 > i.1 { 142 | break; 143 | } 144 | } 145 | } 146 | result 147 | } 148 | 149 | pub fn union(&self, other: &Self) -> Self { 150 | let mut result = SmallVec::new(); 151 | let mut i = self.0.iter().copied().peekable(); 152 | let mut j = other.0.iter().copied().peekable(); 153 | loop { 154 | let mut current = match (i.peek(), j.peek()) { 155 | (Some(&x), Some(&y)) if x.0 < y.0 => i.next().unwrap(), 156 | (_, Some(_)) => j.next().unwrap(), 157 | (Some(_), _) => i.next().unwrap(), 158 | _ => break, 159 | }; 160 | loop { 161 | match (i.peek(), j.peek()) { 162 | (Some(x), _) if current.1.wrapping_add(1) >= x.0 => { 163 | current.1 = current.1.max(i.next().unwrap().1); 164 | } 165 | (_, Some(y)) if current.1.wrapping_add(1) >= y.0 => { 166 | current.1 = current.1.max(j.next().unwrap().1); 167 | } 168 | _ => break, 169 | } 170 | } 171 | result.push(current); 172 | } 173 | Self(result) 174 | } 175 | } 176 | 177 | impl Display for Intervals { 178 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 179 | match self.0.as_slice() { 180 | [] => Ok(()), 181 | [single] => write!(f, "{single}"), 182 | multiple => { 183 | let iter = multiple.iter().map(|i| i.to_string()); 184 | write!(f, "({})", iter.collect::>().join(" | ")) 185 | } 186 | } 187 | } 188 | } 189 | 190 | pub fn byte_char(c: u8) -> Literal { 191 | format!("b'{}'", c.escape_ascii()).parse().unwrap() 192 | } 193 | 194 | impl ToTokens for Intervals { 195 | fn to_tokens(&self, tokens: &mut TokenStream) { 196 | debug_assert!(!self.0.is_empty()); 197 | let iter = self.0.iter().map(|Interval(start, end)| { 198 | let start_lit = byte_char(*start); 199 | let end_lit = byte_char(*end); 200 | if start == end { 201 | quote! { #start_lit } 202 | } else { 203 | quote! { #start_lit ..= #end_lit } 204 | } 205 | }); 206 | tokens.extend(quote! { #(#iter)|* }); 207 | } 208 | } 209 | 210 | #[cfg(test)] 211 | mod test { 212 | #[test] 213 | fn basic_format() { 214 | let interval = interval!(0x41, 0x5A); 215 | assert_eq!(format!("{interval}"), "[A, Z]"); 216 | let interval = interval!(0x41, 0x7A); 217 | assert_eq!(format!("{interval}"), "[A, z]"); 218 | let interval = interval!(0x41, 0x7B); 219 | assert_eq!(format!("{interval}"), "[A, {]"); 220 | // whitespace 221 | let interval = interval!(b'\t', b'\t'); 222 | assert_eq!(format!("{interval}"), r"\t"); 223 | } 224 | 225 | #[test] 226 | fn intervals_format() { 227 | let intervals = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9')); 228 | assert_eq!(format!("{intervals}"), "([0, 9] | [A, Z] | [a, z])"); 229 | } 230 | 231 | #[test] 232 | fn union() { 233 | let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9')); 234 | assert_eq!(x.union(&x), x); 235 | let y = intervals!(('!', '7')); 236 | assert_eq!(x.union(&y), intervals!(('!', '9'), ('A', 'Z'), ('a', 'z'))); 237 | let z = intervals!(('!', '7'), ('C', 'e')); 238 | assert_eq!(x.union(&z), intervals!(('!', '9'), ('A', 'z'))); 239 | } 240 | 241 | #[test] 242 | fn complement() { 243 | let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9')); 244 | let y = intervals!((0, 47), (58, 64), (91, 96), (123, u8::MAX)); 245 | assert_eq!(x.complement(), Some(y)); 246 | let z = intervals!(('\0', '7')); 247 | assert_eq!(z.complement().unwrap(), intervals!(('8', u8::MAX))); 248 | assert_eq!(x.complement().unwrap().complement().unwrap(), x); 249 | assert_eq!(x.union(&x.complement().unwrap()), intervals!((0, u8::MAX))); 250 | } 251 | 252 | #[test] 253 | fn intersection() { 254 | let x = intervals!(('a', 'z'), ('A', 'Z'), ('0', '9')); 255 | let z = intervals!(('\0', '7')); 256 | assert_eq!(x.intersection(&z), Some(intervals!(('0', '7')))); 257 | assert!(x.intersection(&x.complement().unwrap()).is_none()); 258 | assert_eq!(x.intersection(&intervals!((0, u8::MAX))).unwrap(), x); 259 | let a = intervals!(('E', 'c')); 260 | assert_eq!(x.intersection(&a), Some(intervals!(('E', 'Z'), ('a', 'c')))); 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 4 |

5 | 6 | hermit-crab 7 |
8 | Paguroidea 9 |

10 | 11 | ![GITHUB-BADGE](https://github.com/SchrodingerZhu/paguroidea/workflows/Build/badge.svg) 12 | 13 | | Crate | Status | 14 | |----------------|----------------------------------------------------------------| 15 | | `pag-lexer` | ![crates.io](https://img.shields.io/crates/v/pag-lexer.svg) | 16 | | `pag-parser` | ![crates.io](https://img.shields.io/crates/v/pag-parser.svg) | 17 | | `pag-compiler` | ![crates.io](https://img.shields.io/crates/v/pag-compiler.svg) | 18 | 19 | 20 | 21 | A reimplementation of the Flap parser in Rust (with our own modifications applied)! 22 | 23 | ## 🚧 Under Construction 🚧 24 | This project is still under its early-stage development. The grammar for Paguroidea is subject to change 25 | (see [Issue #22](https://github.com/SchrodingerZhu/paguroidea/issues/22)). The parser generation is not thoroughly tested, 26 | which may still shake some bugs out from time to time. There are also ongoing works to improvement the quality of the generated 27 | code. 28 | 29 | ## Introduction 30 | Paguroidea is a parser generator (a.k.a. the compiler of compiler). The theoretical foundation of Paguroidea is built 31 | on a few papers: 32 | 33 | - [Regular-expression derivatives reexamined](https://www.ccs.neu.edu/home/turon/re-deriv.pdf) introduced a way to generate 34 | DFAs for lexer directly based on language derivatives. The number of states in DFAs created by this approach is close to 35 | the minimal. 36 | - [A Typed, Algebraic Approach to Parsing](https://www.cl.cam.ac.uk/~nk480/parsing.pdf) provides a method to "type check" 37 | context-free grammars such that the checked grammar is guaranteed to be parsed in linear time with single-token 38 | lookahead. This is especially useful in Flap/Paguroidea to make sure the total correctness of normalization. 39 | - [flap: A Deterministic Parser with Fused Lexing](https://arxiv.org/abs/2304.05276v2) invented a novel approach to normalize 40 | context-free grammars into the so-called Deterministic Greibach Normal Form (DGNF), where one can use localized smaller lexers 41 | for each individual parser routine rather than lexing the whole input with a lexer containing all regular expressions of 42 | all tokens. 43 | 44 | We modified the work of flap by extending DGNF with tree-generation actions, which are similar to the "reduce" operation in a 45 | traditional shift-reduce parser. 46 | 47 | ## How to use 48 | > **Notice**: grammar for parser definitions used in this section will be changed in the near future. 49 | 50 | It is simple: just define your grammar and pass it to our compiler. Then, Paguroidea will output a standalone parser file. 51 | 52 | For example, a simple S-expression parser can be defined as the following 53 | ```text 54 | lexer { 55 | definition BLANK = ' '; 56 | definition DIGIT = '0' .. '9'; 57 | definition ALPHA = 'a' .. 'z' | 'A' .. 'Z'; 58 | active token LPAREN = '('; 59 | active token RPAREN = ')'; 60 | active token ATOM = ALPHA ~ (ALPHA | DIGIT)*; 61 | silent token WHITESPACE = (BLANK | '\t' | '\n' | '\r')+; 62 | } 63 | parser sexpr { 64 | active fixpoint compound 65 | = LPAREN ~ (compound | atom) * ~ RPAREN; 66 | 67 | active definition atom 68 | = ATOM; 69 | 70 | active definition sexpr 71 | = compound | atom; 72 | } 73 | ``` 74 | 75 |
76 | How to write a grammar file 77 | 78 | You can put up your own one with the following rules: 79 | 80 | - A grammar file must contain both lexer and parser parts. 81 | - A `definition` in lexer part is a `macro` representing some common lexical rules. A definition 82 | itself does not count as a token, which is similar to `fragment` in ANTLR. 83 | - A lexer can atmost have one `silent` token. `silent` tokens will be automatically skipped during 84 | parsing. 85 | - All rules defined in lexer part must be full uppercase. 86 | - You can use 87 | - empty (`'_'`) 88 | - characters (`'a', '\x12', '😊'`) 89 | - strings (`"你好", "Rust"`) 90 | - ranges (`'A' .. 'Z'`) 91 | - sequences (`'a' ~ 'b'`), 92 | - alternatives (`'a' | 'b'`) 93 | - optionals (`'a'?`) 94 | - zero-or-mores (`'a'\*`) 95 | - one-or-mores (`'a'+`) 96 | - complements (`!'a'`) 97 | 98 | to make up your regular expressions. Notice that complement is not negative lookahead in the common sense. Rather, 99 | it represents characters or languages complement to negated one. It is required that all active tokens cannot be nullable. 100 | - The parser part must have an entrypoint specified in the header. 101 | - Strings/characters/ranges cannot be directly used in the parser part, but parser can refer to tokens defined in lexer. 102 | - Parser rules are all in lowercase. 103 | - Most combinators in the lexer part are also supported in the parser part except for complement. 104 | 105 | For more complicated examples, one can see [json.pag](benches/json/json.pag). 106 |
107 | 108 |
109 | How to compile and use a grammar file 110 | 111 | To compile your grammar file, the recommended way is to add `pag-compiler` as your build dependency. With `pag-compiler`, 112 | the parser file can be easily generated in a build script as the following: 113 | ```rust 114 | fn main() { 115 | pag_compiler::compile("csv.pag", "src/parser.rs"); 116 | println!("cargo:rerun-if-changed=csv.pag"); 117 | } 118 | ``` 119 | 120 | For some reasons (mostly performance issues), only nightly rust (1.71+) is supported for now. It is also required that the crate containing the parser file 121 | should be annotated with 122 | ```rust 123 | #![feature(portable_simd)] 124 | #![feature(core_intrinsics)] 125 | #![feature(array_chunks)] 126 | ``` 127 |
128 | 129 | ## Performance 130 | 131 | We are continuously working on improvement the quality of our generated parser. For now, on workloads of CSV/JSON, 132 | the performance is close to or even better than those specialized parsers. 133 | ``` 134 | === Random Generated CSV === 135 | throughput/pag time: [635.88 µs 637.64 µs 639.46 µs] 136 | thrpt: [622.63 MiB/s 624.41 MiB/s 626.14 MiB/s] 137 | throughput/csv time: [528.36 µs 541.72 µs 559.54 µs] 138 | thrpt: [711.56 MiB/s 734.97 MiB/s 753.55 MiB/s] 139 | throughput/pest time: [3.7278 ms 3.7364 ms 3.7460 ms] 140 | thrpt: [106.29 MiB/s 106.56 MiB/s 106.80 MiB/s] 141 | === Random Generated JSON === 142 | random-json/pag-json time: [22.634 ns 22.650 ns 22.666 ns] 143 | thrpt: [84.149 MiB/s 84.209 MiB/s 84.271 MiB/s] 144 | random-json/serde-json time: [12.493 ns 12.587 ns 12.694 ns] 145 | thrpt: [150.26 MiB/s 151.54 MiB/s 152.67 MiB/s] 146 | random-json/pest-json time: [177.38 ns 178.17 ns 179.17 ns] 147 | thrpt: [10.645 MiB/s 10.705 MiB/s 10.753 MiB/s] 148 | === twitter.json === 149 | twitter-json/pag-json time: [1.0923 ms 1.0941 ms 1.0961 ms] 150 | thrpt: [667.24 MiB/s 668.46 MiB/s 669.59 MiB/s] 151 | twitter-json/serde-json time: [1.2281 ms 1.2295 ms 1.2312 ms] 152 | thrpt: [594.02 MiB/s 594.88 MiB/s 595.54 MiB/s] 153 | twitter-json/pest-json time: [5.2977 ms 5.3055 ms 5.3148 ms] 154 | thrpt: [137.61 MiB/s 137.85 MiB/s 138.06 MiB/s] 155 | ``` 156 | 157 |
158 | Why is it fast and how can I make my grammar faster 159 | 160 | - Thanks to the work of the Flap parser, we can fuse lexer and parser together such that lexers can be localized. 161 | - We apply tail-call optimizations explicitly. To utilize this feature, default more grammar rules using `*`, `+` or 162 | mark them rule as silent rules if possible. 163 | - We apply batched lookahead strategy using SIMD or lookup tables. This optimization applies when you repeat simple character sets 164 | (for instance, `(BLANK | '\t' | '\n' | '\r')+`). 165 | - We are working on to inline/reduce more operations involving state transitions and lexer-parse communications. 166 |
167 | 168 | ## Diagnostic Grammar Error Check 169 | We provide diagnostic information for "type errors" in your grammar definitions. Here are some examples: 170 | 171 | **Left-recursion** 172 | ``` 173 | Error: Unguarded fixpoint 174 | ╭─[json.pag:39:5] 175 | │ 176 | 39 │ active fixpoint json = json ~ value; 177 | │ ─────────────────┬───────────────── 178 | │ ╰─────────────────── fixpoint rule json is not guarded -- your grammar is left-recursive 179 | ────╯ 180 | ``` 181 | **Sequence Ambiguity** 182 | 183 | > **Explanation**: there may be ambiguity when separating a sequence into two part according to the grammar definition 184 | 185 | ``` 186 | Error: When type checking a sequence of rules, the following rules are ambiguous 187 | ╭─[json.pag:39:28] 188 | │ 189 | 39 │ active fixpoint test = NUMBER+ ~ NUMBER+; 190 | │ ───┬─── ───┬─── 191 | │ ╰─────────────── type info for left-hand side: nullable: false, first set: {NUMBER}, follow set: {NUMBER} 192 | │ │ 193 | │ ╰───── type info for right-hand side: nullable: false, first set: {NUMBER}, follow set: {NUMBER} 194 | ────╯ 195 | ``` 196 | 197 | **Alternation Ambiguity** 198 | > **Explanation**: there may be ambiguity when select a match in an alternation of two rules. 199 | ``` 200 | Error: When type checking an alternation of rules, the following rules are ambiguous 201 | ╭─[json.pag:39:28] 202 | │ 203 | 39 │ active fixpoint test = NUMBER+ | NUMBER; 204 | │ ───┬─── ───┬── 205 | │ ╰────────────── type info for left-hand side: nullable false, first set: NUMBER, follow set: NUMBER 206 | │ │ 207 | │ ╰──── type info for right-hand side: nullable false, first set: NUMBER, follow set: 208 | ────╯ 209 | ``` 210 | 211 | There are other diagnostic information for undefined references, nullable tokens in lexer, character format error, etc. 212 | -------------------------------------------------------------------------------- /pag-lexer/src/vector.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use crate::congruence::{approximate_congruence_class, meet}; 10 | use crate::derivative::derivative; 11 | use crate::intervals::Intervals; 12 | use crate::normalization::normalize; 13 | use crate::regex_tree::RegexTree; 14 | use crate::utilities::dbg_sort; 15 | 16 | use crate::lookahead::LoopOptimizer; 17 | use proc_macro2::{Literal, TokenStream}; 18 | use quote::{format_ident, quote}; 19 | use std::collections::{HashMap, HashSet}; 20 | use std::fmt::{Display, Formatter}; 21 | use std::rc::Rc; 22 | 23 | #[derive(Hash, PartialEq, Eq, Debug, Clone, Ord, PartialOrd)] 24 | pub struct Vector { 25 | regex_trees: Vec>, 26 | } 27 | 28 | impl Display for Vector { 29 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 30 | write!(f, "(")?; 31 | for (i, regex_tree) in self.regex_trees.iter().enumerate() { 32 | if i != 0 { 33 | write!(f, ", ")?; 34 | } 35 | write!(f, "{}", regex_tree)?; 36 | } 37 | write!(f, ")") 38 | } 39 | } 40 | 41 | impl Vector { 42 | pub fn new(iter: I) -> Self 43 | where 44 | I: IntoIterator>, 45 | { 46 | let regex_trees = iter.into_iter().collect(); 47 | Self { regex_trees } 48 | } 49 | 50 | pub fn is_byte_sequence(&self) -> bool { 51 | let mut iter = self 52 | .regex_trees 53 | .iter() 54 | .filter(|x| !matches!(x.as_ref(), RegexTree::Bottom)) 55 | .map(|x| x.is_byte_sequence()); 56 | matches!(iter.next(), Some(true)) && iter.next().is_none() 57 | } 58 | 59 | pub fn as_byte_sequence(&self) -> Option<(usize, Vec)> { 60 | let failing = self 61 | .regex_trees 62 | .iter() 63 | .filter(|x| matches!(x.as_ref(), RegexTree::Bottom)) 64 | .count(); 65 | if failing == self.regex_trees.len() - 1 { 66 | self.regex_trees 67 | .iter() 68 | .enumerate() 69 | .find_map(|(idx, x)| x.as_byte_sequence().map(|x| (idx, x))) 70 | } else { 71 | None 72 | } 73 | } 74 | 75 | pub fn derivative(&self, x: u8) -> Self { 76 | Vector { 77 | regex_trees: self 78 | .regex_trees 79 | .iter() 80 | .map(|t| derivative(t.clone(), x)) 81 | .collect(), 82 | } 83 | } 84 | 85 | pub fn accepting_state(&self) -> Option { 86 | self.regex_trees.iter().enumerate().find_map(|t| { 87 | if t.1.is_nullable() { 88 | Some(t.0) 89 | } else { 90 | None 91 | } 92 | }) 93 | } 94 | 95 | pub fn is_rejecting_state(&self) -> bool { 96 | self.regex_trees 97 | .iter() 98 | .all(|t| matches!(t.as_ref(), RegexTree::Bottom)) 99 | } 100 | 101 | pub fn approximate_congruence_class(&self) -> Vec { 102 | // meet all congruence classes for each regex tree 103 | self.regex_trees 104 | .iter() 105 | .map(|x| approximate_congruence_class(x)) 106 | .reduce(|acc, x| meet(acc.as_slice(), x.as_slice())) 107 | .unwrap_or_default() 108 | } 109 | 110 | pub fn normalize(&self) -> Self { 111 | let regex_trees = self 112 | .regex_trees 113 | .iter() 114 | .map(|x| normalize(x.clone())) 115 | .collect(); 116 | Self { regex_trees } 117 | } 118 | 119 | pub fn generate_dfa( 120 | &self, 121 | initial_idx: &TokenStream, 122 | optimizer: &mut LoopOptimizer, 123 | success_actions: &[TokenStream], 124 | failure_action: &TokenStream, 125 | ) -> TokenStream { 126 | let initial_state = { 127 | let initial_state = self.normalize(); 128 | let last_success = initial_state.accepting_state(); 129 | DfaState { 130 | state_vec: initial_state, 131 | last_success, 132 | } 133 | }; 134 | let mut dfa = build_dfa(initial_state.state_vec.clone()); 135 | let leaf_states = extract_leaf_states(&mut dfa); 136 | let initial_label = format_ident!("S{}", dfa[&initial_state].state_id); 137 | let actions = dbg_sort(&dfa, |(_, info)| info.state_id).map(|(state, info)| { 138 | let label = format_ident!("S{}", info.state_id); 139 | if let Some((rule_idx, seq)) = state.state_vec.as_byte_sequence() { 140 | let literal = Literal::byte_string(&seq); 141 | let length = seq.len(); 142 | let on_success = &success_actions[rule_idx]; 143 | return quote! { 144 | State::#label => { 145 | if input[idx..].starts_with(#literal) { 146 | cursor = idx + #length; 147 | #on_success 148 | } else { 149 | #failure_action 150 | } 151 | }, 152 | }; 153 | } 154 | let transitions = info.transitions.iter().map(|(interval, target)| { 155 | if leaf_states.contains(target) { 156 | let rule_idx = target.last_success.unwrap(); 157 | let on_success = &success_actions[rule_idx]; 158 | return quote! { Some(#interval) => { cursor = idx + 1; #on_success }, }; 159 | } 160 | let target_label = format_ident!("S{}", dfa[target].state_id); 161 | quote! { Some(#interval) => state = State::#target_label, } 162 | }); 163 | let lookahead = optimizer.generate_lookahead(&dfa, state); 164 | let otherwise = state 165 | .last_success 166 | .and_then(|x| success_actions.get(x)) 167 | .unwrap_or(failure_action); 168 | let advance_cursor = if state.state_vec.accepting_state().is_some() { 169 | Some(quote!(cursor = idx;)) 170 | } else { 171 | None 172 | }; 173 | quote! { 174 | State::#label => { 175 | #lookahead 176 | #advance_cursor 177 | match input.get(idx) { 178 | #(#transitions)* 179 | _ => { #otherwise } 180 | } 181 | }, 182 | } 183 | }); 184 | 185 | let labels = dbg_sort(dfa.values(), |info| info.state_id) 186 | .map(|info| format_ident!("S{}", info.state_id)); 187 | 188 | quote! { 189 | enum State { 190 | #(#labels,)* 191 | } 192 | let mut idx = #initial_idx; 193 | let mut state = State::#initial_label; 194 | loop { 195 | match state { 196 | #(#actions)* 197 | } 198 | idx += 1; 199 | } 200 | } 201 | } 202 | } 203 | 204 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 205 | pub struct DfaState { 206 | state_vec: Vector, 207 | last_success: Option, 208 | } 209 | 210 | #[derive(Debug, Clone)] 211 | pub struct DfaInfo { 212 | state_id: usize, 213 | pub(crate) transitions: Vec<(Intervals, DfaState)>, 214 | } 215 | 216 | pub type DfaTable = HashMap; 217 | 218 | fn explore_dfa_node(dfa: &mut DfaTable, state: DfaState, state_id: &mut usize) { 219 | dfa.insert( 220 | state.clone(), 221 | DfaInfo { 222 | state_id: *state_id, 223 | transitions: vec![], 224 | }, 225 | ); 226 | *state_id += 1; 227 | 228 | if state.state_vec.is_byte_sequence() { 229 | return; 230 | } 231 | 232 | let classes = state.state_vec.approximate_congruence_class(); 233 | let mut transitions = Vec::with_capacity(classes.len()); 234 | 235 | for intervals in classes { 236 | let char = intervals.representative(); 237 | let target = state.state_vec.derivative(char).normalize(); 238 | let last_success = target.accepting_state().or(state.last_success); 239 | let next = DfaState { 240 | state_vec: target, 241 | last_success, 242 | }; 243 | if !next.state_vec.is_rejecting_state() { 244 | transitions.push((intervals, next.clone())); 245 | if !dfa.contains_key(&next) { 246 | explore_dfa_node(dfa, next, state_id) 247 | } 248 | } 249 | } 250 | 251 | dfa.get_mut(&state).unwrap().transitions = transitions; 252 | } 253 | 254 | pub fn build_dfa(state: Vector) -> DfaTable { 255 | let mut state_id = 0; 256 | let mut dfa = HashMap::new(); 257 | let last_success = state.accepting_state(); 258 | let state = DfaState { 259 | state_vec: state, 260 | last_success, 261 | }; 262 | explore_dfa_node(&mut dfa, state, &mut state_id); 263 | #[cfg(pag_print_dfa)] 264 | print_dfa(&dfa); 265 | dfa 266 | } 267 | 268 | fn extract_leaf_states(dfa: &mut DfaTable) -> HashSet { 269 | // TODO: switch to `drain_filter` (nightly) / `extract_if` (hashbrown) 270 | let leaf_states = dfa 271 | .iter() 272 | .filter_map(|(state, info)| { 273 | if info.transitions.is_empty() && state.last_success.is_some() { 274 | Some(state.clone()) 275 | } else { 276 | None 277 | } 278 | }) 279 | .collect(); 280 | for s in &leaf_states { 281 | dfa.remove(s); 282 | } 283 | leaf_states 284 | } 285 | 286 | #[cfg(pag_print_dfa)] 287 | fn print_dfa(dfa: &DfaTable) { 288 | for (state, info) in dfa { 289 | println!( 290 | "S{}({:?}): {}", 291 | info.state_id, state.last_success, state.state_vec 292 | ); 293 | for (intervals, target) in &info.transitions { 294 | println!(" {} -> S{}", intervals, dfa[target].state_id); 295 | } 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /pag-parser/src/frontend/unicode.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use pag_lexer::normalization::normalize; 10 | use pag_lexer::regex_tree::RegexTree; 11 | use smallvec::smallvec; 12 | use std::rc::Rc; 13 | 14 | pub fn encode_char(x: char) -> Rc { 15 | let mut buf = [0; 4]; 16 | normalize(Rc::new(RegexTree::Concat( 17 | x.encode_utf8(&mut buf) 18 | .bytes() 19 | .map(|b| Rc::new(RegexTree::single(b))) 20 | .collect(), 21 | ))) 22 | } 23 | 24 | fn full_range_2() -> Rc { 25 | Rc::new(RegexTree::Concat(smallvec![ 26 | Rc::new(RegexTree::range(0xc0..=0xdf)), 27 | Rc::new(RegexTree::range(0x80..=0xbf)) 28 | ])) 29 | } 30 | 31 | fn full_range_3() -> Rc { 32 | Rc::new(RegexTree::Concat(smallvec![ 33 | Rc::new(RegexTree::range(0xe0..=0xef)), 34 | Rc::new(RegexTree::range(0x80..=0xbf)), 35 | Rc::new(RegexTree::range(0x80..=0xbf)) 36 | ])) 37 | } 38 | 39 | fn encode_same_level1(x: char, y: char) -> Rc { 40 | encode_same_level_expanded(1, &[x as u8], &[y as u8]) 41 | } 42 | 43 | fn encode_same_level2(x: char, y: char) -> Rc { 44 | let x_fst = (0xc0 | (x as u32 >> 6)) as u8; 45 | let x_snd = (0x80 | (x as u32 & 0x3f)) as u8; 46 | let y_fst = (0xc0 | (y as u32 >> 6)) as u8; 47 | let y_snd = (0x80 | (y as u32 & 0x3f)) as u8; 48 | encode_same_level_expanded(2, &[x_fst, x_snd], &[y_fst, y_snd]) 49 | } 50 | 51 | fn encode_same_level3(x: char, y: char) -> Rc { 52 | let x_fst = (0xe0 | (x as u32 >> 12)) as u8; 53 | let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8; 54 | let x_trd = (0x80 | (x as u32 & 0x3f)) as u8; 55 | let y_fst = (0xe0 | (y as u32 >> 12)) as u8; 56 | let y_snd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8; 57 | let y_trd = (0x80 | (y as u32 & 0x3f)) as u8; 58 | encode_same_level_expanded(3, &[x_fst, x_snd, x_trd], &[y_fst, y_snd, y_trd]) 59 | } 60 | 61 | fn encode_same_level4(x: char, y: char) -> Rc { 62 | let x_fst = (0xf0 | (x as u32 >> 18)) as u8; 63 | let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8; 64 | let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8; 65 | let x_fth = (0x80 | (x as u32 & 0x3f)) as u8; 66 | let y_fst = (0xf0 | (y as u32 >> 18)) as u8; 67 | let y_snd = (0x80 | ((y as u32 >> 12) & 0x3f)) as u8; 68 | let y_trd = (0x80 | ((y as u32 >> 6) & 0x3f)) as u8; 69 | let y_fth = (0x80 | (y as u32 & 0x3f)) as u8; 70 | encode_same_level_expanded( 71 | 4, 72 | &[x_fst, x_snd, x_trd, x_fth], 73 | &[y_fst, y_snd, y_trd, y_fth], 74 | ) 75 | } 76 | 77 | const ALL_BF: [u8; 4] = [0xbf, 0xbf, 0xbf, 0xbf]; 78 | const ALL_80: [u8; 4] = [0x80, 0x80, 0x80, 0x80]; 79 | 80 | fn encode_same_level_expanded(level: usize, tuple_x: &[u8], tuple_y: &[u8]) -> Rc { 81 | if level == 1 { 82 | return Rc::new(RegexTree::range(tuple_x[0]..=tuple_y[0])); 83 | } 84 | if tuple_x[0] == tuple_y[0] { 85 | Rc::new(RegexTree::Concat(smallvec![ 86 | Rc::new(RegexTree::single(tuple_x[0])), 87 | encode_same_level_expanded(level - 1, &tuple_x[1..], &tuple_y[1..]), 88 | ])) 89 | } else { 90 | Rc::new(RegexTree::Union(smallvec![ 91 | Rc::new(RegexTree::Concat(smallvec![ 92 | Rc::new(RegexTree::single(tuple_x[0])), 93 | encode_same_level_expanded(level - 1, &tuple_x[1..], &ALL_BF), 94 | ])), 95 | Rc::new(RegexTree::Concat(smallvec![ 96 | Rc::new(RegexTree::range(tuple_x[0] + 1..=tuple_y[0] - 1)), 97 | encode_same_level_expanded(level - 1, &ALL_80, &ALL_BF), 98 | ])), 99 | Rc::new(RegexTree::Concat(smallvec![ 100 | Rc::new(RegexTree::single(tuple_y[0])), 101 | encode_same_level_expanded(level - 1, &ALL_80, &tuple_y[1..]), 102 | ])), 103 | ])) 104 | } 105 | } 106 | 107 | fn encode_le_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc { 108 | if level == 1 { 109 | return Rc::new(RegexTree::range(fst_bound..=tuple[0])); 110 | } 111 | Rc::new(RegexTree::Union(smallvec![ 112 | Rc::new(RegexTree::Concat(smallvec![ 113 | Rc::new(RegexTree::single(tuple[0])), 114 | encode_le_expanded(level - 1, 0x80, &tuple[1..]), 115 | ])), 116 | Rc::new(RegexTree::Concat(smallvec![ 117 | Rc::new(RegexTree::range(fst_bound..=tuple[0] - 1)), 118 | encode_le_expanded(level - 1, 0x80, &ALL_BF), 119 | ])), 120 | ])) 121 | } 122 | 123 | fn encode_ge_expanded(level: usize, fst_bound: u8, tuple: &[u8]) -> Rc { 124 | if level == 1 { 125 | return Rc::new(RegexTree::range(tuple[0]..=fst_bound)); 126 | } 127 | Rc::new(RegexTree::Union(smallvec![ 128 | Rc::new(RegexTree::Concat(smallvec![ 129 | Rc::new(RegexTree::single(tuple[0])), 130 | encode_ge_expanded(level - 1, 0xBF, &tuple[1..]), 131 | ])), 132 | Rc::new(RegexTree::Concat(smallvec![ 133 | Rc::new(RegexTree::range(tuple[0] + 1..=fst_bound)), 134 | encode_ge_expanded(level - 1, 0xBF, &ALL_80), 135 | ])), 136 | ])) 137 | } 138 | 139 | fn encode_ge1(x: char) -> Rc { 140 | encode_ge_expanded(1, 0x7F, &[x as u8]) 141 | } 142 | 143 | fn encode_ge2(x: char) -> Rc { 144 | let x_fst = (0xc0 | (x as u32 >> 6)) as u8; 145 | let x_snd = (0x80 | (x as u32 & 0x3f)) as u8; 146 | encode_ge_expanded(2, 0xDF, &[x_fst, x_snd]) 147 | } 148 | 149 | fn encode_ge3(x: char) -> Rc { 150 | let x_fst = (0xe0 | (x as u32 >> 12)) as u8; 151 | let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8; 152 | let x_trd = (0x80 | (x as u32 & 0x3f)) as u8; 153 | encode_ge_expanded(3, 0xEF, &[x_fst, x_snd, x_trd]) 154 | } 155 | 156 | fn encode_le2(x: char) -> Rc { 157 | let x_fst = (0xc0 | (x as u32 >> 6)) as u8; 158 | let x_snd = (0x80 | (x as u32 & 0x3f)) as u8; 159 | encode_le_expanded(2, 0xC0, &[x_fst, x_snd]) 160 | } 161 | 162 | fn encode_le3(x: char) -> Rc { 163 | let x_fst = (0xe0 | (x as u32 >> 12)) as u8; 164 | let x_snd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8; 165 | let x_trd = (0x80 | (x as u32 & 0x3f)) as u8; 166 | encode_le_expanded(3, 0xE0, &[x_fst, x_snd, x_trd]) 167 | } 168 | 169 | fn encode_le4(x: char) -> Rc { 170 | let x_fst = (0xf0 | (x as u32 >> 18)) as u8; 171 | let x_snd = (0x80 | ((x as u32 >> 12) & 0x3f)) as u8; 172 | let x_trd = (0x80 | ((x as u32 >> 6) & 0x3f)) as u8; 173 | let x_fth = (0x80 | (x as u32 & 0x3f)) as u8; 174 | encode_le_expanded(4, 0xF0, &[x_fst, x_snd, x_trd, x_fth]) 175 | } 176 | 177 | fn try_encode_same_level(x: char, y: char) -> Option> { 178 | match (x as u32, y as u32) { 179 | (0x00..=0x7F, 0x00..=0x7F) => Some(encode_same_level1(x, y)), 180 | (0x80..=0x7FF, 0x80..=0x7FF) => Some(encode_same_level2(x, y)), 181 | (0x800..=0xFFFF, 0x800..=0xFFFF) => Some(encode_same_level3(x, y)), 182 | (0x10000..=0x10FFFF, 0x10000..=0x10FFFF) => Some(encode_same_level4(x, y)), 183 | _ => None, 184 | } 185 | } 186 | 187 | pub fn encode_range(x: char, y: char) -> Rc { 188 | if let Some(tree) = try_encode_same_level(x, y) { 189 | return normalize(tree); 190 | } 191 | let ranges = match (x as u32, y as u32) { 192 | (0x00..=0x7F, 0x80..=0x7FF) => vec![encode_ge1(x), encode_le2(y)], 193 | (0x00..=0x7F, 0x800..=0xFFFF) => vec![encode_ge1(x), full_range_2(), encode_le3(y)], 194 | (0x00..=0x7F, 0x10000..=0x10FFFF) => { 195 | vec![encode_ge1(x), full_range_2(), full_range_3(), encode_le4(y)] 196 | } 197 | (0x80..=0x7FF, 0x800..=0xFFFF) => vec![encode_ge2(x), encode_le3(y)], 198 | (0x80..=0x7FF, 0x10000..=0x10FFFF) => vec![encode_ge2(x), full_range_3(), encode_le4(y)], 199 | (0x800..=0xFFFF, 0x10000..=0x10FFFF) => vec![encode_ge3(x), encode_le4(y)], 200 | _ => unreachable!(), 201 | }; 202 | // fold union 203 | normalize(Rc::new(RegexTree::Union(ranges.into_iter().collect()))) 204 | } 205 | 206 | #[cfg(test)] 207 | mod test { 208 | use super::*; 209 | 210 | #[test] 211 | fn test_encode_char() { 212 | assert_eq!(encode_char('a').to_string(), "a"); 213 | assert_eq!(encode_char('b').to_string(), "b"); 214 | assert_eq!(encode_char('æ').to_string(), r"(\xc3 ~ \xa6)"); 215 | assert_eq!(encode_char('我').to_string(), r"(\xe6 ~ \x88 ~ \x91)"); 216 | } 217 | 218 | #[test] 219 | fn test_encode_range() { 220 | assert_eq!(encode_range('a', 'a').to_string(), "a"); 221 | assert_eq!(encode_range('a', 'b').to_string(), "[a, b]"); 222 | assert_eq!( 223 | encode_range('\u{80}', '\u{88}').to_string(), 224 | r"(\xc2 ~ [\x80, \x88])" 225 | ); 226 | assert_eq!( 227 | encode_range('\u{81}', '\u{7FA}').to_string(), 228 | r"((\xc2 ~ [\x81, \xbf]) ∪ ([\xc3, \xde] ~ [\x80, \xbf]) ∪ (\xdf ~ [\x80, \xba]))" 229 | ); 230 | assert_eq!( 231 | encode_range('\u{800}', '\u{808}').to_string(), 232 | r"(\xe0 ~ \xa0 ~ [\x80, \x88])" 233 | ); 234 | assert_eq!( 235 | encode_range('\u{881}', '\u{FFA}').to_string(), 236 | r"(\xe0 ~ ((\xa2 ~ [\x81, \xbf]) ∪ ([\xa3, \xbe] ~ [\x80, \xbf]) ∪ (\xbf ~ [\x80, \xba])))" 237 | ); 238 | assert_eq!( 239 | encode_range('\u{901}', '\u{FF00}').to_string(), 240 | "((\\xe0 ~ ((\\xa4 ~ [\\x81, \\xbf]) ∪ ([\\xa5, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ ([\\xe1, \\xee] ~ ((\\x80 ~ [\\x80, \\xbf]) ∪ ([\\x81, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ (\\xef ~ ((\\x80 ~ [\\x80, \\xbf]) ∪ ([\\x81, \\xbb] ~ [\\x80, \\xbf]) ∪ (\\xbc ~ \\x80))))" 241 | ); 242 | assert_eq!( 243 | encode_range('a', '\u{90}').to_string(), 244 | r"([a, \x7f] ∪ ([\xc0, \xc1] ~ [\x80, \xbf]) ∪ (\xc2 ~ [\x80, \x90]))" 245 | ); 246 | assert_eq!( 247 | encode_range('a', '\u{801}').to_string(), 248 | r"([a, \x7f] ∪ ([\xc0, \xdf] ~ [\x80, \xbf]) ∪ (\xe0 ~ (([\x80, \x9f] ~ [\x80, \xbf]) ∪ (\xa0 ~ [\x80, \x81]))))" 249 | ); 250 | assert_eq!( 251 | encode_range('\u{99}', '\u{2771}').to_string(), 252 | "((\\xc2 ~ [\\x99, \\xbf]) ∪ ([\\xc3, \\xdf] ~ [\\x80, \\xbf]) ∪ ([\\xe0, \\xe1] ~ (([\\x80, \\xbe] ~ [\\x80, \\xbf]) ∪ (\\xbf ~ [\\x80, \\xbf]))) ∪ (\\xe2 ~ (([\\x80, \\x9c] ~ [\\x80, \\xbf]) ∪ (\\x9d ~ [\\x80, \\xb1]))))" 253 | ) 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /pag-parser/src/nf.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | use std::collections::{HashMap, HashSet}; 10 | use std::fmt::Display; 11 | 12 | use smallvec::{smallvec, SmallVec}; 13 | use typed_arena::Arena; 14 | 15 | use crate::{core_syntax::Term, frontend::syntax::Parser, utilities::Symbol}; 16 | 17 | // thinking a while... 18 | 19 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] 20 | pub struct Tag<'src> { 21 | symbol: Symbol<'src>, 22 | version: u32, 23 | } 24 | 25 | impl<'src> Tag<'src> { 26 | pub fn new(symbol: Symbol<'src>) -> Self { 27 | Self { symbol, version: 0 } 28 | } 29 | 30 | pub fn is_original(&self) -> bool { 31 | self.version == 0 32 | } 33 | 34 | pub fn symbol(&self) -> Symbol<'src> { 35 | self.symbol 36 | } 37 | } 38 | 39 | impl<'src> Display for Tag<'src> { 40 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 41 | self.symbol.fmt(f)?; 42 | if self.version > 0 { 43 | write!(f, "_{}", self.version)?; 44 | } 45 | Ok(()) 46 | } 47 | } 48 | 49 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 50 | pub enum Action<'src> { 51 | Subroutine(Tag<'src>), 52 | Summarize(Symbol<'src>), 53 | } 54 | 55 | impl<'src> Display for Action<'src> { 56 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 57 | match self { 58 | Action::Subroutine(tag) => write!(f, "{}", tag), 59 | Action::Summarize(tag) => write!(f, "[{}]", tag), 60 | } 61 | } 62 | } 63 | 64 | impl<'src> Action<'src> { 65 | fn symbol(&self) -> Symbol<'src> { 66 | match self { 67 | Action::Subroutine(tag) => tag.symbol, 68 | Action::Summarize(sym) => *sym, 69 | } 70 | } 71 | } 72 | 73 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 74 | pub enum NormalForm<'src> { 75 | Empty(SmallVec<[Symbol<'src>; 1]>), 76 | Unexpanded(SmallVec<[Action<'src>; 1]>), 77 | Sequence { 78 | terminal: Symbol<'src>, 79 | nonterminals: SmallVec<[Action<'src>; 1]>, 80 | }, 81 | } 82 | 83 | impl<'src> Display for NormalForm<'src> { 84 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 85 | match self { 86 | NormalForm::Empty(trees) => trees.iter().fold(write!(f, "ε"), |acc, x| { 87 | acc.and_then(|_| write!(f, " [{x}]")) 88 | }), 89 | NormalForm::Sequence { 90 | terminal, 91 | nonterminals, 92 | } => { 93 | write!(f, "{terminal}")?; 94 | for i in nonterminals { 95 | write!(f, " {i}")?; 96 | } 97 | Ok(()) 98 | } 99 | NormalForm::Unexpanded(tags) => { 100 | write!(f, "{}", tags[0])?; 101 | for i in &tags[1..] { 102 | write!(f, " {i}")?; 103 | } 104 | Ok(()) 105 | } 106 | } 107 | } 108 | } 109 | 110 | pub struct NormalForms<'src, 'a> { 111 | pub entries: HashMap, SmallVec<[&'a NormalForm<'src>; 4]>>, 112 | } 113 | 114 | impl<'src, 'a> NormalForms<'src, 'a> { 115 | pub fn new() -> Self { 116 | Self { 117 | entries: HashMap::new(), 118 | } 119 | } 120 | } 121 | 122 | impl<'src, 'a> Display for NormalForms<'src, 'a> { 123 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 124 | for (tag, nf) in self.entries.iter() { 125 | for i in nf { 126 | writeln!(f, "{tag} -> {i}")?; 127 | } 128 | } 129 | Ok(()) 130 | } 131 | } 132 | 133 | pub fn semi_normalize<'src, 'p, 'nf>( 134 | target: &Term<'src, 'p>, 135 | symbol: Symbol<'src>, 136 | arena: &'nf Arena>, 137 | nfs: &mut NormalForms<'src, 'nf>, 138 | tag_cnt: &mut u32, 139 | parser: &Parser<'src, 'p>, 140 | ) -> Tag<'src> { 141 | let version = *tag_cnt; 142 | *tag_cnt += 1; 143 | let tag = Tag { symbol, version }; 144 | 145 | match target { 146 | Term::Epsilon => { 147 | let nf = smallvec![&*arena.alloc(NormalForm::Empty(Default::default()))]; 148 | nfs.entries.insert(tag, nf); 149 | tag 150 | } 151 | Term::Sequence(x, y) => { 152 | let x_tag = semi_normalize(&x.node, symbol, arena, nfs, tag_cnt, parser); 153 | let y_tag = semi_normalize(&y.node, symbol, arena, nfs, tag_cnt, parser); 154 | let acts = smallvec![Action::Subroutine(x_tag), Action::Subroutine(y_tag)]; 155 | let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))]; 156 | nfs.entries.insert(tag, nf); 157 | tag 158 | } 159 | Term::LexerRef(lexer) => { 160 | let nf = smallvec![&*arena.alloc(NormalForm::Sequence { 161 | terminal: *lexer, 162 | nonterminals: SmallVec::new(), 163 | })]; 164 | nfs.entries.insert(tag, nf); 165 | tag 166 | } 167 | Term::Bottom => { 168 | let nf = SmallVec::new(); 169 | nfs.entries.insert(tag, nf); 170 | tag 171 | } 172 | Term::Alternative(x, y) => { 173 | let x_tag = semi_normalize(&x.node, symbol, arena, nfs, tag_cnt, parser); 174 | let y_tag = semi_normalize(&y.node, symbol, arena, nfs, tag_cnt, parser); 175 | let nf = smallvec![ 176 | &*arena.alloc(NormalForm::Unexpanded(smallvec![Action::Subroutine(x_tag)])), 177 | &*arena.alloc(NormalForm::Unexpanded(smallvec![Action::Subroutine(y_tag)])), 178 | ]; 179 | nfs.entries.insert(tag, nf); 180 | tag 181 | } 182 | Term::Fix(var, body) => { 183 | let body_tag = semi_normalize(&body.node, *var, arena, nfs, &mut 0, parser); 184 | if symbol != *var { 185 | nfs.entries.insert(tag, nfs.entries[&body_tag].clone()); 186 | } 187 | body_tag 188 | } 189 | Term::ParserRef(x) => { 190 | let ref_tag = Tag::new(*x); 191 | if parser.is_active(&ref_tag) { 192 | let acts = smallvec![Action::Subroutine(ref_tag), Action::Summarize(*x)]; 193 | let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))]; 194 | nfs.entries.insert(tag, nf); 195 | tag 196 | } else if tag.version == 0 { 197 | let acts = smallvec![Action::Subroutine(ref_tag)]; 198 | let nf = smallvec![&*arena.alloc(NormalForm::Unexpanded(acts))]; 199 | nfs.entries.insert(tag, nf); 200 | tag 201 | } else { 202 | ref_tag 203 | } 204 | } 205 | } 206 | } 207 | 208 | pub fn fully_normalize<'src, 'nf>( 209 | arena: &'nf Arena>, 210 | nfs: &mut NormalForms<'src, 'nf>, 211 | ) { 212 | let mut updates = Vec::new(); 213 | loop { 214 | for (tag, i) in nfs.entries.iter() { 215 | if !i.iter().any(|x| matches!(x, NormalForm::Unexpanded(..))) { 216 | continue; 217 | } 218 | let mut result = SmallVec::new(); 219 | for j in i.iter() { 220 | let NormalForm::Unexpanded(actions) = j else { 221 | result.push(*j); 222 | continue; 223 | }; 224 | let first_subroutine = actions.iter().enumerate().find_map(|(index, act)| { 225 | if let Action::Subroutine(x) = act { 226 | Some((index, x)) 227 | } else { 228 | None 229 | } 230 | }); 231 | match first_subroutine { 232 | None => { 233 | let nf = NormalForm::Empty(actions.iter().map(|x| x.symbol()).collect()); 234 | result.push(&*arena.alloc(nf)); 235 | } 236 | Some((index, x)) => { 237 | let variable_nf = &nfs.entries[x]; 238 | for k in variable_nf.iter().copied() { 239 | let head = actions[..index].iter().cloned(); 240 | let tail = actions[index + 1..].iter().cloned(); 241 | match k { 242 | NormalForm::Empty(trees) => { 243 | let insert = trees.iter().map(|x| Action::Summarize(*x)); 244 | let acts = head.chain(insert).chain(tail).collect(); 245 | result.push(&*arena.alloc(NormalForm::Unexpanded(acts))); 246 | } 247 | NormalForm::Unexpanded(subacts) => { 248 | let insert = subacts.iter().cloned(); 249 | let acts = head.chain(insert).chain(tail).collect(); 250 | result.push(&*arena.alloc(NormalForm::Unexpanded(acts))); 251 | } 252 | NormalForm::Sequence { 253 | terminal, 254 | nonterminals, 255 | } => { 256 | let insert = nonterminals.iter().cloned(); 257 | let acts = head.chain(insert).chain(tail).collect(); 258 | result.push(&*arena.alloc(NormalForm::Sequence { 259 | terminal: *terminal, 260 | nonterminals: acts, 261 | })); 262 | } 263 | } 264 | } 265 | } 266 | } 267 | } 268 | updates.push((*tag, result)); 269 | } 270 | if updates.is_empty() { 271 | break; 272 | } 273 | nfs.entries.extend(updates.drain(..)); 274 | } 275 | } 276 | 277 | pub fn merge_inactive_rules<'src, 'nf>( 278 | nfs: &mut NormalForms<'src, 'nf>, 279 | parser: &Parser<'src, '_>, 280 | arena: &'nf Arena>, 281 | ) { 282 | // sort all rules 283 | for i in nfs.entries.values_mut() { 284 | i.sort_unstable(); 285 | } 286 | let mut table: HashMap<&[&NormalForm], Tag<'src>> = HashMap::new(); 287 | let mut rename = Vec::new(); 288 | for (tag, nf) in nfs.entries.iter() { 289 | if parser.is_active(tag) { 290 | continue; 291 | } 292 | table 293 | .entry(nf.as_slice()) 294 | .and_modify(|new_tag| rename.push((*tag, *new_tag))) 295 | .or_insert(*tag); 296 | } 297 | for (tag, new_tag) in rename { 298 | nfs.entries.remove(&tag); 299 | for i in nfs.entries.values_mut() { 300 | for j in i.iter_mut() { 301 | let NormalForm::Sequence { 302 | terminal, 303 | nonterminals, 304 | } = j else { continue }; 305 | if nonterminals.contains(&Action::Subroutine(tag)) { 306 | *j = &*arena.alloc(NormalForm::Sequence { 307 | terminal: *terminal, 308 | nonterminals: nonterminals 309 | .iter() 310 | .map(|x| { 311 | if *x == Action::Subroutine(tag) { 312 | Action::Subroutine(new_tag) 313 | } else { 314 | *x 315 | } 316 | }) 317 | .collect(), 318 | }); 319 | } 320 | } 321 | } 322 | } 323 | } 324 | 325 | pub fn remove_unreachable_rules<'src>(nfs: &mut NormalForms<'src, '_>, parser: &Parser<'src, '_>) { 326 | fn dfs<'src>( 327 | nfs: &NormalForms<'src, '_>, 328 | current: Tag<'src>, 329 | visited: &mut HashSet>, 330 | ) { 331 | if visited.contains(¤t) { 332 | return; 333 | } 334 | visited.insert(current); 335 | let Some(tag) = nfs.entries.get(¤t) else { return }; 336 | for i in tag { 337 | let NormalForm::Sequence { nonterminals, .. } = i else { continue }; 338 | for i in nonterminals { 339 | let Action::Subroutine(x) = i else { continue }; 340 | dfs(nfs, *x, visited); 341 | } 342 | } 343 | } 344 | 345 | let mut visited = HashSet::new(); 346 | dfs(nfs, Tag::new(parser.entrypoint), &mut visited); 347 | nfs.entries.retain(|k, _| visited.contains(k)); 348 | } 349 | -------------------------------------------------------------------------------- /pag-parser/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Paguroidea Developers 2 | // 3 | // Licensed under the Apache License, Version 2.0 4 | // or the MIT 5 | // license , at your 6 | // option. All files in the project carrying such notice may not be copied, 7 | // modified, or distributed except according to those terms. 8 | 9 | mod core_syntax; 10 | mod frontend; 11 | mod fusion; 12 | mod nf; 13 | mod type_system; 14 | mod utilities; 15 | 16 | use ariadne::{Color, Label, Report, ReportKind, Source}; 17 | use proc_macro2::TokenStream; 18 | use quote::format_ident; 19 | use typed_arena::Arena; 20 | 21 | use std::ops::Range; 22 | 23 | use core_syntax::TermArena; 24 | use frontend::{ 25 | lexical::construct_lexer_database, syntax::construct_parser, FrontendError, 26 | GrammarDefinitionError, 27 | }; 28 | use fusion::fusion_parser; 29 | use nf::{ 30 | fully_normalize, merge_inactive_rules, remove_unreachable_rules, semi_normalize, NormalForms, 31 | }; 32 | use type_system::TypeError; 33 | use utilities::unreachable_branch; 34 | 35 | pub enum Error<'src> { 36 | GrammarDefinitionError(GrammarDefinitionError<'src>), 37 | FrontendErrors(Vec>), 38 | TypeErrors(Vec>), 39 | } 40 | 41 | impl<'src> From>> for Error<'src> { 42 | fn from(errors: Vec>) -> Self { 43 | Error::FrontendErrors(errors) 44 | } 45 | } 46 | 47 | impl<'src> From>> for Error<'src> { 48 | fn from(errors: Vec>) -> Self { 49 | Error::TypeErrors(errors) 50 | } 51 | } 52 | 53 | impl<'src> Error<'src> { 54 | pub fn report_stderr(&self, input_name: &str, input: &'src str) -> Result<(), std::io::Error> { 55 | let mut cache = (input_name, Source::from(input)); 56 | for i in self.to_reports(input_name) { 57 | i.eprint(&mut cache)?; 58 | } 59 | Ok(()) 60 | } 61 | 62 | pub fn report_stdout(&self, input_name: &str, input: &'src str) -> Result<(), std::io::Error> { 63 | let mut cache = (input_name, Source::from(input)); 64 | for i in self.to_reports(input_name) { 65 | i.print(&mut cache)?; 66 | } 67 | Ok(()) 68 | } 69 | 70 | pub fn to_reports<'a>(&self, input_name: &'a str) -> Vec)>> { 71 | match self { 72 | Error::GrammarDefinitionError(e) => { 73 | use GrammarDefinitionError::*; 74 | vec![match e { 75 | SyntaxError(x) => { 76 | let span = match x.location { 77 | pest::error::InputLocation::Pos(x) => x..x+1, 78 | pest::error::InputLocation::Span((x, y)) => x..y, 79 | }; 80 | Report::build(ReportKind::Error, input_name, span.start) 81 | .with_message("Syntax error in grammar definition") 82 | .with_label(Label::new((input_name, span)) 83 | .with_message(format!("{}", x.variant.message())) 84 | .with_color(Color::Red)) 85 | .finish() 86 | }, 87 | FormatError { span, message } => { 88 | Report::build(ReportKind::Error, input_name, span.start()) 89 | .with_message("Format error in grammar definition") 90 | .with_label(Label::new((input_name, span.start()..span.end())) 91 | .with_message(format!("{}", message)) 92 | .with_color(Color::Red)) 93 | .finish() 94 | }, 95 | ParserLogicError(e) => { 96 | Report::build(ReportKind::Error, input_name, 0) 97 | .with_message(format!("Internal logical error when parsing grammar definition {}", e)) 98 | .finish() 99 | }, 100 | UnexpectedEOI(e) => { 101 | Report::build(ReportKind::Error, input_name, 0) 102 | .with_message(format!("Internal logical error when parsing grammar definition, pest parser failed to give {}", e)) 103 | .finish() 104 | }, 105 | }] 106 | }, 107 | Error::FrontendErrors(errors) => errors 108 | .iter() 109 | .map(|e| { 110 | use FrontendError::*; 111 | match &e { 112 | // InternalLogicalError(span, msg) => { 113 | // Report::build(ReportKind::Error, input_name, span.start()) 114 | // .with_message("Internal logical error encountered") 115 | // .with_label(Label::new((input_name, span.start()..span.end())) 116 | // .with_message(msg) 117 | // .with_color(Color::Red)) 118 | // .finish() 119 | // }, 120 | MultipleDefinition(fst, snd) => { 121 | Report::build(ReportKind::Error, input_name, snd.start()) 122 | .with_message(format!("Multiple definition of {}", fst.as_str())) 123 | .with_label(Label::new((input_name, fst.start()..fst.end())) 124 | .with_message("first definition") 125 | .with_color(Color::Green)) 126 | .with_label(Label::new((input_name, snd.start()..snd.end())) 127 | .with_message("second definition") 128 | .with_color(Color::Blue)) 129 | .finish() 130 | }, 131 | UndefinedLexicalRuleReference(span) => { 132 | Report::build(ReportKind::Error, input_name, span.start()) 133 | .with_message("Undefined lexical rule reference") 134 | .with_label(Label::new((input_name, span.start()..span.end())) 135 | .with_message(format!("lexcical rule {} is undefined", span.as_str())) 136 | .with_color(Color::Red)) 137 | .finish() 138 | }, 139 | CyclicLexicalRuleReference(span) => { 140 | Report::build(ReportKind::Error, input_name, span.start()) 141 | .with_message("Cyclic lexical rule reference") 142 | .with_label(Label::new((input_name, span.start()..span.end())) 143 | .with_message("this reference causes cyclic dependency") 144 | .with_color(Color::Red)) 145 | .finish() 146 | }, 147 | UndefinedParserRuleReference(span) => { 148 | Report::build(ReportKind::Error, input_name, span.start()) 149 | .with_message("Undefined parser rule reference") 150 | .with_label(Label::new((input_name, span.start()..span.end())) 151 | .with_message(format!("parser rule {} is undefined", span.as_str())) 152 | .with_color(Color::Red)) 153 | .finish() 154 | }, 155 | MultipleSkippingRule(fst, snd) => { 156 | Report::build(ReportKind::Error, input_name, snd.start()) 157 | .with_message("Skipping lexical rule is already defined") 158 | .with_label(Label::new((input_name, fst.start()..fst.end())) 159 | .with_message("first definition") 160 | .with_color(Color::Green)) 161 | .with_label(Label::new((input_name, snd.start()..snd.end())) 162 | .with_message("second definition") 163 | .with_color(Color::Blue)) 164 | .finish() 165 | }, 166 | NullableToken(name, span) => { 167 | Report::build(ReportKind::Error, input_name, span.start()) 168 | .with_message("Nullable token detected") 169 | .with_label(Label::new((input_name, span.start()..span.end())) 170 | .with_message(format!("token {name} is nullable")) 171 | .with_color(Color::Red)) 172 | .finish() 173 | }, 174 | } 175 | }) 176 | .collect::>(), 177 | Error::TypeErrors(errors) => errors 178 | .iter() 179 | .map(|e| { 180 | use TypeError::*; 181 | match e { 182 | SequentialUniquenessViolation { lhs, rhs, total } => { 183 | Report::build(ReportKind::Error, input_name, total.start()) 184 | .with_message("When type checking a sequence of rules, the following rules are ambiguous") 185 | .with_label(Label::new((input_name, lhs.0.start()..lhs.0.end())) 186 | .with_message(format!("type info for left-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}", 187 | lhs.1.nullable, lhs.1.first.iter().map(|x|x.name()).collect::>().join(", "), 188 | lhs.1.follow.iter().map(|x|x.name()).collect::>().join(", ") 189 | )) 190 | .with_color(Color::Green)) 191 | .with_label(Label::new((input_name, rhs.0.start()..rhs.0.end())) 192 | .with_message(format!("type info for right-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}", 193 | rhs.1.nullable, rhs.1.first.iter().map(|x|x.name()).collect::>().join(", "), 194 | rhs.1.follow.iter().map(|x|x.name()).collect::>().join(", ") 195 | )) 196 | .with_color(Color::Blue)) 197 | .finish() 198 | }, 199 | DisjunctiveUniquenessViolation { lhs, rhs, total } => { 200 | Report::build(ReportKind::Error, input_name, total.start()) 201 | .with_message("When type checking an alternation of rules, the following rules are ambiguous") 202 | .with_label(Label::new((input_name, lhs.0.start()..lhs.0.end())) 203 | .with_message(format!("type info for left-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}", 204 | lhs.1.nullable, lhs.1.first.iter().map(|x|x.name()).collect::>().join(", "), 205 | lhs.1.follow.iter().map(|x|x.name()).collect::>().join(", ") 206 | )) 207 | .with_color(Color::Green)) 208 | .with_label(Label::new((input_name, rhs.0.start()..rhs.0.end())) 209 | .with_message(format!("type info for right-hand side: nullable: {}, first set: {{{}}}, follow set: {{{}}}", 210 | rhs.1.nullable, rhs.1.first.iter().map(|x|x.name()).collect::>().join(", "), 211 | rhs.1.follow.iter().map(|x|x.name()).collect::>().join(", ") 212 | )) 213 | .with_color(Color::Blue)) 214 | .finish() 215 | }, 216 | UnguardedFixpoint(sym, span) => { 217 | Report::build(ReportKind::Error, input_name, span.start()) 218 | .with_message("Unguarded fixpoint") 219 | .with_label(Label::new((input_name, span.start()..span.end())) 220 | .with_message(format!("fixpoint rule {} is not guarded -- your grammar is left-recursive", sym)) 221 | .with_color(Color::Red)) 222 | .finish() 223 | }, 224 | UnresolvedReference(sym, span) => { 225 | Report::build(ReportKind::Error, input_name, span.start()) 226 | .with_message("Unresolved reference") 227 | .with_label(Label::new((input_name, span.start()..span.end())) 228 | .with_message(format!("cannot resolve parser rule {} within context -- did you forget to put recursive rule into fixpoint", sym)) 229 | .with_color(Color::Red)) 230 | .finish() 231 | }, 232 | } 233 | }) 234 | .collect::>(), 235 | } 236 | } 237 | } 238 | 239 | pub fn generate_parser(input: &str) -> Result { 240 | use frontend::SurfaceSyntaxTree::Grammar; 241 | 242 | let sst = frontend::parse(input)?; 243 | let Grammar { lexer, parser } = &sst.node else { 244 | unreachable_branch!("the entrypoint of sst can only be Grammar") 245 | }; 246 | let lexer_database = construct_lexer_database(lexer)?; 247 | lexer_database.nullability_check()?; 248 | let term_arena = TermArena::new(); 249 | let mut parser = construct_parser(&term_arena, lexer_database, parser)?; 250 | parser.infer_fixpoints(); 251 | let type_errs = parser.type_check(); 252 | if !type_errs.is_empty() { 253 | return Err(Error::TypeErrors(type_errs)); 254 | } 255 | let nf_arena = Arena::new(); 256 | let mut nfs = NormalForms::new(); 257 | for (symbol, rule) in parser.bindings.iter() { 258 | semi_normalize( 259 | &rule.term.node, 260 | *symbol, 261 | &nf_arena, 262 | &mut nfs, 263 | &mut 0, 264 | &parser, 265 | ); 266 | } 267 | fully_normalize(&nf_arena, &mut nfs); 268 | merge_inactive_rules(&mut nfs, &parser, &nf_arena); 269 | remove_unreachable_rules(&mut nfs, &parser); 270 | let parser_routines = fusion_parser(&nfs, &parser); 271 | let entrypoint = format_ident!("parse_{}", parser.entrypoint.name()); 272 | Ok(quote::quote! { 273 | #![allow( 274 | dead_code, 275 | non_camel_case_types, 276 | unused_variables, 277 | unused_mut, 278 | unreachable_code, 279 | unused_assignments, 280 | clippy::single_match, 281 | clippy::never_loop, 282 | clippy::match_single_binding, 283 | )] 284 | #parser_routines 285 | pub fn parse(input: &str) -> Result { 286 | #entrypoint(input, 0) 287 | } 288 | }) 289 | } 290 | 291 | #[cfg(test)] 292 | mod tests; 293 | --------------------------------------------------------------------------------