├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── clearcache.yml │ ├── coverage.yml │ ├── pages.yml │ ├── rustbench.yml │ ├── rustcheck.yml │ ├── rustdoc.yml │ ├── rustlib.yml │ ├── rustlints.yml │ └── rustmsrv.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── RELEASE-PROCESS.md ├── book ├── book.toml ├── mdbook-admonish.css └── src │ ├── SUMMARY.md │ ├── assets │ ├── calculator_example_flow.png │ └── calculator_example_how_evaluator_works.png │ ├── attributes.md │ ├── attributes │ ├── logos.md │ └── token_and_regex.md │ ├── callbacks.md │ ├── common-regex.md │ ├── context-dependent-lexing.md │ ├── contributing.md │ ├── contributing │ ├── fuzzing.md │ ├── internals.md │ └── setup.md │ ├── debugging.md │ ├── examples.md │ ├── examples │ ├── brainfuck.md │ ├── calculator.md │ ├── json.md │ ├── json_borrowed.md │ └── string-interpolation.md │ ├── extras.md │ ├── getting-help.md │ ├── getting-started.md │ ├── intro.md │ ├── token-disambiguation.md │ └── unsafe.md ├── examples ├── brainfuck.rs ├── calculator.rs ├── custom_error.rs ├── example.json ├── extras.rs ├── hello_world.bf ├── json.rs ├── json_borrowed.rs └── string-interpolation.rs ├── fuzz ├── Cargo.toml ├── in │ ├── literal │ └── regex └── src │ └── main.rs ├── logos-cli ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── src │ └── main.rs └── tests │ ├── data │ ├── fmt_output.rs │ ├── input.rs │ └── output.rs │ └── tests.rs ├── logos-codegen ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── build.rs ├── src │ ├── error.rs │ ├── generator │ │ ├── context.rs │ │ ├── fork.rs │ │ ├── leaf.rs │ │ ├── mod.rs │ │ ├── rope.rs │ │ └── tables.rs │ ├── graph │ │ ├── fork.rs │ │ ├── impls.rs │ │ ├── meta.rs │ │ ├── mod.rs │ │ ├── range.rs │ │ ├── regex.rs │ │ └── rope.rs │ ├── leaf.rs │ ├── lib.rs │ ├── macros.rs │ ├── mir.rs │ ├── parser │ │ ├── definition.rs │ │ ├── ignore_flags.rs │ │ ├── mod.rs │ │ ├── nested.rs │ │ ├── subpattern.rs │ │ └── type_params.rs │ └── util.rs └── tests │ ├── codegen.rs │ └── data │ ├── no_error_lut │ ├── input.rs │ └── output.rs │ └── simple │ ├── input.rs │ └── output.rs ├── logos-derive ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT └── src │ └── lib.rs ├── logos.png ├── logos.svg ├── release.toml ├── src ├── internal.rs ├── lexer.rs ├── lib.rs └── source.rs └── tests ├── Cargo.toml ├── benches └── bench.rs ├── src └── lib.rs └── tests ├── advanced.rs ├── binary.rs ├── callbacks.rs ├── clone.rs ├── crate_.rs ├── css.rs ├── custom_error.rs ├── edgecase.rs ├── ignore_case.rs ├── lexer_modes.rs ├── properties.rs ├── simple.rs ├── source.rs ├── string.rs └── unicode_dot.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [maciejhirsz] 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: github-actions 9 | # Workflow files stored in the 10 | # default location of `.github/workflows` 11 | directory: / 12 | schedule: 13 | interval: daily 14 | labels: 15 | - github-actions 16 | - dependencies 17 | -------------------------------------------------------------------------------- /.github/workflows/clearcache.yml: -------------------------------------------------------------------------------- 1 | # From: https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries 2 | name: Cleanup caches by a branch 3 | on: 4 | pull_request: 5 | types: 6 | - closed 7 | 8 | jobs: 9 | cleanup: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out code 13 | uses: actions/checkout@v4 14 | 15 | - name: Cleanup 16 | run: | 17 | gh extension install actions/gh-actions-cache 18 | 19 | REPO=${{ github.repository }} 20 | BRANCH="refs/pull/${{ github.event.pull_request.number }}/merge" 21 | 22 | echo "Fetching list of cache key" 23 | cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 ) 24 | 25 | ## Setting this to not fail the workflow while deleting cache keys. 26 | set +e 27 | echo "Deleting caches..." 28 | for cacheKey in $cacheKeysForPR 29 | do 30 | gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm 31 | done 32 | echo "Done" 33 | env: 34 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Code Coverage 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '**.rs' 7 | - .github/workflows/coverage.yml 8 | - '**/Cargo.toml' 9 | push: 10 | branches: [master] 11 | workflow_dispatch: 12 | 13 | jobs: 14 | test: 15 | name: Coverage 16 | runs-on: ubuntu-latest 17 | container: 18 | image: xd009642/tarpaulin:develop-nightly 19 | options: --security-opt seccomp=unconfined 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | - name: Install rustfmt 25 | run: | 26 | rustup component add rustfmt 27 | 28 | - name: Generate code coverage 29 | run: | 30 | cargo +nightly tarpaulin --verbose --features debug --workspace --timeout 120 --out Xml 31 | 32 | - name: Upload to codecov.io 33 | uses: codecov/codecov-action@v5 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | fail_ci_if_error: true 37 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | # Workflow for building and deploying a mdBook site to GitHub Pages 2 | name: Book 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: [book, master] 8 | 9 | # Also runs on PR 10 | pull_request: 11 | 12 | # Allows you to run this workflow manually from the Actions tab 13 | workflow_dispatch: 14 | 15 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 16 | permissions: 17 | contents: read 18 | pages: write 19 | id-token: write 20 | 21 | # Allow one concurrent deployment 22 | concurrency: 23 | group: pages 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | # Build job 28 | build-book: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v4 33 | 34 | - name: Setup mdBook 35 | uses: peaceiris/actions-mdbook@v2 36 | with: 37 | mdbook-version: 0.4.28 38 | 39 | - name: Install mbBook-admonish 40 | uses: taiki-e/install-action@v2 41 | with: 42 | tool: mdbook-admonish 43 | 44 | - name: Download assets 45 | run: | 46 | cd book/ 47 | mdbook-admonish install 48 | 49 | - name: Build book 50 | run: mdbook build book 51 | 52 | - name: Upload artifact 53 | uses: actions/upload-pages-artifact@v3 54 | with: 55 | path: ./book/book 56 | 57 | # Deployment job 58 | deploy: 59 | if: github.event_name != 'pull_request' 60 | environment: 61 | name: github-pages 62 | url: ${{ steps.deployment.outputs.page_url }} 63 | runs-on: ubuntu-latest 64 | needs: [build-book] 65 | steps: 66 | - name: Deploy to GitHub Pages 67 | id: deployment 68 | uses: actions/deploy-pages@v4 69 | -------------------------------------------------------------------------------- /.github/workflows/rustbench.yml: -------------------------------------------------------------------------------- 1 | name: Benchmark 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '**.rs' 7 | - .github/workflows/rustbench.yml 8 | - '**/Cargo.toml' 9 | push: 10 | branches: [master] 11 | workflow_dispatch: 12 | 13 | permissions: 14 | pull-requests: write 15 | 16 | jobs: 17 | benchmark: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Setup rust toolchain, cache and cargo-codspeed binary 23 | uses: moonrepo/setup-rust@v1 24 | with: 25 | channel: stable 26 | cache-target: release 27 | bins: cargo-codspeed 28 | 29 | - name: Build the benchmark target(s) 30 | run: cargo codspeed build --workspace 31 | 32 | - name: Run the benchmarks 33 | uses: CodSpeedHQ/action@v3 34 | with: 35 | run: cargo codspeed run --workspace -------------------------------------------------------------------------------- /.github/workflows/rustcheck.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | paths: 4 | - '**.rs' 5 | - '**/Cargo.toml' 6 | workflow_dispatch: 7 | 8 | name: Cargo check 9 | 10 | jobs: 11 | cargo_check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: taiki-e/install-action@cargo-hack 16 | - run: cargo hack check --feature-powerset --no-dev-deps 17 | -------------------------------------------------------------------------------- /.github/workflows/rustdoc.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | paths: 4 | - '**.rs' 5 | - '**/Cargo.toml' 6 | workflow_dispatch: 7 | 8 | name: Rustdoc 9 | 10 | jobs: 11 | publish: 12 | name: Build docs 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout sources 16 | uses: actions/checkout@v4 17 | 18 | - name: Install nightly toolchain 19 | uses: dtolnay/rust-toolchain@nightly 20 | 21 | - name: Cache dependencies 22 | uses: Swatinem/rust-cache@v2 23 | 24 | - name: Check rustdoc build 25 | run: RUSTDOCFLAGS='--cfg docsrs' cargo +nightly doc --features debug -Zunstable-options -Zrustdoc-scrape-examples 26 | -------------------------------------------------------------------------------- /.github/workflows/rustlib.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | paths: 4 | - '**.rs' 5 | - '**/Cargo.toml' 6 | workflow_dispatch: 7 | 8 | name: Library testing 9 | 10 | jobs: 11 | rustdoc: 12 | name: Rustdoc 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout sources 16 | uses: actions/checkout@v4 17 | 18 | - name: Install nightly toolchain 19 | uses: dtolnay/rust-toolchain@nightly 20 | 21 | - name: Cache dependencies 22 | uses: Swatinem/rust-cache@v2 23 | 24 | - name: Check rustdoc build 25 | run: RUSTDOCFLAGS='--cfg docsrs' cargo +nightly doc --features debug -Zunstable-options -Zrustdoc-scrape-examples 26 | 27 | tests: 28 | name: Tests 29 | strategy: 30 | matrix: 31 | rust: 32 | - 1.74.0 # current MSRV 33 | - 1.82.0 # precise capturing 34 | - stable 35 | - beta 36 | - nightly 37 | os: 38 | - macos-latest 39 | - ubuntu-latest 40 | - windows-latest 41 | features: 42 | - "" # default features 43 | - "--features forbid_unsafe" 44 | 45 | runs-on: ${{ matrix.os }} 46 | steps: 47 | - name: Checkout sources 48 | uses: actions/checkout@v4 49 | 50 | - name: Install stable toolchain 51 | uses: dtolnay/rust-toolchain@stable 52 | with: 53 | toolchain: ${{ matrix.rust }} 54 | components: rustfmt 55 | 56 | - name: Cache dependencies 57 | uses: Swatinem/rust-cache@v2 58 | 59 | - name: Check that tests run 60 | run: cargo test --workspace --verbose ${{ matrix.features }} 61 | -------------------------------------------------------------------------------- /.github/workflows/rustlints.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | paths: 4 | - '**.rs' 5 | - '**/Cargo.toml' 6 | workflow_dispatch: 7 | 8 | name: Rust lints 9 | 10 | jobs: 11 | clippy: 12 | name: Clippy 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout sources 16 | uses: actions/checkout@v4 17 | 18 | - name: Install stable toolchain 19 | uses: dtolnay/rust-toolchain@stable 20 | with: 21 | components: clippy 22 | 23 | - name: Check clippy 24 | run: cargo clippy --features debug -- -D warnings 25 | 26 | rustfmt: 27 | name: Rustfmt 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: Checkout sources 31 | uses: actions/checkout@v4 32 | 33 | - name: Install stable toolchain 34 | uses: dtolnay/rust-toolchain@stable 35 | with: 36 | components: rustfmt 37 | 38 | - name: Check format 39 | run: cargo fmt --check 40 | -------------------------------------------------------------------------------- /.github/workflows/rustmsrv.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | paths: 4 | - '**.rs' 5 | - '**/Cargo.toml' 6 | workflow_dispatch: 7 | 8 | name: MSRV check 9 | 10 | jobs: 11 | msrv_check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Install stable toolchain 17 | uses: dtolnay/rust-toolchain@stable 18 | 19 | - name: Install Cargo MSRV 20 | uses: baptiste0928/cargo-install@v3 21 | with: 22 | crate: cargo-msrv 23 | args: --no-default-features 24 | version: ^0.18.1 25 | 26 | - name: Check MSRV 27 | run: cargo msrv verify -- cargo check --workspace --features debug 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | /book/book 3 | **/*.rs.bk 4 | 5 | # ignore output fuzzing 6 | out 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-yaml 6 | - id: check-toml 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks 10 | rev: v2.14.0 11 | hooks: 12 | - id: pretty-format-yaml 13 | args: [--autofix] 14 | - id: pretty-format-toml 15 | exclude: Cargo.lock 16 | args: [--autofix, --trailing-commas] 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["logos-cli", "logos-codegen", "logos-derive", "tests"] 3 | exclude = ["fuzz"] 4 | resolver = "2" 5 | 6 | [workspace.package] 7 | authors = [ 8 | "Maciej Hirsz ", 9 | "Jérome Eertmans (maintainer) ", 10 | ] 11 | categories = ["parsing", "text-processing"] 12 | description = "Create ridiculously fast Lexers" 13 | edition = "2021" 14 | homepage = "https://logos.maciej.codes/" 15 | keywords = ["lexer", "lexical", "tokenizer", "parser", "no_std"] 16 | license = "MIT OR Apache-2.0" 17 | readme = "README.md" 18 | repository = "https://github.com/maciejhirsz/logos" 19 | rust-version = "1.74.0" 20 | version = "0.15.0" 21 | 22 | [package] 23 | name = "logos" 24 | authors.workspace = true 25 | categories.workspace = true 26 | description.workspace = true 27 | edition.workspace = true 28 | homepage.workspace = true 29 | keywords.workspace = true 30 | license.workspace = true 31 | readme.workspace = true 32 | repository.workspace = true 33 | rust-version.workspace = true 34 | version.workspace = true 35 | 36 | [package.metadata.release] 37 | pre-release-replacements = [ 38 | {file="book/src/getting-started.md", search="logos = \"[0-9]+\\.[0-9]+\\.[0-9]+\"", replace="logos = \"{{version}}\"", exactly=1}, 39 | ] 40 | shared-version = true 41 | 42 | [lib] 43 | bench = false 44 | 45 | [features] 46 | # Enables debug messages 47 | debug = ["logos-derive?/debug"] 48 | default = ["export_derive", "std"] 49 | # Re-exports the `Logos` derive macro, so that end user only needs to 50 | # import this crate and `use logos::Logos` to get both the trait and 51 | # derive proc macro. 52 | export_derive = ["logos-derive"] 53 | # Should the crate use the standard library? 54 | std = [] 55 | # Use safe alternatives for unsafe code (may impact performance)? 56 | forbid_unsafe = ["logos-derive?/forbid_unsafe"] 57 | 58 | [package.metadata.docs.rs] 59 | features = ["debug"] 60 | cargo-args = ["-Zunstable-options", "-Zrustdoc-scrape-examples"] 61 | rustdoc-args = ["--cfg", "docsrs"] 62 | 63 | [profile] 64 | bench = {lto = true} 65 | release = {lto = true} 66 | 67 | [dependencies] 68 | logos-derive = {version = "0.15.0", path = "./logos-derive", optional = true} 69 | 70 | [dev-dependencies] 71 | ariadne = {version = "0.4", features = ["auto-color"]} 72 | chumsky = {version = "0.9.3" } 73 | 74 | [[example]] 75 | doc-scrape-examples = true # Only needed once, because requires dev-dependencies 76 | name = "brainfuck" 77 | path = "examples/brainfuck.rs" 78 | 79 | [[example]] 80 | name = "calculator" 81 | path = "examples/calculator.rs" 82 | 83 | [[example]] 84 | name = "string-interpolation" 85 | path = "examples/string-interpolation.rs" 86 | 87 | [[example]] 88 | name = "custom_error" 89 | path = "examples/custom_error.rs" 90 | 91 | [[example]] 92 | name = "extras" 93 | path = "examples/extras.rs" 94 | 95 | [[example]] 96 | name = "json" 97 | path = "examples/json.rs" 98 | 99 | [[example]] 100 | name = "json-borrowed" 101 | path = "examples/json_borrowed.rs" 102 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Maciej Hirsz 2 | 3 | The MIT License (MIT) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Logos logo 2 | 3 | # Logos 4 | 5 | [![Book](https://github.com/maciejhirsz/logos/actions/workflows/pages.yml/badge.svg?branch=master)](https://logos.maciej.codes/) 6 | [![Crates.io version shield](https://img.shields.io/crates/v/logos.svg)](https://crates.io/crates/logos) 7 | [![Docs](https://docs.rs/logos/badge.svg)](https://docs.rs/logos) 8 | [![Crates.io license shield](https://img.shields.io/crates/l/logos.svg)](https://crates.io/crates/logos) 9 | [![Code coverage](https://codecov.io/gh/maciejhirsz/logos/branch/master/graph/badge.svg)](https://codecov.io/gh/maciejhirsz/logos) 10 | 11 | _Create ridiculously fast Lexers._ 12 | 13 | **Logos** has two goals: 14 | 15 | + To make it easy to create a Lexer, so you can focus on more complex problems. 16 | + To make the generated Lexer faster than anything you'd write by hand. 17 | 18 | To achieve those, **Logos**: 19 | 20 | + Combines all token definitions into a single [deterministic state machine](https://en.wikipedia.org/wiki/Deterministic_finite_automaton). 21 | + Optimizes branches into [lookup tables](https://en.wikipedia.org/wiki/Lookup_table) or [jump tables](https://en.wikipedia.org/wiki/Branch_table). 22 | + Prevents [backtracking](https://en.wikipedia.org/wiki/ReDoS) inside token definitions. 23 | + [Unwinds loops](https://en.wikipedia.org/wiki/Loop_unrolling), and batches reads to minimize bounds checking. 24 | + Does all of that heavy lifting at compile time. 25 | 26 | ## Example 27 | 28 | ```rust 29 | use logos::Logos; 30 | 31 | #[derive(Logos, Debug, PartialEq)] 32 | #[logos(skip r"[ \t\n\f]+")] // Ignore this regex pattern between tokens 33 | enum Token { 34 | // Tokens can be literal strings, of any length. 35 | #[token("fast")] 36 | Fast, 37 | 38 | #[token(".")] 39 | Period, 40 | 41 | // Or regular expressions. 42 | #[regex("[a-zA-Z]+")] 43 | Text, 44 | } 45 | 46 | fn main() { 47 | let mut lex = Token::lexer("Create ridiculously fast Lexers."); 48 | 49 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 50 | assert_eq!(lex.span(), 0..6); 51 | assert_eq!(lex.slice(), "Create"); 52 | 53 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 54 | assert_eq!(lex.span(), 7..19); 55 | assert_eq!(lex.slice(), "ridiculously"); 56 | 57 | assert_eq!(lex.next(), Some(Ok(Token::Fast))); 58 | assert_eq!(lex.span(), 20..24); 59 | assert_eq!(lex.slice(), "fast"); 60 | 61 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 62 | assert_eq!(lex.slice(), "Lexers"); 63 | assert_eq!(lex.span(), 25..31); 64 | 65 | assert_eq!(lex.next(), Some(Ok(Token::Period))); 66 | assert_eq!(lex.span(), 31..32); 67 | assert_eq!(lex.slice(), "."); 68 | 69 | assert_eq!(lex.next(), None); 70 | } 71 | ``` 72 | 73 | For more examples and documentation, please refer to the 74 | [Logos handbook](https://maciejhirsz.github.io/logos/) or the 75 | [crate documentation](https://docs.rs/logos/latest/logos/). 76 | 77 | ## How fast? 78 | 79 | Ridiculously fast! 80 | 81 | ```norust 82 | test identifiers ... bench: 647 ns/iter (+/- 27) = 1204 MB/s 83 | test keywords_operators_and_punctators ... bench: 2,054 ns/iter (+/- 78) = 1037 MB/s 84 | test strings ... bench: 553 ns/iter (+/- 34) = 1575 MB/s 85 | ``` 86 | 87 | ## Acknowledgements 88 | 89 | + [Pedrors](https://pedrors.pt/) for the **Logos** logo. 90 | 91 | ## Thank you 92 | 93 | **Logos** is very much a labor of love. If you find it useful, consider 94 | [getting me some coffee](https://github.com/sponsors/maciejhirsz). ☕ 95 | 96 | If you'd like to contribute to Logos, then consider reading the 97 | [Contributing guide](https://maciejhirsz.github.io/logos/contributing). 98 | 99 | ## Contributing 100 | 101 | **Logos** welcome any kind of contribution: bug reports, suggestions, 102 | or new features! 103 | 104 | Please use the 105 | [issues](https://github.com/maciejhirsz/logos/issues) or 106 | [pull requests](https://github.com/maciejhirsz/logos/pulls) tabs, 107 | when appropriate. 108 | 109 | To release a new version, follow the [RELEASE-PROCESS](RELEASE-PROCESS.md) 110 | 111 | ## License 112 | 113 | This code is distributed under the terms of both the MIT license 114 | and the Apache License (Version 2.0), choose whatever works for you. 115 | 116 | See [LICENSE-APACHE](LICENSE-APACHE) and [LICENSE-MIT](LICENSE-MIT) for details. 117 | -------------------------------------------------------------------------------- /RELEASE-PROCESS.md: -------------------------------------------------------------------------------- 1 | # Release process 2 | 3 | First, make sure you are logged-in https://crates.io with: `cargo login`. 4 | If you don't have write access to **Logos**' crates, you can still 5 | perform steps 1-4, and ask a maintainer with accesses to perform step 5. 6 | 7 | This project uses `cargo-release` to publish all packages with more ease. 8 | Note that, by default, every command runs in *dry mode*, and you need to append `--execute` 9 | to actually perform the action. 10 | 11 | Here are the following steps to release a new version: 12 | 13 | 1. create a branch `release-x.y.z` from the `master` branch; 14 | 2. run and commit `cargo release version --workspace `; 15 | 3. run and commit `cargo release replace --workspace`; 16 | 4. push your branch and create a pull request; 17 | 5. and, once your branch was merged to `master`, run the following: 18 | ```bash 19 | cargo release publish --package logos-codegen 20 | cargo release publish --package logos-derive 21 | cargo release publish --package logos-cli 22 | cargo release publish --package logos 23 | ``` 24 | 25 | And voilà! 26 | -------------------------------------------------------------------------------- /book/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Maciej Hirsz ", "Jérome Eertmans (maintainer) "] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "Logos Handbook" 7 | 8 | [preprocessor.admonish] 9 | command = "mdbook-admonish" 10 | assets_version = "3.0.2" # do not edit: managed by `mdbook-admonish install` 11 | 12 | [output] 13 | 14 | [output.html] 15 | additional-css = ["./mdbook-admonish.css"] 16 | -------------------------------------------------------------------------------- /book/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | + [Intro](./intro.md) 4 | + [Getting Started](./getting-started.md) 5 | + [Help](./getting-help.md) 6 | + [Attributes](./attributes.md) 7 | + [`#[logos]`](./attributes/logos.md) 8 | + [`#[token]` and `#[regex]`](./attributes/token_and_regex.md) 9 | + [Token disambiguation](./token-disambiguation.md) 10 | + [Using `Extras`](./extras.md) 11 | + [Using callbacks](./callbacks.md) 12 | + [Context-dependent lexing](./context-dependent-lexing.md) 13 | + [Common regular expressions](./common-regex.md) 14 | + [Debugging](./debugging.md) 15 | + [Unsafe Code](./unsafe.md) 16 | + [Examples](./examples.md) 17 | + [Brainfuck interpreter](./examples/brainfuck.md) 18 | + [Simple calculator](./examples/calculator.md) 19 | + [JSON parser](./examples/json.md) 20 | + [JSON-borrowed parser](./examples/json_borrowed.md) 21 | + [String interpolation](./examples/string-interpolation.md) 22 | + [Contributing](./contributing.md) 23 | + [Setup](./contributing/setup.md) 24 | + [Internals](./contributing/internals.md) 25 | + [Fuzzing](./contributing/fuzzing.md) 26 | -------------------------------------------------------------------------------- /book/src/assets/calculator_example_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maciejhirsz/logos/96765c0be188f3e8005b48db82bf1e904d2e6650/book/src/assets/calculator_example_flow.png -------------------------------------------------------------------------------- /book/src/assets/calculator_example_how_evaluator_works.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maciejhirsz/logos/96765c0be188f3e8005b48db82bf1e904d2e6650/book/src/assets/calculator_example_how_evaluator_works.png -------------------------------------------------------------------------------- /book/src/attributes.md: -------------------------------------------------------------------------------- 1 | # Attributes 2 | 3 | The `#[derive(Logos)]` procedural macro recognizes three different attribute 4 | names. 5 | 6 | + [`#[logos]`](./attributes/logos.md) is the main attribute which can be 7 | attached to the `enum` of your token definition. It allows you to define the 8 | `Extras` associated type in order to put custom state into the `Lexer`, or 9 | declare concrete types for generic type parameters, if your `enum` uses such. 10 | It is strictly optional. It also allows to define parts that must be skipped 11 | by the lexer, the error type, or regex subpatterns. 12 | + And most importantly the 13 | [`#[token]` and `#[regex]`](./attributes/token_and_regex.md) 14 | attributes. Those allow you to define patterns to match against the input, 15 | either plain text strings with `#[token]`, or using regular expression 16 | syntax with `#[regex]`. Aside from that difference, they are equivalent, 17 | and any extra arguments you can pass to one, you can pass to the other. 18 | -------------------------------------------------------------------------------- /book/src/attributes/logos.md: -------------------------------------------------------------------------------- 1 | # `#[logos]` 2 | 3 | As previously said, the `#[logos]` attribute can be attached to the `enum` 4 | of your token definition to customize your lexer. Note that they all are 5 | **optional**. 6 | 7 | The syntax is as follows: 8 | 9 | ```rust,no_run,no_playground 10 | #[derive(Logos)] 11 | #[logos(skip "regex literal")] 12 | #[logos(extras = ExtrasType)] 13 | #[logos(error = ErrorType)] 14 | #[logos(crate = path::to::logos)] 15 | #[logos(source = SourceType)] 16 | #[logos(subpattern subpattern_name = "regex literal")] 17 | enum Token { 18 | /* ... */ 19 | } 20 | ``` 21 | 22 | where `"regex literal"` can be any regex supported by 23 | [`#[regex]`](../common-regex.md), and `ExtrasType` can be of any type! 24 | 25 | An example usage of `skip` is provided in the [JSON parser example](../examples/json.md). 26 | 27 | For more details about extras, read the [eponym section](../extras.md). 28 | 29 | ## Custom error type 30 | 31 | By default, **Logos** uses `()` as the error type, which means that it 32 | doesn't store any information about the error. 33 | This can be changed by using `#[logos(error = ErrorType)]` attribute on the enum. 34 | The type `ErrorType` can be any type that implements `Clone`, `PartialEq`, 35 | `Default` and `From` for each callback's error type. 36 | 37 | `ErrorType` must implement the `Default` trait because invalid tokens, i.e., 38 | literals that do not match any variant, will produce `Err(ErrorType::default())`. 39 | 40 | Here is an example using a custom error type: 41 | 42 | ```rust,no_run,noplayground 43 | {{#include ../../../examples/custom_error.rs:all}} 44 | ``` 45 | 46 | You can add error variants to `LexingError`, 47 | and implement `From` for each error type `E` that could 48 | be returned by a callback. See [callbacks](../callbacks.md). 49 | 50 | ## Specifying path to logos 51 | 52 | You can force the derive macro to use a different path to `Logos`'s crate 53 | with `#[logos(crate = path::to::logos)]`. 54 | 55 | ## Custom source type 56 | 57 | By default, **Logos**'s lexer will accept `&str` as input, unless any of the 58 | pattern literals match a non utf-8 bytes sequence. In this case, it will fall 59 | back to `&[u8]`. You can override this behavior by forcing one of the two 60 | source types. You can also specify any custom type that implements 61 | [`Source`](https://docs.rs/logos/latest/logos/source/trait.Source.html). 62 | 63 | ## Subpatterns 64 | 65 | We can use subpatterns to reuse regular expressions in our tokens or other subpatterns. 66 | 67 | The syntax tu use a previously defined subpattern, like `#[logos(subpattern subpattern_name = "regex literal")]`, 68 | in a new regular expression is `"(?&subpattern_name)"`. 69 | 70 | For example: 71 | 72 | ```rust,no_run,noplayground 73 | use logos::Logos; 74 | 75 | #[derive(Logos, Debug, PartialEq)] 76 | #[logos(skip r"\s+")] 77 | #[logos(subpattern alpha = r"[a-zA-Z]")] 78 | #[logos(subpattern digit = r"[0-9]")] 79 | #[logos(subpattern alphanum = r"(?&alpha)|(?&digit)")] 80 | enum Token { 81 | #[regex("(?&alpha)+")] 82 | Word, 83 | #[regex("(?&digit)+")] 84 | Number, 85 | #[regex("(?&alphanum){2}")] 86 | TwoAlphanum, 87 | #[regex("(?&alphanum){3}")] 88 | ThreeAlphanum, 89 | } 90 | 91 | fn main() { 92 | let mut lex = Token::lexer("Word 1234 ab3 12"); 93 | 94 | assert_eq!(lex.next(), Some(Ok(Token::Word))); 95 | assert_eq!(lex.slice(), "Word"); 96 | 97 | assert_eq!(lex.next(), Some(Ok(Token::Number))); 98 | assert_eq!(lex.slice(), "1234"); 99 | 100 | assert_eq!(lex.next(), Some(Ok(Token::ThreeAlphanum))); 101 | assert_eq!(lex.slice(), "ab3"); 102 | 103 | assert_eq!(lex.next(), Some(Ok(Token::TwoAlphanum))); 104 | assert_eq!(lex.slice(), "12"); 105 | 106 | assert_eq!(lex.next(), None); 107 | } 108 | ``` 109 | 110 | (Note that the above supatterns are redundant as the same can be achieved with [existing character classes](https://docs.rs/regex/latest/regex/#ascii-character-classes)) 111 | -------------------------------------------------------------------------------- /book/src/attributes/token_and_regex.md: -------------------------------------------------------------------------------- 1 | # `#[token]` and `#[regex]` 2 | 3 | For each variant your declare in your `enum` that uses the `Logos` derive macro, 4 | you can specify one or more string literal or regex it can match. 5 | 6 | The usage syntax is a follows: 7 | 8 | ```rust,no_run,no_playground 9 | #[derive(Logos)] 10 | enum Token { 11 | #[token(literal [, callback, priority = , ignore(, ...)]] 12 | #[regex(literal [, callback, priority = , ignore(, ...)]] 13 | SomeVariant, 14 | } 15 | ``` 16 | 17 | where `literal` can be any `&str` or `&[u8]` string literal, 18 | `callback` can either be a closure, or a literal path to a function 19 | (see [Using callbacks section](../callbacks.md)), 20 | `priority` can be any positive integer 21 | (see [Token disambiguation section](../token-disambiguation.md)), 22 | and `flag` can by of: `case`, `ascii_case`. Only `literal` is **required**, 23 | others are optional. 24 | 25 | You can stack any number of `#[token]` and or `#[regex]` attributes on top of 26 | the same variant. 27 | 28 | ```admonish info 29 | For a list of supported `regex` literals, read the 30 | [Common regular expressions section](../common-regex.md). 31 | ``` 32 | -------------------------------------------------------------------------------- /book/src/callbacks.md: -------------------------------------------------------------------------------- 1 | # Using callbacks 2 | 3 | **Logos** can also call arbitrary functions whenever a pattern is matched, 4 | which can be used to put data into a variant: 5 | 6 | ```rust,no_run,no_playground 7 | use logos::{Logos, Lexer}; 8 | 9 | // Note: callbacks can return `Option` or `Result` 10 | fn kilo(lex: &mut Lexer) -> Option { 11 | let slice = lex.slice(); 12 | let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'k' 13 | Some(n * 1_000) 14 | } 15 | 16 | fn mega(lex: &mut Lexer) -> Option { 17 | let slice = lex.slice(); 18 | let n: u64 = slice[..slice.len() - 1].parse().ok()?; // skip 'm' 19 | Some(n * 1_000_000) 20 | } 21 | 22 | #[derive(Logos, Debug, PartialEq)] 23 | #[logos(skip r"[ \t\n\f]+")] 24 | enum Token { 25 | // Callbacks can use closure syntax, or refer 26 | // to a function defined elsewhere. 27 | // 28 | // Each pattern can have it's own callback. 29 | #[regex("[0-9]+", |lex| lex.slice().parse().ok())] 30 | #[regex("[0-9]+k", kilo)] 31 | #[regex("[0-9]+m", mega)] 32 | Number(u64), 33 | } 34 | 35 | fn main() { 36 | let mut lex = Token::lexer("5 42k 75m"); 37 | 38 | assert_eq!(lex.next(), Some(Ok(Token::Number(5)))); 39 | assert_eq!(lex.slice(), "5"); 40 | 41 | assert_eq!(lex.next(), Some(Ok(Token::Number(42_000)))); 42 | assert_eq!(lex.slice(), "42k"); 43 | 44 | assert_eq!(lex.next(), Some(Ok(Token::Number(75_000_000)))); 45 | assert_eq!(lex.slice(), "75m"); 46 | 47 | assert_eq!(lex.next(), None); 48 | } 49 | ``` 50 | 51 | Logos can handle callbacks with following return types: 52 | 53 | | Return type | Produces | 54 | | --------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- | 55 | | `()` | `Ok(Token::Unit)` | 56 | | `bool` | `Ok(Token::Unit)` **or** `Err(::Error::default())` | 57 | | `Result<(), E>` | `Ok(Token::Unit)` **or** `Err(::Error::from(err))` | 58 | | `T` | `Ok(Token::Value(T))` | 59 | | `Option` | `Ok(Token::Value(T))` **or** `Err(::Error::default())` | 60 | | `Result` | `Ok(Token::Value(T))` **or** `Err(::Error::from(err))` | 61 | | [`Skip`](https://docs.rs/logos/latest/logos/struct.Skip.html) | _skips matched input_ | 62 | | `Result` | _skips matched input_ **or** `Err(::Error::from(err))` | 63 | | [`Filter`](https://docs.rs/logos/latest/logos/enum.Filter.html) | `Ok(Token::Value(T))` **or** _skips matched input_ | 64 | | [`FilterResult`](https://docs.rs/logos/latest/logos/enum.FilterResult.html) | `Ok(Token::Value(T))` **or** `Err(::Error::from(err))` **or** _skips matched input_ | 65 | 66 | Callbacks can also be used to perform more specialized lexing in places 67 | where regular expressions are too limiting. For specifics look at 68 | [`Lexer::remainder`](https://docs.rs/logos/latest/logos/struct.Lexer.html#method.remainder) and 69 | [`Lexer::bump`](https://docs.rs/logos/latest/logos/struct.Lexer.html#method.bump). 70 | -------------------------------------------------------------------------------- /book/src/common-regex.md: -------------------------------------------------------------------------------- 1 | # Common regular expressions 2 | 3 | Maybe the most important feature of **Logos** is its ability to accept 4 | regex patterns in your tokens' definition. 5 | 6 | [Regular expressions](https://en.wikipedia.org/wiki/Regular_expression), 7 | or regexes for short, are sequences of characters (or bytes) that define a match 8 | pattern. When constructing lexers, this is especially useful to define tokens 9 | that should match a set of *similar* literals. E.g., a sequence of 10 | 3 ASCII uppercase letters and 3 digits could define a license plate, 11 | and could be matched with the following regex: `"[A-Z]{3}[0-9]{3}"`. 12 | 13 | For more details about regexes in Rust, refer to the 14 | [regex](https://crates.io/crates/regex) crate. 15 | 16 | ## Valid regexes that are not supported 17 | 18 | Because **Logos** aims at generating high-performance code, it never allows to 19 | do backtracking. This means that anytime a byte is read from the input source, 20 | it will never be read again. This implementation choice comes at a cost: not 21 | all valid regexes are supported by **Logos**[^1]. 22 | 23 | For reference, **Logos** parses regexes using `regex-syntax = 0.8.2`, and 24 | transforms its high-level intermediate representation (HIR) into some 25 | medium intermediate representation (MIR). From HIR, MIR does not support 26 | the following 27 | [`HirKind`](https://docs.rs/regex-syntax/0.8.2/regex_syntax/hir/enum.HirKind.html)s: 28 | 29 | + Non-greedy repetitions, i.e., matching as little as possible as given pattern. 30 | + `".*"` and `".+"` repetition patterns, because they will potentially consume 31 | all the input source, breaking the non-backtracking rule. 32 | For solutions, see footnote[^1] or read the error message. 33 | + Word boundaries, i.e., r`"\b"`. 34 | + Anchors, because input source does not treat lines separately. 35 | 36 | Additionally, note that capture groups will silently be *ungrouped*, 37 | because **Logos** does not support capturing groups, but the main slice 38 | (`lex.slice()`). 39 | 40 | [^1]: Most of time, however, it is possible to circumvent this issue by 41 | rewriting your regex another way, or by using callbacks. 42 | E.g., see 43 | [#302](https://github.com/maciejhirsz/logos/issues/302#issuecomment-1521342541). 44 | 45 | ## Other issues 46 | 47 | **Logos**' support for regexes is not yet complete, and errors can still exist. 48 | Some are found at compile time, and others will create wrong matches or panic. 49 | 50 | If you ever feel like your patterns do not match the expected source slices, 51 | please check the 52 | [GitHub issues](https://github.com/maciejhirsz/logos/issues?q=is%3Aissue). 53 | If no issue covers your problem, we encourage 54 | you to create a 55 | [new issue](https://github.com/maciejhirsz/logos/issues/new), 56 | and document it as best as you can so that the issue 57 | can be reproduced locally. 58 | -------------------------------------------------------------------------------- /book/src/context-dependent-lexing.md: -------------------------------------------------------------------------------- 1 | # Context-dependent lexing 2 | 3 | Sometimes, a single lexer is insufficient to properly handle complex grammars. To address this, many lexer generators offer the ability to have separate lexers with their own set of patterns and tokens, allowing you to dynamically switch between them based on the context. 4 | 5 | In Logos, context switching is handled using the [`morph`](https://docs.rs/logos/0.11.0-rc2/logos/struct.Lexer.html#method.morph) method of the `logos::Lexer` struct. 6 | This method takes ownership of the current lexer and transforms it into a lexer for a new token type. 7 | 8 | It is important to note that: 9 | 10 | - Both the original lexer and the new lexer must share the same [`Source`](./attributes/logos.md#custom-source-type) type. 11 | - The [`Extras`](./extras.md) type from the original lexer must be convertible into the `Extras` type of the new lexer. 12 | 13 | ## Example 14 | 15 | The following example demonstrates how to use `morph` to handle a C-style language that also supports python blocks: 16 | 17 | ```rust 18 | #[derive(Logos, Debug, PartialEq, Clone)] 19 | #[logos(skip r"\s+")] 20 | enum CToken { 21 | /* Tokens supporting C syntax */ 22 | // ... 23 | #[regex(r#"extern\s+"python"\s*\{"#, python_block_callback)] 24 | PythonBlock(Vec), 25 | } 26 | 27 | #[derive(Logos, Debug, PartialEq, Clone)] 28 | #[logos(skip r"\s+")] 29 | enum PythonToken { 30 | #[token("}")] 31 | ExitPythonBlock, 32 | /* Tokens supporting Python syntax */ 33 | // ... 34 | } 35 | 36 | fn python_block_callback(lex: &mut Lexer) -> Option> { 37 | let mut python_lexer = lex.clone().morph::(); 38 | let mut tokens = Vec::new(); 39 | while let Some(token) = python_lexer.next() { 40 | match token { 41 | Ok(PythonToken::ExitPythonBlock) => break, 42 | Err(_) => return None, 43 | Ok(tok) => tokens.push(tok), 44 | } 45 | } 46 | *lex = python_lexer.morph(); 47 | Some(tokens) 48 | } 49 | ``` 50 | 51 | Note that if we want to use `morph` inside a callback we need to be able to clone the original lexer, as `morph` needs to take ownership but the callback receives only a reference to the lexer. 52 | 53 | For a more in depth example check out [String interpolation](./examples/string-interpolation.md). 54 | -------------------------------------------------------------------------------- /book/src/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | If you are considering to contribute to Logos, then this place it for you! 4 | 5 | First, we really appreciate people that can help this project grow, and we 6 | would like to guide you through the standard contribution process. 7 | 8 | There are many ways to help us, and here is a short list of some of them: 9 | 10 | + fixing an [BUG](https://github.com/maciejhirsz/logos/labels/bug), by providing 11 | a patch (or suggesting in the comments how one could fix it); 12 | + correcting some typos in the documentation, the book, or anywhere else; 13 | + raising an issue about a problem (i.e., 14 | [opening an issue](https://github.com/maciejhirsz/logos/issues/new) on GitHub); 15 | + proposing new features (either with 16 | [an issue](https://github.com/maciejhirsz/logos/issues/new) or 17 | [a pull request](https://github.com/maciejhirsz/logos/pulls) on GitHub); 18 | + or improving the documentation (either in the crate or in the book). 19 | 20 | In any case, GitHub is the place-to-go for anything related to contributing. 21 | 22 | Below, we provide a few help pages (or links) to contents that can help you 23 | understand Logos' internals and how you can create submit a contribution. 24 | 25 | + If you are new to GitHub or git, please consider reading those two guides: 26 | + [GitHub’s Hello World](https://docs.github.com/en/get-started/quickstart/hello-world); 27 | + and [GitHub Pull Request in 100 Seconds](https://www.youtube.com/watch?v=8lGpZkjnkt4&ab_channel=Fireship) 28 | (video). 29 | + To setup and test your code locally, see the [Setup](./contributing/setup.md) 30 | page. 31 | + To know a bit more how Logos works, check the 32 | [Internals](./contributing/internals.md). 33 | -------------------------------------------------------------------------------- /book/src/contributing/fuzzing.md: -------------------------------------------------------------------------------- 1 | # Fuzzing 2 | 3 | 4 | Fuzzing is a technique to test a piece of software by injecting randomly generated inputs. This can be pretty useful to discover bugs, as pointed out in [#407](https://github.com/maciejhirsz/logos/pull/407). 5 | 6 | **Logos**' fuzzing crate is powered by [afl.rs](https://github.com/rust-fuzz/afl.rs) that 7 | finds panics in **Logos**' methods. 8 | 9 | ## Usage 10 | 11 | First, make sure you have `cargo-afl` installed, 12 | [see the rust-fuzz afl setup guide for installation information](https://rust-fuzz.github.io/book/afl/setup.html). 13 | 14 | Next, change your current working directory to be the `fuzz` folder. 15 | 16 | ### Building 17 | 18 | Before fuzzing, you need to build the target with: 19 | 20 | ```bash 21 | cargo afl build 22 | ``` 23 | 24 | ### Fuzzy testing 25 | 26 | The recommended way the run tests is with: 27 | 28 | ```bash 29 | cargo afl fuzz -i in -o out ../target/debug/logos-fuzz 30 | ``` 31 | 32 | Note that it may run for a (very) long time before 33 | it encounter any bug. 34 | 35 | ## Replaying a Crash 36 | 37 | If you happen to find a bug that crashes the program, 38 | you can reply it with 39 | 40 | ```bash 41 | cargo afl run logos-fuzz < out/default/crashes/crash_file 42 | ``` 43 | 44 | ### Reporting a Bug 45 | 46 | If you encounter a crash and you feel the error message 47 | is not appropriate, 48 | please report it by opening 49 | [an issue](https://github.com/maciejhirsz/logos/issues/new). 50 | Don't forget to include your crash file so we can later 51 | reproduce it. 52 | -------------------------------------------------------------------------------- /book/src/contributing/internals.md: -------------------------------------------------------------------------------- 1 | # Internals 2 | 3 | **Logos**' core functionalities are split across five crates: 4 | 5 | - `logos` is the main crate, that you add to your project (in `Cargo.toml`) 6 | to obtain the `Logos` derive macro. The public API is limited to this crate, 7 | and most users should only use this crate, not the others. 8 | - `logos-derive` is a very simple but necessary crate to expose `logos-codegen`'s code as a derive macro. 9 | - `logos-codegen` contains the most technical parts of **Logos**: the code 10 | that **reads** your token definitions, and **generates** optimized code 11 | to create blazingly fast lexers. 12 | You can [read a blog post](https://maciej.codes/2020-04-19-stacking-luts-in-logos.html) 13 | from the author of **Logos** to get a small insight of what the 14 | `logos-codegen` crate does. In the future, we hope to provide more documents 15 | about how this crate works, so people are more likely to understand it and 16 | improve it with pull requests (see the 17 | [Contributing section](../contributing.md)). 18 | - `logos-cli` is a separate crate, that installs a binary of the same name, 19 | and allows to expand the `Logos` derive macro into code. 20 | It can be installed with `cargo install logos-cli`, 21 | and usage help can be obtained through the `logos-cli --help` command. 22 | This tool can be useful if your token definitions stay constant, and 23 | you want to reduce compilation time overhead caused by derive macros. 24 | - `logos-fuzz` is an internal crate (i.e., unpublished) that uses [afl.rs](https://github.com/rust-fuzz/afl.rs) 25 | to find confusing panics before they reach the developer. 26 | To use this tool, see the [Fuzzing guide]('./fuzzing.md') 27 | -------------------------------------------------------------------------------- /book/src/contributing/setup.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | On this page, you will find all the information needed to run and test your 4 | own version of the Logos crate, locally. 5 | 6 | We assume you have basic knowledge with git and GitHub. If that is not the 7 | case, please refer to the link mentioned in [Contributing](./contributing.md). 8 | 9 | ## Prerequisites 10 | 11 | You need to have both git and Rust installed on your computer, 12 | see installation procedures: 13 | 14 | + for [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git); 15 | + and [Rust](https://www.rust-lang.org/tools/install). 16 | 17 | Once it's done, clone the Logos repository on your computer: 18 | 19 | ```bash 20 | git clone https://github.com/maciejhirsz/logos.git 21 | ``` 22 | 23 | If you have a fork of this repository, make sure to clone it instead. 24 | 25 | Finally, launch a terminal (i.e., command-line) session and go to the 26 | `logos` directory. 27 | 28 | ## Checking the code compiles 29 | 30 | A good way to see if you code can compile is to use the eponym command: 31 | 32 | ```bash 33 | cargo check --workspace 34 | ``` 35 | 36 | ## Formatting and linting your code 37 | 38 | Prior to suggesting changes in a pull request, it is important to both 39 | format your code: 40 | 41 | ```bash 42 | cargo fmt 43 | ``` 44 | 45 | and check against Rust's linter: 46 | 47 | ```bash 48 | cargo clippy 49 | ``` 50 | 51 | Make sure to run those frequently, otherwise your pull request will probably 52 | fail to pass the automated tests. 53 | 54 | ## Testing your code 55 | 56 | A code that compiles isn't necessarily correct, and testing it against known 57 | cases is of good practice: 58 | 59 | ```bash 60 | cargo test --workspace 61 | ``` 62 | 63 | You can also run benchmarks: 64 | 65 | ```bash 66 | cargo bench --workspace --benches 67 | ``` 68 | 69 | ## Building the documentation 70 | 71 | Logos' documentation needs to be built with Rust's nightly toolchain. 72 | 73 | You can install the latest nightly channel with: 74 | 75 | ```bash 76 | rustup install nightly 77 | ``` 78 | 79 | Then, use the following command to build the documentation with a similar 80 | configuration to the one used by [docs.rs](https://docs.rs/logos/latest/logos/): 81 | 82 | ```bash 83 | RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc \ 84 | --features debug \ 85 | -Zunstable-options \ 86 | -Zrustdoc-scrape-examples \ 87 | --no-deps \ 88 | --open \ 89 | ``` 90 | 91 | 92 | ## Building the book 93 | 94 | Logos' book can be built with mbBook. 95 | 96 | This tool can be installed with `cargo`: 97 | 98 | ```bash 99 | cargo install mdbook 100 | ``` 101 | 102 | You also need to install `mdbook-admonish` and its assets: 103 | 104 | ```bash 105 | cargo install mdbook-admonish 106 | cd book/ # You must run the next command from the book/ directory 107 | mdbook-admonish install 108 | cd ../ # Back to project root 109 | ``` 110 | 111 | Then, you can build the book with: 112 | 113 | ```bash 114 | mdbook serve book --open 115 | ``` 116 | 117 | Any change in the `./book` folder will automatically trigger a new build, 118 | and the pages will be live-reloaded. 119 | -------------------------------------------------------------------------------- /book/src/debugging.md: -------------------------------------------------------------------------------- 1 | # Debugging 2 | 3 | Instructions on how to debug your Logos lexer. 4 | 5 | ## Visualizing Logos Graph 6 | 7 | Logos works by creating a graph that gets derived from 8 | the tokens that you defined. 9 | This graph describes how the lexer moves through different 10 | states when processing input. 11 | 12 | Hence, it may be beneficial during debugging to be able to 13 | visualize this graph, to understand how Logos will match the various tokens. 14 | 15 | If we take this example: 16 | 17 | ```rust,no_run,noplayground 18 | use logos::Logos; 19 | 20 | #[derive(Debug, Logos, PartialEq)] 21 | enum Token { 22 | // Tokens can be literal strings, of any length. 23 | #[token("fast")] 24 | Fast, 25 | 26 | #[token(".")] 27 | Period, 28 | 29 | // Or regular expressions. 30 | #[regex("[a-zA-Z]+")] 31 | Text, 32 | } 33 | fn main() { 34 | let input = "Create ridiculously fast Lexers."; 35 | 36 | let mut lexer = Token::lexer(input); 37 | while let Some(token) = lexer.next() { 38 | println!("{:?}", token); 39 | } 40 | } 41 | ``` 42 | 43 | Logos actually constructs a graph that contains the logic for matching tokens: 44 | 45 | ``` 46 | graph = { 47 | 1: ::Fast, 48 | 2: ::Period, 49 | 3: ::Text, 50 | 4: { 51 | [A-Z] ⇒ 4, 52 | [a-z] ⇒ 4, 53 | _ ⇒ 3, 54 | }, 55 | 7: [ 56 | ast ⇒ 8, 57 | _ ⇒ 4*, 58 | ], 59 | 8: { 60 | [A-Z] ⇒ 4, 61 | [a-z] ⇒ 4, 62 | _ ⇒ 1, 63 | }, 64 | 9: { 65 | . ⇒ 2, 66 | [A-Z] ⇒ 4, 67 | [a-e] ⇒ 4, 68 | f ⇒ 7, 69 | [g-z] ⇒ 4, 70 | }, 71 | } 72 | ``` 73 | This graph can help us understand how our patterns are matched, 74 | and maybe understand why we have a bug at some point. 75 | 76 | Let's get started by trying to understand how Logos is matching the 77 | `.` character, which we've tokenized as `Token::Period`. 78 | 79 | We can begin our search by looking at number `9` for the character `.`. 80 | We can see that if Logos matches a `.` it will jump `=>` to number `2`. 81 | We can then follow that by looking at `2` which resolves to our `::Period` token. 82 | 83 | Logos will then continue to look for any matches past our `.` character. 84 | This is required in case there is potential continuation after the `.` character. 85 | Although, in the *input* we provided, there are no any additional characters, 86 | since it is the end of our input. 87 | 88 | We also can try to identify how the token `fast` works by looking at `9`, 89 | first, and seeing that `f` will cause Logos to jump to `7`. 90 | This will then resolve the last letters of our word *fast* by matching `ast` 91 | which jumps to `8`. Since our provided _input_ to the lexer does not include 92 | alphabetic characters after the word "fast", but rather a whitespace, 93 | the token `::Fast` will be recognized. 94 | Then, the graph will look for further potential continuation (here, `[g-z] => 4`) 95 | 96 | ## Enabling 97 | 98 | To enable debugging output you can define a `debug` feature in your 99 | `Cargo.toml` file, like this: 100 | 101 | ``` 102 | // Cargo.toml 103 | [dependencies] 104 | logos = { version = "1.2.3", features = ["debug"] } 105 | ``` 106 | 107 | Next, you can build your project with `cargo build` and 108 | the output will contain a debug representation of your graph(s). 109 | -------------------------------------------------------------------------------- /book/src/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | The following examples are ordered by increasing level of complexity. 4 | 5 | **[Brainfuck interpreter](./examples/brainfuck.md)**: Lexers are very powerful tools for parsing code programs into meaningful instructions. We show you how you can build an interpreter for the Brainfuck programming language under 100 lines of code! 6 | 7 | **[Simple calculator](./examples/calculator.md)**: For a relatively large domain-specifc language (DSL), or any programming language, implementing an interpreter typically involves converting the tokens generated by a lexer into an abstract syntax tree (AST) via a parser, and then evaluating it. We show you how you can build a simple calculator that evaluates arithmetic expressions by combining Logos and a parser generator library. 8 | 9 | **[JSON parser](./examples/json.md)**: We present a JSON parser written with Logos that does nice error reporting when invalid values are encountered. 10 | 11 | **[JSON-borrowed parser](./examples/json_borrowed.md)**: A variant of the previous parser, but that does not own its data. 12 | 13 | **[String interpolation](./examples/string-interpolation.md)**: Example on using context-dependent lexing to parse a simple language with string interpolation. 14 | -------------------------------------------------------------------------------- /book/src/examples/brainfuck.md: -------------------------------------------------------------------------------- 1 | # Brainfuck interpreter 2 | 3 | In most programming languages, commands can be made of multiple program tokens, where a token is simply string slice that has a particular meaning for the language. For example, in Rust, the function signature `pub fn main()` could be split by the **lexer** into tokens `pub`, `fn`, `main`, `(`, and `)`. Then, the **parser** combines tokens into meaningful program instructions. 4 | 5 | However, there exists programming languages that are so simple, such as Brainfuck, that each token can be mapped to a single instruction. There are actually 8 single-characters tokens: 6 | 7 | ```rust,no_run,noplayground 8 | {{#include ../../../examples/brainfuck.rs:tokens}} 9 | ``` 10 | 11 | All other characters must be ignored. 12 | 13 | Once the tokens are obtained, a Brainfuck interpreter can be easily created using a [Finite-state machine](https://en.wikipedia.org/wiki/Finite-state_machine). For the sake of simpliciy, we collected all the tokens into one vector called `operations`. 14 | 15 | Now, creating an interpreter becomes straightforward[^1]: 16 | ```rust,no_run,noplayground 17 | {{#include ../../../examples/brainfuck.rs:fsm}} 18 | ``` 19 | 20 | [^1]: There is a small trick to make it easy. As it can be seen in the full code, we first perform a check that all beginning loops (`'['`) have a matching end (`']'`). This way, we can create two maps, `pairs` and `pairs_reverse`, to easily jump back and forth between them. 21 | 22 | Finally, we provide you the full code that you should be able to run with[^2]: 23 | ```bash 24 | cargo run --example brainfuck examples/hello_word.bf 25 | ``` 26 | 27 | [^2]: You first need to clone [this repository](https://github.com/maciejhirsz/logos). 28 | 29 | ```rust,no_run,noplayground 30 | {{#include ../../../examples/brainfuck.rs:all}} 31 | ``` 32 | -------------------------------------------------------------------------------- /book/src/examples/calculator.md: -------------------------------------------------------------------------------- 1 | # Simple calculator 2 | 3 | This page (including the images) was contributed by [ynn](https://github.com/your-diary). 4 | 5 | When you implement an interpreter for a [domain-specific language (DSL)](https://en.wikipedia.org/wiki/Domain-specific_language), or any programming language, the process typically involves the following steps: 6 | 7 | 1. **Lexing**: Splitting the input stream (i.e., source code string) into tokens via a lexer. 8 | 9 | 2. **Parsing**: Converting the tokens into an [abstract syntax tree (AST)](https://en.wikipedia.org/wiki/Abstract_syntax_tree) via a parser. 10 | 11 | 3. **Evaluation**: Evaluating the AST to produce the result. 12 | 13 | In this example, we implement a simple calculator that evaluates arithmetic expressions such as `1 + 2 * 3` or `((1 + 2) * 3 + 4) * 2 + 4 / 3`. 14 | 15 | We use `logos` as the lexer generator and [`chumsky`](https://github.com/zesterer/chumsky) as the parser generator. 16 | 17 | ![flow chart](/assets/calculator_example_flow.png) 18 | 19 | ## 1. Try It 20 | 21 | Before diving into the implementation details, let's play with it[^1]. 22 | 23 | ```bash 24 | $ cargo run --example calculator '1 + 7 * (3 - 4) / 2' 25 | ``` 26 | 27 | [^1]: You first need to clone [this repository](https://github.com/maciejhirsz/logos). 28 | 29 | **Output**: 30 | 31 | ``` 32 | [AST] 33 | Add( 34 | Int( 35 | 1, 36 | ), 37 | Div( 38 | Mul( 39 | Int( 40 | 7, 41 | ), 42 | Sub( 43 | Int( 44 | 3, 45 | ), 46 | Int( 47 | 4, 48 | ), 49 | ), 50 | ), 51 | Int( 52 | 2, 53 | ), 54 | ), 55 | ) 56 | 57 | [result] 58 | -2 59 | ``` 60 | 61 | ~~~admonish note title="Full Code" collapsible=true 62 | 63 | ```rust,no_run,noplayground 64 | {{#include ../../../examples/calculator.rs:all}} 65 | ``` 66 | 67 | ~~~ 68 | 69 | ## 2. Lexer 70 | 71 | Our calculator supports the following tokens: 72 | 73 | - Integer literals: `0`, `1`, `15`, etc; 74 | 75 | - Unary operator: `-`; 76 | 77 | - Binary operators: `+`, `-`, `*`, `/`; 78 | 79 | - Parenthesized expressions: `(3 + 5) * 2`, `((1 + 2) * 3 + 4) * 2 + 3 / 2`, etc. 80 | 81 | ```rust,no_run,noplayground 82 | {{#include ../../../examples/calculator.rs:tokens}} 83 | ``` 84 | 85 | ## 3. Parser 86 | 87 | While it is easy enough to manually implement a parser in this case (e.g., [Pratt parsing](https://en.wikipedia.org/wiki/Operator-precedence_parser#Pratt_parsing)), let's just use [`chumsky`](https://github.com/zesterer/chumsky) crate, which is one of the most popular parser generator libraries in Rust. 88 | 89 | ### 3.1 AST Definition 90 | 91 | First, we define the AST. 92 | 93 | ```rust,no_run,noplayground 94 | {{#include ../../../examples/calculator.rs:ast}} 95 | ``` 96 | 97 | Note that 98 | 99 | - We name the enum not `AST` but `Expr` because an AST is just nested expressions. 100 | 101 | - There is no `Parenthesized` variant because parentheses only affect the order of operations (i.e., precedence), which is reflected in the AST structure. 102 | 103 | - `Box` is used as [a recursive enum is not allowed in Rust](https://stackoverflow.com/questions/25296195/why-are-recursive-struct-types-illegal-in-rust). 104 | 105 | ### 3.2 Parser Implementation 106 | 107 | Next, we define the parser. The code may look a bit complicated if you are not familiar with parser combinator libraries, but it is actually quite simple. See [Chumsky's official tutorial](https://github.com/zesterer/chumsky/blob/main/tutorial.md) for the details. 108 | 109 | ```rust,no_run,noplayground 110 | {{#include ../../../examples/calculator.rs:parser}} 111 | ``` 112 | 113 | ## 4. Evaluator 114 | 115 | Evaluating the AST is straightforward. We just implement it using [depth-first search (DFS)](https://en.wikipedia.org/wiki/Depth-first_search) such that the mathematical operations are processed in the correct order. 116 | 117 | ```rust,no_run,noplayground 118 | {{#include ../../../examples/calculator.rs:evaluator}} 119 | ``` 120 | 121 | **Example** 122 | 123 | Evaluating `1 + 3 * 12` will proceed as below. 124 | 125 | ![how evaluator works](/assets/calculator_example_how_evaluator_works.png) 126 | 127 | ## 5. `main()` Function 128 | 129 | Finally, we put everything together in the `main()` function. 130 | 131 | ```rust,no_run,noplayground 132 | {{#include ../../../examples/calculator.rs:main}} 133 | ``` 134 | 135 | ## 6. Extend the Calculator 136 | 137 | Now that you've implemented a basic calculator, try extending its functionality with the following tasks: 138 | 139 | - **Handle zero-division gracefully**: The current evaluator panics when zero-division occurs. Change the return type of the evaluator from `isize` to `Result`, making it possible to return an error message. 140 | 141 | - **Add support for the modulo operator (`%`)**: Update the lexer, parser, and evaluator to handle expressions like `10 % 3`. 142 | 143 | - **Add support for built-in functions**: Implement built-in functions such as `abs(x)`, `pow(x, y)` or `rand()`. 144 | -------------------------------------------------------------------------------- /book/src/examples/json.md: -------------------------------------------------------------------------------- 1 | # JSON parser 2 | 3 | JSON is a widely used format for exchanging data between formats, while being human-readable. 4 | 5 | Possible values are defined recursively and can be any of the following: 6 | 7 | ```rust,no_run,noplayground 8 | {{#include ../../../examples/json.rs:values}} 9 | ``` 10 | 11 | Object are delimited with braces `{` and `}`, arrays with brackets `[` and `]`, and values with commas `,`. Newlines, tabs or spaces should be ignored by the lexer. 12 | 13 | Knowing that, we can construct a lexer with `Logos` that will identify all those cases: 14 | 15 | ```rust,no_run,noplayground 16 | {{#include ../../../examples/json.rs:tokens}} 17 | ``` 18 | 19 | ```admonish note 20 | The hardest part is to define valid regexes for `Number` and `String` variants. 21 | The present solution was inspired by 22 | [this stackoverflow thread](https://stackoverflow.com/questions/32155133/regex-to-match-a-json-string) 23 | and checked against [the JSON specification](https://www.json.org/json-en.html). 24 | ``` 25 | 26 | Once we have our tokens, we must parse them into actual JSON values. We will proceed be creating 3 functions: 27 | 28 | + `parse_value` for parsing any JSON object, without prior knowledge of its type; 29 | + `parse_array` for parsing an array, assuming we matched `[`; 30 | + and `parse_object` for parsing an object, assuming we matched `{`. 31 | 32 | Starting with parsing an arbitrary value, we can easily obtain the four scalar types, `Bool`, `Null`, `Number`, and `String`, while we will call the next functions for arrays and objects parsing. 33 | 34 | ```rust,no_run,noplayground 35 | {{#include ../../../examples/json.rs:value}} 36 | ``` 37 | 38 | To parse an array, we simply loop between tokens, alternating between parsing values and commas, until a closing bracket is found. 39 | 40 | ```rust,no_run,noplayground 41 | {{#include ../../../examples/json.rs:array}} 42 | ``` 43 | 44 | A similar approach is used for objects, where the only difference is that we expect (key, value) pairs, separated by a colon. 45 | 46 | ```rust,no_run,noplayground 47 | {{#include ../../../examples/json.rs:object}} 48 | ``` 49 | 50 | Finally, we provide you the full code that you should be able to run with[^1]: 51 | ```bash 52 | cargo run --example json examples/example.json 53 | ``` 54 | 55 | [^1] You first need to clone [this repository](https://github.com/maciejhirsz/logos). 56 | 57 | ```rust,no_run,noplayground 58 | {{#include ../../../examples/json.rs:all}} 59 | ``` 60 | -------------------------------------------------------------------------------- /book/src/examples/json_borrowed.md: -------------------------------------------------------------------------------- 1 | # JSON parser with borrowed values 2 | 3 | The previous parser owned its data by allocating strings. This can require quite 4 | some memory space, and using borrowed string slices can help use saving space, while 5 | also maybe increasing performances. 6 | 7 | If you are familiar with Rust's concept of lifetimes, 8 | using `&str` string slices instead of owned `String` 9 | is straightforward: 10 | 11 | ```diff 12 | @ 33c29 13 | - enum Token { 14 | + enum Token<'source> { 15 | @ 62,63c58,59 16 | - #[regex(r#""([^"\\\x00-\x1F]|\\(["\\bnfrt/]|u[a-fA-F0-9]{4}))*""#, |lex| lex.slice().to_owned())] 17 | - String(String), 18 | + #[regex(r#""([^"\\\x00-\x1F]|\\(["\\bnfrt/]|u[a-fA-F0-9]{4}))*""#, |lex| lex.slice())] 19 | + String(&'source str), 20 | @ 70c66 21 | - enum Value { 22 | + enum Value<'source> { 23 | @ 78c74 24 | - String(String), 25 | + String(&'source str), 26 | @ 80c76 27 | - Array(Vec), 28 | + Array(Vec>), 29 | @ 82c78 30 | - Object(HashMap), 31 | + Object(HashMap<&'source str, Value<'source>>), 32 | @ 88c84 33 | - fn parse_value<'source>(lexer: &mut Lexer<'source, Token>) -> Result { 34 | + fn parse_value<'source>(lexer: &mut Lexer<'source, Token<'source>>) -> Result> { 35 | @ 113c109 36 | - fn parse_array<'source>(lexer: &mut Lexer<'source, Token>) -> Result { 37 | + fn parse_array<'source>(lexer: &mut Lexer<'source, Token<'source>>) -> Result> { 38 | @ 167c163 39 | - fn parse_object<'source>(lexer: &mut Lexer<'source, Token>) -> Result { 40 | + fn parse_object<'source>(lexer: &mut Lexer<'source, Token<'source>>) -> Result> { 41 | ``` 42 | 43 | The above code shows the lines you need to change from the previous example 44 | to use borrowed data. 45 | 46 | Finally, we provide you the full code that you should be able to run with[^1]: 47 | ```bash 48 | cargo run --example json-borrowed examples/example.json 49 | ``` 50 | 51 | [^1] You first need to clone [this repository](https://github.com/maciejhirsz/logos). 52 | 53 | ```rust,no_run,noplayground 54 | {{#include ../../../examples/json_borrowed.rs:all}} 55 | ``` 56 | -------------------------------------------------------------------------------- /book/src/extras.md: -------------------------------------------------------------------------------- 1 | # Using `Extras` 2 | 3 | When deriving the `Logos` traits, you may want to convey some internal state 4 | between your tokens. That is where `Logos::Extras` comes to the rescue. 5 | 6 | Each `Lexer` has a public field called `extras` that can be accessed and 7 | mutated to keep track and modify some internal state. By default, 8 | this field is set to `()`, but its type can by modified using the derive 9 | attribute `#[logos(extras = )]` on your `enum` declaration. 10 | 11 | For example, one may want to know the location, both line and column indices, 12 | of each token. This is especially useful when one needs to report an erroneous 13 | token to the user, in an user-friendly manner. 14 | 15 | ```rust,no_run,noplayground 16 | {{#include ../../examples/extras.rs:tokens}} 17 | ``` 18 | 19 | The above token definition will hold two tokens: `Newline` and `Word`. 20 | The former is only used to keep track of the line numbering and will be skipped 21 | using `Skip` as a return value from its callback function. The latter will be 22 | a word with `(line, column)` indices. 23 | 24 | To make it easy, the lexer will contain the following two extras: 25 | 26 | + `extras.0`: the line number; 27 | + `extras.1`: the char index of the current line. 28 | 29 | We now have to define the two callback functions: 30 | 31 | ```rust,no_run,noplayground 32 | {{#include ../../examples/extras.rs:callbacks}} 33 | ``` 34 | 35 | Extras can of course be used for more complicate logic, and there is no limit 36 | to what you can store within the public `extras` field. 37 | 38 | Finally, we provide you the full code that you should be able to run with[^1]: 39 | ```bash 40 | cargo run --example extras Cargo.toml 41 | ``` 42 | 43 | [^1] You first need to clone [this repository](https://github.com/maciejhirsz/logos). 44 | 45 | ```rust,no_run,noplayground 46 | {{#include ../../examples/extras.rs:all}} 47 | ``` 48 | -------------------------------------------------------------------------------- /book/src/getting-help.md: -------------------------------------------------------------------------------- 1 | # Getting Help 2 | 3 | If you need help using **Logos**, there are three places you can go to depending 4 | on what you are looking for: 5 | 6 | + [this book](./) for a documented walk through **Logos**' usage, with detailed 7 | examples, and more. A **must read** for any newcomer; 8 | + [the API documentation](https://docs.rs/logos/latest/logos/) to obtain precise 9 | information about function signatures and what the Logos crate exposes in 10 | terms of features; 11 | + and [GitHub issues](https://github.com/maciejhirsz/logos/issues) for anything 12 | else that is not covered by any of the two above. 13 | 14 | Regarding [GitHub issues](https://github.com/maciejhirsz/logos/issues), 15 | it's highly recommended to first check if another issue, either open or closed, 16 | already covers the topic you are looking for. If not, then consider creating a 17 | new issue with necessary information about your question, problem or else. 18 | -------------------------------------------------------------------------------- /book/src/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | **Logos** can be included in your Rust project using the `cargo add logos` command, or by directly modifying your `Cargo.toml` file: 4 | 5 | ```toml 6 | [dependencies] 7 | logos = "0.15.0" 8 | ``` 9 | 10 | Then, you can automatically derive the [`Logos`](https://docs.rs/logos/latest/logos/trait.Logos.html) trait on your `enum` using the `Logos` derive macro: 11 | 12 | ```rust,no_run,no_playground 13 | use logos::Logos; 14 | 15 | #[derive(Logos, Debug, PartialEq)] 16 | #[logos(skip r"[ \t\n\f]+")] // Ignore this regex pattern between tokens 17 | enum Token { 18 | // Tokens can be literal strings, of any length. 19 | #[token("fast")] 20 | Fast, 21 | 22 | #[token(".")] 23 | Period, 24 | 25 | // Or regular expressions. 26 | #[regex("[a-zA-Z]+")] 27 | Text, 28 | } 29 | ``` 30 | 31 | Then, you can use `Logos::lexer` method to turn any `&str` into an iterator of tokens[^1]: 32 | 33 | ```rust,no_run,no_playground 34 | let mut lex = Token::lexer("Create ridiculously fast Lexers."); 35 | 36 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 37 | assert_eq!(lex.span(), 0..6); 38 | assert_eq!(lex.slice(), "Create"); 39 | 40 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 41 | assert_eq!(lex.span(), 7..19); 42 | assert_eq!(lex.slice(), "ridiculously"); 43 | 44 | assert_eq!(lex.next(), Some(Ok(Token::Fast))); 45 | assert_eq!(lex.span(), 20..24); 46 | assert_eq!(lex.slice(), "fast"); 47 | 48 | assert_eq!(lex.next(), Some(Ok(Token::Text))); 49 | assert_eq!(lex.slice(), "Lexers"); 50 | assert_eq!(lex.span(), 25..31); 51 | 52 | assert_eq!(lex.next(), Some(Ok(Token::Period))); 53 | assert_eq!(lex.span(), 31..32); 54 | assert_eq!(lex.slice(), "."); 55 | 56 | assert_eq!(lex.next(), None); 57 | ``` 58 | 59 | [^1]: Each item is actually a [`Result`](https://docs.rs/logos/latest/logos/struct.Lexer.html#associatedtype.Item), because the lexer returns an error if some part of the string slice does not match any variant of `Token`. 60 | 61 | Because [`Lexer`](https://docs.rs/logos/latest/logos/struct.Lexer.html), returned by [`Logos::lexer`](https://docs.rs/logos/latest/logos/trait.Logos.html#method.lexer), implements the `Iterator` trait, you can use a `for .. in` construct: 62 | 63 | ```rust,no_run,no_playground 64 | for result in Token::lexer("Create ridiculously fast Lexers.") { 65 | match result { 66 | Ok(token) => println!("{:#?}", token), 67 | Err(e) => panic!("some error occurred: {}", e), 68 | } 69 | } 70 | ``` 71 | -------------------------------------------------------------------------------- /book/src/intro.md: -------------------------------------------------------------------------------- 1 | # Logos Handbook 2 | 3 | [![Crates.io version shield](https://img.shields.io/crates/v/logos.svg)](https://crates.io/crates/logos) 4 | [![Docs](https://docs.rs/logos/badge.svg)](https://docs.rs/logos) 5 | [![Crates.io license shield](https://img.shields.io/crates/l/logos.svg)](https://crates.io/crates/logos) 6 | 7 | Logos logo 8 | 9 | Hi there! 10 | 11 | **Logos** is a fast and easy to use [lexer](https://en.wikipedia.org/wiki/Lexical_analysis) 12 | generator written in Rust. While Rust has excellent documentation tools (and you can access 13 | the [API docs for Logos at docs.rs](https://docs.rs/logos/)), it's not the easiest thing to 14 | document custom syntax used by procedural macros, of which Logos has a bit. This Handbook 15 | seeks to remedy this! 16 | 17 | ## In a nut shell 18 | 19 | There are two main types in **Logos**: 20 | 21 | + The `Logos` trait, which comes out with it's own derive macro. The derive 22 | macro uses custom attributes (the things using these brackets: `#[...]`) 23 | with plain string or [regular expression](https://en.wikipedia.org/wiki/Regular_expression) 24 | syntax on `enum` variants as _patterns_ for some input. 25 | + The `Lexer`, which is an iterator that takes some input (`&str`, 26 | sometimes `&[u8]`) and performs lexical analysis on the input on the go, 27 | producing variants of the enum `T` matching the defined patterns. 28 | -------------------------------------------------------------------------------- /book/src/token-disambiguation.md: -------------------------------------------------------------------------------- 1 | # Token disambiguation 2 | 3 | When two or more tokens can match a given sequence, **Logos** compute the 4 | priority of each pattern (`#[token]` or `#[regex]`), and use that priority 5 | to decide which pattern should match. 6 | 7 | The rule of thumb is: 8 | 9 | + Longer beats shorter. 10 | + Specific beats generic. 11 | 12 | If any two definitions could match the same input, like `fast` and `[a-zA-Z]+` 13 | in the example above, it's the longer and more specific definition of `Token::Fast` 14 | that will be the result. 15 | 16 | This is done by comparing numeric priority attached to each definition. Every 17 | consecutive, non-repeating single byte adds 2 to the priority, while every range 18 | or regex class adds 1. 19 | Loops or optional blocks are ignored, while alternations count the shortest alternative: 20 | 21 | + `[a-zA-Z]+` has a priority of 2 (lowest possible), because at minimum it can 22 | match a single byte to a class; 23 | + `foobar` has a priority of 12; 24 | + and `(foo|hello)(bar)?` has a priority of 6, `foo` being it's shortest possible match. 25 | 26 | Generally speaking, equivalent regex patterns have the same priority. E.g., 27 | `a|b` is equivalent to `[a-b]`, and both have a priority of 2. 28 | 29 | ```admonish info 30 | When two different patterns have the same priority, 31 | **Logos** will issue an compilation error. 32 | To prevent this from happening, you can manually set the priority of a given 33 | pattern with, e.g., `#[token("foobar", priority = 20)]`. 34 | ``` 35 | -------------------------------------------------------------------------------- /book/src/unsafe.md: -------------------------------------------------------------------------------- 1 | # Unsafe Code 2 | 3 | By default, **Logos** uses unsafe code to avoid unnecessary bounds checks while 4 | accessing slices of the input `Source`. 5 | 6 | This unsafe code also exists in the code generated by the `Logos` derive macro, 7 | which generates a deterministic finite automata (DFA). Reasoning about the correctness 8 | of this generated code can be difficult - if the derivation of the DFA in `Logos` 9 | is correct, then this generated code will be correct and any mistakes in implementation 10 | would be caught given sufficient fuzz testing. 11 | 12 | Use of unsafe code is the default as this typically provides the fastest parser. 13 | 14 | ## Disabling Unsafe Code 15 | 16 | However, for applications accepting untrusted input in a trusted context, this 17 | may not be a sufficient correctness justification. 18 | 19 | For those applications which cannot tolerate unsafe code, the feature `forbid-unsafe` 20 | may be enabled. This replaces unchecked accesses in the `Logos` crate with safe, 21 | checked alternatives which will panic on out-of-bounds access rather than cause 22 | undefined behavior. Additionally, code generated by the macro will not use the 23 | unsafe keyword, so generated code may be used in a crates using the 24 | `#![forbid(unsafe_code)]` attribute. 25 | 26 | When the `forbid-unsafe` feature is added to a direct dependency on the `Logos` crate, 27 | [Feature Unification](https://doc.rust-lang.org/cargo/reference/features.html#feature-unification) 28 | ensures any transitive inclusion of `Logos` via other dependencies also have unsafe 29 | code disabled. 30 | 31 | Generally, disabling unsafe code will result in a slower parser. 32 | 33 | However making definitive statements around performance of safe-only code is difficult, 34 | as there are too many variables to consider between compiler optimizations, 35 | the specific grammar being parsed, and the target processor. The automated benchmarks 36 | of this crate show around a 10% slowdown in safe-only code at the time of this writing. 37 | -------------------------------------------------------------------------------- /examples/brainfuck.rs: -------------------------------------------------------------------------------- 1 | //! Brainfuck interpreter written in Rust, using Logos. 2 | //! 3 | //! Usage: 4 | //! cargo run --example brainfuck 5 | //! 6 | //! Example: 7 | //! cargo run --example brainfuck examples/hello_word.bf 8 | //! 9 | //! Brainfuck is an esoteric programming language that only 10 | //! uses 8 single-character commands: 11 | //! - '>'; 12 | //! - '<'; 13 | //! - '+'; 14 | //! - '-'; 15 | //! - '.'; 16 | //! - ','; 17 | //! - '['; 18 | //! - and ']'. 19 | //! 20 | //! Despite being very hard to use in practice, this makes 21 | //! this language very simple to interpret. The following code 22 | //! defines an [`execute`] function that runs Brainfuck code. 23 | //! 24 | //! Logos is used here to directly transform the code stream 25 | //! into meaningful `Op` operations (or commands). 26 | //! Errors, i.e., unknown tokens, are discarded using `filter_map`. 27 | //! 28 | //! More details can be found on Wikipedia: 29 | //! . 30 | //! 31 | //! or on . 32 | 33 | /* ANCHOR: all */ 34 | use logos::Logos; 35 | use std::collections::HashMap; 36 | use std::env; 37 | use std::fs; 38 | use std::io::{self, Read}; 39 | 40 | /* ANCHOR: tokens */ 41 | /// Each [`Op`] variant is a single character. 42 | #[derive(Debug, Logos)] 43 | enum Op { 44 | /// Increment pointer. 45 | #[token(">")] 46 | IncPointer, 47 | /// Decrement pointer. 48 | #[token("<")] 49 | DecPointer, 50 | /// Increment data at pointer. 51 | #[token("+")] 52 | IncData, 53 | /// Decrement data at pointer. 54 | #[token("-")] 55 | DecData, 56 | /// Output data at pointer. 57 | #[token(".")] 58 | OutData, 59 | /// Input (read) to data at pointer. 60 | #[token(",")] 61 | InpData, 62 | /// Conditionally jump to matching `']'`. 63 | #[token("[")] 64 | CondJumpForward, 65 | /// Conditionally jump to matching `'['`. 66 | #[token("]")] 67 | CondJumpBackward, 68 | } 69 | /* ANCHOR_END: tokens */ 70 | 71 | /// Print one byte to the terminal. 72 | #[inline(always)] 73 | fn print_byte(byte: u8) { 74 | print!("{}", byte as char); 75 | } 76 | 77 | /// Read one byte from the terminal. 78 | #[inline(always)] 79 | fn read_byte() -> u8 { 80 | let mut input = [0u8; 1]; 81 | io::stdin() 82 | .read_exact(&mut input) 83 | .expect("An error occurred while reading byte!"); 84 | input[0] 85 | } 86 | 87 | /// Execute Brainfuck code from a string slice. 88 | pub fn execute(code: &str) { 89 | let operations: Vec<_> = Op::lexer(code).filter_map(|op| op.ok()).collect(); 90 | let mut data = [0u8; 30_000]; // Minimum recommended size 91 | let mut pointer: usize = 0; 92 | let len = operations.len(); 93 | 94 | // We pre-process matching jump commands, and we create 95 | // a mapping between them. 96 | let mut queue = Vec::new(); 97 | let mut pairs = HashMap::new(); 98 | let mut pairs_reverse = HashMap::new(); 99 | 100 | for (i, op) in operations.iter().enumerate() { 101 | match op { 102 | Op::CondJumpForward => queue.push(i), 103 | Op::CondJumpBackward => { 104 | if let Some(start) = queue.pop() { 105 | pairs.insert(start, i); 106 | pairs_reverse.insert(i, start); 107 | } else { 108 | panic!( 109 | "Unexpected conditional backward jump at position {}, does not match any '['", 110 | i 111 | ); 112 | } 113 | } 114 | _ => (), 115 | } 116 | } 117 | 118 | if !queue.is_empty() { 119 | panic!("Unmatched conditional forward jump at positions {:?}, expecting a closing ']' for each of them", queue); 120 | } 121 | 122 | /* ANCHOR: fsm */ 123 | let mut i: usize = 0; 124 | // True program execution. 125 | loop { 126 | match operations[i] { 127 | Op::IncPointer => pointer += 1, 128 | Op::DecPointer => pointer -= 1, 129 | Op::IncData => data[pointer] = data[pointer].wrapping_add(1), 130 | Op::DecData => data[pointer] = data[pointer].wrapping_sub(1), 131 | Op::OutData => print_byte(data[pointer]), 132 | Op::InpData => data[pointer] = read_byte(), 133 | Op::CondJumpForward => { 134 | if data[pointer] == 0 { 135 | // Skip until matching end. 136 | i = *pairs.get(&i).unwrap(); 137 | } 138 | } 139 | Op::CondJumpBackward => { 140 | if data[pointer] != 0 { 141 | // Go back to matching start. 142 | i = *pairs_reverse.get(&i).unwrap(); 143 | } 144 | } 145 | } 146 | i += 1; 147 | 148 | if i >= len { 149 | break; 150 | } 151 | } 152 | /* ANCHOR_END: fsm */ 153 | } 154 | 155 | fn main() { 156 | let src = fs::read_to_string(env::args().nth(1).expect("Expected file argument")) 157 | .expect("Failed to read file"); 158 | 159 | execute(src.as_str()); 160 | } 161 | /* ANCHOR_END: all */ 162 | -------------------------------------------------------------------------------- /examples/calculator.rs: -------------------------------------------------------------------------------- 1 | //! Simple calculator. 2 | //! 3 | //! Usage: 4 | //! cargo run --example calculator 5 | //! 6 | //! Example: 7 | //! cargo run --example calculator '1 + 7 * (3 - 4) / 2' 8 | //! 9 | //! Following constructs are supported: 10 | //! - integer literals: `0`, `1`, `15`, etc. 11 | //! - unary operator: `-` 12 | //! - binary operators: `+`, `-`, `*`, `/` 13 | //! - parentheses: `(`, `)` 14 | 15 | /* ANCHOR: all */ 16 | use std::env; 17 | 18 | use chumsky::prelude::*; 19 | use logos::Logos; 20 | 21 | /* ANCHOR: tokens */ 22 | #[derive(Logos, Debug, PartialEq, Eq, Hash, Clone)] 23 | #[logos(skip r"[ \t\n]+")] 24 | #[logos(error = String)] 25 | enum Token { 26 | #[token("+")] 27 | Plus, 28 | 29 | #[token("-")] 30 | Minus, 31 | 32 | #[token("*")] 33 | Multiply, 34 | 35 | #[token("/")] 36 | Divide, 37 | 38 | #[token("(")] 39 | LParen, 40 | 41 | #[token(")")] 42 | RParen, 43 | 44 | #[regex("[0-9]+", |lex| lex.slice().parse::().unwrap())] 45 | Integer(isize), 46 | } 47 | /* ANCHOR_END: tokens */ 48 | 49 | /* ANCHOR: ast */ 50 | #[derive(Debug)] 51 | enum Expr { 52 | // Integer literal. 53 | Int(isize), 54 | 55 | // Unary minus. 56 | Neg(Box), 57 | 58 | // Binary operators. 59 | Add(Box, Box), 60 | Sub(Box, Box), 61 | Mul(Box, Box), 62 | Div(Box, Box), 63 | } 64 | /* ANCHOR_END: ast */ 65 | 66 | /* ANCHOR: evaluator */ 67 | impl Expr { 68 | fn eval(&self) -> isize { 69 | match self { 70 | Expr::Int(n) => *n, 71 | Expr::Neg(rhs) => -rhs.eval(), 72 | Expr::Add(lhs, rhs) => lhs.eval() + rhs.eval(), 73 | Expr::Sub(lhs, rhs) => lhs.eval() - rhs.eval(), 74 | Expr::Mul(lhs, rhs) => lhs.eval() * rhs.eval(), 75 | Expr::Div(lhs, rhs) => lhs.eval() / rhs.eval(), 76 | } 77 | } 78 | } 79 | /* ANCHOR_END: evaluator */ 80 | 81 | #[allow(clippy::let_and_return)] 82 | /* ANCHOR: parser */ 83 | fn parser() -> impl Parser> { 84 | recursive(|p| { 85 | let atom = { 86 | let parenthesized = p 87 | .clone() 88 | .delimited_by(just(Token::LParen), just(Token::RParen)); 89 | 90 | let integer = select! { 91 | Token::Integer(n) => Expr::Int(n), 92 | }; 93 | 94 | parenthesized.or(integer) 95 | }; 96 | 97 | let unary = just(Token::Minus) 98 | .repeated() 99 | .then(atom) 100 | .foldr(|_op, rhs| Expr::Neg(Box::new(rhs))); 101 | 102 | let binary_1 = unary 103 | .clone() 104 | .then( 105 | just(Token::Multiply) 106 | .or(just(Token::Divide)) 107 | .then(unary) 108 | .repeated(), 109 | ) 110 | .foldl(|lhs, (op, rhs)| match op { 111 | Token::Multiply => Expr::Mul(Box::new(lhs), Box::new(rhs)), 112 | Token::Divide => Expr::Div(Box::new(lhs), Box::new(rhs)), 113 | _ => unreachable!(), 114 | }); 115 | 116 | let binary_2 = binary_1 117 | .clone() 118 | .then( 119 | just(Token::Plus) 120 | .or(just(Token::Minus)) 121 | .then(binary_1) 122 | .repeated(), 123 | ) 124 | .foldl(|lhs, (op, rhs)| match op { 125 | Token::Plus => Expr::Add(Box::new(lhs), Box::new(rhs)), 126 | Token::Minus => Expr::Sub(Box::new(lhs), Box::new(rhs)), 127 | _ => unreachable!(), 128 | }); 129 | 130 | binary_2 131 | }) 132 | .then_ignore(end()) 133 | } 134 | /* ANCHOR_END: parser */ 135 | 136 | /* ANCHOR: main */ 137 | fn main() { 138 | //reads the input expression from the command line 139 | let input = env::args() 140 | .nth(1) 141 | .expect("Expected expression argument (e.g. `1 + 7 * (3 - 4) / 5`)"); 142 | 143 | //creates a lexer instance from the input 144 | let lexer = Token::lexer(&input); 145 | 146 | //splits the input into tokens, using the lexer 147 | let mut tokens = vec![]; 148 | for (token, span) in lexer.spanned() { 149 | match token { 150 | Ok(token) => tokens.push(token), 151 | Err(e) => { 152 | println!("lexer error at {:?}: {}", span, e); 153 | return; 154 | } 155 | } 156 | } 157 | 158 | //parses the tokens to construct an AST 159 | let ast = match parser().parse(tokens) { 160 | Ok(expr) => { 161 | println!("[AST]\n{:#?}", expr); 162 | expr 163 | } 164 | Err(e) => { 165 | println!("parse error: {:#?}", e); 166 | return; 167 | } 168 | }; 169 | 170 | //evaluates the AST to get the result 171 | println!("\n[result]\n{}", ast.eval()); 172 | } 173 | /* ANCHOR_END: main */ 174 | /* ANCHOR_END: all */ 175 | -------------------------------------------------------------------------------- /examples/custom_error.rs: -------------------------------------------------------------------------------- 1 | //! ASCII tokens lexer with custom error type. 2 | //! 3 | //! Takes tabs-or-spaces separated words or u8 numbers, 4 | //! only accepting ascii letters. 5 | //! 6 | //! Usage: 7 | //! cargo run --example custom_error 8 | 9 | /* ANCHOR: all */ 10 | use logos::Logos; 11 | 12 | use std::num::ParseIntError; 13 | 14 | #[derive(Default, Debug, Clone, PartialEq)] 15 | enum LexingError { 16 | InvalidInteger(String), 17 | #[default] 18 | NonAsciiCharacter, 19 | } 20 | 21 | /// Error type returned by calling `lex.slice().parse()` to u8. 22 | impl From for LexingError { 23 | fn from(err: ParseIntError) -> Self { 24 | use std::num::IntErrorKind::*; 25 | match err.kind() { 26 | PosOverflow | NegOverflow => LexingError::InvalidInteger("overflow error".to_owned()), 27 | _ => LexingError::InvalidInteger("other error".to_owned()), 28 | } 29 | } 30 | } 31 | 32 | #[derive(Debug, Logos, PartialEq)] 33 | #[logos(error = LexingError)] 34 | #[logos(skip r"[ \t]+")] 35 | enum Token { 36 | #[regex(r"[a-zA-Z]+")] 37 | Word, 38 | #[regex(r"[0-9]+", |lex| lex.slice().parse())] 39 | Integer(u8), 40 | } 41 | 42 | fn main() { 43 | // 256 overflows u8, since u8's max value is 255. 44 | // 'é' is not a valid ascii letter. 45 | let mut lex = Token::lexer("Hello 256 Jérome"); 46 | 47 | assert_eq!(lex.next(), Some(Ok(Token::Word))); 48 | assert_eq!(lex.slice(), "Hello"); 49 | 50 | assert_eq!( 51 | lex.next(), 52 | Some(Err(LexingError::InvalidInteger( 53 | "overflow error".to_owned() 54 | ))) 55 | ); 56 | assert_eq!(lex.slice(), "256"); 57 | 58 | assert_eq!(lex.next(), Some(Ok(Token::Word))); 59 | assert_eq!(lex.slice(), "J"); 60 | 61 | assert_eq!(lex.next(), Some(Err(LexingError::NonAsciiCharacter))); 62 | assert_eq!(lex.slice(), "é"); 63 | 64 | assert_eq!(lex.next(), Some(Ok(Token::Word))); 65 | assert_eq!(lex.slice(), "rome"); 66 | 67 | assert_eq!(lex.next(), None); 68 | } 69 | /* ANCHOR_END: all */ 70 | -------------------------------------------------------------------------------- /examples/example.json: -------------------------------------------------------------------------------- 1 | [ 2 | 3 | { 4 | "selftext" : "Hey, folks!\n\nWhile /r/Funny has always had a strong preference for original content – it's right there in Rule 3, after all – we've never required users in good standing to post only things that they personally created. However, we *have* frequently taken steps to cut down on low-effort, low-quality submissions (like memes, screenshots of social media, and so on)... and although we're a little bit late to the game with this, we're going to take another such step:\n\n**Henceforth, AI-generated content of any kind may not be posted in /r/Funny.**\n\nWe know, we know. \"Welcome to 2022,\" right? We're well aware that the novelty of things like Midjourney, ChatGPT, Bing, Rutabaga, Bard, DALL-E, StorFisa, DeepAI, and other such programs is quickly wearing off, and we've seen the growing disillusionment, disapproval, and general annoyance that folks have been voicing... but in our defense, we made up two of those services, so you can't *really* be upset about people using them.\n\nAnyway, this change was prompted by a few different factors (in addition to addressing users' concerns), but one of the most prominent is the fact that AI-generated content requires almost no involvement on the part of a given submitter: While a glorified algorithm may spit out some images, the *user's* only contribution – assuming that they didn't design, code, and train said algorithm, of course – is a short prompt. That requires even less effort than \"making\" memes or taking screenshots of social media does, so if the goal is to encourage high-quality, original content... well, you see the obvious conclusion.\n\nThe TL;DR is that we want to keep /r/Funny as pleasant as possible for contributors, participants, and lurkers alike, so until such time as *real* AIs start registering Reddit accounts (which our counterparts from the future¹ say will happen on September 12th, 2097), AI-generated content will not be allowed.\n\n------\n\n^¹ ^(Yes, we have a time-machine, and no, it isn't just a Magic 8-Ball that we duct-taped to a frog.)", 5 | 6 | "WHO": "Joe", 7 | "WEEK": [ 8 | { 9 | "NUMBER": 3, 10 | "EXPENSE": [ 11 | { 12 | "WHAT": "Beer", 13 | "AMOUNT": 18.00 14 | }, 15 | { 16 | "WHAT": "Food", 17 | "AMOUNT": 12.00 18 | }, 19 | { 20 | "WHAT": "Food", 21 | "AMOUNT": 19.00 22 | }, 23 | { 24 | "WHAT": "Car", 25 | "AMOUNT": 20.00 26 | } 27 | ] 28 | } 29 | ] 30 | }, 31 | { 32 | "updated_at": "2015-01-01T15:00:06Z", 33 | "glossary": { 34 | "title": "example glossary", 35 | "GlossDiv": { 36 | "title": "S", 37 | "GlossList": { 38 | "GlossEntry": { 39 | "ID": "SGML", 40 | "SortAs": "SGML", 41 | "GlossTerm": "Standard Generalized Markup Language", 42 | "Acronym": "SGML", 43 | "Abbrev": "ISO 8879:1986", 44 | "GlossDef": { 45 | "para": "A meta-markup language, used to create markup languages such as DocBook.", 46 | "GlossSeeAlso": ["GML", "XML"] 47 | }, 48 | "GlossSee": "markup" 49 | } 50 | } 51 | } 52 | } 53 | } 54 | ] 55 | -------------------------------------------------------------------------------- /examples/extras.rs: -------------------------------------------------------------------------------- 1 | //! Print line and column positions for each word in a file. 2 | //! 3 | //! Usage: 4 | //! cargo run --example extras 5 | //! 6 | //! Example: 7 | //! cargo run --example extras Cargo.toml 8 | //! 9 | //! This is a small example on how to use 10 | //! [`Extras`](https://docs.rs/logos/latest/logos/trait.Logos.html#associatedtype.Extras) 11 | //! to convey some (mutable) internal state from token to token. 12 | //! 13 | //! Here, the extras will be a tuple with the following fields: 14 | //! 15 | //! + 0. the line number; 16 | //! + 1. the char index of the current line. 17 | //! 18 | //! From then, one can easily compute the column number of some token by computing: 19 | //! 20 | //! ```rust,no_run,no_playground 21 | //! fn get_column(lex: &Lexer) -> usize { 22 | //! lex.span().start - lex.extras.1 23 | //! } 24 | //! ``` 25 | 26 | /* ANCHOR: all */ 27 | use logos::{Lexer, Logos, Skip}; 28 | use std::env; 29 | use std::fs; 30 | 31 | /* ANCHOR: callbacks */ 32 | /// Update the line count and the char index. 33 | fn newline_callback(lex: &mut Lexer) -> Skip { 34 | lex.extras.0 += 1; 35 | lex.extras.1 = lex.span().end; 36 | Skip 37 | } 38 | 39 | /// Compute the line and column position for the current word. 40 | fn word_callback(lex: &mut Lexer) -> (usize, usize) { 41 | let line = lex.extras.0; 42 | let column = lex.span().start - lex.extras.1; 43 | 44 | (line, column) 45 | } 46 | /* ANCHOR_END: callbacks */ 47 | 48 | /* ANCHOR: tokens */ 49 | /// Simple tokens to retrieve words and their location. 50 | #[derive(Debug, Logos)] 51 | #[logos(extras = (usize, usize))] 52 | enum Token { 53 | #[regex(r"\n", newline_callback)] 54 | Newline, 55 | 56 | #[regex(r"\w+", word_callback)] 57 | Word((usize, usize)), 58 | } 59 | /* ANCHOR_END: tokens */ 60 | 61 | fn main() { 62 | let src = fs::read_to_string(env::args().nth(1).expect("Expected file argument")) 63 | .expect("Failed to read file"); 64 | 65 | let mut lex = Token::lexer(src.as_str()); 66 | 67 | while let Some(token) = lex.next() { 68 | if let Ok(Token::Word((line, column))) = token { 69 | println!("Word '{}' found at ({}, {})", lex.slice(), line, column); 70 | } 71 | } 72 | } 73 | /* ANCHOR_END: all */ 74 | -------------------------------------------------------------------------------- /examples/hello_world.bf: -------------------------------------------------------------------------------- 1 | [ This program prints "Hello World!" and a newline to the screen, its 2 | length is 106 active command characters. [It is not the shortest.] 3 | 4 | This loop is an "initial comment loop", a simple way of adding a comment 5 | to a BF program such that you don't have to worry about any command 6 | characters. Any ".", ",", "+", "-", "<" and ">" characters are simply 7 | ignored, the "[" and "]" characters just have to be balanced. This 8 | loop and the commands it contains are ignored because the current cell 9 | defaults to a value of 0; the 0 value causes this loop to be skipped. 10 | ] 11 | ++++++++ Set Cell #0 to 8 12 | [ 13 | >++++ Add 4 to Cell #1; this will always set Cell #1 to 4 14 | [ as the cell will be cleared by the loop 15 | >++ Add 2 to Cell #2 16 | >+++ Add 3 to Cell #3 17 | >+++ Add 3 to Cell #4 18 | >+ Add 1 to Cell #5 19 | <<<<- Decrement the loop counter in Cell #1 20 | ] Loop until Cell #1 is zero; number of iterations is 4 21 | >+ Add 1 to Cell #2 22 | >+ Add 1 to Cell #3 23 | >- Subtract 1 from Cell #4 24 | >>+ Add 1 to Cell #6 25 | [<] Move back to the first zero cell you find; this will 26 | be Cell #1 which was cleared by the previous loop 27 | <- Decrement the loop Counter in Cell #0 28 | ] Loop until Cell #0 is zero; number of iterations is 8 29 | 30 | The result of this is: 31 | Cell no : 0 1 2 3 4 5 6 32 | Contents: 0 0 72 104 88 32 8 33 | Pointer : ^ 34 | 35 | >>. Cell #2 has value 72 which is 'H' 36 | >---. Subtract 3 from Cell #3 to get 101 which is 'e' 37 | +++++++..+++. Likewise for 'llo' from Cell #3 38 | >>. Cell #5 is 32 for the space 39 | <-. Subtract 1 from Cell #4 for 87 to give a 'W' 40 | <. Cell #3 was set to 'o' from the end of 'Hello' 41 | +++.------.--------. Cell #3 for 'rl' and 'd' 42 | >>+. Add 1 to Cell #5 gives us an exclamation point 43 | >++. And finally a newline from Cell #6 44 | -------------------------------------------------------------------------------- /examples/string-interpolation.rs: -------------------------------------------------------------------------------- 1 | /* ANCHOR: all */ 2 | use std::collections::HashMap; 3 | 4 | use logos::{Lexer, Logos}; 5 | 6 | /* ANCHOR: lexers */ 7 | type SymbolTable = HashMap; 8 | 9 | #[derive(Logos, Debug, PartialEq, Clone)] 10 | #[logos(skip r"\s+")] 11 | #[logos(extras = SymbolTable)] 12 | enum VariableDefinitionContext { 13 | #[regex(r"[[:alpha:]][[:alnum:]]*", variable_definition)] 14 | Id((String /* variable name */, String /* value */)), 15 | #[token("=")] 16 | Equals, 17 | #[token("'")] 18 | Quote, 19 | } 20 | 21 | #[derive(Logos, Debug, PartialEq, Clone)] 22 | #[logos(extras = SymbolTable)] 23 | enum StringContext { 24 | #[token("'")] 25 | Quote, 26 | #[regex("[^'$]+")] 27 | Content, 28 | #[token("${", evaluate_interpolation)] 29 | InterpolationStart(String /* evaluated value of the interpolation */), 30 | #[token("$")] 31 | DollarSign, 32 | } 33 | 34 | #[derive(Logos, Debug, PartialEq, Clone)] 35 | #[logos(skip r"\s+")] 36 | #[logos(extras = SymbolTable)] 37 | enum StringInterpolationContext { 38 | #[regex(r"[[:alpha:]][[:alnum:]]*", get_variable_value)] 39 | Id(String /* value for the given id */), 40 | #[token("'")] 41 | Quote, 42 | #[token("}")] 43 | InterpolationEnd, 44 | } 45 | /* ANCHOR_END: lexers */ 46 | 47 | /* ANCHOR: variable_definition */ 48 | fn get_string_content(lex: &mut Lexer) -> String { 49 | let mut s = String::new(); 50 | while let Some(Ok(token)) = lex.next() { 51 | match token { 52 | StringContext::Content => s.push_str(lex.slice()), 53 | StringContext::DollarSign => s.push_str("$"), 54 | StringContext::InterpolationStart(value) => s.push_str(&value), 55 | StringContext::Quote => break, 56 | } 57 | } 58 | s 59 | } 60 | 61 | fn variable_definition(lex: &mut Lexer) -> Option<(String, String)> { 62 | let id = lex.slice().to_string(); 63 | if let Some(Ok(VariableDefinitionContext::Equals)) = lex.next() { 64 | if let Some(Ok(VariableDefinitionContext::Quote)) = lex.next() { 65 | let mut lex2 = lex.clone().morph::(); 66 | let value = get_string_content(&mut lex2); 67 | *lex = lex2.morph(); 68 | lex.extras.insert(id.clone(), value.clone()); 69 | return Some((id, value)); 70 | } 71 | } 72 | None 73 | } 74 | /* ANCHOR_END: variable_definition */ 75 | 76 | /* ANCHOR: evaluate_interpolation */ 77 | fn evaluate_interpolation(lex: &mut Lexer) -> Option { 78 | let mut lex2 = lex.clone().morph::(); 79 | let mut interpolation = String::new(); 80 | while let Some(result) = lex2.next() { 81 | match result { 82 | Ok(token) => match token { 83 | StringInterpolationContext::Id(value) => interpolation.push_str(&value), 84 | StringInterpolationContext::Quote => { 85 | *lex = lex2.morph(); 86 | interpolation.push_str(&get_string_content(lex)); 87 | lex2 = lex.clone().morph(); 88 | } 89 | StringInterpolationContext::InterpolationEnd => break, 90 | }, 91 | Err(()) => panic!("Interpolation error"), 92 | } 93 | } 94 | *lex = lex2.morph(); 95 | Some(interpolation) 96 | } 97 | /* ANCHOR_END: evaluate_interpolation */ 98 | 99 | /* ANCHOR: get_variable_value */ 100 | fn get_variable_value(lex: &mut Lexer) -> Option { 101 | if let Some(value) = lex.extras.get(lex.slice()) { 102 | return Some(value.clone()); 103 | } 104 | None 105 | } 106 | /* ANCHOR_END: get_variable_value */ 107 | 108 | /* ANCHOR: main */ 109 | fn test_variable_definition( 110 | expeected_id: &str, 111 | expeected_value: &str, 112 | token: Option>, 113 | ) { 114 | if let Some(Ok(VariableDefinitionContext::Id((id, value)))) = token { 115 | assert_eq!(id, expeected_id); 116 | assert_eq!(value, expeected_value); 117 | } else { 118 | panic!("Expected key: {} not found", expeected_id); 119 | } 120 | } 121 | 122 | fn main() { 123 | let mut lex = VariableDefinitionContext::lexer( 124 | "\ 125 | name = 'Mark'\n\ 126 | greeting = 'Hi ${name}!'\n\ 127 | surname = 'Scott'\n\ 128 | greeting2 = 'Hi ${name ' ' surname}!'\n\ 129 | greeting3 = 'Hi ${name ' ${surname}!'}!'\n\ 130 | ", 131 | ); 132 | test_variable_definition("name", "Mark", lex.next()); 133 | test_variable_definition("greeting", "Hi Mark!", lex.next()); 134 | test_variable_definition("surname", "Scott", lex.next()); 135 | test_variable_definition("greeting2", "Hi Mark Scott!", lex.next()); 136 | test_variable_definition("greeting3", "Hi Mark Scott!!", lex.next()); 137 | } 138 | /* ANCHOR_END: main */ 139 | /* ANCHOR_END: all */ 140 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition.workspace = true 3 | name = "logos-fuzz" 4 | publish = false 5 | rust-version.workspace = true 6 | 7 | [dependencies] 8 | afl = "0.15" 9 | arbitrary = "1.3" 10 | logos-codegen = { path = "../logos-codegen", features = ["fuzzing"] } 11 | 12 | [package.metadata.release] 13 | shared-version = true 14 | -------------------------------------------------------------------------------- /fuzz/in/literal: -------------------------------------------------------------------------------- 1 | literal -------------------------------------------------------------------------------- /fuzz/in/regex: -------------------------------------------------------------------------------- 1 | a+b[cd-h]+? -------------------------------------------------------------------------------- /fuzz/src/main.rs: -------------------------------------------------------------------------------- 1 | use afl::fuzz; 2 | use logos_codegen::{ 3 | graph::{Graph, Node}, 4 | mir::Mir, 5 | }; 6 | 7 | fn main() { 8 | fuzz!(|regex: String| { 9 | let mut graph = Graph::new(); 10 | 11 | if let Ok(mir) = Mir::utf8(®ex) { 12 | let leaf = graph.push(Node::Leaf("LEAF")); 13 | let _ = graph.regex(mir, leaf); 14 | } 15 | }); 16 | } 17 | -------------------------------------------------------------------------------- /logos-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [dependencies] 2 | anyhow = "1.0.57" 3 | clap = {version = "3.1.18", features = ["derive"]} 4 | fs-err = "2.7.0" 5 | logos-codegen = {version = "0.15.0", path = "../logos-codegen"} 6 | proc-macro2 = "1.0.39" 7 | 8 | [dev-dependencies] 9 | assert_cmd = "2.0.4" 10 | assert_fs = "1.0.7" 11 | predicates = "2.1.1" 12 | 13 | [features] 14 | # Enables debug messages 15 | debug = ["logos-codegen/debug"] 16 | 17 | [package] 18 | name = "logos-cli" 19 | authors.workspace = true 20 | categories.workspace = true 21 | description.workspace = true 22 | edition.workspace = true 23 | homepage.workspace = true 24 | keywords.workspace = true 25 | license.workspace = true 26 | readme.workspace = true 27 | repository.workspace = true 28 | rust-version.workspace = true 29 | version.workspace = true 30 | 31 | [package.metadata.release] 32 | shared-version = true 33 | -------------------------------------------------------------------------------- /logos-cli/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /logos-cli/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /logos-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::Write, 3 | io, 4 | path::PathBuf, 5 | process::{Command, Stdio}, 6 | }; 7 | 8 | use anyhow::{Context, Result}; 9 | use clap::Parser; 10 | use fs_err as fs; 11 | use proc_macro2::{LexError, TokenStream}; 12 | 13 | /// Logos as a CLI! 14 | #[derive(Parser)] 15 | #[clap(author, version, about, long_about = None)] 16 | pub struct Args { 17 | /// Input file to process 18 | #[clap(parse(from_os_str))] 19 | input: PathBuf, 20 | /// Path to write output. By default output is printed to stdout. 21 | #[clap(long, short, parse(from_os_str))] 22 | output: Option, 23 | /// Checks whether the output file is up-to-date instead of writing to it. Requires --output to be specified. 24 | #[clap(long, requires = "output")] 25 | check: bool, 26 | /// Invokes `rustfmt` on the generated code. `rustfmt` must be in $PATH. 27 | #[clap(long)] 28 | format: bool, 29 | } 30 | 31 | pub fn main() -> Result<()> { 32 | let args = Args::parse(); 33 | 34 | let input = fs::read_to_string(args.input)?; 35 | let mut output = codegen(input).context("failed to run rustfmt")?; 36 | 37 | if args.format { 38 | output = rustfmt(output)?; 39 | } 40 | 41 | if let Some(output_path) = args.output { 42 | let changed = match fs::read_to_string(&output_path) { 43 | Ok(existing_output) => !eq_ignore_newlines(&existing_output, &output), 44 | Err(err) if err.kind() == io::ErrorKind::NotFound => true, 45 | Err(err) => return Err(err.into()), 46 | }; 47 | 48 | if !changed { 49 | Ok(()) 50 | } else if args.check { 51 | Err(anyhow::format_err!( 52 | "contents of {} differed from generated code", 53 | output_path.display() 54 | )) 55 | } else { 56 | fs::write(output_path, output)?; 57 | Ok(()) 58 | } 59 | } else { 60 | println!("{}", output); 61 | Ok(()) 62 | } 63 | } 64 | 65 | fn codegen(input: String) -> Result { 66 | let input_tokens: TokenStream = input 67 | .parse() 68 | .map_err(|err: LexError| anyhow::Error::msg(err.to_string())) 69 | .context("failed to parse input as rust code")?; 70 | 71 | let mut output = String::new(); 72 | write!( 73 | output, 74 | "{}", 75 | logos_codegen::strip_attributes(input_tokens.clone()) 76 | )?; 77 | write!(output, "{}", logos_codegen::generate(input_tokens))?; 78 | Ok(output) 79 | } 80 | 81 | fn rustfmt(input: String) -> Result { 82 | let mut command = Command::new("rustfmt") 83 | .stdin(Stdio::piped()) 84 | .stderr(Stdio::inherit()) 85 | .stdout(Stdio::piped()) 86 | .spawn()?; 87 | io::Write::write_all(&mut command.stdin.take().unwrap(), input.as_bytes())?; 88 | let output = command.wait_with_output()?; 89 | if !output.status.success() { 90 | anyhow::bail!("rustfmt returned unsuccessful exit code"); 91 | } 92 | 93 | String::from_utf8(output.stdout).context("failed to parse rustfmt output as utf-8") 94 | } 95 | 96 | fn eq_ignore_newlines(lhs: &str, rhs: &str) -> bool { 97 | lhs.lines().eq(rhs.lines()) 98 | } 99 | -------------------------------------------------------------------------------- /logos-cli/tests/data/fmt_output.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, Copy, PartialEq)] 2 | enum Token { 3 | Letter, 4 | } 5 | impl<'s> ::logos::Logos<'s> for Token { 6 | type Error = (); 7 | type Extras = (); 8 | type Source = str; 9 | fn lex(lex: &mut ::logos::Lexer<'s, Self>) { 10 | use logos::internal::{CallbackResult, LexerInternal}; 11 | type Lexer<'s> = ::logos::Lexer<'s, Token>; 12 | fn _end<'s>(lex: &mut Lexer<'s>) { 13 | lex.end() 14 | } 15 | fn _error<'s>(lex: &mut Lexer<'s>) { 16 | lex.bump_unchecked(1); 17 | lex.error(); 18 | } 19 | macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } 20 | #[inline] 21 | fn goto1_x<'s>(lex: &mut Lexer<'s>) { 22 | lex.set(Ok(Token::Letter)); 23 | } 24 | #[inline] 25 | fn goto3_at1_with3<'s>(lex: &mut Lexer<'s>) { 26 | match lex.read_at::<&[u8; 2usize]>(1usize) { 27 | Some(b"-z") => { 28 | lex.bump_unchecked(3usize); 29 | goto1_x(lex) 30 | } 31 | _ => _error(lex), 32 | } 33 | } 34 | #[inline] 35 | fn goto4<'s>(lex: &mut Lexer<'s>) { 36 | let arr = match lex.read::<&[u8; 3usize]>() { 37 | Some(arr) => arr, 38 | None => return _end(lex), 39 | }; 40 | match arr[0] { 41 | b'a' => goto3_at1_with3(lex), 42 | _ => _error(lex), 43 | } 44 | } 45 | goto4(lex) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /logos-cli/tests/data/input.rs: -------------------------------------------------------------------------------- 1 | #[derive(Logos, Debug, Clone, Copy, PartialEq)] 2 | enum Token { 3 | #[regex("a-z")] 4 | Letter, 5 | } 6 | -------------------------------------------------------------------------------- /logos-cli/tests/data/output.rs: -------------------------------------------------------------------------------- 1 | # [derive (Debug , Clone , Copy , PartialEq)] enum Token { Letter , }impl < 's > :: logos :: Logos < 's > for Token { type Error = () ; type Extras = () ; type Source = str ; fn lex (lex : & mut :: logos :: Lexer < 's , Self >) { use :: logos :: internal :: { LexerInternal , CallbackResult } ; type Lexer < 's > = :: logos :: Lexer < 's , Token > ; fn _end < 's > (lex : & mut Lexer < 's >) { lex . end () } fn _error < 's > (lex : & mut Lexer < 's >) { lex . bump_unchecked (1) ; lex . error () ; } macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } # [inline] fn goto1_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Letter)) ; } # [inline] fn goto3_at1_with3 < 's > (lex : & mut Lexer < 's >) { match lex . read_at :: < & [u8 ; 2usize] > (1usize) { Some (b"-z") => { lex . bump_unchecked (3usize) ; goto1_x (lex) } , _ => _error (lex) , } } # [inline] fn goto4 < 's > (lex : & mut Lexer < 's >) { let arr = match lex . read :: < & [u8 ; 3usize] > () { Some (arr) => arr , None => return _end (lex) , } ; match arr [0] { b'a' => goto3_at1_with3 (lex) , _ => _error (lex) , } } goto4 (lex) } } -------------------------------------------------------------------------------- /logos-cli/tests/tests.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use assert_cmd::Command; 4 | use assert_fs::{assert::PathAssert, fixture::FileWriteStr, NamedTempFile}; 5 | use predicates::prelude::*; 6 | 7 | const INPUT_FILE: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/input.rs"); 8 | const OUTPUT_FILE: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/output.rs"); 9 | const FMT_OUTPUT_FILE: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/fmt_output.rs"); 10 | 11 | #[test] 12 | fn test_codegen() { 13 | let tempfile = NamedTempFile::new("output.gen.rs").unwrap(); 14 | 15 | let mut cmd = Command::cargo_bin("logos-cli").unwrap(); 16 | cmd.arg(INPUT_FILE) 17 | .arg("--output") 18 | .arg(tempfile.path()) 19 | .assert() 20 | .success(); 21 | 22 | tempfile.assert(normalize_newlines(OUTPUT_FILE)); 23 | } 24 | 25 | #[test] 26 | fn test_codegen_check() { 27 | Command::cargo_bin("logos-cli") 28 | .unwrap() 29 | .arg(INPUT_FILE) 30 | .arg("--check") 31 | .arg("--output") 32 | .arg(OUTPUT_FILE) 33 | .assert() 34 | .success(); 35 | } 36 | 37 | #[test] 38 | fn test_codegen_check_format() { 39 | Command::cargo_bin("logos-cli") 40 | .unwrap() 41 | .arg(INPUT_FILE) 42 | .arg("--format") 43 | .arg("--check") 44 | .arg("--output") 45 | .arg(FMT_OUTPUT_FILE) 46 | .assert() 47 | .success(); 48 | } 49 | 50 | #[test] 51 | fn test_codegen_fail_check() { 52 | let tempfile = NamedTempFile::new("output.gen.rs").unwrap(); 53 | 54 | tempfile.write_str("some random data").unwrap(); 55 | 56 | Command::cargo_bin("logos-cli") 57 | .unwrap() 58 | .arg(INPUT_FILE) 59 | .arg("--check") 60 | .arg("--output") 61 | .arg(tempfile.path()) 62 | .assert() 63 | .failure(); 64 | } 65 | 66 | #[test] 67 | fn test_codegen_format() { 68 | let tempfile = NamedTempFile::new("output.gen.rs").unwrap(); 69 | 70 | let mut cmd = Command::cargo_bin("logos-cli").unwrap(); 71 | cmd.arg(INPUT_FILE) 72 | .arg("--format") 73 | .arg("--output") 74 | .arg(tempfile.path()) 75 | .assert() 76 | .success(); 77 | 78 | tempfile.assert(normalize_newlines(FMT_OUTPUT_FILE)); 79 | } 80 | 81 | fn normalize_newlines(s: impl AsRef) -> impl Predicate { 82 | predicates::str::diff(fs_err::read_to_string(s).unwrap().replace("\r\n", "\n")).normalize() 83 | } 84 | -------------------------------------------------------------------------------- /logos-codegen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [dependencies] 2 | beef = "0.5.0" 3 | fnv = "1.0.6" 4 | lazy_static = "1.4.0" 5 | proc-macro2 = "1.0.9" 6 | quote = "1.0.3" 7 | regex-syntax = "0.8.2" 8 | syn = { version = "2.0.13", features = ["full"] } 9 | 10 | [dev-dependencies] 11 | pretty_assertions = "1.4.0" 12 | rstest = "0.23.0" 13 | 14 | [build-dependencies] 15 | rustc_version = "0.4.1" 16 | 17 | [features] 18 | # Enables debug messages 19 | debug = [] 20 | # Exports out internal methods for fuzzing 21 | fuzzing = [] 22 | # Don't use or generate unsafe code 23 | forbid_unsafe = [] 24 | 25 | [lib] 26 | bench = false 27 | 28 | [package] 29 | name = "logos-codegen" 30 | authors.workspace = true 31 | categories.workspace = true 32 | description.workspace = true 33 | edition.workspace = true 34 | homepage.workspace = true 35 | keywords.workspace = true 36 | license.workspace = true 37 | readme.workspace = true 38 | repository.workspace = true 39 | rust-version.workspace = true 40 | version.workspace = true 41 | 42 | [package.metadata.release] 43 | shared-version = true 44 | -------------------------------------------------------------------------------- /logos-codegen/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /logos-codegen/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /logos-codegen/build.rs: -------------------------------------------------------------------------------- 1 | use rustc_version::{version_meta, Version}; 2 | 3 | fn main() { 4 | let version_meta = version_meta().expect("Could not get Rust version"); 5 | 6 | let rustc_version = version_meta.semver; 7 | let trimmed_rustc_version = Version::new( 8 | rustc_version.major, 9 | rustc_version.minor, 10 | rustc_version.patch, 11 | ); 12 | 13 | // Add cfg flag for Rust >= 1.82 14 | // Required for precise capturing in edition 2024 15 | // Due to changes in lifetime and type capture behavior for impl trait 16 | // see: https://github.com/maciejhirsz/logos/issues/434, https://github.com/rust-lang/rfcs/pull/3498 17 | println!("cargo:rustc-check-cfg=cfg(rust_1_82)"); 18 | if trimmed_rustc_version >= Version::new(1, 82, 0) { 19 | println!("cargo:rustc-cfg=rust_1_82"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /logos-codegen/src/error.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use beef::lean::Cow; 4 | use proc_macro2::{Span, TokenStream}; 5 | use quote::quote; 6 | use quote::{quote_spanned, ToTokens, TokenStreamExt}; 7 | 8 | pub type Result = std::result::Result; 9 | 10 | #[derive(Default)] 11 | pub struct Errors { 12 | collected: Vec, 13 | } 14 | 15 | impl Errors { 16 | pub fn err(&mut self, message: M, span: Span) -> &mut Self 17 | where 18 | M: Into>, 19 | { 20 | self.collected.push(SpannedError { 21 | message: message.into(), 22 | span, 23 | }); 24 | 25 | self 26 | } 27 | 28 | pub fn render(self) -> Option { 29 | let errors = self.collected; 30 | 31 | match errors.len() { 32 | 0 => None, 33 | _ => Some(quote! { 34 | fn _logos_derive_compile_errors() { 35 | #(#errors)* 36 | } 37 | }), 38 | } 39 | } 40 | } 41 | 42 | pub struct Error(Cow<'static, str>); 43 | 44 | #[derive(Debug)] 45 | pub struct SpannedError { 46 | message: Cow<'static, str>, 47 | span: Span, 48 | } 49 | 50 | impl Error { 51 | pub fn new(message: M) -> Self 52 | where 53 | M: Into>, 54 | { 55 | Error(message.into()) 56 | } 57 | 58 | pub fn span(self, span: Span) -> SpannedError { 59 | SpannedError { 60 | message: self.0, 61 | span, 62 | } 63 | } 64 | } 65 | 66 | impl fmt::Display for Error { 67 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 68 | self.0.fmt(f) 69 | } 70 | } 71 | 72 | impl fmt::Debug for Error { 73 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 74 | fmt::Display::fmt(self, f) 75 | } 76 | } 77 | 78 | impl From for Error { 79 | fn from(err: regex_syntax::Error) -> Error { 80 | Error(err.to_string().into()) 81 | } 82 | } 83 | 84 | impl From<&'static str> for Error { 85 | fn from(err: &'static str) -> Error { 86 | Error(err.into()) 87 | } 88 | } 89 | 90 | impl From for Error { 91 | fn from(err: String) -> Error { 92 | Error(err.into()) 93 | } 94 | } 95 | 96 | impl From for Cow<'static, str> { 97 | fn from(err: Error) -> Self { 98 | err.0 99 | } 100 | } 101 | 102 | impl ToTokens for SpannedError { 103 | fn to_tokens(&self, tokens: &mut TokenStream) { 104 | let message = &*self.message; 105 | 106 | tokens.append_all(quote_spanned!(self.span => { 107 | compile_error!(#message) 108 | })) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /logos-codegen/src/generator/context.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::TokenStream; 2 | use quote::quote; 3 | 4 | use crate::generator::Generator; 5 | use crate::graph::NodeId; 6 | 7 | /// This struct keeps track of bytes available to be read without 8 | /// bounds checking across the tree. 9 | /// 10 | /// For example, a branch that matches 4 bytes followed by a fork 11 | /// with smallest branch containing of 2 bytes can do a bounds check 12 | /// for 6 bytes ahead, and leave the remaining 2 byte array (fixed size) 13 | /// to be handled by the fork, avoiding bound checks there. 14 | #[derive(Default, Clone, Copy, PartialEq, Eq, Hash, Debug)] 15 | pub struct Context { 16 | /// Amount of bytes that haven't been bumped yet but should 17 | /// before a new read is performed 18 | at: usize, 19 | /// Number of bytes available without bound checks 20 | available: usize, 21 | /// Whether or not the Lexer has been bumped at least by 1 byte 22 | bumped: bool, 23 | /// Node to backtrack to to in case an explicit match has failed. 24 | /// If `None` will instead produce an error token. 25 | backtrack: Option, 26 | } 27 | 28 | impl Context { 29 | pub fn can_backtrack(&self) -> bool { 30 | self.backtrack.is_some() 31 | } 32 | 33 | pub fn switch(&mut self, miss: Option) -> Option { 34 | self.backtrack = Some(miss?); 35 | self.bump() 36 | } 37 | 38 | pub const fn advance(self, n: usize) -> Self { 39 | Context { 40 | at: self.at + n, 41 | ..self 42 | } 43 | } 44 | 45 | pub fn bump(&mut self) -> Option { 46 | match self.at { 47 | 0 => None, 48 | n => { 49 | let tokens = quote!(lex.bump_unchecked(#n);); 50 | self.at = 0; 51 | self.available = 0; 52 | self.bumped = true; 53 | Some(tokens) 54 | } 55 | } 56 | } 57 | 58 | pub fn remainder(&self) -> usize { 59 | self.available.saturating_sub(self.at) 60 | } 61 | 62 | pub fn read_byte(&mut self) -> TokenStream { 63 | let at = self.at; 64 | 65 | self.advance(1); 66 | 67 | #[cfg(not(feature = "forbid_unsafe"))] 68 | { 69 | quote!(unsafe { lex.read_byte_unchecked(#at) }) 70 | } 71 | 72 | #[cfg(feature = "forbid_unsafe")] 73 | { 74 | quote!(lex.read_byte(#at)) 75 | } 76 | } 77 | 78 | pub fn read(&mut self, len: usize) -> TokenStream { 79 | self.available = len; 80 | 81 | match (self.at, len) { 82 | (0, 0) => quote!(lex.read::()), 83 | (a, 0) => quote!(lex.read_at::(#a)), 84 | (0, l) => quote!(lex.read::<&[u8; #l]>()), 85 | (a, l) => quote!(lex.read_at::<&[u8; #l]>(#a)), 86 | } 87 | } 88 | 89 | pub fn wipe(&mut self) { 90 | self.available = 0; 91 | } 92 | 93 | const fn backtrack(self) -> Self { 94 | Context { 95 | at: 0, 96 | available: 0, 97 | bumped: self.bumped, 98 | backtrack: None, 99 | } 100 | } 101 | 102 | pub fn miss(mut self, miss: Option, gen: &mut Generator) -> TokenStream { 103 | self.wipe(); 104 | match (miss, self.backtrack) { 105 | (Some(id), _) => gen.goto(id, self).clone(), 106 | (_, Some(id)) => gen.goto(id, self.backtrack()).clone(), 107 | _ if self.bumped => quote!(lex.error()), 108 | _ => quote!(_error(lex)), 109 | } 110 | } 111 | 112 | pub fn write_suffix(&self, buf: &mut String) { 113 | use std::fmt::Write; 114 | 115 | if self.at > 0 { 116 | let _ = write!(buf, "_at{}", self.at); 117 | } 118 | if self.available > 0 { 119 | let _ = write!(buf, "_with{}", self.available); 120 | } 121 | if let Some(id) = self.backtrack { 122 | let _ = write!(buf, "_ctx{}", id); 123 | } 124 | if self.bumped { 125 | buf.push_str("_x"); 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /logos-codegen/src/generator/leaf.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::TokenStream; 2 | use quote::quote; 3 | 4 | use crate::generator::{Context, Generator}; 5 | use crate::leaf::{Callback, Leaf}; 6 | use crate::util::MaybeVoid; 7 | 8 | impl Generator<'_> { 9 | pub fn generate_leaf(&mut self, leaf: &Leaf, mut ctx: Context) -> TokenStream { 10 | let bump = ctx.bump(); 11 | 12 | let ident = &leaf.ident; 13 | let name = self.name; 14 | let this = self.this; 15 | let ty = &leaf.field; 16 | 17 | let constructor = match leaf.field { 18 | MaybeVoid::Some(_) => quote!(#name::#ident), 19 | MaybeVoid::Void => quote!(|()| #name::#ident), 20 | }; 21 | 22 | match &leaf.callback { 23 | Some(Callback::Label(callback)) => quote! { 24 | #bump 25 | #callback(lex).construct(#constructor, lex); 26 | }, 27 | Some(Callback::Inline(inline)) => { 28 | let arg = &inline.arg; 29 | let body = &inline.body; 30 | 31 | #[cfg(not(rust_1_82))] 32 | let ret = quote!(impl CallbackResult<'s, #ty, #this>); 33 | 34 | #[cfg(rust_1_82)] 35 | let ret = quote!(impl CallbackResult<'s, #ty, #this> + use<'s>); 36 | 37 | quote! { 38 | #bump 39 | 40 | #[inline] 41 | fn callback<'s>(#arg: &mut Lexer<'s>) -> #ret { 42 | #body 43 | } 44 | 45 | callback(lex).construct(#constructor, lex); 46 | } 47 | } 48 | Some(Callback::Skip(_)) => { 49 | quote! { 50 | #bump 51 | 52 | lex.trivia(); 53 | #name::lex(lex); 54 | } 55 | } 56 | None if matches!(leaf.field, MaybeVoid::Void) => quote! { 57 | #bump 58 | lex.set(Ok(#name::#ident)); 59 | }, 60 | None => quote! { 61 | #bump 62 | let token = #name::#ident(lex.slice()); 63 | lex.set(Ok(token)); 64 | }, 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /logos-codegen/src/generator/rope.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::TokenStream; 2 | use quote::quote; 3 | 4 | use crate::generator::{Context, Generator}; 5 | use crate::graph::Rope; 6 | 7 | impl Generator<'_> { 8 | pub fn generate_rope(&mut self, rope: &Rope, mut ctx: Context) -> TokenStream { 9 | let miss = ctx.miss(rope.miss.first(), self); 10 | let read = ctx.read(rope.pattern.len()); 11 | let then = self.goto(rope.then, ctx.advance(rope.pattern.len())); 12 | 13 | let pat = match rope.pattern.to_bytes() { 14 | Some(bytes) => byte_slice_literal(&bytes), 15 | None => { 16 | let ranges = rope.pattern.iter(); 17 | 18 | quote!([#(#ranges),*]) 19 | } 20 | }; 21 | 22 | quote! { 23 | match #read { 24 | Some(#pat) => #then, 25 | _ => #miss, 26 | } 27 | } 28 | } 29 | } 30 | 31 | fn byte_slice_literal(bytes: &[u8]) -> TokenStream { 32 | if bytes.iter().any(|&b| !(0x20..0x7F).contains(&b)) { 33 | return quote!(&[#(#bytes),*]); 34 | } 35 | 36 | let slice = std::str::from_utf8(bytes).unwrap(); 37 | 38 | syn::parse_str(&format!("b{:?}", slice)).unwrap() 39 | } 40 | -------------------------------------------------------------------------------- /logos-codegen/src/generator/tables.rs: -------------------------------------------------------------------------------- 1 | use crate::util::ToIdent; 2 | use proc_macro2::{Literal, TokenStream}; 3 | use quote::{quote, ToTokens}; 4 | use syn::Ident; 5 | 6 | pub struct TableStack { 7 | tables: Vec<(Ident, [u8; 256])>, 8 | shift: u8, 9 | } 10 | 11 | pub struct TableView<'a> { 12 | ident: &'a Ident, 13 | table: &'a mut [u8; 256], 14 | mask: u8, 15 | } 16 | 17 | impl TableStack { 18 | pub fn new() -> Self { 19 | TableStack { 20 | tables: vec![("COMPACT_TABLE_0".to_ident(), [0; 256])], 21 | shift: 0, 22 | } 23 | } 24 | 25 | pub fn view(&mut self) -> TableView { 26 | let mask = if self.shift < 8 { 27 | // Reusing existing table with a shifted mask 28 | let mask = 1u8 << self.shift; 29 | 30 | self.shift += 1; 31 | 32 | mask 33 | } else { 34 | // Need to create a new table 35 | let ident = format!("COMPACT_TABLE_{}", self.tables.len()).to_ident(); 36 | 37 | self.tables.push((ident, [0; 256])); 38 | self.shift = 1; 39 | 40 | 1 41 | }; 42 | 43 | let (ref ident, ref mut table) = self.tables.last_mut().unwrap(); 44 | 45 | TableView { ident, table, mask } 46 | } 47 | } 48 | 49 | impl<'a> TableView<'a> { 50 | pub fn ident(&self) -> &'a Ident { 51 | self.ident 52 | } 53 | 54 | pub fn flag(&mut self, byte: u8) { 55 | self.table[byte as usize] |= self.mask; 56 | } 57 | 58 | pub fn mask(&self) -> Literal { 59 | Literal::u8_unsuffixed(self.mask) 60 | } 61 | } 62 | 63 | impl ToTokens for TableStack { 64 | fn to_tokens(&self, out: &mut TokenStream) { 65 | if self.shift == 0 { 66 | return; 67 | } 68 | 69 | for (ident, table) in self.tables.iter() { 70 | let bytes = table.iter().copied().map(Literal::u8_unsuffixed); 71 | 72 | out.extend(quote! { 73 | static #ident: [u8; 256] = [#(#bytes),*]; 74 | }); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /logos-codegen/src/graph/impls.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{self, Debug, Display}; 2 | use std::hash::{Hash, Hasher}; 3 | 4 | use crate::graph::{Fork, Graph, Node, NodeId, Range, Rope}; 5 | 6 | impl From for Node { 7 | fn from(fork: Fork) -> Self { 8 | Node::Fork(fork) 9 | } 10 | } 11 | impl From for Node { 12 | fn from(rope: Rope) -> Self { 13 | Node::Rope(rope) 14 | } 15 | } 16 | 17 | fn is_ascii(byte: u8) -> bool { 18 | (0x20..0x7F).contains(&byte) 19 | } 20 | 21 | impl Hash for Fork { 22 | fn hash(&self, state: &mut H) { 23 | for branch in self.branches() { 24 | branch.hash(state); 25 | } 26 | self.miss.hash(state); 27 | } 28 | } 29 | 30 | impl Hash for Node { 31 | fn hash(&self, state: &mut H) { 32 | match self { 33 | Node::Rope(rope) => { 34 | b"ROPE".hash(state); 35 | rope.hash(state); 36 | } 37 | Node::Fork(fork) => { 38 | b"FORK".hash(state); 39 | fork.hash(state); 40 | } 41 | Node::Leaf(_) => b"LEAF".hash(state), 42 | } 43 | } 44 | } 45 | 46 | impl Debug for NodeId { 47 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 48 | Debug::fmt(&self.0, f) 49 | } 50 | } 51 | 52 | /// We don't need debug impls in release builds 53 | // #[cfg(test)] 54 | mod debug { 55 | use super::*; 56 | use crate::graph::rope::Miss; 57 | use crate::graph::Disambiguate; 58 | use std::cmp::{Ord, Ordering}; 59 | 60 | impl Disambiguate for &str { 61 | fn cmp(left: &&str, right: &&str) -> Ordering { 62 | Ord::cmp(left, right) 63 | } 64 | } 65 | 66 | impl Debug for Range { 67 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 68 | let Range { start, end } = *self; 69 | 70 | if start != end || !is_ascii(start) { 71 | f.write_str("[")?; 72 | } 73 | match is_ascii(start) { 74 | true => write!(f, "{}", start as char), 75 | false => write!(f, "{:02X}", start), 76 | }?; 77 | if start != end { 78 | match is_ascii(end) { 79 | true => write!(f, "-{}]", end as char), 80 | false => write!(f, "-{:02X}]", end), 81 | }?; 82 | } else if !is_ascii(start) { 83 | f.write_str("]")?; 84 | } 85 | Ok(()) 86 | } 87 | } 88 | 89 | impl Display for Range { 90 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 91 | ::fmt(self, f) 92 | } 93 | } 94 | 95 | impl Debug for Graph { 96 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 97 | let entries = self 98 | .nodes() 99 | .iter() 100 | .enumerate() 101 | .filter_map(|(i, n)| n.as_ref().map(|n| (i, n))); 102 | 103 | f.debug_map().entries(entries).finish() 104 | } 105 | } 106 | 107 | struct Arm(T, U); 108 | 109 | impl Debug for Arm 110 | where 111 | T: Display, 112 | U: Display, 113 | { 114 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 115 | write!(f, "{} ⇒ {}", self.0, self.1) 116 | } 117 | } 118 | 119 | impl Debug for Fork { 120 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 121 | let mut list = f.debug_set(); 122 | 123 | for (range, then) in self.branches() { 124 | list.entry(&Arm(range, then)); 125 | } 126 | if let Some(id) = self.miss { 127 | list.entry(&Arm('_', id)); 128 | } 129 | 130 | list.finish() 131 | } 132 | } 133 | 134 | impl Display for Miss { 135 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 136 | match self { 137 | Miss::First(id) => Display::fmt(id, f), 138 | Miss::Any(id) => write!(f, "{}*", id), 139 | Miss::None => f.write_str("n/a"), 140 | } 141 | } 142 | } 143 | 144 | impl Debug for Rope { 145 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 146 | use std::fmt::Write; 147 | 148 | let mut rope = String::with_capacity(self.pattern.len()); 149 | for range in self.pattern.iter() { 150 | write!(rope, "{}", range)?; 151 | } 152 | 153 | match self.miss.is_none() { 154 | false => { 155 | let mut list = f.debug_list(); 156 | 157 | list.entry(&Arm(rope, self.then)); 158 | list.entry(&Arm('_', self.miss)); 159 | 160 | list.finish() 161 | } 162 | true => Arm(rope, self.then).fmt(f), 163 | } 164 | } 165 | } 166 | 167 | impl PartialEq for Fork { 168 | fn eq(&self, other: &Self) -> bool { 169 | self.miss == other.miss && self.branches().eq(other.branches()) 170 | } 171 | } 172 | 173 | impl Debug for Node { 174 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 175 | match self { 176 | Node::Fork(fork) => fork.fmt(f), 177 | Node::Rope(rope) => rope.fmt(f), 178 | Node::Leaf(leaf) => leaf.fmt(f), 179 | } 180 | } 181 | } 182 | 183 | use std::ops::RangeInclusive; 184 | 185 | impl From> for Range { 186 | fn from(range: RangeInclusive) -> Range { 187 | Range { 188 | start: *range.start(), 189 | end: *range.end(), 190 | } 191 | } 192 | } 193 | 194 | impl From> for Range { 195 | fn from(range: RangeInclusive) -> Range { 196 | Range { 197 | start: *range.start() as u8, 198 | end: *range.end() as u8, 199 | } 200 | } 201 | } 202 | 203 | impl PartialEq for Node { 204 | fn eq(&self, other: &Rope) -> bool { 205 | match self { 206 | Node::Rope(rope) => rope == other, 207 | _ => false, 208 | } 209 | } 210 | } 211 | 212 | impl PartialEq for Node { 213 | fn eq(&self, other: &Fork) -> bool { 214 | match self { 215 | Node::Fork(fork) => fork == other, 216 | _ => false, 217 | } 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /logos-codegen/src/graph/meta.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::min; 2 | use std::collections::BTreeMap; 3 | use std::ops::{Index, IndexMut}; 4 | 5 | use crate::graph::{Graph, Node, NodeId}; 6 | 7 | #[derive(Debug)] 8 | pub struct Meta { 9 | map: BTreeMap, 10 | } 11 | 12 | #[derive(Debug, Default)] 13 | pub struct MetaItem { 14 | /// Number of references to this node 15 | pub refcount: usize, 16 | /// Minimum number of bytes that ought to be read for this 17 | /// node to find a match 18 | pub min_read: usize, 19 | /// Marks whether or not this node leads to a loop entry node. 20 | pub is_loop_init: bool, 21 | /// Ids of other nodes that point to this node while this 22 | /// node is on a stack (creating a loop) 23 | pub loop_entry_from: Vec, 24 | } 25 | 26 | impl Index for Meta { 27 | type Output = MetaItem; 28 | 29 | fn index(&self, id: NodeId) -> &MetaItem { 30 | &self.map[&id] 31 | } 32 | } 33 | 34 | impl IndexMut for Meta { 35 | fn index_mut(&mut self, id: NodeId) -> &mut MetaItem { 36 | self.map.entry(id).or_default() 37 | } 38 | } 39 | 40 | impl MetaItem { 41 | fn loop_entry(&mut self, id: NodeId) { 42 | if let Err(idx) = self.loop_entry_from.binary_search(&id) { 43 | self.loop_entry_from.insert(idx, id); 44 | } 45 | } 46 | } 47 | 48 | impl Meta { 49 | pub fn analyze(root: NodeId, graph: &Graph) -> Self { 50 | let mut meta = Meta { 51 | map: Default::default(), 52 | }; 53 | 54 | meta.first_pass(root, root, graph, &mut Vec::new()); 55 | 56 | meta 57 | } 58 | 59 | pub fn first_pass( 60 | &mut self, 61 | this: NodeId, 62 | parent: NodeId, 63 | graph: &Graph, 64 | stack: &mut Vec, 65 | ) -> &MetaItem { 66 | let meta = &mut self[this]; 67 | let is_done = meta.refcount > 0; 68 | 69 | meta.refcount += 1; 70 | 71 | if stack.contains(&this) { 72 | meta.loop_entry(parent); 73 | self[parent].is_loop_init = true; 74 | } 75 | if is_done { 76 | return &self[this]; 77 | } 78 | 79 | stack.push(this); 80 | 81 | let mut min_read; 82 | 83 | match &graph[this] { 84 | Node::Fork(fork) => { 85 | min_read = usize::MAX; 86 | for (_, id) in fork.branches() { 87 | let meta = self.first_pass(id, this, graph, stack); 88 | 89 | if meta.is_loop_init { 90 | min_read = 1; 91 | } else { 92 | min_read = min(min_read, meta.min_read + 1); 93 | } 94 | } 95 | if let Some(id) = fork.miss { 96 | let meta = self.first_pass(id, this, graph, stack); 97 | 98 | if meta.is_loop_init { 99 | min_read = 0; 100 | } else { 101 | min_read = min(min_read, meta.min_read); 102 | } 103 | } 104 | if min_read == usize::MAX { 105 | min_read = 0; 106 | } 107 | } 108 | Node::Rope(rope) => { 109 | min_read = rope.pattern.len(); 110 | let meta = self.first_pass(rope.then, this, graph, stack); 111 | 112 | if !meta.is_loop_init { 113 | min_read += meta.min_read; 114 | } 115 | 116 | if let Some(id) = rope.miss.first() { 117 | let meta = self.first_pass(id, this, graph, stack); 118 | 119 | if meta.is_loop_init { 120 | min_read = 0; 121 | } else { 122 | min_read = min(min_read, meta.min_read); 123 | } 124 | } 125 | } 126 | Node::Leaf(_) => min_read = 0, 127 | } 128 | 129 | stack.pop(); 130 | 131 | let meta = &mut self[this]; 132 | meta.min_read = min_read; 133 | let second_pass = meta.loop_entry_from.clone(); 134 | 135 | for id in second_pass { 136 | self.meta_second_pass(id, graph); 137 | } 138 | 139 | &self[this] 140 | } 141 | 142 | fn meta_second_pass(&mut self, id: NodeId, graph: &Graph) { 143 | let mut min_read; 144 | 145 | match &graph[id] { 146 | Node::Fork(fork) => { 147 | min_read = usize::MAX; 148 | for (_, id) in fork.branches() { 149 | let meta = &self[id]; 150 | 151 | if meta.is_loop_init { 152 | min_read = 1; 153 | } else { 154 | min_read = min(min_read, meta.min_read + 1); 155 | } 156 | } 157 | if min_read == usize::MAX { 158 | min_read = 0; 159 | } 160 | } 161 | Node::Rope(rope) => { 162 | min_read = rope.pattern.len(); 163 | let meta = &self[rope.then]; 164 | 165 | if !meta.is_loop_init { 166 | min_read += meta.min_read; 167 | } 168 | } 169 | Node::Leaf(_) => unreachable!(), 170 | } 171 | 172 | self[id].min_read = min_read; 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /logos-codegen/src/graph/range.rs: -------------------------------------------------------------------------------- 1 | use regex_syntax::hir::ClassBytesRange; 2 | use regex_syntax::hir::ClassUnicodeRange; 3 | use regex_syntax::utf8::Utf8Range; 4 | 5 | use std::cmp::{Ord, Ordering}; 6 | 7 | #[derive(Clone, Copy, PartialEq, Eq, Hash)] 8 | pub struct Range { 9 | pub start: u8, 10 | pub end: u8, 11 | } 12 | 13 | impl Range { 14 | pub fn as_byte(&self) -> Option { 15 | if self.is_byte() { 16 | Some(self.start) 17 | } else { 18 | None 19 | } 20 | } 21 | 22 | pub fn is_byte(&self) -> bool { 23 | self.start == self.end 24 | } 25 | } 26 | 27 | impl From for Range { 28 | fn from(byte: u8) -> Range { 29 | Range { 30 | start: byte, 31 | end: byte, 32 | } 33 | } 34 | } 35 | 36 | impl From<&u8> for Range { 37 | fn from(byte: &u8) -> Range { 38 | Range::from(*byte) 39 | } 40 | } 41 | 42 | impl Iterator for Range { 43 | type Item = u8; 44 | 45 | fn next(&mut self) -> Option { 46 | match self.start.cmp(&self.end) { 47 | std::cmp::Ordering::Less => { 48 | let res = self.start; 49 | self.start += 1; 50 | 51 | Some(res) 52 | } 53 | std::cmp::Ordering::Equal => { 54 | let res = self.start; 55 | 56 | // Necessary so that range 0xFF-0xFF doesn't loop forever 57 | self.start = 0xFF; 58 | self.end = 0x00; 59 | 60 | Some(res) 61 | } 62 | std::cmp::Ordering::Greater => None, 63 | } 64 | } 65 | } 66 | 67 | impl PartialOrd for Range { 68 | fn partial_cmp(&self, other: &Range) -> Option { 69 | Some(self.cmp(other)) 70 | } 71 | } 72 | 73 | impl Ord for Range { 74 | fn cmp(&self, other: &Self) -> Ordering { 75 | self.start.cmp(&other.start) 76 | } 77 | } 78 | 79 | impl From for Range { 80 | fn from(r: Utf8Range) -> Range { 81 | Range { 82 | start: r.start, 83 | end: r.end, 84 | } 85 | } 86 | } 87 | 88 | impl From for Range { 89 | fn from(r: ClassUnicodeRange) -> Range { 90 | let start = r.start() as u32; 91 | let end = r.end() as u32; 92 | 93 | if start >= 128 || end >= 128 && end != 0x0010FFFF { 94 | panic!("Casting non-ascii ClassUnicodeRange to Range") 95 | } 96 | 97 | Range { 98 | start: start as u8, 99 | end: end as u8, 100 | } 101 | } 102 | } 103 | 104 | impl From for Range { 105 | fn from(r: ClassBytesRange) -> Range { 106 | Range { 107 | start: r.start(), 108 | end: r.end(), 109 | } 110 | } 111 | } 112 | 113 | #[cfg(test)] 114 | mod tests { 115 | use super::*; 116 | 117 | #[test] 118 | fn range_iter_one() { 119 | let byte = Range::from(b'!'); 120 | let collected = byte.take(1000).collect::>(); 121 | 122 | assert_eq!(b"!", &collected[..]); 123 | } 124 | 125 | #[test] 126 | fn range_iter_few() { 127 | let byte = Range { 128 | start: b'a', 129 | end: b'd', 130 | }; 131 | let collected = byte.take(1000).collect::>(); 132 | 133 | assert_eq!(b"abcd", &collected[..]); 134 | } 135 | 136 | #[test] 137 | fn range_iter_bounds() { 138 | let byte = Range::from(0xFA..=0xFF); 139 | 140 | let collected = byte.take(1000).collect::>(); 141 | 142 | assert_eq!(b"\xFA\xFB\xFC\xFD\xFE\xFF", &collected[..]); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /logos-codegen/src/leaf.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::{Ord, Ordering}; 2 | use std::fmt::{self, Debug, Display}; 3 | 4 | use proc_macro2::{Span, TokenStream}; 5 | use syn::{spanned::Spanned, Ident}; 6 | 7 | use crate::graph::{Disambiguate, Node}; 8 | use crate::util::MaybeVoid; 9 | 10 | #[derive(Clone)] 11 | pub struct Leaf<'t> { 12 | pub ident: Option<&'t Ident>, 13 | pub span: Span, 14 | pub priority: usize, 15 | pub field: MaybeVoid, 16 | pub callback: Option, 17 | } 18 | 19 | #[derive(Clone)] 20 | pub enum Callback { 21 | Label(TokenStream), 22 | Inline(Box), 23 | Skip(Span), 24 | } 25 | 26 | #[derive(Clone)] 27 | pub struct InlineCallback { 28 | pub arg: Ident, 29 | pub body: TokenStream, 30 | pub span: Span, 31 | } 32 | 33 | impl From for Callback { 34 | fn from(inline: InlineCallback) -> Callback { 35 | Callback::Inline(Box::new(inline)) 36 | } 37 | } 38 | 39 | impl Callback { 40 | pub fn span(&self) -> Span { 41 | match self { 42 | Callback::Label(tokens) => tokens.span(), 43 | Callback::Inline(inline) => inline.span, 44 | Callback::Skip(span) => *span, 45 | } 46 | } 47 | } 48 | 49 | impl<'t> Leaf<'t> { 50 | pub fn new(ident: &'t Ident, span: Span) -> Self { 51 | Leaf { 52 | ident: Some(ident), 53 | span, 54 | priority: 0, 55 | field: MaybeVoid::Void, 56 | callback: None, 57 | } 58 | } 59 | 60 | pub fn new_skip(span: Span) -> Self { 61 | Leaf { 62 | ident: None, 63 | span, 64 | priority: 0, 65 | field: MaybeVoid::Void, 66 | callback: Some(Callback::Skip(span)), 67 | } 68 | } 69 | 70 | pub fn callback(mut self, callback: Option) -> Self { 71 | self.callback = callback; 72 | self 73 | } 74 | 75 | pub fn field(mut self, field: MaybeVoid) -> Self { 76 | self.field = field; 77 | self 78 | } 79 | 80 | pub fn priority(mut self, priority: usize) -> Self { 81 | self.priority = priority; 82 | self 83 | } 84 | } 85 | 86 | impl Disambiguate for Leaf<'_> { 87 | fn cmp(left: &Leaf, right: &Leaf) -> Ordering { 88 | Ord::cmp(&left.priority, &right.priority) 89 | } 90 | } 91 | 92 | impl<'t> From> for Node> { 93 | fn from(leaf: Leaf<'t>) -> Self { 94 | Node::Leaf(leaf) 95 | } 96 | } 97 | 98 | impl Debug for Leaf<'_> { 99 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 100 | write!(f, "::{}", self)?; 101 | 102 | match self.callback { 103 | Some(Callback::Label(ref label)) => write!(f, " ({})", label), 104 | Some(Callback::Inline(_)) => f.write_str(" ()"), 105 | Some(Callback::Skip(_)) => f.write_str(" ()"), 106 | None => Ok(()), 107 | } 108 | } 109 | } 110 | 111 | impl Display for Leaf<'_> { 112 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 113 | match self.ident { 114 | Some(ident) => Display::fmt(ident, f), 115 | None => f.write_str(""), 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /logos-codegen/src/macros.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "debug")] 2 | macro_rules! debug { 3 | ($($arg:tt)*) => { 4 | eprint!("[{}:{}:{}] ", file!(), line!(), column!()); 5 | eprintln!($($arg)*) 6 | } 7 | } 8 | 9 | #[cfg(not(feature = "debug"))] 10 | macro_rules! debug { 11 | ($($arg:tt)*) => {}; 12 | } 13 | -------------------------------------------------------------------------------- /logos-codegen/src/parser/definition.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, Span}; 2 | use syn::{spanned::Spanned, LitByteStr, LitStr}; 3 | 4 | use crate::error::{Errors, Result}; 5 | use crate::leaf::Callback; 6 | use crate::mir::Mir; 7 | use crate::parser::nested::NestedValue; 8 | use crate::parser::{IgnoreFlags, Parser, Subpatterns}; 9 | 10 | use super::ignore_flags::ascii_case::MakeAsciiCaseInsensitive; 11 | 12 | pub struct Definition { 13 | pub literal: Literal, 14 | pub priority: Option, 15 | pub callback: Option, 16 | pub ignore_flags: IgnoreFlags, 17 | } 18 | 19 | pub enum Literal { 20 | Utf8(LitStr), 21 | Bytes(LitByteStr), 22 | } 23 | 24 | impl Definition { 25 | pub fn new(literal: Literal) -> Self { 26 | Definition { 27 | literal, 28 | priority: None, 29 | callback: None, 30 | ignore_flags: IgnoreFlags::Empty, 31 | } 32 | } 33 | 34 | pub fn named_attr(&mut self, name: Ident, value: NestedValue, parser: &mut Parser) { 35 | match (name.to_string().as_str(), value) { 36 | ("priority", NestedValue::Assign(tokens)) => { 37 | let prio = match tokens.to_string().parse() { 38 | Ok(prio) => prio, 39 | Err(_) => { 40 | parser.err("Expected an unsigned integer", tokens.span()); 41 | return; 42 | } 43 | }; 44 | 45 | if self.priority.replace(prio).is_some() { 46 | parser.err("Resetting previously set priority", tokens.span()); 47 | } 48 | } 49 | ("priority", _) => { 50 | parser.err("Expected: priority = ", name.span()); 51 | } 52 | ("callback", NestedValue::Assign(tokens)) => { 53 | let span = tokens.span(); 54 | let callback = match parser.parse_callback(tokens) { 55 | Some(callback) => callback, 56 | None => { 57 | parser.err("Not a valid callback", span); 58 | return; 59 | } 60 | }; 61 | 62 | if let Some(previous) = self.callback.replace(callback) { 63 | parser 64 | .err( 65 | "Callback has been already set", 66 | span.join(name.span()).unwrap(), 67 | ) 68 | .err("Previous callback set here", previous.span()); 69 | } 70 | } 71 | ("callback", _) => { 72 | parser.err("Expected: callback = ...", name.span()); 73 | } 74 | ("ignore", NestedValue::Group(tokens)) => { 75 | self.ignore_flags.parse_group(name, tokens, parser); 76 | } 77 | ("ignore", _) => { 78 | parser.err("Expected: ignore(, ...)", name.span()); 79 | } 80 | (unknown, _) => { 81 | parser.err( 82 | format!( 83 | "\ 84 | Unknown nested attribute: {}\n\ 85 | \n\ 86 | Expected one of: priority, callback\ 87 | ", 88 | unknown 89 | ), 90 | name.span(), 91 | ); 92 | } 93 | } 94 | } 95 | } 96 | 97 | impl Literal { 98 | pub fn to_bytes(&self) -> Vec { 99 | match self { 100 | Literal::Utf8(string) => string.value().into_bytes(), 101 | Literal::Bytes(bytes) => bytes.value(), 102 | } 103 | } 104 | 105 | pub fn escape_regex(&self) -> Literal { 106 | match self { 107 | Literal::Utf8(string) => Literal::Utf8(LitStr::new( 108 | regex_syntax::escape(&string.value()).as_str(), 109 | self.span(), 110 | )), 111 | Literal::Bytes(bytes) => Literal::Bytes(LitByteStr::new( 112 | regex_syntax::escape(&bytes_to_regex_string(bytes.value())).as_bytes(), 113 | self.span(), 114 | )), 115 | } 116 | } 117 | 118 | pub fn to_mir( 119 | &self, 120 | subpatterns: &Subpatterns, 121 | ignore_flags: IgnoreFlags, 122 | errors: &mut Errors, 123 | ) -> Result { 124 | let value = subpatterns.fix(self, errors); 125 | 126 | if ignore_flags.contains(IgnoreFlags::IgnoreAsciiCase) { 127 | match self { 128 | Literal::Utf8(_) => { 129 | Mir::utf8(&value).map(MakeAsciiCaseInsensitive::make_ascii_case_insensitive) 130 | } 131 | Literal::Bytes(_) => Mir::binary_ignore_case(&value), 132 | } 133 | } else if ignore_flags.contains(IgnoreFlags::IgnoreCase) { 134 | match self { 135 | Literal::Utf8(_) => Mir::utf8_ignore_case(&value), 136 | Literal::Bytes(_) => Mir::binary_ignore_case(&value), 137 | } 138 | } else { 139 | match self { 140 | Literal::Utf8(_) => Mir::utf8(&value), 141 | Literal::Bytes(_) => Mir::binary(&value), 142 | } 143 | } 144 | } 145 | 146 | pub fn span(&self) -> Span { 147 | match self { 148 | Literal::Utf8(string) => string.span(), 149 | Literal::Bytes(bytes) => bytes.span(), 150 | } 151 | } 152 | } 153 | 154 | impl syn::parse::Parse for Literal { 155 | fn parse(input: syn::parse::ParseStream) -> syn::Result { 156 | let la = input.lookahead1(); 157 | if la.peek(LitStr) { 158 | Ok(Literal::Utf8(input.parse()?)) 159 | } else if la.peek(LitByteStr) { 160 | Ok(Literal::Bytes(input.parse()?)) 161 | } else { 162 | Err(la.error()) 163 | } 164 | } 165 | } 166 | 167 | pub fn bytes_to_regex_string(bytes: Vec) -> String { 168 | if bytes.is_ascii() { 169 | unsafe { 170 | // Unicode values are prohibited, so we can't use 171 | // safe version of String::from_utf8 172 | // 173 | // We can, however, construct a safe ASCII string 174 | return String::from_utf8_unchecked(bytes); 175 | } 176 | } 177 | 178 | let mut string = String::with_capacity(bytes.len() * 2); 179 | 180 | for byte in bytes { 181 | if byte < 0x80 { 182 | string.push(byte as char); 183 | } else { 184 | static DIGITS: [u8; 16] = *b"0123456789abcdef"; 185 | 186 | string.push_str(r"\x"); 187 | string.push(DIGITS[(byte / 16) as usize] as char); 188 | string.push(DIGITS[(byte % 16) as usize] as char); 189 | } 190 | } 191 | 192 | string 193 | } 194 | -------------------------------------------------------------------------------- /logos-codegen/src/parser/nested.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::token_stream::IntoIter as TokenIter; 2 | use proc_macro2::{Ident, Literal, TokenStream, TokenTree}; 3 | use quote::quote; 4 | 5 | use crate::util::{expect_punct, is_punct}; 6 | 7 | pub enum NestedValue { 8 | /// `name = ...` 9 | Assign(TokenStream), 10 | /// `name "literal"` 11 | Literal(Literal), 12 | /// `name(...)` 13 | Group(TokenStream), 14 | /// `name ident = ...` 15 | KeywordAssign(Ident, TokenStream), 16 | } 17 | 18 | pub enum Nested { 19 | /// Unnamed nested attribute, such as a string, 20 | /// callback closure, or a lone ident/path 21 | /// 22 | /// Note: a lone ident will be Named with no value instead 23 | Unnamed(TokenStream), 24 | /// Named: name ... 25 | Named(Ident, NestedValue), 26 | /// Unexpected token, 27 | Unexpected(TokenStream), 28 | } 29 | 30 | pub struct AttributeParser { 31 | inner: TokenIter, 32 | } 33 | 34 | pub struct Empty; 35 | 36 | impl From for TokenStream { 37 | fn from(_: Empty) -> TokenStream { 38 | TokenStream::new() 39 | } 40 | } 41 | 42 | impl AttributeParser { 43 | pub fn new(stream: TokenStream) -> Self { 44 | AttributeParser { 45 | inner: stream.into_iter(), 46 | } 47 | } 48 | 49 | pub fn parsed(&mut self) -> Option> 50 | where 51 | T: syn::parse::Parse, 52 | { 53 | let tokens = self.collect_tail(TokenStream::new()); 54 | 55 | if tokens.is_empty() { 56 | return None; 57 | } 58 | 59 | Some(syn::parse2(tokens)) 60 | } 61 | 62 | fn next_tt(&mut self) -> Option { 63 | expect_punct(self.inner.next(), ',') 64 | } 65 | 66 | fn collect_tail(&mut self, first: T) -> TokenStream 67 | where 68 | T: Into, 69 | { 70 | let mut out = first.into(); 71 | 72 | while let Some(tt) = self.next_tt() { 73 | out.extend(Some(tt)); 74 | } 75 | 76 | out 77 | } 78 | 79 | fn parse_unnamed(&mut self, first: Ident, next: TokenTree) -> Nested { 80 | let mut out = TokenStream::from(TokenTree::Ident(first)); 81 | 82 | out.extend(self.collect_tail(next)); 83 | 84 | Nested::Unnamed(out.into_iter().collect()) 85 | } 86 | 87 | fn parse_assign(&mut self, name: Ident) -> Nested { 88 | let value = self.collect_tail(Empty); 89 | 90 | Nested::Named(name, NestedValue::Assign(value)) 91 | } 92 | 93 | fn parse_literal(&mut self, name: Ident, lit: Literal) -> Nested { 94 | // TODO: Error if there are any tokens following 95 | let _ = self.collect_tail(Empty); 96 | 97 | Nested::Named(name, NestedValue::Literal(lit)) 98 | } 99 | 100 | fn parse_group(&mut self, name: Ident, group: TokenStream) -> Nested { 101 | Nested::Named(name, NestedValue::Group(group)) 102 | } 103 | 104 | fn parse_keyword(&mut self, keyword: Ident, name: Ident) -> Nested { 105 | let error = expect_punct(self.next_tt(), '='); 106 | 107 | match error { 108 | Some(error) => { 109 | let error = self.collect_tail(error); 110 | 111 | Nested::Unexpected(error) 112 | } 113 | None => { 114 | let value = self.collect_tail(Empty); 115 | 116 | Nested::Named(keyword, NestedValue::KeywordAssign(name, value)) 117 | } 118 | } 119 | } 120 | } 121 | 122 | impl Iterator for AttributeParser { 123 | type Item = Nested; 124 | 125 | fn next(&mut self) -> Option { 126 | let first = self.inner.next()?; 127 | 128 | let name = match first { 129 | TokenTree::Ident(ident) => ident, 130 | tt => { 131 | let stream = self.collect_tail(tt); 132 | 133 | return Some(Nested::Unnamed(stream.into_iter().collect())); 134 | } 135 | }; 136 | 137 | match self.next_tt() { 138 | Some(tt) if is_punct(&tt, '=') => Some(self.parse_assign(name)), 139 | Some(TokenTree::Literal(lit)) => Some(self.parse_literal(name, lit)), 140 | Some(TokenTree::Group(group)) => Some(self.parse_group(name, group.stream())), 141 | Some(TokenTree::Ident(next)) => Some(self.parse_keyword(name, next)), 142 | Some(next) => Some(self.parse_unnamed(name, next)), 143 | None => Some(Nested::Unnamed(quote!(#name))), 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /logos-codegen/src/parser/subpattern.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::TokenStream; 2 | use syn::Ident; 3 | 4 | use crate::error::Errors; 5 | use crate::mir::Mir; 6 | use crate::parser::definition::{bytes_to_regex_string, Literal}; 7 | 8 | #[derive(Default)] 9 | pub struct Subpatterns { 10 | map: Vec<(Ident, String)>, 11 | } 12 | 13 | impl Subpatterns { 14 | pub fn add(&mut self, param: Ident, pattern: TokenStream, errors: &mut Errors) { 15 | let lit = match syn::parse2::(pattern) { 16 | Ok(lit) => lit, 17 | Err(e) => { 18 | errors.err(e.to_string(), e.span()); 19 | return; 20 | } 21 | }; 22 | 23 | if let Some((name, _)) = self.map.iter().find(|(name, _)| *name == param) { 24 | errors 25 | .err(format!("{} can only be assigned once", param), param.span()) 26 | .err("Previously assigned here", name.span()); 27 | return; 28 | } 29 | 30 | let fixed = self.fix(&lit, errors); 31 | 32 | // Validate the literal as proper regex. If it's not, emit an error. 33 | let mir = match &lit { 34 | Literal::Utf8(_) => Mir::utf8(&fixed), 35 | Literal::Bytes(_) => Mir::binary(&fixed), 36 | }; 37 | 38 | if let Err(err) = mir { 39 | errors.err(err, lit.span()); 40 | }; 41 | 42 | self.map.push((param, fixed)); 43 | } 44 | 45 | pub fn fix(&self, lit: &Literal, errors: &mut Errors) -> String { 46 | let mut i = 0; 47 | let mut pattern = match lit { 48 | Literal::Utf8(s) => s.value(), 49 | Literal::Bytes(b) => bytes_to_regex_string(b.value()), 50 | }; 51 | 52 | while let Some(f) = pattern[i..].find("(?&") { 53 | i += f; 54 | pattern.replace_range(i..i + 3, "(?:"); 55 | i += 3; 56 | 57 | let subref_end = if let Some(f) = pattern[i..].find(')') { 58 | i + f 59 | } else { 60 | pattern.truncate(i); // truncate so latter error doesn't suppress 61 | break; // regex-syntax will report the unclosed group 62 | }; 63 | 64 | let name = &pattern[i..subref_end]; 65 | let name = match syn::parse_str::(name) { 66 | Ok(name) => name, 67 | Err(_) => { 68 | errors.err( 69 | format!("subpattern reference `{}` is not an identifier", name), 70 | lit.span(), 71 | ); 72 | // we emitted the error; make something up and continue 73 | pattern.replace_range(i..subref_end, "_"); 74 | i += 2; 75 | continue; 76 | } 77 | }; 78 | 79 | match self.map.iter().find(|(def, _)| *def == name) { 80 | Some((_, subpattern)) => { 81 | pattern.replace_range(i..subref_end, subpattern); 82 | i += subpattern.len() + 1; 83 | } 84 | None => { 85 | errors.err( 86 | format!("subpattern reference `{}` has not been defined", name), 87 | lit.span(), 88 | ); 89 | // leaving `(?:name)` is fine 90 | i = subref_end + 1; 91 | } 92 | } 93 | } 94 | 95 | pattern 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /logos-codegen/src/parser/type_params.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Ident, Span, TokenStream}; 2 | use quote::quote; 3 | use syn::spanned::Spanned; 4 | use syn::{Lifetime, LifetimeParam, Path, Type}; 5 | 6 | use crate::error::Errors; 7 | 8 | #[derive(Default)] 9 | pub struct TypeParams { 10 | lifetime: bool, 11 | type_params: Vec<(Ident, Option)>, 12 | } 13 | 14 | impl TypeParams { 15 | pub fn explicit_lifetime(&mut self, lt: LifetimeParam, errors: &mut Errors) { 16 | if self.lifetime { 17 | let span = lt.span(); 18 | 19 | errors.err("Logos types can only have one lifetime", span); 20 | } 21 | 22 | self.lifetime = true; 23 | } 24 | 25 | pub fn add(&mut self, param: Ident) { 26 | self.type_params.push((param, None)); 27 | } 28 | 29 | pub fn set(&mut self, param: Ident, ty: TokenStream, errors: &mut Errors) { 30 | let ty = match syn::parse2::(ty) { 31 | Ok(mut ty) => { 32 | replace_lifetimes(&mut ty); 33 | ty 34 | } 35 | Err(err) => { 36 | errors.err(err.to_string(), err.span()); 37 | return; 38 | } 39 | }; 40 | 41 | match self.type_params.iter_mut().find(|(name, _)| *name == param) { 42 | Some((_, slot)) => { 43 | if let Some(previous) = slot.replace(ty) { 44 | errors 45 | .err( 46 | format!("{} can only have one type assigned to it", param), 47 | param.span(), 48 | ) 49 | .err("Previously assigned here", previous.span()); 50 | } 51 | } 52 | None => { 53 | errors.err( 54 | format!("{} is not a declared type parameter", param), 55 | param.span(), 56 | ); 57 | } 58 | } 59 | } 60 | 61 | pub fn find(&self, path: &Path) -> Option { 62 | for (ident, ty) in &self.type_params { 63 | if path.is_ident(ident) { 64 | return ty.clone(); 65 | } 66 | } 67 | 68 | None 69 | } 70 | 71 | pub fn generics(&self, errors: &mut Errors) -> Option { 72 | if !self.lifetime && self.type_params.is_empty() { 73 | return None; 74 | } 75 | 76 | let mut generics = Vec::new(); 77 | 78 | if self.lifetime { 79 | generics.push(quote!('s)); 80 | } 81 | 82 | for (ty, replace) in self.type_params.iter() { 83 | match replace { 84 | Some(ty) => generics.push(quote!(#ty)), 85 | None => { 86 | errors.err( 87 | format!( 88 | "Generic type parameter without a concrete type\n\ 89 | \n\ 90 | Define a concrete type Logos can use: #[logos(type {} = Type)]", 91 | ty, 92 | ), 93 | ty.span(), 94 | ); 95 | } 96 | } 97 | } 98 | 99 | if generics.is_empty() { 100 | None 101 | } else { 102 | Some(quote!(<#(#generics),*>)) 103 | } 104 | } 105 | } 106 | 107 | pub fn replace_lifetimes(ty: &mut Type) { 108 | traverse_type(ty, &mut replace_lifetime) 109 | } 110 | 111 | pub fn replace_lifetime(ty: &mut Type) { 112 | use syn::{GenericArgument, PathArguments}; 113 | 114 | match ty { 115 | Type::Path(p) => { 116 | p.path 117 | .segments 118 | .iter_mut() 119 | .filter_map(|segment| match &mut segment.arguments { 120 | PathArguments::AngleBracketed(ab) => Some(ab), 121 | _ => None, 122 | }) 123 | .flat_map(|ab| ab.args.iter_mut()) 124 | .for_each(|arg| { 125 | if let GenericArgument::Lifetime(lt) = arg { 126 | *lt = Lifetime::new("'s", lt.span()); 127 | } 128 | }); 129 | } 130 | Type::Reference(r) => { 131 | let span = match r.lifetime.take() { 132 | Some(lt) => lt.span(), 133 | None => Span::call_site(), 134 | }; 135 | 136 | r.lifetime = Some(Lifetime::new("'s", span)); 137 | } 138 | _ => (), 139 | } 140 | } 141 | 142 | pub fn traverse_type(ty: &mut Type, f: &mut impl FnMut(&mut Type)) { 143 | f(ty); 144 | match ty { 145 | Type::Array(array) => traverse_type(&mut array.elem, f), 146 | Type::BareFn(bare_fn) => { 147 | for input in &mut bare_fn.inputs { 148 | traverse_type(&mut input.ty, f); 149 | } 150 | if let syn::ReturnType::Type(_, ty) = &mut bare_fn.output { 151 | traverse_type(ty, f); 152 | } 153 | } 154 | Type::Group(group) => traverse_type(&mut group.elem, f), 155 | Type::Paren(paren) => traverse_type(&mut paren.elem, f), 156 | Type::Path(path) => traverse_path(&mut path.path, f), 157 | Type::Ptr(p) => traverse_type(&mut p.elem, f), 158 | Type::Reference(r) => traverse_type(&mut r.elem, f), 159 | Type::Slice(slice) => traverse_type(&mut slice.elem, f), 160 | Type::TraitObject(object) => object.bounds.iter_mut().for_each(|bound| { 161 | if let syn::TypeParamBound::Trait(trait_bound) = bound { 162 | traverse_path(&mut trait_bound.path, f); 163 | } 164 | }), 165 | Type::Tuple(tuple) => tuple 166 | .elems 167 | .iter_mut() 168 | .for_each(|elem| traverse_type(elem, f)), 169 | _ => (), 170 | } 171 | } 172 | 173 | fn traverse_path(path: &mut Path, f: &mut impl FnMut(&mut Type)) { 174 | for segment in &mut path.segments { 175 | match &mut segment.arguments { 176 | syn::PathArguments::None => (), 177 | syn::PathArguments::AngleBracketed(args) => { 178 | for arg in &mut args.args { 179 | match arg { 180 | syn::GenericArgument::Type(ty) => { 181 | traverse_type(ty, f); 182 | } 183 | syn::GenericArgument::AssocType(assoc) => { 184 | traverse_type(&mut assoc.ty, f); 185 | } 186 | _ => (), 187 | } 188 | } 189 | } 190 | syn::PathArguments::Parenthesized(args) => { 191 | for arg in &mut args.inputs { 192 | traverse_type(arg, f); 193 | } 194 | if let syn::ReturnType::Type(_, ty) = &mut args.output { 195 | traverse_type(ty, f); 196 | } 197 | } 198 | } 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /logos-codegen/src/util.rs: -------------------------------------------------------------------------------- 1 | use proc_macro2::{Spacing, Span, TokenStream, TokenTree}; 2 | use quote::{quote, ToTokens}; 3 | use syn::Ident; 4 | 5 | /// Analog to Option, except when put into the quote! 6 | /// macro, `MaybeVoid::Void` will produce `()` 7 | #[derive(Clone, Default)] 8 | pub enum MaybeVoid { 9 | Some(TokenStream), 10 | #[default] 11 | Void, 12 | } 13 | 14 | impl MaybeVoid { 15 | pub fn replace(&mut self, stream: TokenStream) -> MaybeVoid { 16 | std::mem::replace(self, MaybeVoid::Some(stream)) 17 | } 18 | 19 | pub fn take(&mut self) -> MaybeVoid { 20 | std::mem::replace(self, MaybeVoid::Void) 21 | } 22 | } 23 | 24 | impl ToTokens for MaybeVoid { 25 | fn to_tokens(&self, out: &mut TokenStream) { 26 | match self { 27 | MaybeVoid::Some(stream) => out.extend(stream.clone()), 28 | MaybeVoid::Void => out.extend(quote!(())), 29 | } 30 | } 31 | 32 | fn to_token_stream(&self) -> TokenStream { 33 | match self { 34 | MaybeVoid::Some(stream) => stream.clone(), 35 | MaybeVoid::Void => quote!(()), 36 | } 37 | } 38 | 39 | fn into_token_stream(self) -> TokenStream { 40 | match self { 41 | MaybeVoid::Some(stream) => stream, 42 | MaybeVoid::Void => quote!(()), 43 | } 44 | } 45 | } 46 | 47 | pub fn is_punct(tt: &TokenTree, expect: char) -> bool { 48 | matches!(tt, TokenTree::Punct(punct) if punct.as_char() == expect && punct.spacing() == Spacing::Alone) 49 | } 50 | 51 | /// If supplied `tt` is a punct matching a char, returns `None`, else returns `tt` 52 | pub fn expect_punct(tt: Option, expect: char) -> Option { 53 | tt.filter(|tt| !is_punct(tt, expect)) 54 | } 55 | 56 | pub trait ToIdent { 57 | fn to_ident(&self) -> Ident; 58 | } 59 | 60 | impl ToIdent for str { 61 | fn to_ident(&self) -> Ident { 62 | Ident::new(self, Span::call_site()) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /logos-codegen/tests/codegen.rs: -------------------------------------------------------------------------------- 1 | use std::{error::Error, path::PathBuf}; 2 | 3 | #[rstest::rstest] 4 | #[case("simple")] 5 | #[case("no_error_lut")] 6 | pub fn test_codegen(#[case] fixture: &str) -> Result<(), Box> { 7 | let mut fixture_dir = PathBuf::new(); 8 | fixture_dir.push(env!("CARGO_MANIFEST_DIR")); 9 | fixture_dir.push("tests"); 10 | fixture_dir.push("data"); 11 | fixture_dir.push(fixture); 12 | 13 | let input = fixture_dir.join("input.rs"); 14 | fixture_dir.push("output.rs"); 15 | let output_file = fixture_dir; 16 | 17 | let input = std::fs::read_to_string(input)?; 18 | let output = std::fs::read_to_string(&output_file)?; 19 | 20 | let generated = logos_codegen::generate(input.parse()?); 21 | let generated = generated.to_string(); 22 | 23 | if std::env::var("BLESS_CODEGEN").is_ok_and(|value| value == "1") { 24 | std::fs::write(&output_file, &generated)?; 25 | return Ok(()); 26 | } 27 | 28 | assert_eq!(generated, output, "Codegen test failed: `{fixture}`, run tests again with env var `BLESS_CODEGEN=1` to bless these changes"); 29 | 30 | Ok(()) 31 | } 32 | -------------------------------------------------------------------------------- /logos-codegen/tests/data/no_error_lut/input.rs: -------------------------------------------------------------------------------- 1 | #[derive(Logos)] 2 | #[logos(source = [u8])] 3 | enum Token { 4 | #[token("\n")] 5 | Newline, 6 | #[regex(".")] 7 | AnyUnicode, 8 | #[regex(b".", priority = 0)] 9 | Any, 10 | } 11 | -------------------------------------------------------------------------------- /logos-codegen/tests/data/no_error_lut/output.rs: -------------------------------------------------------------------------------- 1 | impl < 's > :: logos :: Logos < 's > for Token { type Error = () ; type Extras = () ; type Source = [u8] ; fn lex (lex : & mut :: logos :: Lexer < 's , Self >) { use :: logos :: internal :: { LexerInternal , CallbackResult } ; type Lexer < 's > = :: logos :: Lexer < 's , Token > ; fn _end < 's > (lex : & mut Lexer < 's >) { lex . end () } fn _error < 's > (lex : & mut Lexer < 's >) { lex . bump_unchecked (1) ; lex . error () ; } macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } # [inline] fn goto1_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Newline)) ; } # [inline] fn goto11_ctx11_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Any)) ; } # [inline] fn goto2_ctx11_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: AnyUnicode)) ; } # [inline] fn goto16_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 2usize] > () { Some ([128u8 ..= 159u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (2usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto17_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 3usize] > () { Some ([144u8 ..= 191u8 , 128u8 ..= 191u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (3usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto2_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: AnyUnicode)) ; } # [inline] fn goto13_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 1usize] > () { Some ([128u8 ..= 191u8]) => { lex . bump_unchecked (1usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto18_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 3usize] > () { Some ([128u8 ..= 191u8 , 128u8 ..= 191u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (3usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto15_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 2usize] > () { Some ([128u8 ..= 191u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (2usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto14_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 2usize] > () { Some ([160u8 ..= 191u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (2usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto19_ctx11_x < 's > (lex : & mut Lexer < 's >) { match lex . read :: < & [u8 ; 3usize] > () { Some ([128u8 ..= 143u8 , 128u8 ..= 191u8 , 128u8 ..= 191u8]) => { lex . bump_unchecked (3usize) ; goto2_ctx11_x (lex) } , _ => goto11_ctx11_x (lex) , } } # [inline] fn goto11_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Any)) ; } # [inline] fn goto20 < 's > (lex : & mut Lexer < 's >) { enum Jump { J1 , J16 , J17 , J2 , J13 , J18 , J15 , J14 , J19 , J11 , } const LUT : [Jump ; 256] = { use Jump :: * ; [J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J1 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J2 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J13 , J14 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J15 , J16 , J15 , J15 , J17 , J18 , J18 , J18 , J19 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11 , J11] } ; let byte = match lex . read :: < u8 > () { Some (byte) => byte , None => return _end (lex) , } ; match LUT [byte as usize] { Jump :: J1 => { lex . bump_unchecked (1usize) ; goto1_x (lex) } , Jump :: J16 => { lex . bump_unchecked (1usize) ; goto16_ctx11_x (lex) } , Jump :: J17 => { lex . bump_unchecked (1usize) ; goto17_ctx11_x (lex) } , Jump :: J2 => { lex . bump_unchecked (1usize) ; goto2_x (lex) } , Jump :: J13 => { lex . bump_unchecked (1usize) ; goto13_ctx11_x (lex) } , Jump :: J18 => { lex . bump_unchecked (1usize) ; goto18_ctx11_x (lex) } , Jump :: J15 => { lex . bump_unchecked (1usize) ; goto15_ctx11_x (lex) } , Jump :: J14 => { lex . bump_unchecked (1usize) ; goto14_ctx11_x (lex) } , Jump :: J19 => { lex . bump_unchecked (1usize) ; goto19_ctx11_x (lex) } , Jump :: J11 => { lex . bump_unchecked (1usize) ; goto11_x (lex) } , } } goto20 (lex) } } -------------------------------------------------------------------------------- /logos-codegen/tests/data/simple/input.rs: -------------------------------------------------------------------------------- 1 | #[derive(Logos, Debug, Clone, Copy, PartialEq)] 2 | enum Token { 3 | #[regex("a-z")] 4 | Letter, 5 | } 6 | -------------------------------------------------------------------------------- /logos-codegen/tests/data/simple/output.rs: -------------------------------------------------------------------------------- 1 | impl < 's > :: logos :: Logos < 's > for Token { type Error = () ; type Extras = () ; type Source = str ; fn lex (lex : & mut :: logos :: Lexer < 's , Self >) { use :: logos :: internal :: { LexerInternal , CallbackResult } ; type Lexer < 's > = :: logos :: Lexer < 's , Token > ; fn _end < 's > (lex : & mut Lexer < 's >) { lex . end () } fn _error < 's > (lex : & mut Lexer < 's >) { lex . bump_unchecked (1) ; lex . error () ; } macro_rules ! _fast_loop { ($ lex : ident , $ test : ident , $ miss : expr) => { while let Some (arr) = $ lex . read :: < & [u8 ; 16] > () { if $ test (arr [0]) { if $ test (arr [1]) { if $ test (arr [2]) { if $ test (arr [3]) { if $ test (arr [4]) { if $ test (arr [5]) { if $ test (arr [6]) { if $ test (arr [7]) { if $ test (arr [8]) { if $ test (arr [9]) { if $ test (arr [10]) { if $ test (arr [11]) { if $ test (arr [12]) { if $ test (arr [13]) { if $ test (arr [14]) { if $ test (arr [15]) { $ lex . bump_unchecked (16) ; continue ; } $ lex . bump_unchecked (15) ; return $ miss ; } $ lex . bump_unchecked (14) ; return $ miss ; } $ lex . bump_unchecked (13) ; return $ miss ; } $ lex . bump_unchecked (12) ; return $ miss ; } $ lex . bump_unchecked (11) ; return $ miss ; } $ lex . bump_unchecked (10) ; return $ miss ; } $ lex . bump_unchecked (9) ; return $ miss ; } $ lex . bump_unchecked (8) ; return $ miss ; } $ lex . bump_unchecked (7) ; return $ miss ; } $ lex . bump_unchecked (6) ; return $ miss ; } $ lex . bump_unchecked (5) ; return $ miss ; } $ lex . bump_unchecked (4) ; return $ miss ; } $ lex . bump_unchecked (3) ; return $ miss ; } $ lex . bump_unchecked (2) ; return $ miss ; } $ lex . bump_unchecked (1) ; return $ miss ; } return $ miss ; } while $ lex . test ($ test) { $ lex . bump_unchecked (1) ; } $ miss } ; } # [inline] fn goto1_x < 's > (lex : & mut Lexer < 's >) { lex . set (Ok (Token :: Letter)) ; } # [inline] fn goto3_at1_with3 < 's > (lex : & mut Lexer < 's >) { match lex . read_at :: < & [u8 ; 2usize] > (1usize) { Some (b"-z") => { lex . bump_unchecked (3usize) ; goto1_x (lex) } , _ => _error (lex) , } } # [inline] fn goto4 < 's > (lex : & mut Lexer < 's >) { let arr = match lex . read :: < & [u8 ; 3usize] > () { Some (arr) => arr , None => return _end (lex) , } ; match arr [0] { b'a' => goto3_at1_with3 (lex) , _ => _error (lex) , } } goto4 (lex) } } -------------------------------------------------------------------------------- /logos-derive/Cargo.toml: -------------------------------------------------------------------------------- 1 | [dependencies] 2 | logos-codegen = {version = "0.15.0", path = "../logos-codegen"} 3 | 4 | [features] 5 | # Enables debug messages 6 | debug = ["logos-codegen/debug"] 7 | # Don't use or generate unsafe code 8 | forbid_unsafe = ["logos-codegen/forbid_unsafe"] 9 | 10 | [lib] 11 | bench = false 12 | proc-macro = true 13 | 14 | [package] 15 | name = "logos-derive" 16 | authors.workspace = true 17 | categories.workspace = true 18 | description.workspace = true 19 | edition.workspace = true 20 | homepage.workspace = true 21 | keywords.workspace = true 22 | license.workspace = true 23 | readme.workspace = true 24 | repository.workspace = true 25 | rust-version.workspace = true 26 | version.workspace = true 27 | 28 | [package.metadata.release] 29 | shared-version = true 30 | -------------------------------------------------------------------------------- /logos-derive/LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | ../LICENSE-APACHE -------------------------------------------------------------------------------- /logos-derive/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | ../LICENSE-MIT -------------------------------------------------------------------------------- /logos-derive/src/lib.rs: -------------------------------------------------------------------------------- 1 | use proc_macro::TokenStream; 2 | 3 | #[proc_macro_derive(Logos, attributes(logos, extras, error, end, token, regex))] 4 | pub fn logos(input: TokenStream) -> TokenStream { 5 | logos_codegen::generate(input.into()).into() 6 | } 7 | -------------------------------------------------------------------------------- /logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maciejhirsz/logos/96765c0be188f3e8005b48db82bf1e904d2e6650/logos.png -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | pre-release-commit-message = "chore(version): bump logos version to {{version}}" 2 | push = false 3 | tag = false 4 | -------------------------------------------------------------------------------- /tests/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition.workspace = true 3 | name = "tests" 4 | publish = false 5 | rust-version.workspace = true 6 | version = "0.0.0" 7 | 8 | [dependencies] 9 | logos-derive = {path = "../logos-derive"} 10 | logos = {path = "../", default-features = false, features = ["std"]} 11 | 12 | [features] 13 | forbid_unsafe = [ 14 | "logos-derive/forbid_unsafe", 15 | "logos/forbid_unsafe" 16 | ] 17 | 18 | [dev-dependencies] 19 | criterion = { version = "2.10.1", package = "codspeed-criterion-compat" } 20 | 21 | [package.metadata.release] 22 | release = false 23 | 24 | [[bench]] 25 | harness = false 26 | name = "bench" 27 | -------------------------------------------------------------------------------- /tests/benches/bench.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; 2 | use logos_derive::Logos; 3 | 4 | #[derive(Debug, Clone, Copy, PartialEq, Logos)] 5 | pub enum Token { 6 | #[regex(r"[ \n\t\f]", logos::skip)] 7 | InvalidToken, 8 | 9 | #[regex("[a-zA-Z_$][a-zA-Z0-9_$]*")] 10 | Identifier, 11 | 12 | #[regex(r#""([^"\\]|\\t|\\u|\\n|\\")*""#)] 13 | String, 14 | 15 | #[token("private")] 16 | Private, 17 | 18 | #[token("primitive")] 19 | Primitive, 20 | 21 | #[token("protected")] 22 | Protected, 23 | 24 | #[token("in")] 25 | In, 26 | 27 | #[token("instanceof")] 28 | Instanceof, 29 | 30 | #[token(".")] 31 | Accessor, 32 | 33 | #[token("...")] 34 | Ellipsis, 35 | 36 | #[token("(")] 37 | ParenOpen, 38 | 39 | #[token(")")] 40 | ParenClose, 41 | 42 | #[token("{")] 43 | BraceOpen, 44 | 45 | #[token("}")] 46 | BraceClose, 47 | 48 | #[token("+")] 49 | OpAddition, 50 | 51 | #[token("++")] 52 | OpIncrement, 53 | 54 | #[token("=")] 55 | OpAssign, 56 | 57 | #[token("==")] 58 | OpEquality, 59 | 60 | #[token("===")] 61 | OpStrictEquality, 62 | 63 | #[token("=>")] 64 | FatArrow, 65 | } 66 | 67 | static SOURCE: &str = " 68 | foobar(protected primitive private instanceof in) { + ++ = == === => } 69 | foobar(protected primitive private instanceof in) { + ++ = == === => } 70 | foobar(protected primitive private instanceof in) { + ++ = == === => } 71 | foobar(protected primitive private instanceof in) { + ++ = == === => } 72 | foobar(protected primitive private instanceof in) { + ++ = == === => } 73 | foobar(protected primitive private instanceof in) { + ++ = == === => } 74 | foobar(protected primitive private instanceof in) { + ++ = == === => } 75 | foobar(protected primitive private instanceof in) { + ++ = == === => } 76 | foobar(protected primitive private instanceof in) { + ++ = == === => } 77 | foobar(protected primitive private instanceof in) { + ++ = == === => } 78 | foobar(protected primitive private instanceof in) { + ++ = == === => } 79 | foobar(protected primitive private instanceof in) { + ++ = == === => } 80 | foobar(protected primitive private instanceof in) { + ++ = == === => } 81 | foobar(protected primitive private instanceof in) { + ++ = == === => } 82 | foobar(protected primitive private instanceof in) { + ++ = == === => } 83 | foobar(protected primitive private instanceof in) { + ++ = == === => } 84 | foobar(protected primitive private instanceof in) { + ++ = == === => } 85 | foobar(protected primitive private instanceof in) { + ++ = == === => } 86 | foobar(protected primitive private instanceof in) { + ++ = == === => } 87 | foobar(protected primitive private instanceof in) { + ++ = == === => } 88 | foobar(protected primitive private instanceof in) { + ++ = == === => } 89 | foobar(protected primitive private instanceof in) { + ++ = == === => } 90 | foobar(protected primitive private instanceof in) { + ++ = == === => } 91 | foobar(protected primitive private instanceof in) { + ++ = == === => } 92 | foobar(protected primitive private instanceof in) { + ++ = == === => } 93 | foobar(protected primitive private instanceof in) { + ++ = == === => } 94 | foobar(protected primitive private instanceof in) { + ++ = == === => } 95 | foobar(protected primitive private instanceof in) { + ++ = == === => } 96 | foobar(protected primitive private instanceof in) { + ++ = == === => } 97 | foobar(protected primitive private instanceof in) { + ++ = == === => } 98 | "; 99 | 100 | static IDENTIFIERS: &str = "It was the year when they finally immanentized the Eschaton \ 101 | It was the year when they finally immanentized the Eschaton \ 102 | It was the year when they finally immanentized the Eschaton \ 103 | It was the year when they finally immanentized the Eschaton \ 104 | It was the year when they finally immanentized the Eschaton \ 105 | It was the year when they finally immanentized the Eschaton \ 106 | It was the year when they finally immanentized the Eschaton \ 107 | It was the year when they finally immanentized the Eschaton \ 108 | It was the year when they finally immanentized the Eschaton \ 109 | It was the year when they finally immanentized the Eschaton \ 110 | It was the year when they finally immanentized the Eschaton \ 111 | It was the year when they finally immanentized the Eschaton \ 112 | It was the year when they finally immanentized the Eschaton"; 113 | 114 | static STRINGS: &str = r#""tree" "to" "a" "graph" "that can" "more adequately represent" "loops and arbitrary state jumps" "with\"\"\"out" "the\n\n\n\n\n" "expl\"\"\"osive" "nature\"""of trying to build up all possible permutations in a tree." "tree" "to" "a" "graph" "that can" "more adequately represent" "loops and arbitrary state jumps" "with\"\"\"out" "the\n\n\n\n\n" "expl\"\"\"osive" "nature\"""of trying to build up all possible permutations in a tree." "tree" "to" "a" "graph" "that can" "more adequately represent" "loops and arbitrary state jumps" "with\"\"\"out" "the\n\n\n\n\n" "expl\"\"\"osive" "nature\"""of trying to build up all possible permutations in a tree." "tree" "to" "a" "graph" "that can" "more adequately represent" "loops and arbitrary state jumps" "with\"\"\"out" "the\n\n\n\n\n" "expl\"\"\"osive" "nature\"""of trying to build up all possible permutations in a tree.""#; 115 | 116 | static CANDIDATES: [(&str, &str); 3] = [ 117 | ("identifiers", IDENTIFIERS), 118 | ("keywords_operators_and_punctators", SOURCE), 119 | ("strings", STRINGS), 120 | ]; 121 | 122 | #[allow(unused_must_use)] 123 | fn iterate(s: &str) { 124 | use logos::Logos; 125 | 126 | let mut lex = Token::lexer(s); 127 | 128 | while let Some(token) = lex.next() { 129 | black_box(token); 130 | } 131 | } 132 | 133 | fn count_ok(s: &str) -> usize { 134 | use logos::Logos; 135 | 136 | Token::lexer(s).filter_map(|res| res.ok()).count() 137 | } 138 | 139 | fn bench_iterate(c: &mut Criterion) { 140 | let mut group = c.benchmark_group("iterate"); 141 | 142 | for (name, source) in CANDIDATES { 143 | group.throughput(Throughput::Bytes(source.len() as u64)); 144 | group.bench_with_input(name, &source, |b, &s| b.iter(|| iterate(s))); 145 | } 146 | } 147 | 148 | fn bench_count_ok(c: &mut Criterion) { 149 | let mut group = c.benchmark_group("count_ok"); 150 | 151 | for (name, source) in CANDIDATES { 152 | group.throughput(Throughput::Bytes(source.len() as u64)); 153 | group.bench_with_input(name, &source, |b, &s| b.iter(|| count_ok(s))); 154 | } 155 | } 156 | 157 | criterion_group!(benches, bench_iterate, bench_count_ok); 158 | criterion_main!(benches); 159 | -------------------------------------------------------------------------------- /tests/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! ```compile_fail 2 | //! use logos::Logos; 3 | //! use logos_derive::Logos; 4 | //! 5 | //! #[derive(Logos)] 6 | //! enum Token { 7 | //! #[token(b"\xFF")] 8 | //! NonUtf8, 9 | //! } 10 | //! 11 | //! Token::lexer("This shouldn't work with a string literal!"); 12 | //! ``` 13 | //! 14 | //! Same, but with regex: 15 | //! 16 | //! ```compile_fail 17 | //! use logos::Logos; 18 | //! use logos_derive::Logos; 19 | //! 20 | //! #[derive(Logos)] 21 | //! enum Token { 22 | //! #[regex(b"\xFF")] 23 | //! NonUtf8, 24 | //! } 25 | //! 26 | //! Token::lexer("This shouldn't work with a string literal!"); 27 | //! ``` 28 | //! 29 | //! Matching against .* (or .+) should fail to compile: 30 | //! 31 | //! ```compile_fail 32 | //! use logos::Logos; 33 | //! use logos_derive::Logos; 34 | //! 35 | //! #[derive(Logos, Debug, PartialEq)] 36 | //! enum Token { 37 | //! #[regex(r"\(.*\)")] 38 | //! BetweenParen, 39 | //! 40 | //! } 41 | //! ``` 42 | //! 43 | //! ```compile_fail 44 | //! use logos::Logos; 45 | //! use logos_derive::Logos; 46 | //! 47 | //! #[derive(Logos, Debug, PartialEq)] 48 | //! enum Token { 49 | //! #[regex(r"\(.+\)")] 50 | //! BetweenParen, 51 | //! 52 | //! } 53 | //! ``` 54 | //! 55 | //! And also when working with bytes: 56 | //! 57 | //! ```compile_fail 58 | //! use logos::Logos; 59 | //! use logos_derive::Logos; 60 | //! 61 | //! #[derive(Logos, Debug, PartialEq)] 62 | //! enum Token { 63 | //! #[regex(b"\x00.*")] 64 | //! NonUtf8, 65 | //! 66 | //! } 67 | //! ``` 68 | //! 69 | //! ```compile_fail 70 | //! use logos::Logos; 71 | //! use logos_derive::Logos; 72 | //! 73 | //! #[derive(Logos, Debug, PartialEq)] 74 | //! enum Token { 75 | //! #[regex(b"\x00.+")] 76 | //! NonUtf8, 77 | //! 78 | //! } 79 | //! ``` 80 | use logos::source::Source; 81 | use logos::Logos; 82 | 83 | use std::fmt; 84 | use std::ops::Range; 85 | 86 | #[allow(clippy::type_complexity)] 87 | pub fn assert_lex<'a, Token>( 88 | source: &'a Token::Source, 89 | tokens: &[( 90 | Result, 91 | ::Slice<'a>, 92 | Range, 93 | )], 94 | ) where 95 | Token: Logos<'a> + fmt::Debug + PartialEq, 96 | Token::Extras: Default, 97 | { 98 | let mut lex = Token::lexer(source); 99 | 100 | for tuple in tokens { 101 | assert_eq!( 102 | &(lex.next().expect("Unexpected end"), lex.slice(), lex.span()), 103 | tuple 104 | ); 105 | } 106 | 107 | assert_eq!(lex.next(), None); 108 | } 109 | -------------------------------------------------------------------------------- /tests/tests/binary.rs: -------------------------------------------------------------------------------- 1 | use logos_derive::Logos; 2 | use tests::assert_lex; 3 | 4 | #[derive(Logos, Debug, Clone, Copy, PartialEq)] 5 | enum Token { 6 | #[token("foo")] 7 | Foo, 8 | 9 | #[regex(b"\x42+")] 10 | Life, 11 | 12 | #[regex(b"[\xA0-\xAF]+")] 13 | Aaaaaaa, 14 | 15 | #[token(b"\xCA\xFE\xBE\xEF")] 16 | CafeBeef, 17 | 18 | #[token(b"\x00")] 19 | Zero, 20 | } 21 | 22 | #[test] 23 | fn handles_non_utf8() { 24 | assert_lex( 25 | &[ 26 | 0, 0, 0xCA, 0xFE, 0xBE, 0xEF, b'f', b'o', b'o', 0x42, 0x42, 0x42, 0xAA, 0xAA, 0xA2, 27 | 0xAE, 0x10, 0x20, 0, 28 | ][..], 29 | &[ 30 | (Ok(Token::Zero), &[0], 0..1), 31 | (Ok(Token::Zero), &[0], 1..2), 32 | (Ok(Token::CafeBeef), &[0xCA, 0xFE, 0xBE, 0xEF], 2..6), 33 | (Ok(Token::Foo), b"foo", 6..9), 34 | (Ok(Token::Life), &[0x42, 0x42, 0x42], 9..12), 35 | (Ok(Token::Aaaaaaa), &[0xAA, 0xAA, 0xA2, 0xAE], 12..16), 36 | (Err(()), &[0x10], 16..17), 37 | (Err(()), &[0x20], 17..18), 38 | (Ok(Token::Zero), &[0], 18..19), 39 | ], 40 | ); 41 | } 42 | -------------------------------------------------------------------------------- /tests/tests/callbacks.rs: -------------------------------------------------------------------------------- 1 | use logos::{Lexer, Logos as _, Skip}; 2 | use logos_derive::Logos; 3 | use tests::assert_lex; 4 | 5 | #[derive(Default, Debug, Clone, PartialEq)] 6 | enum LexingError { 7 | ParseNumberError, 8 | #[default] 9 | Other, 10 | } 11 | 12 | impl From for LexingError { 13 | fn from(_: std::num::ParseIntError) -> Self { 14 | LexingError::ParseNumberError 15 | } 16 | } 17 | 18 | impl From for LexingError { 19 | fn from(_: std::num::ParseFloatError) -> Self { 20 | LexingError::ParseNumberError 21 | } 22 | } 23 | 24 | mod data { 25 | use super::*; 26 | 27 | #[derive(Logos, Debug, PartialEq)] 28 | #[logos(error = LexingError)] 29 | #[logos(skip r"[ \t\n\f]+")] 30 | enum Token<'a> { 31 | #[regex(r"[a-zA-Z]+", |lex| lex.slice())] 32 | Text(&'a str), 33 | 34 | #[regex(r"-?[0-9]+", |lex| lex.slice().parse())] 35 | Integer(i64), 36 | 37 | #[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().parse())] 38 | Float(f64), 39 | } 40 | 41 | #[test] 42 | fn numbers() { 43 | let tokens: Vec<_> = Token::lexer("Hello 1 42 -100 pi 3.14 -77.77").collect(); 44 | 45 | assert_eq!( 46 | tokens, 47 | &[ 48 | Ok(Token::Text("Hello")), 49 | Ok(Token::Integer(1)), 50 | Ok(Token::Integer(42)), 51 | Ok(Token::Integer(-100)), 52 | Ok(Token::Text("pi")), 53 | Ok(Token::Float(3.14)), 54 | Ok(Token::Float(-77.77)), 55 | ] 56 | ); 57 | } 58 | } 59 | 60 | mod nested_lifetime { 61 | use super::*; 62 | use std::borrow::Cow; 63 | 64 | #[derive(Logos, Debug, PartialEq)] 65 | #[logos(error = LexingError)] 66 | #[logos(skip r"[ \t\n\f]+")] 67 | enum Token<'a> { 68 | #[regex(r"[0-9]+", |lex| { 69 | let slice = lex.slice(); 70 | 71 | slice.parse::().map(|n| { 72 | (slice, n) 73 | }) 74 | })] 75 | Integer((&'a str, u64)), 76 | 77 | #[regex(r"[a-z]+", |lex| Cow::Borrowed(lex.slice()))] 78 | Text(Cow<'a, str>), 79 | } 80 | 81 | #[test] 82 | fn supplement_lifetime_in_types() { 83 | let tokens: Vec<_> = Token::lexer("123 hello 42").collect(); 84 | 85 | assert_eq!( 86 | tokens, 87 | &[ 88 | Ok(Token::Integer(("123", 123))), 89 | Ok(Token::Text(Cow::Borrowed("hello"))), 90 | Ok(Token::Integer(("42", 42))), 91 | ], 92 | ); 93 | } 94 | } 95 | 96 | mod rust { 97 | use super::*; 98 | 99 | /// Adaptation of implementation by matklad: 100 | /// https://github.com/matklad/fall/blob/527ab331f82b8394949041bab668742868c0c282/lang/rust/syntax/src/rust.fall#L1294-L1324 101 | fn parse_raw_string(lexer: &mut Lexer) -> bool { 102 | // Who needs more then 25 hashes anyway? :) 103 | let q_hashes = concat!('"', "######", "######", "######", "######", "######"); 104 | let closing = &q_hashes[..lexer.slice().len() - 1]; // skip initial 'r' 105 | 106 | lexer 107 | .remainder() 108 | .find(closing) 109 | .map(|i| lexer.bump(i + closing.len())) 110 | .is_some() 111 | } 112 | 113 | #[derive(Logos, Debug, Clone, Copy, PartialEq)] 114 | #[logos(error = LexingError)] 115 | #[logos(skip r"[ \t\n\f]+")] 116 | enum Token { 117 | #[regex("[a-zA-Z_][a-zA-Z0-9_]*")] 118 | Ident, 119 | 120 | #[regex("r#*\"", parse_raw_string)] 121 | RawString, 122 | } 123 | 124 | #[test] 125 | fn raw_strings() { 126 | assert_lex( 127 | " r\"foo\" r#\"bar\"# r#####\"baz\"##### r###\"error\"## ", 128 | &[ 129 | (Ok(Token::RawString), "r\"foo\"", 1..7), 130 | (Ok(Token::RawString), "r#\"bar\"#", 8..16), 131 | (Ok(Token::RawString), "r#####\"baz\"#####", 17..33), 132 | (Err(LexingError::Other), "r###\"", 34..39), 133 | (Ok(Token::Ident), "error", 39..44), 134 | (Err(LexingError::Other), "\"", 44..45), 135 | (Err(LexingError::Other), "#", 45..46), 136 | (Err(LexingError::Other), "#", 46..47), 137 | ], 138 | ); 139 | } 140 | } 141 | 142 | mod any_token_callback { 143 | use super::*; 144 | 145 | // Adaption of data test for (_) -> Token callbacks 146 | #[derive(Logos, Debug, PartialEq)] 147 | #[logos(skip r"[ \t\n\f]+")] 148 | enum Token { 149 | #[regex(r"[a-zA-Z]+", |_| Token::Text)] 150 | #[regex(r"-?[0-9]+", |_| Token::Integer)] 151 | #[regex(r"-?[0-9]+\.[0-9]+", |_| Token::Float)] 152 | Text, 153 | Integer, 154 | Float, 155 | } 156 | 157 | #[test] 158 | fn any_token_callback() { 159 | let tokens: Vec<_> = Token::lexer("Hello 1 42 -100 pi 3.14 -77.77").collect(); 160 | 161 | assert_eq!( 162 | tokens, 163 | &[ 164 | Ok(Token::Text), 165 | Ok(Token::Integer), 166 | Ok(Token::Integer), 167 | Ok(Token::Integer), 168 | Ok(Token::Text), 169 | Ok(Token::Float), 170 | Ok(Token::Float), 171 | ] 172 | ); 173 | } 174 | } 175 | 176 | mod return_result_skip { 177 | use super::*; 178 | 179 | #[derive(Debug, Default, PartialEq, Clone)] 180 | enum LexerError { 181 | UnterminatedComment, 182 | #[default] 183 | Other, 184 | } 185 | 186 | #[derive(Logos, Debug, PartialEq)] 187 | #[logos(skip r"[ \t\n\f]+")] 188 | #[logos(error = LexerError)] 189 | enum Token<'src> { 190 | #[regex(r"<[a-zA-Z0-9-]+>", |lex| &lex.slice()[1..lex.slice().len()-1])] 191 | Tag(&'src str), 192 | 193 | #[token("") 201 | .ok_or(LexerError::UnterminatedComment)?; 202 | lexer.bump(end + 3); 203 | 204 | Ok(Skip) 205 | } 206 | 207 | #[test] 208 | fn return_result_skip() { 209 | let mut lexer = Token::lexer(" "); 210 | assert_eq!(lexer.next(), Some(Ok(Token::Tag("foo")))); 211 | assert_eq!(lexer.next(), Some(Ok(Token::Tag("bar")))); 212 | assert_eq!(lexer.next(), None); 213 | 214 | let mut lexer = Token::lexer("