├── .appveyor.yml ├── .github └── workflows │ └── rust.yml ├── .gitignore ├── CONTRIBUTING.md ├── Cargo.toml ├── README.md ├── assets ├── arc_lookbehind.svg ├── lookBehind.js ├── look_behind.svg └── special_punct.svg ├── benches ├── chars_vs_jsbuffer.rs ├── major_libs.rs └── ref_perf_vs.rs ├── code_of_conduct.md ├── codecov.yml ├── examples ├── clear-comments │ └── src │ │ └── main.rs ├── count_tokens.rs ├── find_regexes.rs ├── instruments │ ├── bools.rs │ ├── comments.rs │ ├── idents.rs │ ├── keywords.rs │ ├── null.rs │ ├── numbers.rs │ ├── puncts.rs │ ├── regexes.rs │ ├── strings.rs │ └── templates.rs ├── major_libs │ └── src │ │ └── main.rs ├── semi_finder │ └── src │ │ └── main.rs ├── tokenize.rs └── tokens.js ├── license.txt ├── package.json ├── proptest-regressions ├── comments.txt ├── keywords.txt ├── numeric.txt ├── punct.txt ├── regex.txt └── strings.txt ├── regex.md ├── rustfmt.toml ├── src ├── error.rs ├── lib.rs ├── look_behind.rs ├── manual_scanner.rs ├── tokenizer │ ├── buffer.rs │ ├── keyword_trie.rs │ ├── mod.rs │ ├── tokens.rs │ └── unicode.rs └── tokens │ ├── boolean.rs │ ├── comment.rs │ ├── ident.rs │ ├── keyword.rs │ ├── mod.rs │ ├── number.rs │ ├── regex.rs │ ├── string.rs │ └── template.rs └── tests ├── ecma262 ├── es2015m.rs ├── es2015s.rs ├── es5.rs └── main.rs ├── moz_central └── main.rs ├── prop └── main.rs ├── proptest-regressions └── main.txt ├── readme ├── index.js └── main.rs └── snippets └── main.rs /.appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | global: 3 | RUSTFLAGS: -Zunstable-options -Ctarget-feature=+crt-static 4 | RUST_BACKTRACE: 1 5 | CARGO_INCREMENTAL: 0 # should turn this back on when fixed! 6 | matrix: 7 | - TARGET: x86_64-pc-windows-msvc 8 | 9 | install: 10 | - ps: Install-Product node 10 11 | - appveyor-retry appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe 12 | - rustup-init.exe -y --default-host x86_64-pc-windows-msvc --default-toolchain nightly 13 | - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin 14 | - rustc -V 15 | - cargo -V 16 | 17 | build: false 18 | 19 | test_script: 20 | - npm i 21 | - cargo test 22 | - cargo run --example major_libs --release 23 | 24 | branches: 25 | only: 26 | - master -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Setup Node.js for use with actions 13 | uses: actions/setup-node@v2.1.5 14 | - name: install js test libs from npm 15 | run: npm install 16 | - name: Build 17 | run: cargo build 18 | - name: get moz_central files 19 | run: curl https://hg.mozilla.org/mozilla-central/archive/tip.zip/js/src/jit-test/tests/ --output moz_central.zip 20 | - name: unzip moz_central 21 | run: unzip -qq moz_central -d moz_central 22 | - name: Run tests 23 | run: cargo test --features moz_central 24 | if: success() 25 | - name: Run Major Libs example 26 | run: cargo run --example major_libs 27 | if: success() 28 | - name: Check syntax 29 | run: cargo fmt --all -- --check 30 | if: success() 31 | - name: Get tarpaulin install script 32 | run: cargo install cargo-tarpaulin 33 | if: success() 34 | - name: Run tarpaulin and upload to CodeCov.io 35 | run: cargo tarpaulin --out Xml && bash <(curl -s https://codecov.io/bash) 36 | env: 37 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 38 | RUST_LOG: trace 39 | if: success() 40 | - name: Cache node_modules 41 | uses: actions/cache@v1.0.3 42 | with: 43 | path: ./node_modules 44 | key: ${{ runner.os }}.node_modules 45 | - name: before cargo cache 46 | run: rm -rf ~/.cargo/registry 47 | - name: Cache cargo directory 48 | uses: actions/cache@v2.1.4 49 | with: 50 | key: ${{ runner.os }}.cargo 51 | path: ~/.cargo 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | /target 3 | **/*.rs.bk 4 | **/.DS_Store 5 | **/node_modules 6 | Cargo.lock 7 | *.log 8 | /*.js 9 | package-lock.json 10 | moz_central 11 | moz-central 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to RESS 2 | 3 | If you are interested in contributing to RESS know that your help would be appreciated! 4 | 5 | Feel free to open issues and/or pull requests for anything that you see that might be an improvement. 6 | Please note that [ressa](https://github.com/freemasen/ressa) and [resast](https://github.com/freemasen/resast) may already have an issue opened. 7 | 8 | I do not work on this full time, please be patient if I am not able to respond quickly. 9 | 10 | The primary development branch is the `next` branch. It would be ideal to create any pull requests against that branch over `master` or one of the other feature branches that might have been missed when cleaning up. 11 | 12 | For any PRs know that the code must pass ci tests before they will be reviewed/merged. These test include the following commands you could use to check your version. 13 | ```sh 14 | $ npm i 15 | $ cargo test 16 | $ cargo run --example major_libs 17 | ``` 18 | The release flag in the above is due to the fact that this example is a naive benchmark to validate that changes haven't completely ruined the performance. Feel free to leave this flag off when you are testing for a PR. 19 | 20 | This will run all of the project's unit tests as well as a test against some major js libraries, namely [Angular-js](angularjs.org), [Jquery](jquery.com), [React/React-Dom](reactjs.org), [Vue](vuejs.org), [Moment.js](momentjs.com) and [Dexie](dexie.org). 21 | 22 | If you are interested in becoming a maintainer send me an email and we can talk more about what that looks like. 23 | 24 | 25 | # Getting Started 26 | There are a few things you might need to know to get started. First, the tests and benchmarks require that `npm` is installed to pull down the javascript they evaluate so you'll need [node.js](https://nodejs.org/en/) installed. 27 | 28 | Because the benchmarks use Criterion, it can be difficult to use them with profiling so each of the single token benchmarks is extracted out as an example (you can find these in the examples/instruments folder). For the major_libs benchmark, you can use the example with the same name. These are helpful for working with tools like [`cargo instruments`](https://crates.io/crates/cargo-instruments). 29 | 30 | The overall code layout works like this. 31 | 32 | - lib.rs 33 | - `Scanner`: The primary interface for this crate 34 | - Mostly this is a wrapper around Tokenizer that handles detecting regexes and calculating line/column numbers 35 | - `ScannerState`: This is used for caching the state and resetting it. See the `Scanner::get_state` and `Scanner::set_state` methods 36 | - erros.rs 37 | - This is where the error structs live. If you add a new error type to the `Tokenizer` you will need to add a Into/From implementation here 38 | - look_behnid: 39 | - `LookBehind`: This is ring like structure that is used to keep the look behind tokens. 40 | - For regex detection we only care are the last token we have seen and the three toknes before an open parentheses, so the Scanner keeps two of these on hand. 41 | - The basic idea here is to just use a 3 element array and keep track of where we last put an element to be able to calculate which is `last`, `two` or `three`. 42 | - `MetaToken`: a cheaper token variant which only holds the bare minimum of information for regex detection 43 | - tokenizer 44 | - mod.rs 45 | - `RawItem`: a cheaper version of the `Item` struct from above, it has only as much information as the `Tokenizer` can determine; a `RawToken` and the byte index of the start and end. 46 | - `Tokenizer`: This is the primary export of this module. This struct will perform the actual seperation and classification of tokens 47 | - One note about the matching logic, matching on the length of a byte array or string a bunch of times with an if clause is cheaper than matching on the strings directly. Until [phf](https://github.com/sfackler/rust-phf) can handle byte slices, this is the fastest method available 48 | - bufer.rs 49 | - `JSBuffer`: Mostly a reimplementation of [std::Chars](https://doc.rust-lang.org/std/str/struct.Chars.html) 50 | - For most look_ahead operations there is `look_ahead_matches` which takes a byte slice, however if you are looking for a single byte character the `look_ahead_byte_matche` is slightly faster 51 | - `at_new_line` the `cmp` operation on u8 is faster than matching or `eq` so checking if something is smaller than a target is faster than doing bounds checks between `||`s 52 | - tokens.rs 53 | - `RawToken`: This is a token more tailored to directing the Scanner about how to construct a `tokens::Token` 54 | - The three cases that can have new lines carry some extra information with them, the `new_line_count` and the `last_len` (length of the last line) 55 | - `CommentKind`: empty version of `tokens::Comment` 56 | - `StingKind`: empty version of `tokens::StringLit` 57 | - `TemplateKind`: empty version of `tokens::Template` 58 | - unicode.rs 59 | - bounds checks on `char`s is more effective than binary search (which the two unicode implemtations I could find use) so these function bodies are generated using the approprate table 60 | - The generation code may become available in the future but right now it isn't very effective 61 | - `is_ident_start`: check if a `char` has the attribute of ident_start 62 | - `is_id_continue`: check if a `char` has the attribute of ident_continue 63 | - `is_other_whitesapce`: the ECMA spec says that any Zs category character is valid whitespace. This function will test any exotic whitespaces 64 | 65 | # Testing 66 | There are a few sets of JavaScript files that are required to run the tests in this repository. The first set can be easily aquired by running `npm install` in the root of this project. An additional test is also available behind a feature flag `moz_central` that requires the JIT Test files from the FireFox repository, the expectation is that these will exist in the folder `moz-central` in the root of this project. To get these files you can either manually download and unzip them by following [this link](https://hg.mozilla.org/mozilla-central/archive/tip.zip/js/src/jit-test/tests/) or you can execute the following command. 67 | 68 | ```sh 69 | curl https://hg.mozilla.org/mozilla-central/archive/tip.zip/js/src/jit-test/tests/ --output moz-central.zip 70 | unzip -q moz-central.zip -d moz-central 71 | ``` 72 | 73 | To run these tests simple execute the following command. 74 | 75 | ```sh 76 | cargo test --features moz_central -- moz_central 77 | ``` -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ress" 3 | version = "0.11.7" 4 | authors = ["Robert Masen "] 5 | description = "A scanner/tokenizer for JS files" 6 | keywords = ["JavaScript", "parsing", "JS", "ES", "ECMA"] 7 | categories = ["parsing", "text-processing", "web-programming"] 8 | license = "MIT" 9 | repository = "https://github.com/rusty-ecma/ress" 10 | readme = "./README.md" 11 | edition = "2018" 12 | 13 | [dependencies] 14 | log = "0.4" 15 | unicode-xid = "0.2" 16 | 17 | [dev-dependencies] 18 | walkdir = "2" 19 | docopt = "1" 20 | serde = "1" 21 | serde_derive = "1" 22 | proptest = "0.10" 23 | pretty_env_logger = "0.4" 24 | regex_generate = "0.2" 25 | criterion = "0.3" 26 | lazy_static = "1" 27 | res-regex = "0.1" 28 | 29 | [features] 30 | default = [] 31 | moz_central = [] 32 | 33 | [[example]] 34 | name = "major_libs" 35 | path = "examples/major_libs/src/main.rs" 36 | 37 | [[example]] 38 | name = "clear-comments" 39 | path = "examples/clear-comments/src/main.rs" 40 | 41 | [[example]] 42 | name = "semi_finder" 43 | path = "examples/semi_finder/src/main.rs" 44 | 45 | #instrument's examples 46 | [[example]] 47 | name = "keywords" 48 | path = "examples/instruments/keywords.rs" 49 | [[example]] 50 | name = "puncts" 51 | path = "examples/instruments/puncts.rs" 52 | [[example]] 53 | name = "idents" 54 | path = "examples/instruments/idents.rs" 55 | [[example]] 56 | name = "strings" 57 | path = "examples/instruments/strings.rs" 58 | [[example]] 59 | name = "templates" 60 | path = "examples/instruments/templates.rs" 61 | [[example]] 62 | name = "regexes" 63 | path = "examples/instruments/regexes.rs" 64 | [[example]] 65 | name = "numbers" 66 | path = "examples/instruments/numbers.rs" 67 | [[example]] 68 | name = "bools" 69 | path = "examples/instruments/bools.rs" 70 | [[example]] 71 | name = "null" 72 | path = "examples/instruments/null.rs" 73 | [[example]] 74 | name = "comments" 75 | path = "examples/instruments/comments.rs" 76 | 77 | [[bench]] 78 | name = "major_libs" 79 | harness = false 80 | 81 | [[bench]] 82 | name = "ref_perf_vs" 83 | harness = false 84 | 85 | [[bench]] 86 | name = "chars_vs_jsbuffer" 87 | harness = false 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RESS 2 | 3 | > Rusty EcmaScript Scanner 4 | 5 | [![Github Actions](https://img.shields.io/github/workflow/status/rusty-ecma/RESS/Rust)](https://travis-ci.org/FreeMasen/RESS) 6 | [![crates.io](https://img.shields.io/crates/v/ress.svg)](https://crates.io/crates/ress) 7 | [![last commit master](https://img.shields.io/github/last-commit/FreeMasen/RESS.svg)](https://github.com/FreeMasen/RESS/commits/master) 8 | 9 | A scanner/tokenizer for JS written in Rust 10 | 11 | ## Usage 12 | 13 | The primary way to interact with ress is through the `Scanner` struct which implements `Iterator` over the `Item` struct. `Item` has three fields `token` for the `Token` found, `span` which represents the start and end of the byte position in the original string and `location` which represents start and end character position with a line and column. It's definition looks like this. 14 | 15 | ```rust 16 | Item { 17 | token: Token::Punct(Punct::Bang), 18 | span: Span { 19 | start: 0, 20 | end: 1, 21 | }, 22 | location: SourceLocation { 23 | start: Position { 24 | line: 1, 25 | column: 1, 26 | }, 27 | end: Position { 28 | line: 1, 29 | column: 2, 30 | } 31 | } 32 | } 33 | ``` 34 | 35 | Note: the EcmaScript spec allows for 4 new line characters, only two of which are normally rendered by modern text editors the location line numbers will count these un-rendered lines. 36 | 37 | Here is an example that will check some JS text for the existence of a semicolon and panics if one 38 | is found. 39 | 40 | ```rust 41 | use ress::Scanner; 42 | 43 | static JS: &str = include_str!("index.js"); 44 | 45 | fn main() { 46 | let s = Scanner::new(JS); 47 | for item in s { 48 | let token = item.unwrap().token; 49 | if token.matches_punct_str(";") { 50 | panic!("A semi-colon!? Heathen!"); 51 | } 52 | } 53 | println!("Good show! Why use something that's optional?") 54 | } 55 | ``` 56 | 57 | By far the most important part of `Item` is the `Token` enum, which will represent the 11 different types of token's supported by the [ECMAScript specification](https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar). 58 | 59 | In Javascript [it is hard to know if a forward slash means divide or is the start of a regular expression](https://github.com/rusty-ecma/RESS/blob/master/regex.md). 60 | The above `Scanner` will detect RegEx automatically by keeping track of the previously 61 | parsed tokens, this makes things very convenient, however if you are parsing Javascript 62 | into an AST, you likely already need to keep track of the same information. In that 63 | case, you may not want to pay the performance cost of that automatic RegEx detection, 64 | you would want to reach for the `ManualScanner`. Instead of exposing 65 | the basic `Iterator` interface, it exposes two primary methods for driving the scanner 66 | `next_token` and `next_regex`. The first of those will always return a `/` or `/=` when 67 | encountering a regular expression, the latter will fail if the next token isn't 68 | a regular expression. 69 | 70 | ```rust 71 | use ress::{ManualScanner, prelude::*}; 72 | 73 | fn main() { 74 | let mut s = ManualScanner::new("let x = /[a-z]+/g"); 75 | while let Some(Ok(item)) = s.next_token() { 76 | if item.token.matches_punct(Punct::ForwardSlash) 77 | || item.token.matches_punct(Punct::ForwardSlashEqual) { 78 | // it could be a 1 or 2 length prefix 79 | let regex = s.next_regex(1).unwrap().unwrap(); 80 | println!("{:?}", regex); 81 | } else { 82 | println!("{:?}", item); 83 | } 84 | } 85 | } 86 | ``` 87 | 88 | ### ES Tokens 89 | 90 | - Boolean Literal 91 | - End of File 92 | - Identifier 93 | - Keyword 94 | - Null Literal 95 | - Numeric Literal 96 | - Punctuation 97 | - String Literal 98 | - Regular Expression Literal 99 | - Template String 100 | - Comment 101 | 102 | Keep in mind that keywords have been moving around a lot in JS between ES3 through ES2019 so you might find some items parsed as keywords in the ES2019 context that are not in the ES3 context, this should be dealt with at a higher level. A good example of this is `yield` which is sometimes a keyword and sometimes an identifier, this package will always parse this as a Keyword. As of the writing of this readme `ress` supports all tokens in the [Stage 2 and Stage 3 ECMAScript Proposals](https://github.com/tc39/proposals) with the exception of the `#!` comments and number seperators. 103 | 104 | For each of the token cases there is either a struct or enum to provide additional information with the exception of `NullLiteral` and `EoF` which should be self explanatory. The more complicated items do implement `ToString` which should get you back to the original js text for that token. The `Token` enum also provides a number of helper functions for building that picture without pulling the inner data our of the enum. Using the `Punct` case as an example the helper functions look like this. 105 | 106 | ```rust 107 | fn is_punct(&self) -> bool; 108 | fn matches_punct(&self, p: Punct) -> bool; 109 | fn matches_punct_str(&self, s: &str) -> bool; 110 | ``` 111 | 112 | A similar set of functions are available for each case. 113 | 114 | Like all `Iterators` the `Scanner` has a `next` method, It also has a `look_ahead` method that will allow you to parse the next value without advancing. Using this method can be a convenient way to get the next token without performing a mutable borrow, however you will be incurring the cost of parsing that token twice. All `Iterators` can be converted into a `Peekable` Iterator with a `peek` method, this will allow you to look ahead while only paying the cost once however `peek` performs a mutable borrow which means it needs to be in a different scope than a call to `next`. 115 | 116 | ```rust 117 | // look_ahead 118 | let js = "function() { return; }"; 119 | let mut s = Scanner::new(js); 120 | let current = s.next(); 121 | let next = s.look_ahead(); 122 | let new_current = s.next(); 123 | assert_eq!(next, new_current); 124 | // peekable (fails to compile) 125 | let p = Scanner::new(js).peekable(); 126 | let current = s.next(); // <-- first mutable borrow 127 | let next = p.peek(); // <-- second mutable borrow 128 | ``` 129 | 130 | For more intense lookahead scenarios `Scanner` makes available the `get_state` and `set_state` methods. These methods will allow you to capture a snapshot of the current position and any context, and then later reset to that position and context. 131 | 132 | ```rust 133 | let js = "function() { 134 | return 0; 135 | };"; 136 | let mut s = Scanner::new(js); 137 | let start = s.get_state(); 138 | assert_eq!(s.next().unwrap().unwrap().token, Token::Keyword(Keyword::Function)); 139 | assert_eq!(s.next().unwrap().unwrap().token, Token::Punct(Punct::OpenParen)); 140 | assert_eq!(s.next().unwrap().unwrap().token, Token::Punct(Punct::CloseParen)); 141 | s.set_state(start); 142 | assert_eq!(s.next().unwrap().unwrap().token, Token::Keyword(Keyword::Function)); 143 | ``` 144 | 145 | ## Why? 146 | 147 | Wouldn't it be nice to write new JS development tools in Rust? The [clear-comments](https://github.com/FreeMasen/RESS/blob/master/examples/clear-comments/src/main.rs) example is a proof of concept on how you might use this crate to do just that. This example will take in a JS file and output a version with all of the comments removed. An example of how you might see it in action is below (assuming you have a file called in.js in the project root). 148 | 149 | ```sh 150 | cargo run --example clear-comments -- ./in.js ./out.js 151 | ``` 152 | 153 | ## Performance 154 | 155 | The below stats are from running `cargo +nightly bench` on a MBP (2.9 GHz i9-8850H & 16bg RAM). 156 | 157 | | Lib | Size | Time | +/- | 158 | | ----------- | -------- | --------- | ---------- | 159 | | Angular 1.5 | 1.16mb | 18.991 ms | 4.393 ms | 160 | | jquery | 271.75kb | 7.218 ms | 577.236 μs | 161 | | React | 59.09kb | 1.976 ms | 116.139 μs | 162 | | React-dom | 641.51kb | 16.880 ms | 3.614 ms | 163 | | Vue | 289.30kb | 9.675 ms | 1.402 ms | 164 | 165 | If you are interested in getting an idea about performance without waiting for `cargo bench` to complete you can run the following command. 166 | 167 | ```sh 168 | cargo run --example major_libs 169 | ``` 170 | 171 | ## Contributing 172 | 173 | [see contributing.md](https://github.com/FreeMasen/RESS/blob/master/CONTRIBUTING.md) 174 | -------------------------------------------------------------------------------- /assets/lookBehind.js: -------------------------------------------------------------------------------- 1 | var runLookBehindAnimation = (function () { 2 | /** 3 | * Flag to avoid running more 4 | * than once 5 | */ 6 | let running = false; 7 | /** 8 | * Set the arrow's fill to "black" at the 9 | * provided index 10 | * @param {number} current index 11 | */ 12 | function setBlack(idx) { 13 | if (idx < 0 || idx > 11) { 14 | return; 15 | } 16 | const arrow = document.getElementById(`index-${idx}`); 17 | arrow.style.fill = 'black'; 18 | } 19 | /** 20 | * Set the arrow's fill to "none" at the 21 | * provided index 22 | * @param {number} idx currentIndex 23 | */ 24 | function setNone(idx) { 25 | if (idx < 0 || idx > 11) { 26 | return; 27 | } 28 | const arrow = document.getElementById(`index-${idx}`); 29 | arrow.style.fill = 'none'; 30 | } 31 | /** 32 | * Perform the fill setting correctly 33 | * - the last 3 are "black" 34 | * - all others are "none" 35 | * @param {number} idx current token index 36 | */ 37 | function updateArrowColors(idx) { 38 | for (let i = 0; i < 11; i++) { 39 | if (i < idx - 2 || i > idx) { 40 | setNone(i); 41 | } else { 42 | setBlack(i); 43 | } 44 | } 45 | } 46 | /** 47 | * Set all arrow's fill to "none" 48 | */ 49 | function clearAll() { 50 | for (let i = 0; i < 11; i++) { 51 | setNone(i); 52 | } 53 | running = false; 54 | } 55 | /** 56 | * Perform one step in the animation 57 | * 58 | * Calling this once will start an async loop 59 | * for 10 counts finally clearing all arrows 60 | * @param {number} idx Current iteration count 61 | */ 62 | function oneTick(idx) { 63 | if (!idx) idx = 0; 64 | if (idx > 10) { 65 | return clearAll(); 66 | } 67 | updateArrowColors(idx); 68 | setTimeout(run, 1000, idx + 1) 69 | } 70 | /** 71 | * Exported member, starts the async loop 72 | * but checks if we are already running 73 | * and short-circuits if we are 74 | */ 75 | return function run() { 76 | if (running) { 77 | return; 78 | } 79 | running = true; 80 | oneTick(); 81 | } 82 | })(); -------------------------------------------------------------------------------- /benches/chars_vs_jsbuffer.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | extern crate ress; 3 | 4 | #[macro_use] 5 | extern crate criterion; 6 | 7 | use criterion::black_box; 8 | use criterion::Criterion; 9 | 10 | fn ascii_string() -> String { 11 | string_from_range(0..256) 12 | } 13 | fn non_ascii_string() -> String { 14 | string_from_range(0x7FF..0x110000) 15 | } 16 | fn string_from_range(r: std::ops::Range) -> String { 17 | let mut ret = String::new(); 18 | for i in r { 19 | if let Some(ch) = std::char::from_u32(i) { 20 | ret.push(ch); 21 | } 22 | } 23 | ret 24 | } 25 | fn chars_ascii_chars(c: &mut Criterion) { 26 | let s = ascii_string(); 27 | chars(c, &s, "chars_ascii_chars"); 28 | } 29 | fn chars_non_ascii_chars(c: &mut Criterion) { 30 | let mut s = non_ascii_string(); 31 | chars(c, &s, "chars_non_ascii_chars"); 32 | } 33 | fn jsb_ascii_chars(c: &mut Criterion) { 34 | let s = ascii_string(); 35 | js_buffer(c, &s, "jsb_ascii_chars"); 36 | } 37 | fn jsb_non_ascii_chars(c: &mut Criterion) { 38 | let s = non_ascii_string(); 39 | js_buffer(c, &s, "jsb_non_ascii_chars") 40 | } 41 | fn chars(c: &mut Criterion, s: &str, name: &str) { 42 | c.bench_function(name, |b| { 43 | b.iter(|| { 44 | let mut chs = s.chars(); 45 | while let Some(ch) = chs.next() { 46 | black_box(ch); 47 | } 48 | }); 49 | }); 50 | } 51 | fn js_buffer(c: &mut Criterion, s: &str, name: &str) { 52 | c.bench_function(name, |b| { 53 | b.iter(|| { 54 | let mut chs = ress::JSBuffer::new(s.as_bytes()); 55 | while let Some(ch) = chs.next_char() { 56 | black_box(ch); 57 | } 58 | }); 59 | }); 60 | } 61 | 62 | criterion_group!( 63 | benches, 64 | chars_ascii_chars, 65 | chars_non_ascii_chars, 66 | jsb_ascii_chars, 67 | jsb_non_ascii_chars, 68 | ); 69 | criterion_main!(benches); 70 | -------------------------------------------------------------------------------- /benches/major_libs.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | extern crate ress; 3 | 4 | #[macro_use] 5 | extern crate criterion; 6 | 7 | use criterion::black_box; 8 | use criterion::Criterion; 9 | 10 | use ress::Scanner; 11 | use std::fs::read_to_string; 12 | use std::path::PathBuf; 13 | 14 | fn angular(c: &mut Criterion) { 15 | run_bench(c, Lib::Angular, "angular", false); 16 | } 17 | 18 | fn angular_min(c: &mut Criterion) { 19 | run_bench(c, Lib::Angular, "angular_min", true); 20 | } 21 | 22 | fn jq(c: &mut Criterion) { 23 | run_bench(c, Lib::Jquery, "jq", false); 24 | } 25 | 26 | fn jq_min(c: &mut Criterion) { 27 | run_bench(c, Lib::Jquery, "jq_min", true); 28 | } 29 | 30 | fn react(c: &mut Criterion) { 31 | run_bench(c, Lib::React, "react", false); 32 | } 33 | 34 | fn react_min(c: &mut Criterion) { 35 | run_bench(c, Lib::React, "react_min", true); 36 | } 37 | 38 | fn react_dom(c: &mut Criterion) { 39 | run_bench(c, Lib::ReactDom, "react_dom", false); 40 | } 41 | 42 | fn react_dom_min(c: &mut Criterion) { 43 | run_bench(c, Lib::ReactDom, "react_dom_min", true); 44 | } 45 | 46 | fn vue(c: &mut Criterion) { 47 | run_bench(c, Lib::Vue, "vue", false); 48 | } 49 | 50 | fn vue_min(c: &mut Criterion) { 51 | run_bench(c, Lib::Vue, "vue_min", true); 52 | } 53 | 54 | fn everything_es5(c: &mut Criterion) { 55 | run_bench(c, Lib::EveryEs5, "everything_es5", false); 56 | } 57 | 58 | fn everything_es2015_s(c: &mut Criterion) { 59 | run_bench(c, Lib::EveryEs2015Script, "everything_es2015_s", false); 60 | } 61 | 62 | fn everything_es2015_m(c: &mut Criterion) { 63 | run_bench(c, Lib::EveryEs2015Mod, "everything_es2015_m", false); 64 | } 65 | 66 | enum Lib { 67 | Jquery, 68 | Angular, 69 | React, 70 | ReactDom, 71 | Vue, 72 | EveryEs5, 73 | EveryEs2015Script, 74 | EveryEs2015Mod, 75 | } 76 | 77 | fn get_js(l: Lib) -> Result { 78 | let path = PathBuf::from(l.path()); 79 | if !path.exists() { 80 | npm_install(); 81 | if !path.exists() { 82 | panic!("npm install failed to make {} available", path.display()); 83 | } 84 | } 85 | read_to_string(path) 86 | } 87 | 88 | fn get_min_js(l: Lib) -> Result { 89 | let path = PathBuf::from(l.min_path()); 90 | if !path.exists() { 91 | npm_install(); 92 | if !path.exists() { 93 | panic!("npm install failed to make {} available", path.display()); 94 | } 95 | } 96 | read_to_string(path) 97 | } 98 | 99 | impl Lib { 100 | pub fn path(&self) -> String { 101 | match self { 102 | Lib::Jquery => "node_modules/jquery/dist/jquery.js", 103 | Lib::Angular => "node_modules/angular/angular.js", 104 | Lib::React => "node_modules/react/umd/react.development.js", 105 | Lib::ReactDom => "node_modules/react-dom/umd/react-dom.development.js", 106 | Lib::Vue => "node_modules/vue/dist/vue.js", 107 | Lib::EveryEs5 => "node_modules/everything.js/es5.js", 108 | Lib::EveryEs2015Script => "node_modules/everything.js/es2015-script.js", 109 | Lib::EveryEs2015Mod => "node_modules/everything.js/es2015-module.js", 110 | } 111 | .into() 112 | } 113 | 114 | pub fn min_path(&self) -> String { 115 | match self { 116 | &Lib::Jquery => "node_modules/jquery/dist/jquery.min.js".into(), 117 | &Lib::Angular => "node_modules/angular/angular.min.js".into(), 118 | &Lib::React => "node_modules/react/umd/react.production.min.js".into(), 119 | &Lib::ReactDom => "node_modules/react-dom/umd/react-dom.production.min.js".into(), 120 | &Lib::Vue => "node_modules/vue/dist/vue.min.js".into(), 121 | _ => String::new(), 122 | } 123 | } 124 | } 125 | 126 | fn npm_install() { 127 | eprintln!("Downloading required js dependencies"); 128 | let mut c = ::std::process::Command::new("npm"); 129 | c.arg("i"); 130 | let out = c.output().expect("Failed to read output from npm"); 131 | if !out.status.success() { 132 | panic!( 133 | "{}", 134 | format!( 135 | "Failed to run npm i\n{:?}", 136 | String::from_utf8_lossy(&out.stderr) 137 | ) 138 | ); 139 | } 140 | } 141 | 142 | #[inline(always)] 143 | fn run_bench(c: &mut Criterion, lib: Lib, name: &str, min: bool) { 144 | let js = if min { 145 | get_min_js(lib).unwrap() 146 | } else { 147 | get_js(lib).unwrap() 148 | }; 149 | run_bench_(c, &js, name) 150 | } 151 | 152 | #[inline(always)] 153 | fn run_bench_(c: &mut Criterion, js: &str, name: &str) { 154 | let mut group = c.benchmark_group(name); 155 | group.throughput(criterion::Throughput::Bytes(js.len() as u64)); 156 | group.bench_function(name, |b| { 157 | b.iter(|| { 158 | for i in Scanner::new(&js) { 159 | black_box(i.unwrap()); 160 | } 161 | }) 162 | }); 163 | group.finish(); 164 | } 165 | 166 | criterion_group!( 167 | benches, 168 | angular, 169 | angular_min, 170 | jq, 171 | jq_min, 172 | react, 173 | react_min, 174 | react_dom, 175 | react_dom_min, 176 | vue, 177 | vue_min, 178 | everything_es5, 179 | everything_es2015_s, 180 | everything_es2015_m 181 | ); 182 | criterion_main!(benches); 183 | -------------------------------------------------------------------------------- /benches/ref_perf_vs.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | extern crate ress; 3 | #[macro_use] 4 | extern crate lazy_static; 5 | #[macro_use] 6 | extern crate criterion; 7 | 8 | use criterion::{black_box, Criterion}; 9 | use ress::{Scanner, Tokenizer}; 10 | 11 | static KEYWORDS: &[&str] = &[ 12 | "implements", 13 | "interface", 14 | "package", 15 | "private", 16 | "protected", 17 | "public", 18 | "static", 19 | "yield", 20 | "let", 21 | "enum", 22 | "export", 23 | "import", 24 | "super", 25 | "break", 26 | "case", 27 | "catch", 28 | "continue", 29 | "debugger", 30 | "default", 31 | "delete", 32 | "do", 33 | "else", 34 | "finally", 35 | "for", 36 | "function", 37 | "if", 38 | "instanceof", 39 | "in", 40 | "new", 41 | "return", 42 | "switch", 43 | "this", 44 | "throw", 45 | "try", 46 | "typeof", 47 | "var", 48 | "void", 49 | "while", 50 | "with", 51 | ]; 52 | static PUNCTS: &[&str] = &[ 53 | "{", "}", "(", ")", ".", ";", ",", "[", "]", ":", "?", "~", ">", "<", "=", "!", "+", "-", "/", 54 | "*", "%", "&", "|", "^", "#", "@", ">>>=", "...", "===", "!==", ">>>", "<<=", ">>=", "**=", 55 | "&&", "||", "==", "!=", "+=", "-=", "*=", "/=", "++", "--", "<<", ">>", "&=", "|=", "^=", "%=", 56 | "<=", ">=", "=>", "**", 57 | ]; 58 | 59 | static STRINGS: &[&str] = &[ 60 | r#""things and stuff""#, 61 | r#"'people and places'"#, 62 | r#""with and escaped \"""#, 63 | r#"'another escaped \''"#, 64 | r#""with a new \ 65 | line""#, 66 | r#"'another new line \ 67 | hahaha'"#, 68 | "\"sequence double quoted\\\r\nis hard\"", 69 | "'new line sequence\\\r\nmight be harder'", 70 | ]; 71 | 72 | static COMMENTS: &[&str] = &[ 73 | "//this is a comment", 74 | "/*this is a 75 | multi-line comment*/", 76 | "", 77 | " with a trailer", 78 | ]; 79 | 80 | static NUMBERS: &[&str] = &[ 81 | "0", 82 | "00", 83 | "1234567890", 84 | "01234567", 85 | "0.", 86 | "0.00", 87 | "10.00", 88 | ".0", 89 | "0e0", 90 | "0E0", 91 | "0.e0", 92 | "0.00e+0", 93 | ".00e-0", 94 | "0x0", 95 | "0X0", 96 | "0x0123456789abcdefABCDEF", 97 | "0b0", 98 | "0b0100101", 99 | "0o0", 100 | "0o01234567", 101 | "2e308", 102 | ]; 103 | static REGEX: &[&str] = &[ 104 | r#"x/"#, 105 | r#"|/"#, 106 | r#"|||/"#, 107 | r#"^$\b\B/"#, 108 | r#"(?=(?!(?:(.))))/"#, 109 | r#"a.\f\n\r\t\v\0\[\-\/\\\x00\u0000/"#, 110 | r#"\d\D\s\S\w\W/"#, 111 | r#"\ca\cb\cc\cd\ce\cf\cg\ch\ci\cj\ck\cl\cm\cn\co\cp\cq\cr\cs\ct\cu\cv\cw\cx\cy\cz/"#, 112 | r#"\cA\cB\cC\cD\cE\cF\cG\cH\cI\cJ\cK\cL\cM\cN\cO\cP\cQ\cR\cS\cT\cU\cV\cW\cX\cY\cZ/"#, 113 | r#"[a-z-]/"#, 114 | r#"[^\b\-^]/"#, 115 | r#"[/\]\\]/"#, 116 | r#"./i"#, 117 | r#"./g"#, 118 | r#"./m"#, 119 | r#"./igm"#, 120 | r#".*/"#, 121 | r#".*?/"#, 122 | r#".+/"#, 123 | r#".+?/"#, 124 | r#".?/"#, 125 | r#".??/"#, 126 | r#".{0}/"#, 127 | r#".{0,}/"#, 128 | r#".{0,0}/"#, 129 | ]; 130 | 131 | static TEMPLATE_STARTS: &[&str] = &[ 132 | "`things and stuff times ${", 133 | "`things and stuff`", 134 | r#"`a\${b`"#, 135 | r#"`\0\n\x0A\u000A\u{A}${"#, 136 | ]; 137 | 138 | static TEMPLATE_CONTINUATIONS: &[&str] = &[ 139 | "`${} and animals and minerals`", 140 | "`${}`", 141 | "`${} and animals and minerals`", 142 | "`${} and places and people ${", 143 | ]; 144 | 145 | static IDENTS: &[&str] = &[ 146 | r#"$"#, 147 | r#"_"#, 148 | r#"\u0078"#, 149 | r#"x$"#, 150 | r#"x_"#, 151 | r#"x\u0030"#, 152 | r#"xa"#, 153 | r#"x0"#, 154 | r#"x0a"#, 155 | r#"x0123456789"#, 156 | r#"qwertyuiopasdfghjklzxcvbnm"#, 157 | r#"QWERTYUIOPASDFGHJKLZXCVBNM"#, 158 | r#"œ一"#, 159 | r#"ǻ둘"#, 160 | r#"ɤ〩"#, 161 | r#"φ"#, 162 | r#"fiⅷ"#, 163 | r#"ユニコード"#, 164 | r#"x‌‍"#, 165 | ]; 166 | 167 | static BOOLS: &[&str] = &["true", "false"]; 168 | 169 | static NULL: &[&str] = &["null"]; 170 | 171 | lazy_static! { 172 | static ref TOKENS: Vec<&'static str> = COMMENTS 173 | .into_iter() 174 | .chain(KEYWORDS.into_iter()) 175 | .chain(NUMBERS.into_iter()) 176 | .chain(PUNCTS.into_iter()) 177 | .chain(IDENTS.into_iter()) 178 | .chain(BOOLS.into_iter()) 179 | .chain(NULL.into_iter()) 180 | .chain(TEMPLATE_STARTS.into_iter()) 181 | .map(|s| *s) 182 | .collect(); 183 | static ref JS: String = TOKENS.join("\n"); 184 | } 185 | 186 | fn keywords(c: &mut Criterion) { 187 | c.bench_function("keywords", |b| { 188 | b.iter(|| { 189 | for key in KEYWORDS { 190 | black_box(Tokenizer::new(key).next(true).unwrap()); 191 | } 192 | }) 193 | }); 194 | } 195 | 196 | fn punct(c: &mut Criterion) { 197 | c.bench_function("punct", |b| { 198 | b.iter(|| { 199 | for punct in PUNCTS { 200 | black_box(Tokenizer::new(punct).next(true).unwrap()); 201 | } 202 | }) 203 | }); 204 | } 205 | 206 | fn strings(c: &mut Criterion) { 207 | c.bench_function("strings", |b| { 208 | b.iter(|| { 209 | for s in STRINGS { 210 | black_box(Tokenizer::new(s).next(true).unwrap()); 211 | } 212 | }) 213 | }); 214 | } 215 | 216 | fn comments(c: &mut Criterion) { 217 | c.bench_function("comments", |b| { 218 | b.iter(|| { 219 | for c in COMMENTS { 220 | black_box(Tokenizer::new(c).next(true).unwrap()); 221 | } 222 | }) 223 | }); 224 | } 225 | 226 | fn numbers(c: &mut Criterion) { 227 | c.bench_function("numbers", |b| { 228 | b.iter(|| { 229 | for n in NUMBERS { 230 | black_box(Tokenizer::new(n).next(true).unwrap()); 231 | } 232 | }) 233 | }); 234 | } 235 | 236 | fn regex(c: &mut Criterion) { 237 | c.bench_function("regex", |b| { 238 | b.iter(|| { 239 | for r in REGEX { 240 | black_box(Tokenizer::new(r).next_regex(1).unwrap()); 241 | } 242 | }) 243 | }); 244 | } 245 | 246 | fn templates(c: &mut Criterion) { 247 | c.bench_function("TEMPLATE_CONTINUATIONS", |b| { 248 | b.iter(|| { 249 | for s in TEMPLATE_CONTINUATIONS { 250 | let mut t = Tokenizer::new(&s); 251 | let _ = t.next(true).unwrap(); 252 | black_box(t.next(true).unwrap()); 253 | } 254 | }) 255 | }); 256 | c.bench_function("TEMPLATE_STARTS", |b| { 257 | b.iter(|| { 258 | for s in TEMPLATE_STARTS { 259 | black_box(Tokenizer::new(s).next(true).unwrap()); 260 | } 261 | }) 262 | }); 263 | } 264 | 265 | fn bools(c: &mut Criterion) { 266 | c.bench_function("bools", |b| { 267 | b.iter(|| { 268 | for b in BOOLS { 269 | black_box(Tokenizer::new(b).next(true).unwrap()); 270 | } 271 | }) 272 | }); 273 | } 274 | 275 | fn null(c: &mut Criterion) { 276 | c.bench_function("null", |b| { 277 | b.iter(|| { 278 | for b in NULL { 279 | black_box(Tokenizer::new(b).next(true).unwrap()); 280 | } 281 | }) 282 | }); 283 | } 284 | 285 | fn idents(c: &mut Criterion) { 286 | c.bench_function("idents", |b| { 287 | b.iter(|| { 288 | for i in IDENTS { 289 | black_box(Tokenizer::new(i).next(true).unwrap()); 290 | } 291 | }) 292 | }); 293 | } 294 | 295 | pub fn token(c: &mut Criterion) { 296 | c.bench_function("token", |b| { 297 | b.iter(|| { 298 | for s in TOKENS.iter() { 299 | black_box(Tokenizer::new(s).next(true).unwrap()); 300 | } 301 | }) 302 | }); 303 | } 304 | 305 | fn scanner(c: &mut Criterion) { 306 | c.bench_function("scanner", |b| { 307 | b.iter(|| { 308 | let s = Scanner::new(&JS); 309 | black_box(s.collect::>()) 310 | }) 311 | }); 312 | } 313 | 314 | criterion_group!( 315 | benches, punct, keywords, idents, strings, comments, numbers, regex, templates, bools, null, 316 | token, scanner 317 | ); 318 | criterion_main!(benches); 319 | -------------------------------------------------------------------------------- /code_of_conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | * Demonstrating empathy and kindness toward other people 14 | * Being respectful of differing opinions, viewpoints, and experiences 15 | * Giving and gracefully accepting constructive feedback 16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | * Focusing on what is best not just for us as individuals, but for the overall community 18 | 19 | Examples of unacceptable behavior include: 20 | 21 | * The use of sexualized language or imagery, and sexual attention or 22 | advances of any kind 23 | * Trolling, insulting or derogatory comments, and personal or political attacks 24 | * Public or private harassment 25 | * Publishing others' private information, such as a physical or email 26 | address, without their explicit permission 27 | * Other conduct which could reasonably be considered inappropriate in a 28 | professional setting 29 | 30 | ## Enforcement Responsibilities 31 | 32 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 33 | 34 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 35 | 36 | ## Scope 37 | 38 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 39 | 40 | ## Enforcement 41 | 42 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at [INSERT CONTACT METHOD]. All complaints will be reviewed and investigated promptly and fairly. 43 | 44 | All community leaders are obligated to respect the privacy and security of the reporter of any incident. 45 | 46 | ## Enforcement Guidelines 47 | 48 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 49 | 50 | ### 1. Correction 51 | 52 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 53 | 54 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. 55 | 56 | ### 2. Warning 57 | 58 | **Community Impact**: A violation through a single incident or series of actions. 59 | 60 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. 61 | 62 | ### 3. Temporary Ban 63 | 64 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. 65 | 66 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. 67 | 68 | ### 4. Permanent Ban 69 | 70 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 71 | 72 | **Consequence**: A permanent ban from any sort of public interaction within the project community. 73 | 74 | ## Attribution 75 | 76 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, 77 | available at https://www.contributor-covenant.org/version/2/0/code-of-conduct.html. 78 | 79 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 80 | 81 | [homepage]: https://www.contributor-covenant.org 82 | 83 | For answers to common questions about this code of conduct, see the FAQ at 84 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 85 | 86 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | informational: true 6 | patch: 7 | default: 8 | enabled: no 9 | if_not_found: success 10 | -------------------------------------------------------------------------------- /examples/clear-comments/src/main.rs: -------------------------------------------------------------------------------- 1 | //! This example is a quick and dirty example of 2 | //! what someone might want to do with a JS token stream. 3 | //! Essentially this is reading in the file and writing it out 4 | //! with no comments. It successfully stripped all of the comments 5 | //! out of a webpack output file though it cannot handle object literals 6 | //! very well. It does a pretty good job of showing how you might use the Scanner. 7 | extern crate docopt; 8 | extern crate ress; 9 | extern crate serde; 10 | #[macro_use] 11 | extern crate serde_derive; 12 | 13 | use std::{ 14 | fs::{read_to_string, File}, 15 | io::{BufWriter, Write}, 16 | path::PathBuf, 17 | string::ToString, 18 | }; 19 | 20 | use docopt::Docopt; 21 | 22 | use ress::prelude::*; 23 | type RefToken<'a> = Token<&'a str>; 24 | 25 | const USAGE: &str = " 26 | clear-comments 27 | 28 | Usage: 29 | clear-comments 30 | "; 31 | 32 | fn main() { 33 | let opts: Opts = Docopt::new(USAGE) 34 | .and_then(|d| d.deserialize()) 35 | .unwrap_or_else(|e| { 36 | println!("error: {:?}", e); 37 | e.exit() 38 | }); 39 | let js = if let Ok(s) = read_to_string(opts.arg_in_path) { 40 | s 41 | } else { 42 | eprintln!("Unable to read in-path"); 43 | ::std::process::exit(1); 44 | }; 45 | let s = Scanner::new(&js); 46 | let mut indent = 0; 47 | let f = File::create(&opts.arg_out_path).expect("Error opening outfile"); 48 | let mut out = BufWriter::new(f); 49 | let mut last_token = Token::EoF; 50 | let mut new_line = false; 51 | let mut in_loop = false; 52 | let mut in_case = false; 53 | let mut in_if = false; 54 | let mut if_parens = 0; 55 | let mut unbraced_if = false; 56 | for item in s { 57 | let item = item.unwrap(); 58 | println!("{:?}", item); 59 | let token = item.token; 60 | if token.matches_keyword(Keyword::If(())) { 61 | in_if = true; 62 | } 63 | if in_if && token.matches_punct(Punct::OpenParen) { 64 | if_parens += 1; 65 | } 66 | if in_if && token.matches_punct(Punct::CloseParen) { 67 | if_parens -= 1; 68 | } 69 | if last_token.matches_keyword(Keyword::For(())) { 70 | in_loop = true; 71 | } 72 | if last_token.matches_keyword(Keyword::Case(())) 73 | || last_token.matches_keyword(Keyword::Default(())) 74 | { 75 | in_case = true; 76 | } 77 | if last_token.matches_punct(Punct::Colon) && in_case { 78 | new_line = true; 79 | } 80 | if in_loop && last_token.matches_punct(Punct::CloseParen) { 81 | in_loop = false; 82 | } 83 | if token.is_comment() { 84 | continue; 85 | } 86 | if last_token.matches_punct(Punct::OpenBrace) { 87 | indent += 1; 88 | new_line = true; 89 | } 90 | if in_if 91 | && if_parens == 0 92 | && last_token.matches_punct(Punct::CloseParen) 93 | && !token.is_punct() 94 | { 95 | unbraced_if = true; 96 | new_line = true; 97 | indent += 1; 98 | } 99 | if last_token.matches_punct(Punct::CloseParen) && !token.is_punct() { 100 | new_line = true; 101 | } 102 | if last_token.matches_punct(Punct::SemiColon) && !in_loop { 103 | new_line = true; 104 | } 105 | if last_token.matches_punct(Punct::CloseBrace) && !token.is_punct() { 106 | new_line = true; 107 | } 108 | if token.matches_punct(Punct::CloseBrace) { 109 | indent -= 1; 110 | new_line = !last_token.matches_punct(Punct::OpenBrace); 111 | } 112 | if last_token.is_comment() { 113 | new_line = true; 114 | } 115 | if new_line { 116 | out.write_all(format!("\n{}", " ".repeat(indent)).as_bytes()) 117 | .expect("error writing indent"); 118 | new_line = false; 119 | in_if = false; 120 | if_parens = 0; 121 | if unbraced_if { 122 | indent -= 1; 123 | unbraced_if = false; 124 | } 125 | } 126 | 127 | if space_before(&last_token, &token) { 128 | out.write_all(b" ").expect("error writing space"); 129 | } 130 | out.write_all(token_to_string(&token).as_bytes()) 131 | .expect("Error writing token"); 132 | last_token = token; 133 | } 134 | } 135 | 136 | fn space_before(last_token: &RefToken, token: &RefToken) -> bool { 137 | if last_token.matches_punct(Punct::Equal) || token.matches_punct(Punct::DoubleEqual) { 138 | return true; 139 | } 140 | if last_token.matches_punct(Punct::Period) 141 | && (token.is_ident() || token.matches_keyword(Keyword::This(()))) 142 | { 143 | return false; 144 | } 145 | if (last_token.is_ident() || last_token.matches_keyword(Keyword::This(()))) 146 | && token.matches_punct(Punct::Period) 147 | { 148 | return false; 149 | } 150 | if token.matches_keyword(Keyword::If(())) { 151 | return false; 152 | } 153 | if last_token.matches_keyword(Keyword::If(())) { 154 | return true; 155 | } 156 | if last_token.matches_keyword(Keyword::Return(())) && !token.is_punct() { 157 | return true; 158 | } 159 | if last_token.matches_keyword(Keyword::For(())) { 160 | return true; 161 | } 162 | if last_token.matches_keyword(Keyword::Switch(())) { 163 | return true; 164 | } 165 | if last_token.matches_punct(Punct::Colon) { 166 | return true; 167 | } 168 | if token.matches_keyword(Keyword::This(())) { 169 | return false; 170 | } 171 | if token.matches_punct(Punct::OpenParen) { 172 | return false; 173 | } 174 | if token.matches_punct(Punct::CloseParen) { 175 | return false; 176 | } 177 | if token.matches_punct(Punct::CloseBracket) { 178 | return false; 179 | } 180 | if token.matches_punct(Punct::OpenBracket) { 181 | return false; 182 | } 183 | if token.matches_punct(Punct::CloseBrace) { 184 | return false; 185 | } 186 | if last_token.matches_punct(Punct::OpenBrace) { 187 | return false; 188 | } 189 | if last_token.matches_punct(Punct::CloseBrace) { 190 | return false; 191 | } 192 | if last_token.matches_punct(Punct::CloseParen) && token.matches_punct(Punct::OpenBrace) { 193 | return true; 194 | } 195 | if last_token.matches_punct(Punct::OpenBracket) { 196 | return false; 197 | } 198 | if last_token.matches_punct(Punct::OpenParen) { 199 | return false; 200 | } 201 | if token.matches_punct(Punct::SemiColon) { 202 | return false; 203 | } 204 | if token.matches_punct(Punct::Period) { 205 | return false; 206 | } 207 | if last_token.matches_punct(Punct::Period) { 208 | return false; 209 | } 210 | if token.matches_punct(Punct::Comma) { 211 | return false; 212 | } 213 | if token.matches_punct(Punct::Colon) { 214 | return false; 215 | } 216 | if last_token.matches_punct(Punct::Bang) { 217 | return false; 218 | } 219 | if last_token.matches_punct(Punct::Comma) { 220 | return true; 221 | } 222 | if token.matches_punct(Punct::Bang) { 223 | return false; 224 | } 225 | if last_token.matches_keyword(Keyword::Function(())) && token.matches_punct(Punct::OpenBrace) { 226 | return false; 227 | } 228 | if last_token.matches_keyword(Keyword::In(())) 229 | || last_token.matches_ident_str("of") 230 | || last_token.matches_keyword(Keyword::For(())) 231 | { 232 | return true; 233 | } 234 | if token.matches_keyword(Keyword::In(())) || token.matches_ident_str("of") { 235 | return true; 236 | } 237 | if last_token.is_keyword() { 238 | return true; 239 | } 240 | if last_token.matches_punct(Punct::SemiColon) { 241 | return false; 242 | } 243 | if token.is_punct() || last_token.is_punct() { 244 | return true; 245 | } 246 | false 247 | } 248 | 249 | fn token_to_string(t: &RefToken) -> String { 250 | match t { 251 | Token::Boolean(ref t) => if t == &Boolean::True { "true" } else { "false" }.to_string(), 252 | Token::Comment(ref comment) => { 253 | if comment.is_multi_line() { 254 | format!("/*\n{}\n*/", comment.content) 255 | } else { 256 | format!("//{}", comment.content) 257 | } 258 | } 259 | Token::Ident(ref name) => name.to_string(), 260 | Token::Keyword(ref key) => key.to_string(), 261 | Token::Null => "null".to_string(), 262 | Token::Number(ref number) => number.to_string(), 263 | Token::Punct(ref p) => p.to_string(), 264 | Token::RegEx(ref regex) => match regex.flags { 265 | Some(ref f) => format!("/{}/{}", regex.body, f), 266 | None => format!("/{}/", regex.body), 267 | }, 268 | Token::String(ref s) => s.to_string(), 269 | _ => String::new(), 270 | } 271 | } 272 | 273 | #[derive(Deserialize)] 274 | struct Opts { 275 | arg_in_path: PathBuf, 276 | arg_out_path: PathBuf, 277 | } 278 | -------------------------------------------------------------------------------- /examples/count_tokens.rs: -------------------------------------------------------------------------------- 1 | use docopt::Docopt; 2 | use ress::prelude::*; 3 | #[macro_use] 4 | extern crate serde_derive; 5 | 6 | use std::{collections::HashMap, fs::read_to_string, path::PathBuf}; 7 | 8 | static USAGE: &str = " 9 | count-tokens 10 | 11 | Usage: 12 | count-tokens 13 | "; 14 | 15 | #[derive(Deserialize)] 16 | struct Opts { 17 | arg_in_path: PathBuf, 18 | } 19 | 20 | fn main() { 21 | let _ = pretty_env_logger::try_init(); 22 | let opts: Opts = Docopt::new(USAGE) 23 | .and_then(|d| d.deserialize()) 24 | .unwrap_or_else(|e| { 25 | println!("error: {:?}", e); 26 | e.exit() 27 | }); 28 | let js = read_to_string(opts.arg_in_path).expect("Failed to read file"); 29 | let mut counts = get_initial_counts(); 30 | let mut total = 0; 31 | 32 | for maybe in Scanner::new(&js) { 33 | let item = maybe.expect("failed to scan token"); 34 | let key = token_type_str(&item.token); 35 | counts.entry(key).and_modify(|c| *c += 1); 36 | total += 1; 37 | } 38 | for (key, value) in counts { 39 | println!("{}: {}", key, value); 40 | } 41 | println!("total: {}", total); 42 | } 43 | 44 | fn token_type_str(tok: &Token<&str>) -> &'static str { 45 | match tok { 46 | Token::Null => "null", 47 | Token::Boolean(_) => "bool", 48 | Token::Ident(_) => "ident", 49 | Token::Number(_) => "number", 50 | Token::String(_) => "string", 51 | Token::Keyword(_) => "keyword", 52 | Token::Punct(_) => "punct", 53 | Token::RegEx(_) => "regex", 54 | Token::Template(_) => "template", 55 | Token::Comment(_) => "comment", 56 | Token::EoF => "eof", 57 | } 58 | } 59 | 60 | fn get_initial_counts() -> HashMap<&'static str, usize> { 61 | let mut counts = HashMap::new(); 62 | counts.insert("regex", 0); 63 | counts.insert("ident", 0); 64 | counts.insert("template", 0); 65 | counts.insert("bool", 0); 66 | counts.insert("string", 0); 67 | counts.insert("number", 0); 68 | counts.insert("keyword", 0); 69 | counts.insert("punct", 0); 70 | counts.insert("comment", 0); 71 | counts.insert("null", 0); 72 | counts.insert("eof", 0); 73 | counts 74 | } 75 | -------------------------------------------------------------------------------- /examples/find_regexes.rs: -------------------------------------------------------------------------------- 1 | use ress::prelude::*; 2 | use walkdir::WalkDir; 3 | 4 | use std::{env::args, fs::read_to_string}; 5 | 6 | fn main() { 7 | let mut args = args(); 8 | let _ = args.next(); 9 | let start = args 10 | .next() 11 | .expect("No directory provided as starting location."); 12 | println!("static REGEXES: &[&str] = &["); 13 | let mut set = std::collections::HashSet::new(); 14 | for path in WalkDir::new(start) { 15 | if let Ok(entry) = path { 16 | let path = entry.path(); 17 | if path.is_file() { 18 | if let Some(ext) = path.extension() { 19 | if ext == "js" { 20 | if let Ok(js) = read_to_string(path) { 21 | let s = Scanner::new(&js); 22 | for item in s { 23 | if let Ok(item) = item { 24 | if item.token.is_regex() { 25 | let s = js[item.span.start..item.span.end].to_string(); 26 | if set.insert(s) { 27 | println!( 28 | " r#\"{}\"#,", 29 | &js[item.span.start..item.span.end] 30 | ); 31 | } 32 | } 33 | } 34 | } 35 | } 36 | } 37 | } 38 | } 39 | } 40 | } 41 | println!("];"); 42 | } 43 | -------------------------------------------------------------------------------- /examples/instruments/bools.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | fn main() { 8 | for _ in 0..1000 { 9 | let t = Tokenizer::new("true").next(true).unwrap(); 10 | core::mem::forget(t); 11 | let f = Tokenizer::new("false").next(true).unwrap(); 12 | core::mem::forget(f); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /examples/instruments/comments.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static COMMENTS: &[&str] = &[ 8 | "//this is a comment", 9 | "/*this is a 10 | multi-line comment*/", 11 | "", 12 | " with a trailer", 13 | ]; 14 | 15 | fn main() { 16 | for _ in 0..1000 { 17 | for c in COMMENTS { 18 | let d = Tokenizer::new(c).next(true).unwrap(); 19 | core::mem::forget(d); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /examples/instruments/idents.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static IDENTS: &[&str] = &[ 8 | r#"$"#, 9 | r#"_"#, 10 | r#"\u0078"#, 11 | r#"x$"#, 12 | r#"x_"#, 13 | r#"x\u0030"#, 14 | r#"xa"#, 15 | r#"x0"#, 16 | r#"x0a"#, 17 | r#"x0123456789"#, 18 | r#"qwertyuiopasdfghjklzxcvbnm"#, 19 | r#"QWERTYUIOPASDFGHJKLZXCVBNM"#, 20 | r#"œ一"#, 21 | r#"ǻ둘"#, 22 | r#"ɤ〩"#, 23 | r#"φ"#, 24 | r#"fiⅷ"#, 25 | r#"ユニコード"#, 26 | r#"x‌‍"#, 27 | ]; 28 | 29 | fn main() { 30 | for _ in 0..1000 { 31 | for i in IDENTS { 32 | let d = Tokenizer::new(i).next(true).unwrap(); 33 | core::mem::forget(d); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /examples/instruments/keywords.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static KEYWORDS: &[&str] = &[ 8 | "implements", 9 | "interface", 10 | "package", 11 | "private", 12 | "protected", 13 | "public", 14 | "static", 15 | "yield", 16 | "let", 17 | "enum", 18 | "export", 19 | "import", 20 | "super", 21 | "break", 22 | "case", 23 | "catch", 24 | "continue", 25 | "debugger", 26 | "default", 27 | "delete", 28 | "do", 29 | "else", 30 | "finally", 31 | "for", 32 | "function", 33 | "if", 34 | "instanceof", 35 | "in", 36 | "new", 37 | "return", 38 | "switch", 39 | "this", 40 | "throw", 41 | "try", 42 | "typeof", 43 | "var", 44 | "void", 45 | "while", 46 | "with", 47 | ]; 48 | 49 | fn main() { 50 | for _ in 0..1000 { 51 | for key in KEYWORDS { 52 | let d = Tokenizer::new(key).next(true).unwrap(); 53 | core::mem::forget(d); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/instruments/null.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | fn main() { 8 | for _ in 0..1000 { 9 | let null = Tokenizer::new("null").next(true).unwrap(); 10 | core::mem::forget(null); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /examples/instruments/numbers.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | static NUMBERS: &[&str] = &[ 7 | "0", 8 | "00", 9 | "1234567890", 10 | "01234567", 11 | "0.", 12 | "0.00", 13 | "10.00", 14 | ".0", 15 | "0e0", 16 | "0E0", 17 | "0.e0", 18 | "0.00e+0", 19 | ".00e-0", 20 | "0x0", 21 | "0X0", 22 | "0x0123456789abcdefABCDEF", 23 | "0b0", 24 | "0b0100101", 25 | "0o0", 26 | "0o01234567", 27 | "2e308", 28 | ]; 29 | 30 | fn main() { 31 | for _ in 0..1000 { 32 | for n in NUMBERS { 33 | let d = Tokenizer::new(n).next(true).unwrap(); 34 | core::mem::forget(d); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /examples/instruments/puncts.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static PUNCTS: &[&str] = &[ 8 | "{", "}", "(", ")", ".", ";", ",", "[", "]", ":", "?", "~", ">", "<", "=", "!", "+", "-", "/", 9 | "*", "%", "&", "|", "^", ">>>=", //3 char 10 | "...", "===", "!==", ">>>", "<<=", ">>=", "**=", //2 char 11 | "&&", "||", "==", "!=", "+=", "-=", "*=", "/=", "++", "--", "<<", ">>", "&=", "|=", "^=", "%=", 12 | "<=", ">=", "=>", "**", 13 | ]; 14 | 15 | fn main() { 16 | for _ in 0..1000 { 17 | for punct in PUNCTS { 18 | let d = Tokenizer::new(punct).next(true).unwrap(); 19 | core::mem::forget(d); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /examples/instruments/regexes.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static REGEX: &[&str] = &[ 8 | r#"x/"#, 9 | r#"|/"#, 10 | r#"|||/"#, 11 | r#"^$\b\B/"#, 12 | r#"(?=(?!(?:(.))))/"#, 13 | r#"a.\f\n\r\t\v\0\[\-\/\\\x00\u0000/"#, 14 | r#"\d\D\s\S\w\W/"#, 15 | r#"\ca\cb\cc\cd\ce\cf\cg\ch\ci\cj\ck\cl\cm\cn\co\cp\cq\cr\cs\ct\cu\cv\cw\cx\cy\cz/"#, 16 | r#"\cA\cB\cC\cD\cE\cF\cG\cH\cI\cJ\cK\cL\cM\cN\cO\cP\cQ\cR\cS\cT\cU\cV\cW\cX\cY\cZ/"#, 17 | r#"[a-z-]/"#, 18 | r#"[^\b\-^]/"#, 19 | r#"[/\]\\]/"#, 20 | r#"./i"#, 21 | r#"./g"#, 22 | r#"./m"#, 23 | r#"./igm"#, 24 | r#".*/"#, 25 | r#".*?/"#, 26 | r#".+/"#, 27 | r#".+?/"#, 28 | r#".?/"#, 29 | r#".??/"#, 30 | r#".{0}/"#, 31 | r#".{0,}/"#, 32 | r#".{0,0}/"#, 33 | ]; 34 | 35 | fn main() { 36 | for _ in 0..1000 { 37 | for r in REGEX { 38 | let d = Tokenizer::new(r).next_regex(1).unwrap(); 39 | core::mem::forget(d); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /examples/instruments/strings.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static STRINGS: &[&str] = &[ 8 | r#""things and stuff""#, 9 | r#"'people and places'"#, 10 | r#""with and escaped \"""#, 11 | r#"'another escaped \''"#, 12 | r#""with a new \ 13 | line""#, 14 | r#"'another new line \ 15 | hahaha'"#, 16 | "\"sequence double quoted\\\r\nis hard\"", 17 | "'new line sequence\\\r\nmight be harder'", 18 | ]; 19 | 20 | fn main() { 21 | for _ in 0..1000 { 22 | for s in STRINGS { 23 | let d = Tokenizer::new(s).next(true).unwrap(); 24 | core::mem::forget(d); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /examples/instruments/templates.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::forget_non_drop)] 2 | // This example exists to allow for profiling 3 | // applications to provide details about 4 | // the criterion benchmarks 5 | use ress::Tokenizer; 6 | 7 | static TEMPLATE_STARTS: &[&str] = &[ 8 | "`things and stuff times ${", 9 | "`things and stuff`", 10 | r#"`a\${b`"#, 11 | r#"`\0\n\x0A\u000A\u{A}${"#, 12 | ]; 13 | 14 | static TEMPLATE_CONTINUATIONS: &[&str] = &[ 15 | "`${} and animals and minerals`", 16 | "`${}`", 17 | "`${} and animals and minerals`", 18 | "`${} and places and people ${", 19 | ]; 20 | 21 | fn main() { 22 | for _ in 0..1000 { 23 | for s in TEMPLATE_CONTINUATIONS { 24 | parse_two(s); 25 | } 26 | for s in TEMPLATE_STARTS { 27 | parse(s); 28 | } 29 | } 30 | } 31 | #[inline] 32 | fn parse_two(s: &str) { 33 | let mut t = Tokenizer::new(s); 34 | let _ = t.next(true).unwrap(); 35 | let d = t.next(true).unwrap(); 36 | core::mem::forget(d); 37 | } 38 | #[inline] 39 | fn parse(s: &str) { 40 | let e = Tokenizer::new(s).next(true).unwrap(); 41 | core::mem::forget(e); 42 | } 43 | -------------------------------------------------------------------------------- /examples/major_libs/src/main.rs: -------------------------------------------------------------------------------- 1 | //! This example is primarily for illustrating the 2 | //! project's performance w/o waiting for the current 3 | //! set of benches. It simply pulls down some major 4 | //! JS libraries and attempts to tokenize them with 5 | //! both methods and then reports the size, time and method 6 | //! for each lib. 7 | extern crate ress; 8 | use std::{ 9 | env::args, 10 | fs::read_to_string, 11 | path::PathBuf, 12 | time::{Duration, SystemTime}, 13 | }; 14 | 15 | struct Args { 16 | pub angular: bool, 17 | pub jquery: bool, 18 | pub react: bool, 19 | pub react_dom: bool, 20 | pub vue: bool, 21 | pub moment: bool, 22 | pub dexie: bool, 23 | } 24 | 25 | impl ::std::default::Default for Args { 26 | fn default() -> Args { 27 | Args { 28 | angular: false, 29 | jquery: false, 30 | react: false, 31 | react_dom: false, 32 | vue: false, 33 | moment: false, 34 | dexie: false, 35 | } 36 | } 37 | } 38 | 39 | impl Args { 40 | fn pristine(&self) -> bool { 41 | !self.angular 42 | && !self.jquery 43 | && !self.react 44 | && !self.react_dom 45 | && !self.vue 46 | && !self.moment 47 | && !self.dexie 48 | } 49 | } 50 | 51 | fn main() { 52 | let mut a = Args::default(); 53 | // loop over the ags and check for 54 | // lib names. If they exist, run the test 55 | // and increment the counter 56 | for arg in args() { 57 | if arg == "jquery" || arg == "jq" { 58 | a.jquery = true; 59 | } else if arg == "angular" || arg == "ng" { 60 | a.angular = true; 61 | } else if arg == "react" { 62 | a.react = true; 63 | } else if arg == "react-dom" || arg == "rd" { 64 | a.react_dom = true; 65 | } else if arg == "vue" || arg == "v" { 66 | a.vue = true 67 | } else if arg == "moment" || arg == "mt" { 68 | a.moment = true; 69 | } else if arg == "dexie" || arg == "dx" { 70 | a.dexie = true; 71 | } 72 | } 73 | if a.jquery { 74 | jquery(); 75 | } 76 | if a.angular { 77 | angular1(); 78 | } 79 | if a.react { 80 | react(); 81 | } 82 | if a.react_dom { 83 | react_dom(); 84 | } 85 | if a.vue { 86 | vue(); 87 | } 88 | if a.moment { 89 | moment(); 90 | } 91 | if a.dexie { 92 | dexie(); 93 | } 94 | if a.pristine() { 95 | jquery(); 96 | angular1(); 97 | react(); 98 | react_dom(); 99 | vue(); 100 | moment(); 101 | dexie(); 102 | } 103 | } 104 | 105 | fn jquery() { 106 | println!("trying jquery"); 107 | if let Ok(ref js) = get_js(Lib::Jquery) { 108 | test_js(js, "jquery"); 109 | } 110 | } 111 | 112 | fn angular1() { 113 | println!("trying angular1"); 114 | if let Ok(ref js) = get_js(Lib::Angular) { 115 | test_js(js, "angular"); 116 | } 117 | } 118 | 119 | fn react() { 120 | println!("trying react"); 121 | if let Ok(ref js) = get_js(Lib::React) { 122 | test_js(js, "react"); 123 | } 124 | } 125 | 126 | fn react_dom() { 127 | println!("trying react_dom"); 128 | if let Ok(ref js) = get_js(Lib::ReactDom) { 129 | test_js(js, "react-dom"); 130 | } 131 | } 132 | 133 | fn vue() { 134 | println!("trying vue"); 135 | if let Ok(ref js) = get_js(Lib::Vue) { 136 | test_js(js, "vue"); 137 | } 138 | } 139 | 140 | fn moment() { 141 | println!("trying moment"); 142 | if let Ok(ref js) = get_js(Lib::Moment) { 143 | test_js(js, "moment") 144 | } 145 | } 146 | 147 | fn dexie() { 148 | println!("trying dexie"); 149 | if let Ok(ref js) = get_js(Lib::Dexie) { 150 | test_js(js, "dexie"); 151 | } 152 | } 153 | 154 | fn test_js(text: &str, name: &str) { 155 | let size = text.len(); 156 | let now = SystemTime::now(); 157 | test(text); 158 | if let Ok(e) = now.elapsed() { 159 | report(size, e, "scanner", name) 160 | } else { 161 | println!("error capturing scanner duration for {}", name); 162 | } 163 | } 164 | 165 | fn test(text: &str) { 166 | let s = ress::Scanner::new(text); 167 | let _: Vec<_> = s.collect(); 168 | } 169 | 170 | fn report(bytes: usize, elapsed: Duration, method: &str, name: &str) { 171 | let size = get_size(bytes); 172 | println!( 173 | "{} ({}) using {} in {}s {:.2}ms", 174 | name, 175 | size, 176 | method, 177 | elapsed.as_secs(), 178 | elapsed.subsec_millis() 179 | ) 180 | } 181 | 182 | fn get_size(b: usize) -> String { 183 | let mut size = b as f32; 184 | let mut i = 0; 185 | while size > 1000.0 { 186 | if i > 4 { 187 | break; 188 | } 189 | size /= 1000.0; 190 | i += 1; 191 | } 192 | let bytes = match i { 193 | 0 => "b", 194 | 1 => "kb", 195 | 2 => "mb", 196 | 3 => "gb", 197 | _ => "tb", 198 | }; 199 | format!("{:.2}{}", size, bytes) 200 | } 201 | 202 | fn npm_install() -> Result<(), ::std::io::Error> { 203 | let mut c = ::std::process::Command::new("npm"); 204 | c.arg("i"); 205 | c.output()?; 206 | Ok(()) 207 | } 208 | 209 | enum Lib { 210 | Jquery, 211 | Angular, 212 | React, 213 | ReactDom, 214 | Vue, 215 | Moment, 216 | Dexie, 217 | } 218 | 219 | impl Lib { 220 | fn path(&self) -> String { 221 | match self { 222 | Lib::Jquery => "node_modules/jquery/dist/jquery.js".into(), 223 | Lib::Angular => "node_modules/angular/angular.js".into(), 224 | Lib::React => "node_modules/react/umd/react.development.js".into(), 225 | Lib::ReactDom => "node_modules/react-dom/umd/react-dom.development.js".into(), 226 | Lib::Vue => "node_modules/vue/dist/vue.js".into(), 227 | Lib::Moment => "node_modules/moment/moment.js".into(), 228 | Lib::Dexie => "node_modules/dexie/dist/dexie.js".into(), 229 | } 230 | } 231 | } 232 | 233 | fn get_js(l: Lib) -> Result { 234 | let path = PathBuf::from(l.path()); 235 | if !path.exists() { 236 | npm_install()?; 237 | if !path.exists() { 238 | println!("cannot find {:?}", path); 239 | } 240 | } 241 | read_to_string(path) 242 | } 243 | -------------------------------------------------------------------------------- /examples/semi_finder/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate ress; 2 | extern crate walkdir; 3 | 4 | use ress::prelude::*; 5 | use walkdir::WalkDir; 6 | 7 | use std::{collections::HashMap, env::args, fs::read_to_string, path::PathBuf}; 8 | 9 | fn main() { 10 | // get the command line arguments that started this process 11 | let mut args = args(); 12 | // discard the first argument, this will be the path to our 13 | // executable 14 | let _ = args.next(); 15 | // The next argument will be the path to check 16 | // panic and display an error to the user if no path 17 | // was provided 18 | let start = args 19 | .next() 20 | .expect("No directory provided as starting location."); 21 | // Pass the argument off to our `check_files` function 22 | let issues = check_files(start); 23 | // If no issues were found 24 | if issues.is_empty() { 25 | // Print the success message 26 | println!("Good to go, no semicolons found"); 27 | } else { 28 | // Otherwise loop over the hashmap and 29 | // tell the user where we found semi-colons that need to be 30 | // removed 31 | for (path, indexes) in issues { 32 | println!("Issues found in {:?} at indexes:", path); 33 | println!("\t{:?}\n", indexes) 34 | } 35 | } 36 | } 37 | 38 | fn check_files(start: String) -> HashMap> { 39 | // We are going to store the location of any semi-colons we have found 40 | let mut ret: HashMap> = HashMap::new(); 41 | // loop over the directories in our path 42 | // set the min_depth to 1, so we will skip the 43 | // path passed in as `start` 44 | for entry in WalkDir::new(start).min_depth(1) { 45 | match entry { 46 | Ok(entry) => { 47 | // If the entry doesn't error 48 | // capture the path of this entry 49 | let path = entry.path(); 50 | //if the path ends with js, we want to check for semicolons 51 | if path.extension() == Some(::std::ffi::OsStr::new("js")) { 52 | // if we can read the file to a string 53 | // pass the text off to our check_js fn 54 | // if we can't we'll just skip it for now 55 | if let Ok(js) = read_to_string(path) { 56 | let indexes = check_js(&js); 57 | // if we found any semicolons, add them to our hashmap 58 | if !indexes.is_empty() { 59 | ret.insert(path.to_path_buf(), indexes); 60 | } 61 | } 62 | } 63 | } 64 | Err(e) => eprintln!("failed to get a directory entry: {:?}", e), 65 | } 66 | } 67 | ret 68 | } 69 | 70 | fn check_js(js: &str) -> Vec { 71 | // Create a scanner with the text then 72 | // filter out any tokens that are not semi-colons 73 | // then collect them all into a `Vec` of the start indexes 74 | Scanner::new(js) 75 | .filter_map(|item| { 76 | let item = item.unwrap(); 77 | // If this token matches the `Punct::SemiColon` 78 | if let Token::Punct(ref inner) = item.token { 79 | match inner { 80 | // we want to return the first position of this token 81 | // since semi-colons are only 1 character wide we would 82 | // only need this part of the `Span` 83 | Punct::SemiColon => Some(item.span.start), 84 | _ => None, 85 | } 86 | } else { 87 | None 88 | } 89 | }) 90 | .collect() 91 | } 92 | -------------------------------------------------------------------------------- /examples/tokenize.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let mut args = std::env::args(); 3 | let _ = args.next(); 4 | let path = args.next().expect("First argument must be a file path"); 5 | let path = std::path::Path::new(&path); 6 | if !path.exists() { 7 | panic!("First argument must be a file path"); 8 | } 9 | let js = std::fs::read_to_string(path).expect("Couldn't read the path provide"); 10 | for item in ress::Scanner::new(&js) { 11 | println!("{:?}", item.expect("failed to lex token")); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /examples/tokens.js: -------------------------------------------------------------------------------- 1 | const cp = require('child_process'); 2 | const fs = require('fs'); 3 | const prog = require('progress'); 4 | 5 | 6 | function dd(infile, outfile, bytesize) { 7 | console.log('Getting started'); 8 | var bar; 9 | var currentBytes; 10 | 11 | fs.stat(infile, function(err, stat) { 12 | if (err) return console.error('Unable to get infile stats', err.message); 13 | console.log(`moving \n\t${infile}`); 14 | console.log(`to \n\t${outfile}`); 15 | var inFileSize = stat.size; 16 | bar = new prog('Progress [:bar] :percent :current :total', 17 | { 18 | total: inFileSize, 19 | complete: '‡', 20 | incomplete: ' ' 21 | }); 22 | 23 | var dd = cp.spawn('dd', [`if=${infile}`, `of=${outfile}`, `bs=${bytesize || '1m'}`]); 24 | var interval = setInterval(function() { 25 | if (bar.complete) { 26 | clearInterval(interval) 27 | console.log('Finishing up'); 28 | } else { 29 | dd.kill('SIGINFO'); 30 | } 31 | }, 100); 32 | dd.addListener('exit', function(code, sig) { 33 | if (code == 0) { 34 | bar.tick(bar.total - bar.curr); 35 | console.log('Complete'); 36 | process.exit(); 37 | } else { 38 | console.log(`Exit with code ${code}: ${sig}`); 39 | process.exit(); 40 | } 41 | }); 42 | // TODO: Add color formatting 43 | dd.stderr.on('data', function(data) { 44 | console.log('dd.stderr.on("data", ' + data); 45 | if (typeof data != 'string') data = data.toString('utf8'); 46 | var status = parse(data); 47 | var update; 48 | if (status) { 49 | update = status - currentBytes; 50 | currentBytes = status; 51 | if (!bar.complete) bar.tick(update); 52 | } 53 | }); 54 | }); 55 | } 56 | 57 | function parse(text) { 58 | var lines = text.split('\n') 59 | var line = lines[2] 60 | if (!line) { 61 | line = lines[0] 62 | } 63 | var words = line.split(' ') 64 | return Number.parseInt(words[0]) 65 | } 66 | 67 | var ifile; 68 | var ofile; 69 | var bs; 70 | 71 | if (process.argv[2]) { 72 | ifile = process.argv[2] 73 | } else { 74 | console.error('no ifile'); 75 | process.exit(); 76 | } 77 | if (process.argv[3]) { 78 | ofile = process.argv[3] 79 | } else { 80 | console.error('no ofile'); 81 | process.exit(); 82 | } 83 | 84 | if (process.argv[4]) { 85 | bs = process.argv[4] 86 | } 87 | 88 | dd(ifile, ofile, bs); 89 | 90 | //FIXME nothing used after this 91 | var gen = function*() { 92 | yield 'one'; 93 | yield 'two'; 94 | yield 'three'; 95 | } 96 | let generator = gen(); 97 | let current = generator.next(); 98 | while (!current.done) { 99 | console.log('current value:', current.value); 100 | current = generator.next(); 101 | } 102 | 103 | var {a, b, c} = {a: 1, b: 2, c: 3}; -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 Robert F. Masen 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "devDependencies": { 3 | "angular": "^1.5.6", 4 | "dexie": "^2.0.4", 5 | "everything.js": "^1.0.3", 6 | "jquery": "^3.3.1", 7 | "moment": "^2.22.2", 8 | "react": "^16.4.1", 9 | "react-dom": "^16.4.1", 10 | "vue": "^2.5.16" 11 | }, 12 | "dependencies": { 13 | "esprima": "^4.0.1" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /proptest-regressions/comments.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 823165937 3524042579 1300144645 3460888313 # shrinks to s = "//\r¡" 8 | xs 2546364303 4055620366 3887015968 2037831009 # shrinks to s = "" 9 | -------------------------------------------------------------------------------- /proptest-regressions/keywords.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 1831916767 2446038119 372885449 39983890 # shrinks to s = "class" 8 | xs 3620432093 1598325935 2776960468 1839814061 # shrinks to s = "of" 9 | xs 715549365 4242199435 3354376143 888258416 # shrinks to s = "await" 10 | xs 1617284926 3538474885 2036666429 3668690609 # shrinks to s = "arguments" 11 | -------------------------------------------------------------------------------- /proptest-regressions/numeric.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 3219093276 2773970703 4154894776 3021806892 # shrinks to s = "0O8" 8 | xs 891558002 1584292879 3558343646 3847314476 # shrinks to s = "+0" 9 | xs 2600770942 2224153764 1401590777 3215062306 # shrinks to s = "a" 10 | xs 1886057689 3756146993 2034981319 1909346210 # shrinks to s = "0e-0" 11 | -------------------------------------------------------------------------------- /proptest-regressions/punct.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 2545902955 1260655647 3767056268 2752144796 # shrinks to s = "" 8 | -------------------------------------------------------------------------------- /proptest-regressions/regex.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 919913896 2883734430 3354605609 1554838978 # shrinks to s = "//i" 8 | xs 3256384021 627040095 2127382704 1475438757 # shrinks to s = "*/i" 9 | xs 2627294730 3442124659 1611555692 1486966766 # shrinks to s = "([)/i" 10 | xs 526392280 1180277379 3516634806 4260267833 # shrinks to s = "?\\/A" 11 | xs 2045013785 354495369 2786121027 1334793481 # shrinks to s = "/A/A" 12 | xs 3043448367 3896913540 2883388277 3685927156 # shrinks to s = "\\A\\/a" 13 | -------------------------------------------------------------------------------- /proptest-regressions/strings.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 2303139490 2313371466 2634842513 3600428951 # shrinks to s = "\"\\\"" 8 | xs 2518380799 2909478577 3296634888 1078387965 # shrinks to s = "\"\\\"" 9 | -------------------------------------------------------------------------------- /regex.md: -------------------------------------------------------------------------------- 1 | # Regex Detection at Tokenization 2 | This library implements an algorithm to detect if any give forward slash is the beginning of a regular expression literal or should be considered a single forward slash (originally developed by the [sweet.js team](https://github.com/sweet-js/sweet-core/)). This may seem like a strange thing to need to do but the ecma script spec allows for some crazy things regarding division, for example: 3 | 4 | ```js 5 | let x = {} / 100; 6 | //x == NaN 7 | let y = function() {} / 100; 8 | //y == NaN 9 | {}/1/g //this is actually a regular expression! 10 | ``` 11 | 12 | While most sane JS programmers wouldn't perform the above, the fact that it is valid means that we need to look backwards to know if any forward slash might be a regular expression. Keeping a history of tokens is a bit problematic, depending on how long that history needs to be. In this case we may need to look back an arbitrary number of tokens to get the right answer, keeping all of the tokens around indefinitely is pretty expensive. Even if we were to pair down the data to an un-nested enum that would be 1 bytes per token, the library jquery has a total of `46_863` tokens which would be `~45kb`. Add to the overall size and number of allocations the fact that we would need to scan backwards an unknown distance, touching each index, makes this solution less than ideal. So how could we get the same information more efficiently? Well, let's take a look at the [sweet.js "read" algorithm](https://github.com/sweet-js/sweet-core/wiki/design). 13 | 14 | Initially reading their "almost-one lookbehind" description can be slightly confusing, [they published a paper](https://users.soe.ucsc.edu/~cormac/papers/dls14a.pdf) that details a method for creating "token-trees", the paper goes into much greater detail about what a "token-tree" is but to give you the short version of how it relates to the linked psuedo-code: 15 | 16 | - `{}` and `()` are considered one token but represent all of the tokens between the open and close 17 | - `tok-#` is referring to these "token-trees" not tokens themselves 18 | - so in `function(n) {} /`, `tok-2` is `)` and `tok-3` is `function` 19 | - The `isBlock` helper function also requires that any `{}` can access a possible parent `{}` 20 | - so in `{function() {}}` the function body start needs to be able to see the block start at the very beginning 21 | 22 | With that, let's try and walk through the pseudo-code in more plain language. 23 | 24 | When we find a forward slash, the first thing we need to do is look backwards 1 token. If the token 1 before the `/` is a punctuation but not `}` or `)` or a keyword but not `this`, we found a regular expression. `}` and `)` are special cases we will get into next but all other previous tokens would mean it is not a regular expression. Now we have just two cases left, first is `)`. If the token before the `/` is a `)`, we need to jump backwards to the token before the `(` that would be paired with this `)`, if that is `if`, `while`, `for`, or `with`, we found a regex otherwise not. If the token one before the `/` is `}`, we need to determine if the pair of `{` and `}` is a "block" ([see below](#is-a-block)). If the `}` isn't part of a "block", we are not at a regex, if it is a block we need to check if that block is the body of a function expression ([see below](#is-a-function-expression-body)). If the block is the body of a function expression it is not a regular expression otherwise it is a regular expression. 25 | 26 | #### Is a Block 27 | To determine if a pair of curly braces is a block we first look 1 before the `{`, if it is a `(`, `[`, an _operator_ ([see below](#punctuation-or-keyword-represents-operation)), or the keyword `case` it is not a block. If the token 1 before the `{` is the keyword `return` or `yield`, we need to compare the line number of the keyword and the `{`, if they match then it is not a block otherwise it is a block. if the token 1 before the `{` is a `:`, we need to look at the possible parent `{`. If there is a parent we run the same test on that `{`, if that is a block, this `{` is also a block, otherwise it is not a block. If the token 1 before the `{` is anything else, it is a block. 28 | 29 | #### Is a Function Expression Body 30 | if the token 1 before the `{` is `)`, we need to look at the two tokens before the paired `(`, if either of them are the keyword `function`, we need to look 1 token before _that_. If the token one before `function` is `(`, `[`, an _operator_ ([see below](#punctuation-or-keyword-represents-operation)), or the keyword `case` or `return` the block is the body of a function expression, in all other cases it is not. 31 | 32 |
33 | As a Bulleted List 34 | 35 | - if the current token is a `/`, look back one token 36 | - if the previous token is `)` 37 | - check the token before it's `(` 38 | - if that is `if`, `while`, `for`, or `with`, we found a regex 39 | - else, we found a forward slash 40 | - if the previous token is `}` 41 | - we check if it is a block 42 | - look 1 before it's `{` 43 | - if that is `(` or `[` it is not a block 44 | - if that is `:` we look to the `{`'s parent 45 | - if no parent, it is a block 46 | - else if the parent is a block, it is a block 47 | - else, it is not a block 48 | - if that is an _operator_ ([see below](#punctuation-or-keyword-represents-operation)), it is not a block 49 | - if that is the keyword `return` or `yield` 50 | - check the line number of the open brace and one token before the open brace 51 | - if they match, it is not a block 52 | - else, it is a block 53 | - if that is the keyword `case`, it is not a block 54 | - else, it is a block 55 | - if it is a block 56 | - we look to the token behind the `{` 57 | - if that is a `)` 58 | - we check if the token 1 or 2 before the `(` is the keyword `function`, we need to check if that is an expression 59 | - if the token before `function` is `(`, `[`, an _operator_ ([see below](#punctuation-or-keyword-represents-operation)), or the keyword `case` or `return`, we found a forward slash 60 | - else, we found a regex 61 | - else, we found a regex 62 | - else, we found a forward slash 63 | - if the previous token is any other punctuation, we found a regex 64 | - if the previous token is a keyword but not `this`, we found a regex 65 | - else, we found a forward slash 66 | 67 |
68 | 69 | #### _Operators_ 70 | > `=`, `+=`, `-=`, `*=`, `/=`, `%=`, `<<=`, `>>=`, `>>>=`, `&=`, `|=`, `^=`, `,`, `+`, `-`, `*`, `/`, `%`, `<<`, `>>`, `>>>`, `&`, `|`, `^`, `&&`, `||`, `?`, `:`, `instanceof`, `in`, `===`, `==`, `>=`, `<=`, `<`, `>`, `!=`, `!==`, `++`, `--`, `~`, `!`, `delete`, `void`, `typeof`, `throw`, `new` 71 | 72 | With all of that in mind, let's look at an example: 73 | 74 |
75 | types of tokens 76 |
77 | 78 | As you can see, each of the tokens has a type, the key describes how we think about tokens when checking for a regular expression. There are 4 types of token we care about the rest get lumped into `other`, we can refer to this set as `MetaToken`s. Because of how the detecting a block works, we need each of these to know what line it was on, so all of the `MetaToken`s will carry their line number. Looking through the above description of our algorithm, the furthest we need to look backwards from an `(` is 3 tokens, so our scanner should always keep track of the last 3 tokens we have seen. 79 | 80 | You may have noticed that one of the variants of `MetaToken` is "special punctuation", this is because we need to treat `(`, `)`, `{`, and `}` in a special way. 81 | 82 | Using the same example, this is what special means: 83 |
84 | special punctuation 85 |
86 | 87 | Every `)` or `}` needs to point to their paired `(` or `{` and every `{` needs to point to a parent `{` if one exists. In addition both the `(` and `{` need to point to the 3 tokens before them, which might look something like this: 88 | 89 |
90 | opens with lookbehind 91 |
92 | 93 | First we encounter the red `(`, it would need to hold the `things` ident at position 1 and `function` keyword at position 2, position 3 would be empty. Next we would encounter the orange `{`, this would hold the `)` at 1, `(` at 2 and `things` at 3. Finally we would encounter the blue `{`, this would hold the orange `{` at 1, the `)` at 2 and the red `(` at 3, it also hold the orange `{` as its _parent_. 94 | 95 | This means our scanner needs to keep 3 book keeping lists. The first is the last 3 tokens when scanning the next token, as covered above. This essentially needs to act like a queue with a fixed size where the `enqueue` action would `dequeue` when full. Here is an example of how this would look for the first 4 tokens in our example. 96 | 97 | ```rust 98 | // 3 2 1 99 | // step 1 100 | [ None, None, "function"] 101 | // step 2 102 | [ None, "function", "thing"] 103 | // step 3 104 | ["function", "thing", "("] 105 | // step 4 106 | [ "thing", "(", ")"] 107 | ``` 108 | 109 | The next two are going to be one stack for opening parentheses and one for opening curly braces. They are stacks because once we find a close, we don't need that open any more. With these three book keeping constructs we can build our chain of parentheses and curly brace pairs. 110 | 111 | When we encounter an `(`, we attach the last three tokens to it and push that into both the last three queue and the parentheses stack. When we find a `)`, we pop the last `(` and attach it to the `)` and then push the `)` into the last three queue. When we find an `{` we attach the last 3 tokens we have seen and if the curly brace stack is not empty we attach the top of that stack to this `{` as the _parent_. With all that done we can push the `{` into both the open curly stack and the last three queue. Now when we find a `}` we can pop the open curly off it's stack and link it to the `}`, with the `{` and `}` connected we can push the `}` onto the last three queue. 112 | 113 | With all the book keeping and linking complete, when we find any `/` we can look back at our last three elements. If the first one is a `)`, we can use the link to the open, which is holding the three tokens before it, if the first token before the `(` is one of our keywords, we know this is the start of a regular expression. 114 | 115 | If one before the `/` is a `}`, we first check to see if that is the end of a _block_ by following the link to its open and checking one token before that, if that token is a `:` we recursively check the _parent_ opening curly brace, otherwise we look for our special keywords or punctuation. In the event that it is a _block_, we look one before the opening curly brace, if that is `)`, we check if that is part of a function signature by following the link to the `(` and then looking for a function keyword at 1 and 2 before that, if there is a function keyword there, we look one before it to determine if that is a function expression or declaration. WHEW! 116 | 117 | Let's take a look at what the last 3 tokens look like when we reach the `/` on line 3 in our example. 118 | 119 | ```rust 120 | [ 121 | // 3 122 | MetaToken::CloseParen(MetaToken::OpenParen([ 123 | None, 124 | MetaToken::Keyword(Keyword::Function), 125 | MetaToken::Ident, 126 | ])), 127 | // 2 128 | MetaToken::OpenBrace { 129 | look_behind: [ 130 | MetaToken::Ident, 131 | MetaToken::OpenParen([ 132 | None, 133 | MetaToken::Keyword(Keyword::Function), 134 | MetaToken::Ident, 135 | ]), 136 | MetaToken::CloseParen(MetaToken::OpenParen([ 137 | None, 138 | MetaToken::Keyword(Keyword::Function), 139 | MetaToken::Ident, 140 | ])), 141 | ], 142 | parent: None, 143 | }, 144 | // 1 145 | MetaToken::OpenBrace { 146 | look_behind: [ 147 | MetaToken::OpenParen([ 148 | None, 149 | MetaToken::Keyword(Keyword::Function), 150 | MetaToken::Ident, 151 | ]), 152 | MetaToken::CloseParen(MetaToken::OpenParen([ 153 | None, 154 | MetaToken::Keyword(Keyword::Function), 155 | MetaToken::Ident, 156 | ])), 157 | MetaToken::OpenBrace { 158 | look_behind: [ 159 | MetaToken::Ident, 160 | MetaToken::OpenParen([ 161 | None, 162 | MetaToken::Keyword(Keyword::Function), 163 | MetaToken::Ident, 164 | ]), 165 | MetaToken::CloseParen(MetaToken::OpenParen([ 166 | None, 167 | MetaToken::Keyword(Keyword::Function), 168 | MetaToken::Ident, 169 | ])), 170 | ], 171 | parent: None, 172 | }, 173 | ], 174 | parent: Some(MetaToken::OpenBrace { 175 | look_behind: [ 176 | MetaToken::Ident, 177 | MetaToken::OpenParen([ 178 | None, 179 | MetaToken::Keyword(Keyword::Function), 180 | MetaToken::Ident, 181 | ]), 182 | MetaToken::CloseParen(MetaToken::OpenParen([ 183 | None, 184 | MetaToken::Keyword(Keyword::Function), 185 | MetaToken::Ident, 186 | ])), 187 | ], 188 | parent: None, 189 | }), 190 | }, 191 | ] 192 | ``` 193 | 194 | We have essentially created a list of linked lists and they can get pretty big too! This means that each time we move 3 past a `}`, we might have a lot of things to `drop` and by default rust does that in a recursive manner ([which can get expensive](https://rust-unofficial.github.io/too-many-lists/first-drop.html)). If we look at our example JS from above, there are a total of 9 tokens, and when we reach the end of this block, 8 of them are still hanging around in memory. We could try and use some of Rust's smart pointers to make sure we don't have any clones lying around come drop time but picking apart when things should be `Rc`'d and when they cannot be is a pretty challenging problem. Another solution would be to re-write the drop implementation but that just seems like it might get messy. A third option is to try and find a way to capture this information with a `Copy` type. 195 | 196 | If we look over the logic tree above, we can gather most of the information we need when we encounter any `(`, is the token before it `if`, `while`, `for` or `with` or is the token 1 or 2 before it the keyword `function` and is that an expression? Those are really the two key pieces of information we need. What if we just attached those two booleans to the `(` instead of always attaching the last 3 tokens to it? Then when we pop the `(` off its stack, we can transfer the same two booleans to the `)`. 197 | 198 | Now when we find an `{` we can see if it is a block, if the token before is a `)`, we can also attach the paren flags into our `{`, finally we can copy that information over to the `}` when we pop the open off the curly brace stack. While this means we need to do the computation eagerly, it also means we don't have as much to clean up when we move past a `}`. We could capture all of the information we need a in couple of `struct`s that might look like this: 199 | 200 | ```rust 201 | struct Paren { 202 | is_conditiona: bool, 203 | is_func_expr: bool, 204 | } 205 | 206 | struct Brace { 207 | is_block: bool, 208 | paren: Option, 209 | } 210 | ``` 211 | 212 | With these `struct`s, the last three tokens when we reach the `/` on line 3 would look like this: 213 | 214 | ```rust 215 | [ 216 | MetaToken::CloseParen(Paren { 217 | is_conditional: false, 218 | is_func_expr: false, 219 | }), 220 | MetaToken::OpenBrace(Brace { 221 | is_block: true, 222 | paren: Some(Paren { 223 | is_conditional: false, 224 | is_func_expr: false, 225 | }), 226 | }), 227 | MetaToken::OpenBrace(Brace { 228 | is_block: true, 229 | paren: Some(Paren { 230 | is_conditional: false, 231 | is_func_expr: false, 232 | }), 233 | }), 234 | ] 235 | ``` 236 | That is much easier to follow, keeps a lot less information around, and solves our possible recursive `drop` problem. We still need to keep around our 3 book keeping lists, though they will be a list of copy types! The `MetaToken` is now just 4 bytes! -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | newline_style = "Unix" 2 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | #[derive(Clone, Debug, PartialEq, Eq)] 2 | pub struct Error { 3 | pub line: usize, 4 | pub column: usize, 5 | pub msg: String, 6 | pub idx: usize, 7 | } 8 | 9 | impl ::std::error::Error for Error {} 10 | 11 | impl ::std::fmt::Display for Error { 12 | fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { 13 | write!(f, "{} at {}:{}", self.msg, self.line, self.column) 14 | } 15 | } 16 | 17 | #[derive(Clone, Debug, PartialEq, Eq)] 18 | pub struct RawError { 19 | pub idx: usize, 20 | pub msg: String, 21 | } 22 | 23 | impl ::std::error::Error for RawError {} 24 | 25 | impl ::std::fmt::Display for RawError { 26 | fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { 27 | write!(f, "{} at {}", self.msg, self.idx) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/look_behind.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::RawKeyword; 2 | use crate::tokens::Punct; 3 | use std::rc::Rc; 4 | 5 | /// A 2 element buffer of 6 | /// MetaTokens, this will use a 7 | /// "ring buffer"-esque scheme 8 | /// for automatically overwriting 9 | /// any element after 2 10 | #[derive(Clone, Debug)] 11 | pub struct LookBehind { 12 | list: [Option; 3], 13 | pointer: u8, 14 | } 15 | 16 | impl LookBehind { 17 | #[inline] 18 | pub const fn new() -> Self { 19 | Self { 20 | list: [None, None, None], 21 | pointer: 2, // force the first pointer value to be 0 22 | } 23 | } 24 | #[inline] 25 | pub fn push(&mut self, token: MetaToken) { 26 | self.pointer = wrapping_add(self.pointer, 1, 2); 27 | self.list[self.pointer as usize] = Some(token) 28 | } 29 | #[inline] 30 | pub fn one(&self) -> &Option { 31 | &self.list[self.pointer as usize] 32 | } 33 | #[inline] 34 | pub fn two(&self) -> &Option { 35 | let idx = wrapping_sub(self.pointer, 1, 2) as usize; 36 | &self.list[idx] 37 | } 38 | #[inline] 39 | pub fn three(&self) -> &Option { 40 | let idx = wrapping_sub(self.pointer, 2, 2) as usize; 41 | &self.list[idx] 42 | } 43 | } 44 | 45 | #[inline] 46 | pub fn wrapping_sub(lhs: u8, rhs: u8, max: u8) -> u8 { 47 | if lhs >= rhs { 48 | lhs - rhs 49 | } else { 50 | let diff = rhs - lhs; 51 | (max + 1) - diff 52 | } 53 | } 54 | #[inline] 55 | pub fn wrapping_add(lhs: u8, rhs: u8, max: u8) -> u8 { 56 | let maybe = lhs + rhs; 57 | if maybe > max { 58 | let diff = maybe - max; 59 | diff.saturating_sub(1) 60 | } else { 61 | maybe 62 | } 63 | } 64 | 65 | /// Token classes needed for look behind 66 | /// 67 | /// All variants will carry their line number 68 | /// 69 | #[derive(Debug, Clone, Copy)] 70 | pub enum MetaToken { 71 | Keyword(RawKeyword, u32), 72 | Punct(Punct), 73 | OpenParen(Paren), 74 | CloseParen(Paren), 75 | OpenBrace(Brace, u32), 76 | CloseBrace(Brace), 77 | Ident, 78 | Other, 79 | } 80 | #[derive(Debug, Clone, Copy)] 81 | pub struct Paren { 82 | pub func_expr: bool, 83 | pub conditional: bool, 84 | } 85 | #[derive(Debug, Clone, Copy)] 86 | pub struct Brace { 87 | pub is_block: bool, 88 | pub paren: Option, 89 | } 90 | 91 | impl MetaToken { 92 | pub fn line_number(self) -> u32 { 93 | match self { 94 | MetaToken::Keyword(_, line) | MetaToken::OpenBrace(_, line) => line, 95 | _ => 0, 96 | } 97 | } 98 | } 99 | 100 | impl PartialEq for MetaToken { 101 | fn eq(&self, other: &MetaToken) -> bool { 102 | match (self, other) { 103 | (MetaToken::Keyword(lhs, _), MetaToken::Keyword(rhs, _)) => lhs == rhs, 104 | (MetaToken::Punct(lhs), MetaToken::Punct(rhs)) => lhs == rhs, 105 | (MetaToken::Ident, MetaToken::Ident) | (MetaToken::Other, MetaToken::Other) => true, 106 | _ => false, 107 | } 108 | } 109 | } 110 | 111 | impl From<(&crate::Token, u32)> for MetaToken { 112 | fn from((other, line): (&crate::Token, u32)) -> Self { 113 | match other { 114 | crate::Token::Keyword(k) => MetaToken::Keyword(k.into(), line), 115 | crate::Token::Punct(p) => MetaToken::Punct(*p), 116 | crate::Token::Ident(_) => MetaToken::Ident, 117 | _ => MetaToken::Other, 118 | } 119 | } 120 | } 121 | 122 | #[derive(Debug, Clone)] 123 | pub struct OpenBrace { 124 | pub look_behind: LookBehind, 125 | pub parent: Option>, 126 | } 127 | 128 | #[derive(Debug, Clone)] 129 | pub struct CloseBrace { 130 | pub open: Rc, 131 | } 132 | 133 | #[derive(Debug, Clone)] 134 | pub struct CloseParen { 135 | pub open: LookBehind, 136 | } 137 | 138 | impl std::ops::Deref for OpenBrace { 139 | type Target = LookBehind; 140 | fn deref(&self) -> &Self::Target { 141 | &self.look_behind 142 | } 143 | } 144 | 145 | #[cfg(test)] 146 | mod test { 147 | use super::*; 148 | use crate::tokens::Punct; 149 | 150 | #[test] 151 | fn wrapping_collection() { 152 | let first = MetaToken::Other; 153 | let second = MetaToken::Ident; 154 | let third = MetaToken::Keyword(RawKeyword::Function, 1); 155 | let fourth = MetaToken::Punct(Punct::Ampersand); 156 | let fifth = MetaToken::Punct(Punct::Bang); 157 | let sixth = MetaToken::Punct(Punct::Caret); 158 | let seventh = MetaToken::Punct(Punct::Pipe); 159 | let eighth = MetaToken::Punct(Punct::Tilde); 160 | let mut l = LookBehind::new(); 161 | l.push(first); 162 | test(&l, Some(first), None, None); 163 | l.push(second); 164 | test(&l, Some(second), Some(first), None); 165 | l.push(third); 166 | test(&l, Some(third), Some(second), Some(first)); 167 | l.push(fourth); 168 | test(&l, Some(fourth), Some(third), Some(second)); 169 | l.push(fifth); 170 | test(&l, Some(fifth), Some(fourth), Some(third)); 171 | l.push(sixth); 172 | test(&l, Some(sixth), Some(fifth), Some(fourth)); 173 | l.push(seventh); 174 | test(&l, Some(seventh), Some(sixth), Some(fifth)); 175 | l.push(eighth); 176 | test(&l, Some(eighth), Some(seventh), Some(sixth)); 177 | } 178 | 179 | fn test( 180 | l: &LookBehind, 181 | first: Option, 182 | second: Option, 183 | third: Option, 184 | ) { 185 | println!("{:?}", l); 186 | assert_eq!(l.one(), &first, "one didn't match"); 187 | assert_eq!(l.two(), &second, "two didn't match"); 188 | assert_eq!(l.three(), &third, "three didn't match"); 189 | } 190 | 191 | #[test] 192 | fn wrapping() { 193 | assert_eq!(wrapping_sub(4, 1, 4), 3); 194 | assert_eq!(wrapping_sub(1, 1, 4), 0); 195 | assert_eq!(wrapping_sub(0, 1, 4), 4); 196 | assert_eq!(wrapping_add(0, 1, 4), 1); 197 | assert_eq!(wrapping_add(4, 1, 4), 0); 198 | assert_eq!(wrapping_add(0, 6, 4), 1) 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/tokenizer/buffer.rs: -------------------------------------------------------------------------------- 1 | use std::char; 2 | #[derive(Clone)] 3 | pub struct JSBuffer<'a> { 4 | pub buffer: &'a [u8], 5 | pub idx: usize, 6 | pub len: usize, 7 | } 8 | const CONT_MASK: u8 = 0b0011_1111; 9 | const TAG_CONT_U8: u8 = 0b1000_0000; 10 | /// Re-implementation of 11 | /// the std::str::Chars logic 12 | impl<'a> JSBuffer<'a> { 13 | #[inline] 14 | pub fn next_char(&mut self) -> Option { 15 | if self.at_end() { 16 | return None; 17 | } 18 | let x = self.next_or_zero(); 19 | if x < 128 { 20 | return Some(x as char); 21 | } 22 | 23 | // Multibyte case follows 24 | // Decode from a byte combination out of: [[[x y] z] w] 25 | // NOTE: Performance is sensitive to the exact formulation here 26 | let init = (x & (0x7F >> 2)) as u32; 27 | let y = self.next_or_zero(); 28 | let mut ch = Self::utf8_acc_cont_byte(init, y); 29 | if x < 0xE0 { 30 | return char::from_u32(ch); 31 | } 32 | // [[x y z] w] case 33 | // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid 34 | let z = self.next_or_zero(); 35 | let y_z = Self::utf8_acc_cont_byte((y & CONT_MASK) as u32, z); 36 | ch = init << 12 | y_z; 37 | if x < 0xF0 { 38 | return char::from_u32(ch); 39 | } 40 | // [x y z w] case 41 | // use only the lower 3 bits of `init` 42 | let w = self.next_or_zero(); 43 | ch = (init & 7) << 18 | Self::utf8_acc_cont_byte(y_z, w); 44 | char::from_u32(ch) 45 | } 46 | #[inline] 47 | pub fn prev_char(&mut self) -> Option { 48 | // Decode UTF-8 49 | if self.idx == 0 { 50 | return None; 51 | } 52 | let w = self.prev_or_zero(); 53 | if w < 128 { 54 | return char::from_u32(w as u32); 55 | } 56 | 57 | // Multibyte case follows 58 | // Decode from a byte combination out of: [x [y [z w]]] 59 | let mut ch; 60 | let z = self.prev_or_zero(); 61 | ch = Self::utf8_first_byte(z, 2); 62 | if Self::utf8_is_cont_byte(z) { 63 | let y = self.prev_or_zero(); 64 | ch = Self::utf8_first_byte(y, 3); 65 | if Self::utf8_is_cont_byte(y) { 66 | let x = self.prev_or_zero(); 67 | ch = Self::utf8_first_byte(x, 4); 68 | ch = Self::utf8_acc_cont_byte(ch, y); 69 | } 70 | ch = Self::utf8_acc_cont_byte(ch, z); 71 | } 72 | ch = Self::utf8_acc_cont_byte(ch, w); 73 | 74 | char::from_u32(ch) 75 | } 76 | #[inline] 77 | fn next_or_zero(&mut self) -> u8 { 78 | if self.at_end() { 79 | 0 80 | } else { 81 | let old = self.idx; 82 | self.idx += 1; 83 | self.buffer[old] 84 | } 85 | } 86 | #[inline] 87 | fn prev_or_zero(&mut self) -> u8 { 88 | if self.idx < 1 { 89 | return 0; 90 | } 91 | self.idx = self.idx.saturating_sub(1); 92 | self.buffer[self.idx] 93 | } 94 | #[inline] 95 | #[allow(clippy::all)] 96 | fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { 97 | (ch << 6) | (byte & CONT_MASK) as u32 98 | } 99 | #[inline] 100 | #[allow(clippy::all)] 101 | fn utf8_first_byte(byte: u8, width: u32) -> u32 { 102 | (byte & (0x7F >> width)) as u32 103 | } 104 | #[inline] 105 | fn utf8_is_cont_byte(byte: u8) -> bool { 106 | (byte & !CONT_MASK) == TAG_CONT_U8 107 | } 108 | } 109 | 110 | impl<'a> JSBuffer<'a> { 111 | pub fn new(buffer: &'a [u8]) -> Self { 112 | Self { 113 | buffer, 114 | idx: 0, 115 | len: buffer.len(), 116 | } 117 | } 118 | /// Check if the buffer is at or past the 119 | /// end of the bytes provided 120 | #[inline] 121 | pub fn at_end(&self) -> bool { 122 | self.idx >= self.len 123 | } 124 | 125 | /// Check if the next few bytes match the provided bytes 126 | #[inline] 127 | pub fn look_ahead_matches(&self, s: &[u8]) -> bool { 128 | let len = s.len(); 129 | let end = self.idx + len; 130 | if end > self.len { 131 | return false; 132 | } 133 | end <= self.len && &self.buffer[self.idx..end] == s 134 | } 135 | /// Check if the next byte matches a single byte provided 136 | #[inline] 137 | pub fn look_ahead_byte_matches(&self, b: u8) -> bool { 138 | if self.at_end() { 139 | false 140 | } else { 141 | self.buffer[self.idx] == b 142 | } 143 | } 144 | 145 | /// Skip the number of characters provided returning the number of bytes skipped 146 | /// note: these are full unicode characters, not just bytes 147 | #[inline] 148 | pub fn skip(&mut self, count: usize) { 149 | for _ in 0..count { 150 | self.next_char(); 151 | } 152 | } 153 | #[inline] 154 | pub fn skip_back(&mut self, count: usize) { 155 | for _ in 0..count { 156 | self.prev_char(); 157 | } 158 | } 159 | /// Skip a single byte 160 | /// note: this can cause the buffer to become unaligned 161 | /// be sure to always know the character you are skipping 162 | /// is 1 byte wide or use `skip` instead when unsure 163 | #[inline] 164 | pub fn skip_bytes(&mut self, count: usize) { 165 | self.idx += count; 166 | } 167 | 168 | /// check if current char is a valid 169 | /// js whitespace character 170 | pub fn at_whitespace(&mut self) -> bool { 171 | if self.at_end() { 172 | return false; 173 | } 174 | self.buffer[self.idx] == 9 //\t 175 | || self.buffer[self.idx] == 10 // \n 176 | || self.buffer[self.idx] == 11 // \u{000b} 177 | || self.buffer[self.idx] == 12 // \f 178 | || self.buffer[self.idx] == 13 // \r 179 | || self.buffer[self.idx] == 32 // ' ' 180 | || (self.buffer[self.idx] == 194 && self.idx + 1 < self.len && self.buffer[self.idx+1] == 160) 181 | || (self.buffer[self.idx] >= 226 && self.buffer[self.idx] <= 239 && self.len > self.idx + 2 && { 182 | match &self.buffer[self.idx..self.idx+3] { 183 | [239, 187, 191] //"\u{feff}", 184 | | [226, 128, 168] //"\u{2028}", 185 | | [226, 128, 169] //"\u{2029}", 186 | | [226, 128, 128] //"\u{2000}", 187 | | [226, 128, 129] //"\u{2001}", 188 | | [226, 128, 130] //"\u{2002}", 189 | | [226, 128, 131] //"\u{2003}", 190 | | [226, 128, 132] //"\u{2004}", 191 | | [226, 128, 133] //"\u{2005}", 192 | | [226, 128, 134] //"\u{2006}", 193 | | [226, 128, 135] //"\u{2007}", 194 | | [226, 128, 136] //"\u{2008}", 195 | | [226, 128, 137] //"\u{2009}", 196 | | [226, 128, 138] //"\u{200a}", 197 | | [226, 128, 175] //"\u{202f}", 198 | | [226, 129, 159] //"\u{205f}", 199 | | [227, 128, 128] => true, //"\u{3000}", 200 | _ => false, 201 | } 202 | } ) 203 | } 204 | /// Check of the look ahead character is 205 | /// a valid js new line character 206 | #[inline] 207 | pub fn at_new_line(&mut self) -> bool { 208 | if self.at_end() { 209 | return false; 210 | } 211 | let byte = self.buffer[self.idx]; 212 | if byte < 10 { 213 | false 214 | } else if byte == 10 { 215 | true 216 | } else if byte < 13 { 217 | false 218 | } else if byte == 13 { 219 | true 220 | } else if byte < 226 { 221 | false 222 | } else if byte == 226 { 223 | self.look_ahead_matches("\u{2028}".as_bytes()) 224 | || self.look_ahead_matches("\u{2029}".as_bytes()) 225 | } else { 226 | false 227 | } 228 | } 229 | /// check if the look ahead character is `0` or `1` 230 | #[inline] 231 | pub fn at_binary(&self) -> bool { 232 | if self.at_end() { 233 | return false; 234 | } 235 | self.buffer[self.idx] >= b'0' && self.buffer[self.idx] <= b'1' 236 | } 237 | /// check if the look ahead character is a number 238 | /// between `0` and `9`, inclusive 239 | #[inline] 240 | pub fn at_decimal(&self) -> bool { 241 | if self.at_end() { 242 | return false; 243 | } 244 | self.buffer[self.idx] >= b'0' && self.buffer[self.idx] <= b'9' 245 | } 246 | /// check if the look ahead character is a number 247 | /// between `0` and `7`, inclusive 248 | #[inline] 249 | pub fn at_octal(&self) -> bool { 250 | if self.at_end() { 251 | return false; 252 | } 253 | self.buffer[self.idx] >= b'0' && self.buffer[self.idx] <= b'7' 254 | } 255 | /// check if the look ahead character is a number 256 | /// between `0` and `9` or `a` and `f` or `A` and `F`, inclusive 257 | #[inline] 258 | pub fn at_hex(&self) -> bool { 259 | if self.at_end() { 260 | return false; 261 | } 262 | (self.buffer[self.idx] >= b'0' && self.buffer[self.idx] <= b'9') 263 | || (self.buffer[self.idx] >= b'a' && self.buffer[self.idx] <= b'f') 264 | || (self.buffer[self.idx] >= b'A' && self.buffer[self.idx] <= b'F') 265 | } 266 | /// Peek forward 1 char with out updating the 267 | /// `idx` to this new position. 268 | /// 269 | /// note: this will still cost the same amount 270 | /// of work as `next_char` but cleans up the 271 | /// book keeping for you 272 | #[inline] 273 | pub fn peek_char(&mut self) -> Option { 274 | let ch = self.next_char()?; 275 | self.skip_back_bytes(ch.len_utf8()); 276 | Some(ch) 277 | } 278 | /// Skip backwards a number of bytes 279 | /// note: this can cause the buffer to become unaligned 280 | /// be sure to always know the character you are skipping 281 | /// is [count] bytes wide or use `skip` instead when unsure 282 | /// the right width is skipped 283 | #[inline] 284 | pub fn skip_back_bytes(&mut self, count: usize) { 285 | self.idx -= count; 286 | } 287 | } 288 | 289 | impl<'a> From<&'a str> for JSBuffer<'a> { 290 | fn from(s: &'a str) -> JSBuffer { 291 | Self::new(s.as_bytes()) 292 | } 293 | } 294 | 295 | #[cfg(test)] 296 | mod test { 297 | use super::*; 298 | 299 | #[test] 300 | fn ascii_chars() { 301 | let mut bytes = Vec::new(); 302 | for i in 0..=255u8 { 303 | if i.is_ascii() { 304 | bytes.push(i); 305 | } 306 | } 307 | let mut buf = JSBuffer::new(&bytes); 308 | for &byte in &bytes { 309 | let ch = buf.next_char().unwrap(); 310 | assert_eq!(ch, byte as char); 311 | } 312 | } 313 | #[test] 314 | fn non_ascii_chars() { 315 | let mut s = String::new(); 316 | eprintln!("collecting u32 chars"); 317 | for (i, v) in (0x7FF..=0x10FFFF).enumerate() { 318 | if let Some(ch) = char::from_u32(v) { 319 | s.push(ch); 320 | } 321 | if i % 100 == 0 { 322 | eprintln!("{}", (v as f32 / (0x10FFFF - 0x7FF) as f32) * 100.0); 323 | } 324 | } 325 | eprintln!("creating buffer"); 326 | let mut buf = JSBuffer::new(s.as_bytes()); 327 | for (i, c1) in s.char_indices() { 328 | let c2 = buf.next_char().unwrap(); 329 | assert_eq!( 330 | c1, c2, 331 | "failed at character {}:\n{} vs {}\n{:08b}\n{:08b}", 332 | i, c1 as u32, c2 as u32, c1 as u32, c2 as u32 333 | ); 334 | } 335 | } 336 | #[test] 337 | fn at_whitespace() { 338 | let whitespaces = &[ 339 | 9, // \t 340 | 10, // \n 341 | 11, // \u{000b} 342 | 12, // \f 343 | 13, // \r 344 | 32, // ' ' 345 | 194, 160, //\u{00A0} 346 | 239, 187, 191, // \u{FEFF} 347 | 226, 128, 168, // \u{2028} 348 | 226, 128, 169, // \u{2029} 349 | 226, 128, 128, // \u{2000} 350 | 226, 128, 129, // \u{2001} 351 | 226, 128, 130, // \u{2002} 352 | 226, 128, 131, // \u{2003} 353 | 226, 128, 132, // \u{2004} 354 | 226, 128, 133, // \u{2005} 355 | 226, 128, 134, // \u{2006} 356 | 226, 128, 135, // \u{2007} 357 | 226, 128, 136, // \u{2008} 358 | 226, 128, 137, // \u{2009} 359 | 226, 128, 138, // \u{200A} 360 | 226, 128, 175, // \u{202F} 361 | 226, 129, 159, // \u{205F} 362 | 227, 128, 128, // \u{3000} 363 | ]; 364 | let mut buf = JSBuffer::new(whitespaces); 365 | while !buf.at_end() { 366 | assert!( 367 | buf.at_whitespace(), 368 | "buffer was not at whitespace {}", 369 | buf.idx 370 | ); 371 | buf.skip(1); 372 | } 373 | } 374 | #[test] 375 | fn at_oct_number() { 376 | let s = "012345678"; 377 | let mut buf = JSBuffer::from(s); 378 | for _ in 0..8 { 379 | assert!(buf.at_octal()); 380 | let _ = buf.next_char(); 381 | } 382 | assert!(!buf.at_octal()); 383 | } 384 | #[test] 385 | fn at_dec_number() { 386 | let s = "0123456789a"; 387 | 388 | let mut buf = JSBuffer::from(s); 389 | for _ in 0..10 { 390 | assert!(buf.at_decimal()); 391 | let _ = buf.next_char(); 392 | } 393 | assert!(!buf.at_decimal()); 394 | } 395 | #[test] 396 | fn check() { 397 | let s = "🦜🦡🐁kł둘"; 398 | let mut b = JSBuffer::from(s); 399 | assert!(b.next_char().unwrap() == '🦜'); 400 | assert!(b.next_char().unwrap() == '🦡'); 401 | assert!(b.next_char().unwrap() == '🐁'); 402 | assert!(b.next_char().unwrap() == 'k'); 403 | assert!(b.next_char().unwrap() == 'ł'); 404 | assert!(b.next_char().unwrap() == '둘'); 405 | assert!(b.next_char().is_none()); 406 | assert!(b.prev_char().unwrap() == '둘'); 407 | assert!(b.prev_char().unwrap() == 'ł'); 408 | assert!(b.prev_char().unwrap() == 'k'); 409 | assert!(b.prev_char().unwrap() == '🐁'); 410 | assert!(b.prev_char().unwrap() == '🦡'); 411 | assert!(b.prev_char().unwrap() == '🦜'); 412 | assert!(b.prev_char().is_none()); 413 | } 414 | 415 | #[test] 416 | fn at_end() { 417 | let js = "'things and stuff'"; 418 | let mut buf = JSBuffer::from(js); 419 | for (i, c) in js.char_indices() { 420 | assert!(c == buf.next_char().unwrap()); 421 | if i < js.len() - 1 { 422 | assert!(!buf.at_end()); 423 | } 424 | } 425 | assert!(buf.at_end()); 426 | } 427 | 428 | #[test] 429 | fn look_ahead_matches() { 430 | let js = r#""things and stuff""#; 431 | let mut buf = JSBuffer::from(js); 432 | for i in 0..js.len() { 433 | let c = &js[i..i + 1]; 434 | assert!(buf.look_ahead_matches(c.as_bytes())); 435 | let _ = buf.next_char(); 436 | } 437 | } 438 | } 439 | -------------------------------------------------------------------------------- /src/tokenizer/keyword_trie.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{RawKeyword, RawToken, Res, Tokenizer}; 2 | 3 | type MaybeKeyword = Res>; 4 | 5 | impl<'a> Tokenizer<'a> { 6 | /// Detect if an ident is a keyword starting from and id_start 7 | /// character 8 | /// 9 | /// note: the expectation of the start char is that if it were a 10 | /// unicode escape, it would already have been parsed to its approprate 11 | /// character 12 | pub(crate) fn keyword(&mut self, start: char) -> MaybeKeyword { 13 | match start { 14 | 'a' => self.a_keywords(), 15 | 'b' => self.b_keywords(), 16 | 'c' => self.c_keywords(), 17 | 'd' => self.d_keywords(), 18 | 'e' => self.e_keywords(), 19 | 'f' => self.f_keywords(), 20 | 'i' => self.i_keywords(), 21 | 'l' => self.l_keywords(), 22 | 'n' => self.n_keywords(), 23 | 'p' => self.p_keywords(), 24 | 'r' => self.r_keywords(), 25 | 's' => self.s_keywords(), 26 | 't' => self.t_keywords(), 27 | 'v' => self.v_keywords(), 28 | 'w' => self.w_keywords(), 29 | 'y' => self.y_keywords(), 30 | _ => Ok(None), 31 | } 32 | } 33 | /// attempt to parse `await` 34 | fn a_keywords(&mut self) -> MaybeKeyword { 35 | self.suffix_for_token("wait", RawToken::Keyword(RawKeyword::Await)) 36 | } 37 | /// attempt to parse `break` 38 | fn b_keywords(&mut self) -> MaybeKeyword { 39 | self.suffix_for_token("reak", RawToken::Keyword(RawKeyword::Break)) 40 | } 41 | /// attempt to parse `case`, `catch`, `class`, `const` or 42 | /// `continue` 43 | fn c_keywords(&mut self) -> MaybeKeyword { 44 | if self.eat_ch_or_escaped('a')? { 45 | if self.eat_ch_or_escaped('s')? { 46 | self.suffix_for_token("e", RawToken::Keyword(RawKeyword::Case)) 47 | } else if self.eat_ch_or_escaped('t')? { 48 | self.suffix_for_token("ch", RawToken::Keyword(RawKeyword::Catch)) 49 | } else { 50 | Ok(None) 51 | } 52 | } else if self.eat_ch_or_escaped('l')? { 53 | self.suffix_for_token("ass", RawToken::Keyword(RawKeyword::Class)) 54 | } else if self.eat_ch_or_escaped('o')? && self.eat_ch_or_escaped('n')? { 55 | if self.eat_ch_or_escaped('s')? { 56 | self.suffix_for_token("t", RawToken::Keyword(RawKeyword::Const)) 57 | } else if self.eat_ch_or_escaped('t')? { 58 | self.suffix_for_token("inue", RawToken::Keyword(RawKeyword::Continue)) 59 | } else { 60 | Ok(None) 61 | } 62 | } else { 63 | Ok(None) 64 | } 65 | } 66 | /// attempt to parse `debugger`, `default`, `delete` or `do` 67 | fn d_keywords(&mut self) -> MaybeKeyword { 68 | if self.eat_ch_or_escaped('e')? { 69 | if self.eat_ch_or_escaped('b')? { 70 | self.suffix_for_token("ugger", RawToken::Keyword(RawKeyword::Debugger)) 71 | } else if self.eat_ch_or_escaped('f')? { 72 | self.suffix_for_token("ault", RawToken::Keyword(RawKeyword::Default)) 73 | } else if self.eat_ch_or_escaped('l')? { 74 | self.suffix_for_token("ete", RawToken::Keyword(RawKeyword::Delete)) 75 | } else { 76 | Ok(None) 77 | } 78 | } else if self.eat_ch_or_escaped('o')? && self.at_ident_end() { 79 | Ok(Some(RawToken::Keyword(RawKeyword::Do))) 80 | } else { 81 | Ok(None) 82 | } 83 | } 84 | /// attempt to parse `else`, `enum`, `export`, or `extends` 85 | fn e_keywords(&mut self) -> MaybeKeyword { 86 | if self.eat_ch_or_escaped('l')? { 87 | self.suffix_for_token("se", RawToken::Keyword(RawKeyword::Else)) 88 | } else if self.eat_ch_or_escaped('n')? { 89 | self.suffix_for_token("um", RawToken::Keyword(RawKeyword::Enum)) 90 | } else if self.eat_ch_or_escaped('x')? { 91 | if self.eat_ch_or_escaped('p')? { 92 | self.suffix_for_token("ort", RawToken::Keyword(RawKeyword::Export)) 93 | } else if self.eat_ch_or_escaped('t')? { 94 | self.suffix_for_token("ends", RawToken::Keyword(RawKeyword::Extends)) 95 | } else { 96 | Ok(None) 97 | } 98 | } else { 99 | Ok(None) 100 | } 101 | } 102 | /// attempt to parse `false`, `finally`, `for` or `function` 103 | fn f_keywords(&mut self) -> MaybeKeyword { 104 | if self.eat_ch_or_escaped('a')? { 105 | self.suffix_for_token("lse", RawToken::Boolean(false)) 106 | } else if self.eat_ch_or_escaped('i')? { 107 | self.suffix_for_token("nally", RawToken::Keyword(RawKeyword::Finally)) 108 | } else if self.eat_ch_or_escaped('o')? { 109 | self.suffix_for_token("r", RawToken::Keyword(RawKeyword::For)) 110 | } else if self.eat_ch_or_escaped('u')? { 111 | self.suffix_for_token("nction", RawToken::Keyword(RawKeyword::Function)) 112 | } else { 113 | Ok(None) 114 | } 115 | } 116 | /// attempt to parse `if`, `implements`, `import`, `in`, `instanceof`, 117 | /// or `interface` 118 | fn i_keywords(&mut self) -> MaybeKeyword { 119 | if self.eat_ch_or_escaped('f')? && self.at_ident_end() { 120 | Ok(Some(RawToken::Keyword(RawKeyword::If))) 121 | } else if self.eat_ch_or_escaped('m')? && self.eat_ch_or_escaped('p')? { 122 | if self.eat_ch_or_escaped('l')? { 123 | self.suffix_for_token("ements", RawToken::Keyword(RawKeyword::Implements)) 124 | } else if self.eat_ch_or_escaped('o')? { 125 | self.suffix_for_token("rt", RawToken::Keyword(RawKeyword::Import)) 126 | } else { 127 | Ok(None) 128 | } 129 | } else if self.eat_ch_or_escaped('n')? { 130 | if self.eat_ch_or_escaped('s')? { 131 | self.suffix_for_token("tanceof", RawToken::Keyword(RawKeyword::InstanceOf)) 132 | } else if self.eat_ch_or_escaped('t')? { 133 | self.suffix_for_token("erface", RawToken::Keyword(RawKeyword::Interface)) 134 | } else if self.at_ident_end() { 135 | Ok(Some(RawToken::Keyword(RawKeyword::In))) 136 | } else { 137 | Ok(None) 138 | } 139 | } else { 140 | Ok(None) 141 | } 142 | } 143 | /// attempt to parse `let` 144 | fn l_keywords(&mut self) -> MaybeKeyword { 145 | self.suffix_for_token("et", RawToken::Keyword(RawKeyword::Let)) 146 | } 147 | /// attempt to parse `new` or `null` 148 | fn n_keywords(&mut self) -> MaybeKeyword { 149 | if self.eat_ch_or_escaped('e')? { 150 | self.suffix_for_token("w", RawToken::Keyword(RawKeyword::New)) 151 | } else if self.eat_ch_or_escaped('u')? { 152 | self.suffix_for_token("ll", RawToken::Null) 153 | } else { 154 | Ok(None) 155 | } 156 | } 157 | /// attempt to parse `package`, `private`, `protected`, or 158 | /// `public`, 159 | fn p_keywords(&mut self) -> MaybeKeyword { 160 | if self.eat_ch_or_escaped('a')? { 161 | self.suffix_for_token("ckage", RawToken::Keyword(RawKeyword::Package)) 162 | } else if self.eat_ch_or_escaped('r')? { 163 | if self.eat_ch_or_escaped('i')? { 164 | self.suffix_for_token("vate", RawToken::Keyword(RawKeyword::Private)) 165 | } else if self.eat_ch_or_escaped('o')? { 166 | self.suffix_for_token("tected", RawToken::Keyword(RawKeyword::Protected)) 167 | } else { 168 | Ok(None) 169 | } 170 | } else if self.eat_ch_or_escaped('u')? { 171 | self.suffix_for_token("blic", RawToken::Keyword(RawKeyword::Public)) 172 | } else { 173 | Ok(None) 174 | } 175 | } 176 | 177 | fn r_keywords(&mut self) -> MaybeKeyword { 178 | self.suffix_for_token("eturn", RawToken::Keyword(RawKeyword::Return)) 179 | } 180 | /// attempt to parse `static`, `super`, or `switch` 181 | fn s_keywords(&mut self) -> MaybeKeyword { 182 | if self.eat_ch_or_escaped('t')? { 183 | self.suffix_for_token("atic", RawToken::Keyword(RawKeyword::Static)) 184 | } else if self.eat_ch_or_escaped('u')? { 185 | self.suffix_for_token("per", RawToken::Keyword(RawKeyword::Super)) 186 | } else if self.eat_ch_or_escaped('w')? { 187 | self.suffix_for_token("itch", RawToken::Keyword(RawKeyword::Switch)) 188 | } else { 189 | Ok(None) 190 | } 191 | } 192 | /// attempt to parse `this`, `throw`, `true`, 193 | /// `try`, or `typeof` 194 | fn t_keywords(&mut self) -> MaybeKeyword { 195 | if self.eat_ch_or_escaped('h')? { 196 | if self.eat_ch_or_escaped('i')? { 197 | self.suffix_for_token("s", RawToken::Keyword(RawKeyword::This)) 198 | } else if self.eat_ch_or_escaped('r')? { 199 | self.suffix_for_token("ow", RawToken::Keyword(RawKeyword::Throw)) 200 | } else { 201 | Ok(None) 202 | } 203 | } else if self.eat_ch_or_escaped('r')? { 204 | if self.eat_ch_or_escaped('u')? { 205 | self.suffix_for_token("e", RawToken::Boolean(true)) 206 | } else if self.eat_ch_or_escaped('y')? && self.at_ident_end() { 207 | Ok(Some(RawToken::Keyword(RawKeyword::Try))) 208 | } else { 209 | Ok(None) 210 | } 211 | } else if self.eat_ch_or_escaped('y')? { 212 | self.suffix_for_token("peof", RawToken::Keyword(RawKeyword::TypeOf)) 213 | } else { 214 | Ok(None) 215 | } 216 | } 217 | /// ttempt to parse `var` or `void`, 218 | fn v_keywords(&mut self) -> MaybeKeyword { 219 | if self.eat_ch_or_escaped('a')? { 220 | self.suffix_for_token("r", RawToken::Keyword(RawKeyword::Var)) 221 | } else if self.eat_ch_or_escaped('o')? { 222 | self.suffix_for_token("id", RawToken::Keyword(RawKeyword::Void)) 223 | } else { 224 | Ok(None) 225 | } 226 | } 227 | /// attempt to parse `while` or `with` 228 | fn w_keywords(&mut self) -> MaybeKeyword { 229 | if self.eat_ch_or_escaped('h')? { 230 | self.suffix_for_token("ile", RawToken::Keyword(RawKeyword::While)) 231 | } else if self.eat_ch_or_escaped('i')? { 232 | self.suffix_for_token("th", RawToken::Keyword(RawKeyword::With)) 233 | } else { 234 | Ok(None) 235 | } 236 | } 237 | /// attempt to parse `yield` 238 | fn y_keywords(&mut self) -> MaybeKeyword { 239 | self.suffix_for_token("ield", RawToken::Keyword(RawKeyword::Yield)) 240 | } 241 | /// This will attempt to consumer the suffix, if successful and 242 | /// the stream is at the end of an identifier, it will return 243 | /// the `tok` provided 244 | /// 245 | /// This is useful for when we have reached a leaf on a trie 246 | fn suffix_for_token(&mut self, suffix: &str, tok: RawToken) -> MaybeKeyword { 247 | if self.eat_chs_or_escaped(suffix)? { 248 | if self.at_ident_end() { 249 | Ok(Some(tok)) 250 | } else { 251 | Ok(None) 252 | } 253 | } else { 254 | Ok(None) 255 | } 256 | } 257 | /// Test if the stream has moved past the end of an identifier 258 | fn at_ident_end(&mut self) -> bool { 259 | if self.look_ahead_matches(r"\u") { 260 | false 261 | } else if let Some(c) = self.stream.next_char() { 262 | if !Self::is_id_continue(c) && c != '\u{200C}' && c != '\u{200D}' { 263 | let _ = self.stream.prev_char(); 264 | true 265 | } else { 266 | false 267 | } 268 | } else { 269 | true 270 | } 271 | } 272 | /// If the characters in the provided &str matche the look ahead _bytes_ 273 | /// or a unicode escape of the characters, it will move 274 | /// the stream's index forward to the approrate position 275 | /// it will stop moving forward after at the first failed 276 | /// match (this means it will consume any leading positive matches) 277 | /// 278 | /// note: the character provided must be an ascii character 279 | /// to get a positive match 280 | fn eat_chs_or_escaped(&mut self, chars: &str) -> Res { 281 | for c in chars.chars() { 282 | if !self.eat_ch_or_escaped(c)? { 283 | return Ok(false); 284 | } 285 | } 286 | Ok(true) 287 | } 288 | /// If the character provided matches the look ahead _byte_ 289 | /// or a unicode escape of the character, it will move 290 | /// the stream's index forward to the approrate position 291 | /// 292 | /// note: the character provided must be an ascii character 293 | /// to get a positive match 294 | pub(crate) fn eat_ch_or_escaped(&mut self, ch: char) -> Res { 295 | debug_assert!( 296 | ch.len_utf8() == 1, 297 | "cannot use eat_ch_or_escaped with characters larger than 1 byte wide" 298 | ); 299 | Ok(if self.look_ahead_byte_matches(ch) { 300 | self.stream.skip_bytes(1); 301 | true 302 | } else if self.look_ahead_matches("\\u") { 303 | let start = self.stream.idx; 304 | self.stream.skip_bytes(1); // skip the slash only 305 | let c = self.escaped_ident_part()?; 306 | if c != ch { 307 | self.stream.idx = start; 308 | false 309 | } else { 310 | true 311 | } 312 | } else { 313 | false 314 | }) 315 | } 316 | } 317 | 318 | #[cfg(test)] 319 | mod test { 320 | use super::*; 321 | 322 | #[test] 323 | fn keyword_await() { 324 | test_with_escapes("await", RawToken::Keyword(RawKeyword::Await)); 325 | } 326 | 327 | #[test] 328 | fn keyword_break() { 329 | test_with_escapes("break", RawToken::Keyword(RawKeyword::Break)); 330 | } 331 | 332 | #[test] 333 | fn keyword_case() { 334 | test_with_escapes("case", RawToken::Keyword(RawKeyword::Case)); 335 | } 336 | 337 | #[test] 338 | fn keyword_catch() { 339 | test_with_escapes("catch", RawToken::Keyword(RawKeyword::Catch)); 340 | } 341 | #[test] 342 | fn keyword_const() { 343 | test_with_escapes("const", RawToken::Keyword(RawKeyword::Const)); 344 | } 345 | #[test] 346 | fn keyword_continue() { 347 | test_with_escapes("continue", RawToken::Keyword(RawKeyword::Continue)); 348 | } 349 | #[test] 350 | fn keyword_class() { 351 | test_with_escapes("class", RawToken::Keyword(RawKeyword::Class)); 352 | } 353 | #[test] 354 | fn keyword_debugger() { 355 | test_with_escapes("debugger", RawToken::Keyword(RawKeyword::Debugger)); 356 | } 357 | #[test] 358 | fn keyword_default() { 359 | test_with_escapes("default", RawToken::Keyword(RawKeyword::Default)); 360 | } 361 | #[test] 362 | fn keyword_delete() { 363 | test_with_escapes("delete", RawToken::Keyword(RawKeyword::Delete)); 364 | } 365 | #[test] 366 | fn keyword_do() { 367 | test_with_escapes("do", RawToken::Keyword(RawKeyword::Do)); 368 | } 369 | 370 | #[test] 371 | fn keyword_else() { 372 | test_with_escapes("else", RawToken::Keyword(RawKeyword::Else)); 373 | } 374 | #[test] 375 | fn keyword_enum() { 376 | test_with_escapes("enum", RawToken::Keyword(RawKeyword::Enum)); 377 | } 378 | #[test] 379 | fn keyword_export() { 380 | test_with_escapes("export", RawToken::Keyword(RawKeyword::Export)); 381 | } 382 | #[test] 383 | fn keyword_extends() { 384 | test_with_escapes("extends", RawToken::Keyword(RawKeyword::Extends)); 385 | } 386 | #[test] 387 | fn keyword_false() { 388 | test_with_escapes("false", RawToken::Boolean(false)); 389 | } 390 | #[test] 391 | fn keyword_finally() { 392 | test_with_escapes("finally", RawToken::Keyword(RawKeyword::Finally)); 393 | } 394 | #[test] 395 | fn keyword_for() { 396 | test_with_escapes("for", RawToken::Keyword(RawKeyword::For)); 397 | } 398 | #[test] 399 | fn keyword_function() { 400 | test_with_escapes("function", RawToken::Keyword(RawKeyword::Function)); 401 | } 402 | #[test] 403 | fn keyword_if() { 404 | test_with_escapes("if", RawToken::Keyword(RawKeyword::If)); 405 | } 406 | #[test] 407 | fn keyword_implements() { 408 | test_with_escapes("implements", RawToken::Keyword(RawKeyword::Implements)); 409 | } 410 | #[test] 411 | fn keyword_import() { 412 | test_with_escapes("import", RawToken::Keyword(RawKeyword::Import)); 413 | } 414 | #[test] 415 | fn keyword_in() { 416 | test_with_escapes("in", RawToken::Keyword(RawKeyword::In)); 417 | } 418 | #[test] 419 | fn keyword_instance_of() { 420 | test_with_escapes("instanceof", RawToken::Keyword(RawKeyword::InstanceOf)); 421 | } 422 | #[test] 423 | fn keyword_interface() { 424 | test_with_escapes("interface", RawToken::Keyword(RawKeyword::Interface)); 425 | } 426 | #[test] 427 | fn keyword_let() { 428 | test_with_escapes("let", RawToken::Keyword(RawKeyword::Let)); 429 | } 430 | #[test] 431 | fn keyword_new() { 432 | test_with_escapes("new", RawToken::Keyword(RawKeyword::New)); 433 | } 434 | #[test] 435 | fn keyword_null() { 436 | test_with_escapes("null", RawToken::Null); 437 | } 438 | #[test] 439 | fn keyword_package() { 440 | test_with_escapes("package", RawToken::Keyword(RawKeyword::Package)); 441 | } 442 | #[test] 443 | fn keyword_private() { 444 | test_with_escapes("private", RawToken::Keyword(RawKeyword::Private)); 445 | } 446 | #[test] 447 | fn keyword_protected() { 448 | test_with_escapes("protected", RawToken::Keyword(RawKeyword::Protected)); 449 | } 450 | #[test] 451 | fn keyword_public() { 452 | test_with_escapes("public", RawToken::Keyword(RawKeyword::Public)); 453 | } 454 | #[test] 455 | fn keyword_return() { 456 | test_with_escapes("return", RawToken::Keyword(RawKeyword::Return)); 457 | } 458 | #[test] 459 | fn keyword_static() { 460 | test_with_escapes("static", RawToken::Keyword(RawKeyword::Static)); 461 | } 462 | #[test] 463 | fn keyword_super() { 464 | test_with_escapes("super", RawToken::Keyword(RawKeyword::Super)); 465 | } 466 | #[test] 467 | fn keyword_switch() { 468 | test_with_escapes("switch", RawToken::Keyword(RawKeyword::Switch)); 469 | } 470 | #[test] 471 | fn keyword_this() { 472 | test_with_escapes("this", RawToken::Keyword(RawKeyword::This)); 473 | } 474 | #[test] 475 | fn keyword_throw() { 476 | test_with_escapes("throw", RawToken::Keyword(RawKeyword::Throw)); 477 | } 478 | #[test] 479 | fn keyword_true() { 480 | test_with_escapes("true", RawToken::Boolean(true)); 481 | } 482 | #[test] 483 | fn keyword_try() { 484 | test_with_escapes("try", RawToken::Keyword(RawKeyword::Try)); 485 | } 486 | #[test] 487 | fn keyword_type_of() { 488 | test_with_escapes("typeof", RawToken::Keyword(RawKeyword::TypeOf)); 489 | } 490 | #[test] 491 | fn keyword_var() { 492 | test_with_escapes("var", RawToken::Keyword(RawKeyword::Var)); 493 | } 494 | #[test] 495 | fn keyword_void() { 496 | test_with_escapes("void", RawToken::Keyword(RawKeyword::Void)); 497 | } 498 | #[test] 499 | fn keyword_while() { 500 | test_with_escapes("while", RawToken::Keyword(RawKeyword::While)); 501 | } 502 | #[test] 503 | fn keyword_with() { 504 | test_with_escapes("with", RawToken::Keyword(RawKeyword::With)); 505 | } 506 | #[test] 507 | fn keyword_yield() { 508 | test_with_escapes("yield", RawToken::Keyword(RawKeyword::Yield)); 509 | } 510 | 511 | fn test_with_escapes(k: &str, expect: RawToken) { 512 | let start = k.chars().next().expect("empty keyword"); 513 | let first = test_keyword(start, k) 514 | .unwrap_or_else(|e| panic!("failed to parse {}: {}", k, e)) 515 | .unwrap_or_else(|| panic!("failed to parse {}", k)); 516 | assert_eq!(first, expect); 517 | let mut escape_char_code; 518 | let mut escape_code_points; 519 | for i in 0..k.chars().count() { 520 | escape_char_code = String::new(); 521 | escape_code_points = String::new(); 522 | for (j, c) in k.chars().enumerate() { 523 | if j == i { 524 | escape_char_code.push_str(&format!(r#"\u{:04X}"#, c as u8)); 525 | escape_code_points.push_str(&format!(r#"\u{{{:06X}}}"#, c as u8)); 526 | } else { 527 | escape_char_code.push(c); 528 | escape_code_points.push(c); 529 | } 530 | } 531 | let second = test_keyword(start, &escape_char_code) 532 | .unwrap_or_else(|e| { 533 | panic!( 534 | "failed to parse escaped keyword {}: {}", 535 | escape_char_code, e 536 | ) 537 | }) 538 | .unwrap_or_else(|| panic!("failed to parse escaped keyword {}", escape_char_code)); 539 | assert_eq!( 540 | second, expect, 541 | "{} doesn't match expected keyword", 542 | escape_char_code 543 | ); 544 | let third = test_keyword(start, &escape_code_points) 545 | .unwrap_or_else(|e| { 546 | panic!( 547 | "failed to parse escaped keyword {}: {}", 548 | escape_code_points, e 549 | ) 550 | }) 551 | .unwrap_or_else(|| { 552 | panic!("failed to parse escaped keyword {}", escape_code_points) 553 | }); 554 | assert_eq!( 555 | third, expect, 556 | "{} doesn't match expected keyword", 557 | escape_code_points 558 | ); 559 | } 560 | let not = format!("{}_not", k); 561 | assert_eq!( 562 | test_keyword(start, ¬) 563 | .unwrap_or_else(|e| panic!("Failed to parse not keyword {}: {}", not, e)), 564 | None 565 | ); 566 | } 567 | 568 | fn test_keyword(start: char, k: &str) -> MaybeKeyword { 569 | dbg!(start); 570 | dbg!(k); 571 | let mut t = Tokenizer::new(k); 572 | assert!( 573 | t.eat_ch_or_escaped(start)?, 574 | "start didn't match first character {}, {}", 575 | start, 576 | k 577 | ); 578 | t.keyword(start) 579 | } 580 | } 581 | -------------------------------------------------------------------------------- /src/tokenizer/tokens.rs: -------------------------------------------------------------------------------- 1 | use crate::tokens::{CommentKind, Keyword, NumberKind, Punct}; 2 | 3 | #[derive(PartialEq, Eq, Debug, Clone, Copy)] 4 | pub enum RawToken { 5 | /// `true` of `false` 6 | Boolean(bool), 7 | /// The end of the file 8 | EoF, 9 | /// An identifier this will be either a variable name 10 | /// or a function/method name 11 | Ident, 12 | /// A word that has been reserved to not be used as an identifier 13 | Keyword(RawKeyword), 14 | /// A `null` literal value 15 | Null, 16 | /// A number, this includes integers (`1`), decimals (`0.1`), 17 | /// hex (`0x8f`), binary (`0b010011010`), and octal (`0o273`) 18 | Number(NumberKind), 19 | /// A punctuation mark, this includes all mathematical operators 20 | /// logical operators and general syntax punctuation 21 | Punct(Punct), 22 | /// A string literal, either double or single quoted, the associated 23 | /// value will be the unquoted string 24 | String { 25 | kind: StringKind, 26 | new_line_count: usize, 27 | last_len: usize, 28 | found_octal_escape: bool, 29 | }, 30 | /// A regular expression literal. 31 | /// ```js 32 | /// let regex = /[a-zA-Z]+/g; 33 | /// ``` 34 | RegEx(usize), 35 | /// The string parts of a template string 36 | /// ```js 37 | /// `things and stuff times ${10}` 38 | /// // ^^^^^^^^^^^^^^^^^^^^^^ ^ 39 | /// ``` 40 | Template { 41 | kind: TemplateKind, 42 | new_line_count: usize, 43 | last_len: usize, 44 | has_octal_escape: bool, 45 | found_invalid_unicode_escape: bool, 46 | found_invalid_hex_escape: bool, 47 | }, 48 | /// A comment, the associated value will contain the raw comment 49 | /// This will capture both inline comments `// I am an inline comment` 50 | /// and multi-line comments 51 | /// ```js 52 | /// /*multi lines 53 | /// * comments 54 | /// */ 55 | /// ``` 56 | Comment { 57 | kind: CommentKind, 58 | new_line_count: usize, 59 | last_len: usize, 60 | end_index: usize, 61 | }, 62 | } 63 | 64 | impl Copy for Keyword<()> {} 65 | 66 | impl RawToken { 67 | pub fn is_punct(&self) -> bool { 68 | matches!(self, RawToken::Punct(_)) 69 | } 70 | 71 | pub fn is_comment(&self) -> bool { 72 | matches!(self, RawToken::Comment { .. }) 73 | } 74 | pub fn is_div_punct(&self) -> bool { 75 | matches!( 76 | self, 77 | RawToken::Punct(Punct::ForwardSlash | Punct::ForwardSlashEqual) 78 | ) 79 | } 80 | } 81 | 82 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] 83 | pub enum StringKind { 84 | Double, 85 | Single, 86 | } 87 | 88 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] 89 | pub enum TemplateKind { 90 | NoSub, 91 | Head, 92 | Body, 93 | Tail, 94 | } 95 | #[derive(PartialEq, Eq, Debug, Clone, Copy)] 96 | pub enum RawKeyword { 97 | Await, 98 | Break, 99 | Case, 100 | Catch, 101 | Class, 102 | Const, 103 | Continue, 104 | Debugger, 105 | Default, 106 | Delete, 107 | Do, 108 | Else, 109 | Enum, 110 | Export, 111 | Extends, 112 | Finally, 113 | For, 114 | Function, 115 | If, 116 | Implements, 117 | Import, 118 | In, 119 | InstanceOf, 120 | Interface, 121 | Let, 122 | New, 123 | Package, 124 | Private, 125 | Protected, 126 | Public, 127 | Return, 128 | Static, 129 | Super, 130 | Switch, 131 | This, 132 | Throw, 133 | Try, 134 | TypeOf, 135 | Var, 136 | Void, 137 | While, 138 | With, 139 | Yield, 140 | } 141 | 142 | impl RawKeyword { 143 | pub fn with_str(self, s: &str) -> crate::tokens::Keyword<&str> { 144 | match self { 145 | RawKeyword::Await => Keyword::Await(s), 146 | RawKeyword::Break => Keyword::Break(s), 147 | RawKeyword::Case => Keyword::Case(s), 148 | RawKeyword::Catch => Keyword::Catch(s), 149 | RawKeyword::Class => Keyword::Class(s), 150 | RawKeyword::Const => Keyword::Const(s), 151 | RawKeyword::Continue => Keyword::Continue(s), 152 | RawKeyword::Debugger => Keyword::Debugger(s), 153 | RawKeyword::Default => Keyword::Default(s), 154 | RawKeyword::Delete => Keyword::Delete(s), 155 | RawKeyword::Do => Keyword::Do(s), 156 | RawKeyword::Else => Keyword::Else(s), 157 | RawKeyword::Enum => Keyword::Enum(s), 158 | RawKeyword::Export => Keyword::Export(s), 159 | RawKeyword::Extends => Keyword::Extends(s), 160 | RawKeyword::Finally => Keyword::Finally(s), 161 | RawKeyword::For => Keyword::For(s), 162 | RawKeyword::Function => Keyword::Function(s), 163 | RawKeyword::If => Keyword::If(s), 164 | RawKeyword::Implements => Keyword::Implements(s), 165 | RawKeyword::Import => Keyword::Import(s), 166 | RawKeyword::In => Keyword::In(s), 167 | RawKeyword::InstanceOf => Keyword::InstanceOf(s), 168 | RawKeyword::Interface => Keyword::Interface(s), 169 | RawKeyword::Let => Keyword::Let(s), 170 | RawKeyword::New => Keyword::New(s), 171 | RawKeyword::Package => Keyword::Package(s), 172 | RawKeyword::Private => Keyword::Private(s), 173 | RawKeyword::Protected => Keyword::Protected(s), 174 | RawKeyword::Public => Keyword::Public(s), 175 | RawKeyword::Return => Keyword::Return(s), 176 | RawKeyword::Static => Keyword::Static(s), 177 | RawKeyword::Super => Keyword::Super(s), 178 | RawKeyword::Switch => Keyword::Switch(s), 179 | RawKeyword::This => Keyword::This(s), 180 | RawKeyword::Throw => Keyword::Throw(s), 181 | RawKeyword::Try => Keyword::Try(s), 182 | RawKeyword::TypeOf => Keyword::TypeOf(s), 183 | RawKeyword::Var => Keyword::Var(s), 184 | RawKeyword::Void => Keyword::Void(s), 185 | RawKeyword::While => Keyword::While(s), 186 | RawKeyword::With => Keyword::With(s), 187 | RawKeyword::Yield => Keyword::Yield(s), 188 | } 189 | } 190 | } 191 | 192 | impl From<&Keyword> for RawKeyword { 193 | fn from(k: &Keyword) -> Self { 194 | match k { 195 | Keyword::Await(_) => RawKeyword::Await, 196 | Keyword::Break(_) => RawKeyword::Break, 197 | Keyword::Case(_) => RawKeyword::Case, 198 | Keyword::Catch(_) => RawKeyword::Catch, 199 | Keyword::Class(_) => RawKeyword::Class, 200 | Keyword::Const(_) => RawKeyword::Const, 201 | Keyword::Continue(_) => RawKeyword::Continue, 202 | Keyword::Debugger(_) => RawKeyword::Debugger, 203 | Keyword::Default(_) => RawKeyword::Default, 204 | Keyword::Delete(_) => RawKeyword::Delete, 205 | Keyword::Do(_) => RawKeyword::Do, 206 | Keyword::Else(_) => RawKeyword::Else, 207 | Keyword::Enum(_) => RawKeyword::Enum, 208 | Keyword::Export(_) => RawKeyword::Export, 209 | Keyword::Extends(_) => RawKeyword::Extends, 210 | Keyword::Finally(_) => RawKeyword::Finally, 211 | Keyword::For(_) => RawKeyword::For, 212 | Keyword::Function(_) => RawKeyword::Function, 213 | Keyword::If(_) => RawKeyword::If, 214 | Keyword::Implements(_) => RawKeyword::Implements, 215 | Keyword::Import(_) => RawKeyword::Import, 216 | Keyword::In(_) => RawKeyword::In, 217 | Keyword::InstanceOf(_) => RawKeyword::InstanceOf, 218 | Keyword::Interface(_) => RawKeyword::Interface, 219 | Keyword::Let(_) => RawKeyword::Let, 220 | Keyword::New(_) => RawKeyword::New, 221 | Keyword::Package(_) => RawKeyword::Package, 222 | Keyword::Private(_) => RawKeyword::Private, 223 | Keyword::Protected(_) => RawKeyword::Protected, 224 | Keyword::Public(_) => RawKeyword::Public, 225 | Keyword::Return(_) => RawKeyword::Return, 226 | Keyword::Static(_) => RawKeyword::Static, 227 | Keyword::Super(_) => RawKeyword::Super, 228 | Keyword::Switch(_) => RawKeyword::Switch, 229 | Keyword::This(_) => RawKeyword::This, 230 | Keyword::Throw(_) => RawKeyword::Throw, 231 | Keyword::Try(_) => RawKeyword::Try, 232 | Keyword::TypeOf(_) => RawKeyword::TypeOf, 233 | Keyword::Var(_) => RawKeyword::Var, 234 | Keyword::Void(_) => RawKeyword::Void, 235 | Keyword::While(_) => RawKeyword::While, 236 | Keyword::With(_) => RawKeyword::With, 237 | Keyword::Yield(_) => RawKeyword::Yield, 238 | } 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /src/tokenizer/unicode.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::all)] 2 | use unicode_xid::UnicodeXID; 3 | 4 | /// wrap the `unic_ucd_ident`'s function 5 | /// first short-circuiting around the ascii 6 | /// and other non `CJK` characters 7 | #[inline] 8 | pub(crate) fn is_id_start(c: char) -> bool { 9 | if c >= 'a' && c <= 'z' { 10 | true 11 | } else if c >= 'A' && c <= 'Z' { 12 | true 13 | } else if c == '\\' || c == '_' || c == '$' { 14 | true 15 | } else if c < '\u{AA}' { 16 | false 17 | } else if c == '\u{2118}' 18 | || c == '\u{212E}' 19 | || c == '\u{309B}' 20 | || c == '\u{309C}' 21 | || c == '\u{1885}' 22 | || c == '\u{1886}' 23 | { 24 | true 25 | } else { 26 | UnicodeXID::is_xid_start(c) 27 | } 28 | } 29 | /// wrap the `unic_ucd_ident`'s function 30 | /// first short-circuiting around the ascii 31 | /// and other non `CJK` characters 32 | #[inline] 33 | pub(crate) fn is_id_continue(c: char) -> bool { 34 | if c >= 'a' && c <= 'z' { 35 | true 36 | } else if c >= 'A' && c <= 'Z' { 37 | true 38 | } else if c >= '0' && c <= '9' { 39 | true 40 | } else if c == '\\' || c == '_' || c == '$' { 41 | true 42 | } else if c < '\u{AA}' { 43 | false 44 | } else if c == '\u{200C}' 45 | || c == '\u{200D}' 46 | || c == '\u{2118}' 47 | || c == '\u{212E}' 48 | || c == '\u{309B}' 49 | || c == '\u{309C}' 50 | || c == '\u{1885}' 51 | || c == '\u{1886}' 52 | || c == '\u{1369}' 53 | || c == '\u{136A}' 54 | || c == '\u{136B}' 55 | || c == '\u{136C}' 56 | || c == '\u{136D}' 57 | || c == '\u{136E}' 58 | || c == '\u{136F}' 59 | || c == '\u{1370}' 60 | || c == '\u{1371}' 61 | || c == '\u{B7}' 62 | || c == '\u{387}' 63 | || c == '\u{19DA}' 64 | { 65 | true 66 | } else { 67 | UnicodeXID::is_xid_continue(c) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/tokens/boolean.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 2 | /// The tokenized representation of `true` or `false` 3 | pub enum Boolean { 4 | True, 5 | False, 6 | } 7 | impl PartialEq for Boolean { 8 | fn eq(&self, other: &bool) -> bool { 9 | matches!( 10 | (self, other), 11 | (Boolean::True, true) | (Boolean::False, false) 12 | ) 13 | } 14 | } 15 | impl PartialEq for Boolean { 16 | fn eq(&self, other: &str) -> bool { 17 | matches!( 18 | (self, other), 19 | (Boolean::True, "true") | (Boolean::False, "false") 20 | ) 21 | } 22 | } 23 | impl Boolean { 24 | /// Test if this instance represents `true` 25 | pub fn is_true(self) -> bool { 26 | matches!(self, Boolean::True) 27 | } 28 | } 29 | 30 | impl Boolean { 31 | /// Create a Boolean from raw text 32 | pub fn from(s: &str) -> Option { 33 | if s == "true" { 34 | Some(Boolean::True) 35 | } else if s == "false" { 36 | Some(Boolean::False) 37 | } else { 38 | None 39 | } 40 | } 41 | } 42 | 43 | impl From for Boolean { 44 | /// Creates a JS Bool for a rust bool 45 | fn from(b: bool) -> Self { 46 | if b { 47 | Boolean::True 48 | } else { 49 | Boolean::False 50 | } 51 | } 52 | } 53 | 54 | impl From for String { 55 | /// Return this Boolean to the text 56 | /// that was parsed to create it 57 | fn from(b: Boolean) -> String { 58 | match b { 59 | Boolean::True => "true".into(), 60 | Boolean::False => "false".into(), 61 | } 62 | } 63 | } 64 | 65 | impl ToString for Boolean { 66 | /// Return this Boolean to the text 67 | /// that was parsed to create it 68 | fn to_string(&self) -> String { 69 | match self { 70 | Boolean::True => "true".into(), 71 | Boolean::False => "false".into(), 72 | } 73 | } 74 | } 75 | 76 | impl From for bool { 77 | /// Creates a Rust bool for a js bool 78 | fn from(b: Boolean) -> bool { 79 | match b { 80 | Boolean::True => true, 81 | Boolean::False => false, 82 | } 83 | } 84 | } 85 | 86 | impl From<&Boolean> for bool { 87 | /// Creates a js bool for a rust bool 88 | fn from(b: &Boolean) -> bool { 89 | match b { 90 | Boolean::True => true, 91 | Boolean::False => false, 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/tokens/comment.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// A comment, effectively should be treated 3 | /// as white space. There are 3 kinds of comments 4 | /// according to the specification. 5 | /// 6 | /// - Single line comments: //comment 7 | /// - Multi line comments: /* comment */ 8 | /// - HTML comments: plus more! 9 | pub struct Comment { 10 | pub kind: CommentKind, 11 | pub content: T, 12 | pub tail_content: Option, 13 | } 14 | 15 | impl Comment { 16 | pub fn from_parts(content: T, kind: CommentKind, tail_content: Option) -> Self { 17 | Comment { 18 | content, 19 | kind, 20 | tail_content, 21 | } 22 | } 23 | pub fn new_single_line(content: T) -> Self { 24 | Comment::from_parts(content, CommentKind::Single, None) 25 | } 26 | 27 | pub fn new_multi_line(content: T) -> Self { 28 | Comment::from_parts(content, CommentKind::Multi, None) 29 | } 30 | 31 | pub fn new_html(content: T, tail_content: Option) -> Self { 32 | Comment::from_parts(content, CommentKind::Html, tail_content) 33 | } 34 | 35 | pub fn new_html_no_tail(content: T) -> Self { 36 | Comment::new_html(content, None) 37 | } 38 | 39 | pub fn new_html_with_tail(content: T, tail: T) -> Self { 40 | Comment::new_html(content, Some(tail)) 41 | } 42 | 43 | pub fn new_hashbang(content: T) -> Self { 44 | Comment::from_parts(content, CommentKind::Hashbang, None) 45 | } 46 | pub fn is_multi_line(&self) -> bool { 47 | self.kind == CommentKind::Multi 48 | } 49 | 50 | pub fn is_single_line(&self) -> bool { 51 | self.kind == CommentKind::Single 52 | } 53 | 54 | pub fn is_html(&self) -> bool { 55 | self.kind == CommentKind::Html 56 | } 57 | 58 | pub fn is_hashbang(&self) -> bool { 59 | self.kind == CommentKind::Hashbang 60 | } 61 | } 62 | 63 | impl ToString for Comment 64 | where 65 | T: AsRef, 66 | { 67 | fn to_string(&self) -> String { 68 | match self.kind { 69 | CommentKind::Single => format!("//{}", self.content.as_ref()), 70 | CommentKind::Multi => format!("/*{}*/", self.content.as_ref()), 71 | CommentKind::Html => format!("", self.content.as_ref()), 72 | CommentKind::Hashbang => format!("#!{}", self.content.as_ref()), 73 | } 74 | } 75 | } 76 | 77 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 78 | /// The 4 kinds of comments 79 | pub enum CommentKind { 80 | Single, 81 | Multi, 82 | Html, 83 | Hashbang, 84 | } 85 | -------------------------------------------------------------------------------- /src/tokens/ident.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// An identifier 3 | pub struct Ident(T); 4 | 5 | impl AsRef for Ident 6 | where 7 | T: AsRef, 8 | { 9 | fn as_ref(&self) -> &str { 10 | self.0.as_ref() 11 | } 12 | } 13 | 14 | impl PartialEq for &Ident 15 | where 16 | T: AsRef, 17 | { 18 | fn eq(&self, other: &str) -> bool { 19 | self.0.as_ref().eq(other) 20 | } 21 | } 22 | 23 | impl<'a> From<&'a str> for Ident<&'a str> { 24 | fn from(s: &'a str) -> Self { 25 | Ident(s) 26 | } 27 | } 28 | 29 | impl ToString for Ident 30 | where 31 | T: AsRef, 32 | { 33 | fn to_string(&self) -> String { 34 | self.0.as_ref().to_string() 35 | } 36 | } 37 | 38 | impl From> for String 39 | where 40 | T: ToString, 41 | { 42 | fn from(id: Ident) -> Self { 43 | id.0.to_string() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/tokens/keyword.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug)] 2 | /// A JS Keyword 3 | /// 4 | /// # Standard 5 | /// await 6 | /// break 7 | /// case 8 | /// catch 9 | /// class 10 | /// const 11 | /// continue 12 | /// debugger 13 | /// default 14 | /// delete (10) 15 | /// do 16 | /// else 17 | /// export 18 | /// extends 19 | /// finally 20 | /// for 21 | /// function 22 | /// if 23 | /// import 24 | /// in (20) 25 | /// instanceof 26 | /// new 27 | /// return 28 | /// super 29 | /// switch 30 | /// this 31 | /// throw 32 | /// try 33 | /// typeof 34 | /// var (30) 35 | /// void 36 | /// while 37 | /// with 38 | /// yield 39 | /// # Future Reserved 40 | /// enum 41 | /// # Strict Mode Future Reserved 42 | /// implements 43 | /// package 44 | /// protected 45 | /// interface 46 | /// private (40) 47 | /// public 48 | pub enum Keyword { 49 | Await(T), 50 | Break(T), 51 | Case(T), 52 | Catch(T), 53 | Class(T), 54 | Const(T), 55 | Continue(T), 56 | Debugger(T), 57 | Default(T), 58 | Delete(T), 59 | Do(T), 60 | Else(T), 61 | Enum(T), 62 | Export(T), 63 | Extends(T), 64 | Finally(T), 65 | For(T), 66 | Function(T), 67 | If(T), 68 | Implements(T), 69 | Import(T), 70 | In(T), 71 | InstanceOf(T), 72 | Interface(T), 73 | Let(T), 74 | New(T), 75 | Package(T), 76 | Private(T), 77 | Protected(T), 78 | Public(T), 79 | Return(T), 80 | Static(T), 81 | Super(T), 82 | Switch(T), 83 | This(T), 84 | Throw(T), 85 | Try(T), 86 | TypeOf(T), 87 | Var(T), 88 | Void(T), 89 | While(T), 90 | With(T), 91 | Yield(T), 92 | } 93 | 94 | impl Clone for Keyword 95 | where 96 | T: Clone, 97 | { 98 | fn clone(&self) -> Self { 99 | match self { 100 | Self::Await(i) => Self::Await(i.clone()), 101 | Self::Break(i) => Self::Break(i.clone()), 102 | Self::Case(i) => Self::Case(i.clone()), 103 | Self::Catch(i) => Self::Catch(i.clone()), 104 | Self::Class(i) => Self::Class(i.clone()), 105 | Self::Const(i) => Self::Const(i.clone()), 106 | Self::Continue(i) => Self::Continue(i.clone()), 107 | Self::Debugger(i) => Self::Debugger(i.clone()), 108 | Self::Default(i) => Self::Default(i.clone()), 109 | Self::Delete(i) => Self::Delete(i.clone()), 110 | Self::Do(i) => Self::Do(i.clone()), 111 | Self::Else(i) => Self::Else(i.clone()), 112 | Self::Enum(i) => Self::Enum(i.clone()), 113 | Self::Export(i) => Self::Export(i.clone()), 114 | Self::Extends(i) => Self::Extends(i.clone()), 115 | Self::Finally(i) => Self::Finally(i.clone()), 116 | Self::For(i) => Self::For(i.clone()), 117 | Self::Function(i) => Self::Function(i.clone()), 118 | Self::If(i) => Self::If(i.clone()), 119 | Self::Implements(i) => Self::Implements(i.clone()), 120 | Self::Import(i) => Self::Import(i.clone()), 121 | Self::In(i) => Self::In(i.clone()), 122 | Self::InstanceOf(i) => Self::InstanceOf(i.clone()), 123 | Self::Interface(i) => Self::Interface(i.clone()), 124 | Self::Let(i) => Self::Let(i.clone()), 125 | Self::New(i) => Self::New(i.clone()), 126 | Self::Package(i) => Self::Package(i.clone()), 127 | Self::Private(i) => Self::Private(i.clone()), 128 | Self::Protected(i) => Self::Protected(i.clone()), 129 | Self::Public(i) => Self::Public(i.clone()), 130 | Self::Return(i) => Self::Return(i.clone()), 131 | Self::Static(i) => Self::Static(i.clone()), 132 | Self::Super(i) => Self::Super(i.clone()), 133 | Self::Switch(i) => Self::Switch(i.clone()), 134 | Self::This(i) => Self::This(i.clone()), 135 | Self::Throw(i) => Self::Throw(i.clone()), 136 | Self::Try(i) => Self::Try(i.clone()), 137 | Self::TypeOf(i) => Self::TypeOf(i.clone()), 138 | Self::Var(i) => Self::Var(i.clone()), 139 | Self::Void(i) => Self::Void(i.clone()), 140 | Self::While(i) => Self::While(i.clone()), 141 | Self::With(i) => Self::With(i.clone()), 142 | Self::Yield(i) => Self::Yield(i.clone()), 143 | } 144 | } 145 | } 146 | 147 | impl PartialEq> for Keyword { 148 | fn eq(&self, other: &Keyword) -> bool { 149 | use Keyword::*; 150 | matches!( 151 | (self, other), 152 | (Await(_), Await(_)) 153 | | (Break(_), Break(_)) 154 | | (Case(_), Case(_)) 155 | | (Catch(_), Catch(_)) 156 | | (Class(_), Class(_)) 157 | | (Const(_), Const(_)) 158 | | (Continue(_), Continue(_)) 159 | | (Debugger(_), Debugger(_)) 160 | | (Default(_), Default(_)) 161 | | (Delete(_), Delete(_)) 162 | | (Do(_), Do(_)) 163 | | (Else(_), Else(_)) 164 | | (Enum(_), Enum(_)) 165 | | (Export(_), Export(_)) 166 | | (Extends(_), Extends(_)) 167 | | (Finally(_), Finally(_)) 168 | | (For(_), For(_)) 169 | | (Function(_), Function(_)) 170 | | (If(_), If(_)) 171 | | (Implements(_), Implements(_)) 172 | | (Import(_), Import(_)) 173 | | (In(_), In(_)) 174 | | (InstanceOf(_), InstanceOf(_)) 175 | | (Interface(_), Interface(_)) 176 | | (Let(_), Let(_)) 177 | | (New(_), New(_)) 178 | | (Package(_), Package(_)) 179 | | (Private(_), Private(_)) 180 | | (Protected(_), Protected(_)) 181 | | (Public(_), Public(_)) 182 | | (Return(_), Return(_)) 183 | | (Static(_), Static(_)) 184 | | (Super(_), Super(_)) 185 | | (Switch(_), Switch(_)) 186 | | (This(_), This(_)) 187 | | (Throw(_), Throw(_)) 188 | | (Try(_), Try(_)) 189 | | (TypeOf(_), TypeOf(_)) 190 | | (Var(_), Var(_)) 191 | | (Void(_), Void(_)) 192 | | (While(_), While(_)) 193 | | (With(_), With(_)) 194 | | (Yield(_), Yield(_)) 195 | ) 196 | } 197 | } 198 | 199 | impl Keyword<()> { 200 | pub fn with_str(self, s: &str) -> Keyword<&str> { 201 | match self { 202 | Keyword::Await(_) => Keyword::Await(s), 203 | Keyword::Break(_) => Keyword::Break(s), 204 | Keyword::Case(_) => Keyword::Case(s), 205 | Keyword::Catch(_) => Keyword::Catch(s), 206 | Keyword::Class(_) => Keyword::Class(s), 207 | Keyword::Const(_) => Keyword::Const(s), 208 | Keyword::Continue(_) => Keyword::Continue(s), 209 | Keyword::Debugger(_) => Keyword::Debugger(s), 210 | Keyword::Default(_) => Keyword::Default(s), 211 | Keyword::Delete(_) => Keyword::Delete(s), 212 | Keyword::Do(_) => Keyword::Do(s), 213 | Keyword::Else(_) => Keyword::Else(s), 214 | Keyword::Enum(_) => Keyword::Enum(s), 215 | Keyword::Export(_) => Keyword::Export(s), 216 | Keyword::Extends(_) => Keyword::Extends(s), 217 | Keyword::Finally(_) => Keyword::Finally(s), 218 | Keyword::For(_) => Keyword::For(s), 219 | Keyword::Function(_) => Keyword::Function(s), 220 | Keyword::If(_) => Keyword::If(s), 221 | Keyword::Implements(_) => Keyword::Implements(s), 222 | Keyword::Import(_) => Keyword::Import(s), 223 | Keyword::In(_) => Keyword::In(s), 224 | Keyword::InstanceOf(_) => Keyword::InstanceOf(s), 225 | Keyword::Interface(_) => Keyword::Interface(s), 226 | Keyword::Let(_) => Keyword::Let(s), 227 | Keyword::New(_) => Keyword::New(s), 228 | Keyword::Package(_) => Keyword::Package(s), 229 | Keyword::Private(_) => Keyword::Private(s), 230 | Keyword::Protected(_) => Keyword::Protected(s), 231 | Keyword::Public(_) => Keyword::Public(s), 232 | Keyword::Return(_) => Keyword::Return(s), 233 | Keyword::Static(_) => Keyword::Static(s), 234 | Keyword::Super(_) => Keyword::Super(s), 235 | Keyword::Switch(_) => Keyword::Switch(s), 236 | Keyword::This(_) => Keyword::This(s), 237 | Keyword::Throw(_) => Keyword::Throw(s), 238 | Keyword::Try(_) => Keyword::Try(s), 239 | Keyword::TypeOf(_) => Keyword::TypeOf(s), 240 | Keyword::Var(_) => Keyword::Var(s), 241 | Keyword::Void(_) => Keyword::Void(s), 242 | Keyword::While(_) => Keyword::While(s), 243 | Keyword::With(_) => Keyword::With(s), 244 | Keyword::Yield(_) => Keyword::Yield(s), 245 | } 246 | } 247 | } 248 | 249 | impl ToString for Keyword { 250 | /// Convert a keyword into a string 251 | fn to_string(&self) -> String { 252 | self.as_str().into() 253 | } 254 | } 255 | 256 | impl PartialEq for Keyword { 257 | fn eq(&self, other: &str) -> bool { 258 | self.as_str() == other 259 | } 260 | } 261 | 262 | impl Keyword { 263 | /// Is this keyword one of the future reserved words 264 | /// 265 | /// - enum 266 | /// - export 267 | /// - implements 268 | /// - super 269 | pub fn is_future_reserved(&self) -> bool { 270 | matches!( 271 | self, 272 | Keyword::Enum(_) | Keyword::Export(_) | Keyword::Implements(_) | Keyword::Super(_) 273 | ) 274 | } 275 | /// Is this keyword a reserved word when the context 276 | /// has a 'use strict' directive. 277 | /// 278 | /// ## Keywords 279 | /// - implements 280 | /// - interface 281 | /// - package 282 | /// - private 283 | /// - protected 284 | /// - public 285 | /// - static 286 | /// - yield 287 | /// - let 288 | pub fn is_strict_reserved(&self) -> bool { 289 | matches!( 290 | self, 291 | Keyword::Implements(_) 292 | | Keyword::Interface(_) 293 | | Keyword::Package(_) 294 | | Keyword::Private(_) 295 | | Keyword::Protected(_) 296 | | Keyword::Public(_) 297 | | Keyword::Static(_) 298 | | Keyword::Yield(_) 299 | | Keyword::Let(_) 300 | ) 301 | } 302 | /// Is this keyword a reserved word 303 | /// 304 | /// ## Keywords 305 | /// - break 306 | /// - case 307 | /// - catch 308 | /// - continue 309 | /// - debugger 310 | /// - default 311 | /// - delete 312 | /// - do 313 | /// - else 314 | /// - for 315 | /// - function 316 | /// - if 317 | /// - instanceof 318 | /// - in 319 | /// - new 320 | /// - return 321 | /// - switch 322 | /// - this 323 | /// - throw 324 | /// - try 325 | /// - typeof 326 | /// - var 327 | /// - void 328 | /// - while 329 | /// - with 330 | pub fn is_reserved(&self) -> bool { 331 | matches!( 332 | self, 333 | Keyword::Break(_) 334 | | Keyword::Case(_) 335 | | Keyword::Catch(_) 336 | | Keyword::Class(_) 337 | | Keyword::Continue(_) 338 | | Keyword::Debugger(_) 339 | | Keyword::Default(_) 340 | | Keyword::Delete(_) 341 | | Keyword::Do(_) 342 | | Keyword::Else(_) 343 | | Keyword::Export(_) 344 | | Keyword::Extends(_) 345 | | Keyword::Finally(_) 346 | | Keyword::For(_) 347 | | Keyword::Function(_) 348 | | Keyword::If(_) 349 | | Keyword::Import(_) 350 | | Keyword::In(_) 351 | | Keyword::InstanceOf(_) 352 | | Keyword::New(_) 353 | | Keyword::Return(_) 354 | | Keyword::Switch(_) 355 | | Keyword::Super(_) 356 | | Keyword::This(_) 357 | | Keyword::Throw(_) 358 | | Keyword::Try(_) 359 | | Keyword::TypeOf(_) 360 | | Keyword::Var(_) 361 | | Keyword::Void(_) 362 | | Keyword::While(_) 363 | | Keyword::With(_) 364 | ) 365 | } 366 | 367 | pub fn as_str(&self) -> &str { 368 | match self { 369 | Keyword::Await(_) => "await", 370 | Keyword::Break(_) => "break", 371 | Keyword::Case(_) => "case", 372 | Keyword::Catch(_) => "catch", 373 | Keyword::Class(_) => "class", 374 | Keyword::Const(_) => "const", 375 | Keyword::Continue(_) => "continue", 376 | Keyword::Debugger(_) => "debugger", 377 | Keyword::Default(_) => "default", 378 | Keyword::Import(_) => "import", 379 | Keyword::Delete(_) => "delete", 380 | Keyword::Do(_) => "do", 381 | Keyword::Else(_) => "else", 382 | Keyword::Enum(_) => "enum", 383 | Keyword::Export(_) => "export", 384 | Keyword::Extends(_) => "extends", 385 | Keyword::Finally(_) => "finally", 386 | Keyword::For(_) => "for", 387 | Keyword::Function(_) => "function", 388 | Keyword::If(_) => "if", 389 | Keyword::In(_) => "in", 390 | Keyword::Implements(_) => "implements", 391 | Keyword::InstanceOf(_) => "instanceof", 392 | Keyword::Interface(_) => "interface", 393 | Keyword::Let(_) => "let", 394 | Keyword::New(_) => "new", 395 | Keyword::Package(_) => "package", 396 | Keyword::Private(_) => "private", 397 | Keyword::Protected(_) => "protected", 398 | Keyword::Public(_) => "public", 399 | Keyword::Static(_) => "static", 400 | Keyword::Return(_) => "return", 401 | Keyword::Super(_) => "super", 402 | Keyword::Switch(_) => "switch", 403 | Keyword::This(_) => "this", 404 | Keyword::Throw(_) => "throw", 405 | Keyword::Try(_) => "try", 406 | Keyword::TypeOf(_) => "typeof", 407 | Keyword::Var(_) => "var", 408 | Keyword::Void(_) => "void", 409 | Keyword::While(_) => "while", 410 | Keyword::With(_) => "with", 411 | Keyword::Yield(_) => "yield", 412 | } 413 | } 414 | 415 | pub fn to_empty(&self) -> Keyword<()> { 416 | match self { 417 | Keyword::Await(_) => Keyword::Await(()), 418 | Keyword::Break(_) => Keyword::Break(()), 419 | Keyword::Case(_) => Keyword::Case(()), 420 | Keyword::Catch(_) => Keyword::Catch(()), 421 | Keyword::Class(_) => Keyword::Class(()), 422 | Keyword::Const(_) => Keyword::Const(()), 423 | Keyword::Continue(_) => Keyword::Continue(()), 424 | Keyword::Debugger(_) => Keyword::Debugger(()), 425 | Keyword::Default(_) => Keyword::Default(()), 426 | Keyword::Import(_) => Keyword::Import(()), 427 | Keyword::Delete(_) => Keyword::Delete(()), 428 | Keyword::Do(_) => Keyword::Do(()), 429 | Keyword::Else(_) => Keyword::Else(()), 430 | Keyword::Enum(_) => Keyword::Enum(()), 431 | Keyword::Export(_) => Keyword::Export(()), 432 | Keyword::Extends(_) => Keyword::Extends(()), 433 | Keyword::Finally(_) => Keyword::Finally(()), 434 | Keyword::For(_) => Keyword::For(()), 435 | Keyword::Function(_) => Keyword::Function(()), 436 | Keyword::If(_) => Keyword::If(()), 437 | Keyword::In(_) => Keyword::In(()), 438 | Keyword::Implements(_) => Keyword::Implements(()), 439 | Keyword::InstanceOf(_) => Keyword::InstanceOf(()), 440 | Keyword::Interface(_) => Keyword::Interface(()), 441 | Keyword::Let(_) => Keyword::Let(()), 442 | Keyword::New(_) => Keyword::New(()), 443 | Keyword::Package(_) => Keyword::Package(()), 444 | Keyword::Private(_) => Keyword::Private(()), 445 | Keyword::Protected(_) => Keyword::Protected(()), 446 | Keyword::Public(_) => Keyword::Public(()), 447 | Keyword::Static(_) => Keyword::Static(()), 448 | Keyword::Return(_) => Keyword::Return(()), 449 | Keyword::Super(_) => Keyword::Super(()), 450 | Keyword::Switch(_) => Keyword::Switch(()), 451 | Keyword::This(_) => Keyword::This(()), 452 | Keyword::Throw(_) => Keyword::Throw(()), 453 | Keyword::Try(_) => Keyword::Try(()), 454 | Keyword::TypeOf(_) => Keyword::TypeOf(()), 455 | Keyword::Var(_) => Keyword::Var(()), 456 | Keyword::Void(_) => Keyword::Void(()), 457 | Keyword::While(_) => Keyword::While(()), 458 | Keyword::With(_) => Keyword::With(()), 459 | Keyword::Yield(_) => Keyword::Yield(()), 460 | } 461 | } 462 | } 463 | 464 | impl<'a> Keyword<&'a str> { 465 | #[cfg(test)] 466 | pub fn new(s: &str) -> Self { 467 | match s { 468 | "await" => Keyword::Await("await"), 469 | "break" => Keyword::Break("break"), 470 | "case" => Keyword::Case("case"), 471 | "catch" => Keyword::Catch("catch"), 472 | "class" => Keyword::Class("class"), 473 | "const" => Keyword::Const("const"), 474 | "continue" => Keyword::Continue("continue"), 475 | "debugger" => Keyword::Debugger("debugger"), 476 | "default" => Keyword::Default("default"), 477 | "import" => Keyword::Import("import"), 478 | "delete" => Keyword::Delete("delete"), 479 | "do" => Keyword::Do("do"), 480 | "else" => Keyword::Else("else"), 481 | "enum" => Keyword::Enum("enum"), 482 | "export" => Keyword::Export("export"), 483 | "extends" => Keyword::Extends("extends"), 484 | "finally" => Keyword::Finally("finally"), 485 | "for" => Keyword::For("for"), 486 | "function" => Keyword::Function("function"), 487 | "if" => Keyword::If("if"), 488 | "in" => Keyword::In("in"), 489 | "implements" => Keyword::Implements("implements"), 490 | "instanceof" => Keyword::InstanceOf("instanceof"), 491 | "interface" => Keyword::Interface("interface"), 492 | "let" => Keyword::Let("let"), 493 | "new" => Keyword::New("new"), 494 | "package" => Keyword::Package("package"), 495 | "private" => Keyword::Private("private"), 496 | "protected" => Keyword::Protected("protected"), 497 | "public" => Keyword::Public("public"), 498 | "static" => Keyword::Static("static"), 499 | "return" => Keyword::Return("return"), 500 | "super" => Keyword::Super("super"), 501 | "switch" => Keyword::Switch("switch"), 502 | "this" => Keyword::This("this"), 503 | "throw" => Keyword::Throw("throw"), 504 | "try" => Keyword::Try("try"), 505 | "typeof" => Keyword::TypeOf("typeof"), 506 | "var" => Keyword::Var("var"), 507 | "void" => Keyword::Void("void"), 508 | "while" => Keyword::While("while"), 509 | "with" => Keyword::With("with"), 510 | "yield" => Keyword::Yield("yield"), 511 | _ => panic!("Invalid keyword..."), 512 | } 513 | } 514 | pub fn has_unicode_escape(&self) -> bool { 515 | match self { 516 | Keyword::Await(s) => s, 517 | Keyword::Break(s) => s, 518 | Keyword::Case(s) => s, 519 | Keyword::Catch(s) => s, 520 | Keyword::Class(s) => s, 521 | Keyword::Const(s) => s, 522 | Keyword::Continue(s) => s, 523 | Keyword::Debugger(s) => s, 524 | Keyword::Default(s) => s, 525 | Keyword::Import(s) => s, 526 | Keyword::Delete(s) => s, 527 | Keyword::Do(s) => s, 528 | Keyword::Else(s) => s, 529 | Keyword::Enum(s) => s, 530 | Keyword::Export(s) => s, 531 | Keyword::Extends(s) => s, 532 | Keyword::Finally(s) => s, 533 | Keyword::For(s) => s, 534 | Keyword::Function(s) => s, 535 | Keyword::If(s) => s, 536 | Keyword::In(s) => s, 537 | Keyword::Implements(s) => s, 538 | Keyword::InstanceOf(s) => s, 539 | Keyword::Interface(s) => s, 540 | Keyword::Let(s) => s, 541 | Keyword::New(s) => s, 542 | Keyword::Package(s) => s, 543 | Keyword::Private(s) => s, 544 | Keyword::Protected(s) => s, 545 | Keyword::Public(s) => s, 546 | Keyword::Static(s) => s, 547 | Keyword::Return(s) => s, 548 | Keyword::Super(s) => s, 549 | Keyword::Switch(s) => s, 550 | Keyword::This(s) => s, 551 | Keyword::Throw(s) => s, 552 | Keyword::Try(s) => s, 553 | Keyword::TypeOf(s) => s, 554 | Keyword::Var(s) => s, 555 | Keyword::Void(s) => s, 556 | Keyword::While(s) => s, 557 | Keyword::With(s) => s, 558 | Keyword::Yield(s) => s, 559 | } 560 | .contains("\\u") 561 | } 562 | } 563 | -------------------------------------------------------------------------------- /src/tokens/number.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// A JS number literal. There are 4 kinds of number 3 | /// literals allowed in JS. 4 | /// 5 | /// - Decimal Literals - This includes integers and decimals with 6 | /// optional exponent notation 7 | /// - Hexadecimal Literals - These begin with 0x and consist of numbers 8 | /// 0-9 and letters A-F (case insensitive) 9 | /// - Octal Literals - These being with 0o and consist of numbers 10 | /// 0-7 11 | /// - Binary Literals - These begin with 0b and consist of numbers 0 and 1 12 | pub struct Number(T); 13 | 14 | /// Extension methods for allowing Number 15 | /// to work with both &str and String 16 | pub trait NumberExt { 17 | fn kind(&self) -> NumberKind; 18 | fn is_hex(&self) -> bool; 19 | fn is_bin(&self) -> bool; 20 | fn is_oct(&self) -> bool; 21 | fn is_dec(&self) -> bool; 22 | fn has_exponent(&self) -> bool; 23 | fn is_big_int(&self) -> bool; 24 | } 25 | 26 | impl Number 27 | where 28 | T: AsRef, 29 | { 30 | pub fn kind(&self) -> NumberKind { 31 | let s = self.0.as_ref(); 32 | match self.0.as_ref().get(0..2) { 33 | Some("0x") | Some("0X") => NumberKind::Hex, 34 | Some("0b") | Some("0B") => NumberKind::Bin, 35 | Some("0o") | Some("0O") => NumberKind::Oct, 36 | _ => { 37 | if s.ends_with('n') { 38 | NumberKind::BigInt 39 | } else { 40 | NumberKind::Dec 41 | } 42 | } 43 | } 44 | } 45 | 46 | pub fn is_hex(&self) -> bool { 47 | self.kind() == NumberKind::Hex 48 | } 49 | pub fn is_bin(&self) -> bool { 50 | self.kind() == NumberKind::Bin 51 | } 52 | pub fn is_oct(&self) -> bool { 53 | self.kind() == NumberKind::Oct 54 | } 55 | pub fn is_dec(&self) -> bool { 56 | self.kind() == NumberKind::Dec 57 | } 58 | pub fn has_exponent(&self) -> bool { 59 | match self.kind() { 60 | NumberKind::Dec => self.0.as_ref().contains(|c| c == 'e' || c == 'E'), 61 | _ => false, 62 | } 63 | } 64 | pub fn is_big_int(&self) -> bool { 65 | self.kind() == NumberKind::BigInt 66 | } 67 | } 68 | 69 | impl<'a> From<&'a str> for Number<&'a str> { 70 | fn from(s: &'a str) -> Self { 71 | Number(s) 72 | } 73 | } 74 | 75 | impl ToString for Number 76 | where 77 | T: AsRef, 78 | { 79 | fn to_string(&self) -> String { 80 | self.0.as_ref().to_string() 81 | } 82 | } 83 | 84 | impl PartialEq for &Number 85 | where 86 | T: AsRef, 87 | { 88 | fn eq(&self, other: &str) -> bool { 89 | self.0.as_ref().eq(other) 90 | } 91 | } 92 | 93 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 94 | /// The 5 kinds of numbers 95 | pub enum NumberKind { 96 | Dec, 97 | Hex, 98 | Bin, 99 | Oct, 100 | BigInt, 101 | } 102 | -------------------------------------------------------------------------------- /src/tokens/regex.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// A Regular Expression Literal 3 | /// 4 | /// These being with a `/` and the 5 | /// body ends with another `/` 6 | /// optionally a series of one letter 7 | /// flags can be included after the `/` 8 | pub struct RegEx { 9 | pub body: T, 10 | pub flags: Option, 11 | } 12 | 13 | impl RegEx { 14 | pub fn from_parts(body: T, flags: Option) -> Self { 15 | RegEx { body, flags } 16 | } 17 | } 18 | 19 | impl ToString for RegEx 20 | where 21 | T: AsRef, 22 | { 23 | fn to_string(&self) -> String { 24 | let f = if let Some(f) = &self.flags { 25 | f.as_ref().to_string() 26 | } else { 27 | String::new() 28 | }; 29 | format!("/{}/{}", self.body.as_ref(), f) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/tokens/string.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// A single or double quoted string 3 | /// literal 4 | pub enum StringLit { 5 | Single(InnerString), 6 | Double(InnerString), 7 | } 8 | #[derive(Debug, PartialEq, Eq, Clone)] 9 | pub struct InnerString { 10 | pub content: T, 11 | pub contains_octal_escape: bool, 12 | } 13 | 14 | impl ToString for StringLit 15 | where 16 | T: AsRef, 17 | { 18 | fn to_string(&self) -> String { 19 | match self { 20 | StringLit::Single(ref s) => format!(r#"'{}'"#, s.content.as_ref()), 21 | StringLit::Double(ref s) => format!(r#""{}""#, s.content.as_ref()), 22 | } 23 | } 24 | } 25 | 26 | impl AsRef for StringLit 27 | where 28 | T: AsRef, 29 | { 30 | fn as_ref(&self) -> &str { 31 | match self { 32 | StringLit::Single(s) | StringLit::Double(s) => s.as_ref(), 33 | } 34 | } 35 | } 36 | 37 | impl AsRef for InnerString 38 | where 39 | T: AsRef, 40 | { 41 | fn as_ref(&self) -> &str { 42 | self.content.as_ref() 43 | } 44 | } 45 | 46 | impl StringLit { 47 | pub fn single(content: T, oct: bool) -> Self { 48 | StringLit::Single(InnerString { 49 | content, 50 | contains_octal_escape: oct, 51 | }) 52 | } 53 | pub fn double(content: T, oct: bool) -> Self { 54 | StringLit::Double(InnerString { 55 | content, 56 | contains_octal_escape: oct, 57 | }) 58 | } 59 | pub fn is_single(&self) -> bool { 60 | matches!(self, StringLit::Single(_)) 61 | } 62 | pub fn is_double(&self) -> bool { 63 | matches!(self, StringLit::Double(_)) 64 | } 65 | pub fn has_octal_escape(&self) -> bool { 66 | match self { 67 | StringLit::Single(ref inner) | StringLit::Double(ref inner) => { 68 | inner.contains_octal_escape 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/tokens/template.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Eq, Clone)] 2 | /// A template string 3 | /// 4 | /// These include strings that are wrapped in back ticks (`) 5 | /// which allows for interpolating any js expression between `${` 6 | /// and `}` 7 | pub enum Template { 8 | NoSub(TemplateLiteral), 9 | Head(TemplateLiteral), 10 | Middle(TemplateLiteral), 11 | Tail(TemplateLiteral), 12 | } 13 | 14 | #[derive(Debug, PartialEq, Eq, Clone)] 15 | pub struct TemplateLiteral { 16 | pub content: T, 17 | pub contains_octal_escape: bool, 18 | pub contains_invalid_unicode_escape: bool, 19 | pub contains_invalid_hex_escape: bool, 20 | } 21 | impl TemplateLiteral { 22 | pub fn new( 23 | content: T, 24 | contains_octal_escape: bool, 25 | contains_invalid_unicode_escape: bool, 26 | contains_invalid_hex_escape: bool, 27 | ) -> Self { 28 | Self { 29 | content, 30 | contains_octal_escape, 31 | contains_invalid_unicode_escape, 32 | contains_invalid_hex_escape, 33 | } 34 | } 35 | } 36 | 37 | impl Template { 38 | pub fn no_sub_template(content: T, oct: bool, uni: bool, hex: bool) -> Self { 39 | Template::NoSub(TemplateLiteral::new(content, oct, uni, hex)) 40 | } 41 | pub fn template_head(content: T, oct: bool, uni: bool, hex: bool) -> Self { 42 | Template::Head(TemplateLiteral::new(content, oct, uni, hex)) 43 | } 44 | pub fn template_middle(content: T, oct: bool, uni: bool, hex: bool) -> Self { 45 | Template::Middle(TemplateLiteral::new(content, oct, uni, hex)) 46 | } 47 | pub fn template_tail(content: T, oct: bool, uni: bool, hex: bool) -> Self { 48 | Template::Tail(TemplateLiteral::new(content, oct, uni, hex)) 49 | } 50 | pub fn is_head(&self) -> bool { 51 | matches!(self, Template::Head(_)) 52 | } 53 | pub fn is_middle(&self) -> bool { 54 | matches!(self, Template::Middle(_)) 55 | } 56 | pub fn is_tail(&self) -> bool { 57 | matches!(self, Template::Tail(_)) 58 | } 59 | pub fn is_no_sub(&self) -> bool { 60 | matches!(self, Template::NoSub(_)) 61 | } 62 | } 63 | 64 | impl ToString for Template 65 | where 66 | T: AsRef, 67 | { 68 | fn to_string(&self) -> String { 69 | match self { 70 | Template::NoSub(ref t) => format!("`{}`", t.content.as_ref()), 71 | Template::Head(ref t) => format!("`{}${{", t.content.as_ref()), 72 | Template::Middle(ref t) => format!("}}{}${{", t.content.as_ref()), 73 | Template::Tail(ref t) => format!("}}{}`", t.content.as_ref()), 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tests/ecma262/main.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | extern crate pretty_env_logger; 3 | extern crate ress; 4 | #[macro_use] 5 | extern crate log; 6 | #[macro_use] 7 | extern crate lazy_static; 8 | 9 | use std::{fs::read_to_string, path::Path, process::Command}; 10 | 11 | use ress::Scanner; 12 | mod es2015m; 13 | mod es2015s; 14 | mod es5; 15 | 16 | #[test] 17 | fn es5_test() { 18 | println!("testing es5"); 19 | ensure_logging(); 20 | let js = get_js(EsVersion::Es5); 21 | for (i, (lhs, rhs)) in Scanner::new(&js).zip(es5::ES5.iter()).enumerate() { 22 | let lhs = lhs.unwrap(); 23 | debug!("{:?}:{:?}", lhs.token, rhs); 24 | assert_eq!( 25 | (i, &lhs.token), 26 | (i, rhs), 27 | "{}:{}\n{}", 28 | EsVersion::Es5.path(), 29 | lhs.location.start, 30 | &js[lhs.span.start..lhs.span.end] 31 | ); 32 | } 33 | } 34 | 35 | #[test] 36 | fn es2015_script_test() { 37 | println!("testing es2015 script"); 38 | ensure_logging(); 39 | let js = get_js(EsVersion::Es2015Script); 40 | for (i, (lhs, rhs)) in Scanner::new(&js).zip(es2015s::TOKENS.iter()).enumerate() { 41 | let lhs = lhs.unwrap(); 42 | debug!("{:?}:{:?}", lhs.token, rhs); 43 | assert_eq!( 44 | (i, &lhs.token), 45 | (i, rhs), 46 | "{}:{}\n{}", 47 | EsVersion::Es2015Script.path(), 48 | lhs.location.start, 49 | &js[lhs.span.start..lhs.span.end] 50 | ); 51 | } 52 | } 53 | 54 | #[test] 55 | fn es2015_module_test() { 56 | ensure_logging(); 57 | debug!("testing es2015 module"); 58 | let js = get_js(EsVersion::Es2015Module); 59 | for (i, (lhs, rhs)) in Scanner::new(&js).zip(es2015m::TOKENS.iter()).enumerate() { 60 | let lhs = lhs.unwrap(); 61 | debug!("{:?}:{:?}", lhs.token, rhs); 62 | assert_eq!( 63 | (i, &lhs.token), 64 | (i, rhs), 65 | "{}:{}\n{}", 66 | EsVersion::Es2015Module.path(), 67 | lhs.location.start, 68 | &js[lhs.span.start..lhs.span.end] 69 | ); 70 | } 71 | } 72 | 73 | fn ensure_logging() { 74 | let _ = pretty_env_logger::try_init(); 75 | } 76 | 77 | enum EsVersion { 78 | Es5, 79 | Es2015Module, 80 | Es2015Script, 81 | } 82 | 83 | impl EsVersion { 84 | pub fn path(&self) -> String { 85 | format!( 86 | "node_modules/everything.js/{}", 87 | match self { 88 | EsVersion::Es5 => "es5.js", 89 | EsVersion::Es2015Module => "es2015-module.js", 90 | EsVersion::Es2015Script => "es2015-script.js", 91 | } 92 | ) 93 | } 94 | } 95 | 96 | fn get_js(version: EsVersion) -> String { 97 | get_file(version.path()) 98 | } 99 | 100 | fn get_file(path: impl AsRef) -> String { 101 | let path = path.as_ref(); 102 | if !path.exists() { 103 | npm_install(); 104 | if !path.exists() { 105 | panic!("npm install failed to make {:?} available", path) 106 | } 107 | } 108 | read_to_string(path).unwrap_or_else(|e| panic!("Failed to read {:?} to a string {}", path, e)) 109 | } 110 | 111 | fn npm_install() { 112 | Command::new("npm") 113 | .arg("install") 114 | .output() 115 | .expect("Failed to npm install"); 116 | } 117 | -------------------------------------------------------------------------------- /tests/moz_central/main.rs: -------------------------------------------------------------------------------- 1 | #![cfg(all(test, feature = "moz_central"))] 2 | 3 | use ress::*; 4 | use std::fs::read_to_string; 5 | use std::path::{Path, PathBuf}; 6 | 7 | #[test] 8 | fn moz_central() { 9 | let _ = pretty_env_logger::try_init(); 10 | let moz_central_path = Path::new("moz_central"); 11 | if !moz_central_path.exists() { 12 | panic!("please download the JIT tests from the firefox repository. see CONTRIBUTING.md for more info"); 13 | } 14 | let paths = get_paths(&moz_central_path); 15 | let (failures, total) = walk(&paths); 16 | eprintln!("completed {:?} tests", total); 17 | if !failures.is_empty() { 18 | panic!( 19 | "{:?} tests failed\n{:?}", 20 | failures.len(), 21 | failures.join("\n") 22 | ); 23 | } 24 | } 25 | 26 | fn get_paths(root: &Path) -> Vec { 27 | walkdir::WalkDir::new(root) 28 | .min_depth(1) 29 | .into_iter() 30 | .filter_map(|e| { 31 | let entry = e.expect("bad entry"); 32 | let path = entry.into_path(); 33 | if path.is_file() { 34 | if let Some(ext) = path.extension() { 35 | if ext == "js" { 36 | Some(path) 37 | } else { 38 | None 39 | } 40 | } else { 41 | None 42 | } 43 | } else { 44 | None 45 | } 46 | }) 47 | .collect() 48 | } 49 | 50 | fn walk(paths: &[PathBuf]) -> (Vec, usize) { 51 | let mut ret = Vec::new(); 52 | let mut ct = 0; 53 | for path in paths { 54 | ct += 1; 55 | let js = read_to_string(&path).unwrap(); 56 | let s = Scanner::new(js.as_str()); 57 | for item in s { 58 | if let Err(e) = item { 59 | ret.push(format!("{:?}, path: {:?}", e, path.display())); 60 | } 61 | } 62 | } 63 | (ret, ct) 64 | } 65 | -------------------------------------------------------------------------------- /tests/prop/main.rs: -------------------------------------------------------------------------------- 1 | extern crate ress; 2 | #[macro_use] 3 | extern crate proptest; 4 | 5 | proptest! { 6 | #[test] 7 | fn function_idents(s in r#"function [a-zA-Z_$\u2118\u212E\u309B\u309C\u1885\u1886][a-zA-Z_]+"#) { 8 | ress::tokenize(&s).unwrap(); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tests/proptest-regressions/main.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | xs 3761395854 1747442289 2023256964 1719391646 # shrinks to s = "𑜰" 8 | -------------------------------------------------------------------------------- /tests/readme/index.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | console.log('hello world!') 3 | })() -------------------------------------------------------------------------------- /tests/readme/main.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | 3 | use ress::prelude::*; 4 | 5 | #[test] 6 | fn semi_example() { 7 | static JS: &str = include_str!("index.js"); 8 | let s = Scanner::new(JS); 9 | for token in s { 10 | let token = token.unwrap().token; 11 | if token.matches_punct_str(";") { 12 | panic!("A semi-colon!? Heathen!"); 13 | } 14 | } 15 | println!("Good show! Why use something that's optional?") 16 | } 17 | 18 | #[test] 19 | #[allow(unused_variables)] 20 | fn failed_compile_borrow() { 21 | // look_ahead 22 | let js = "function() { return; }"; 23 | let mut s = Scanner::new(js); 24 | let current = s.next(); 25 | let next = s.look_ahead(); 26 | let new_current = s.next(); 27 | assert_eq!(next, new_current); 28 | // peekable (fails to compile) 29 | let p = Scanner::new(js).peekable(); 30 | let current = s.next(); // <-- first mutable borrow 31 | // let next = p.peek(); // <-- second mutable borrow 32 | } 33 | 34 | #[test] 35 | fn get_set_state() { 36 | let js = "function() { 37 | return 0; 38 | };"; 39 | let mut s = Scanner::new(js); 40 | let start = s.get_state(); 41 | assert_eq!( 42 | s.next().unwrap().unwrap().token, 43 | Token::Keyword(Keyword::Function("Function")) 44 | ); 45 | assert_eq!( 46 | s.next().unwrap().unwrap().token, 47 | Token::Punct(Punct::OpenParen) 48 | ); 49 | assert_eq!( 50 | s.next().unwrap().unwrap().token, 51 | Token::Punct(Punct::CloseParen) 52 | ); 53 | s.set_state(start); 54 | assert_eq!( 55 | s.next().unwrap().unwrap().token, 56 | Token::Keyword(Keyword::Function("Function")) 57 | ); 58 | } 59 | -------------------------------------------------------------------------------- /tests/snippets/main.rs: -------------------------------------------------------------------------------- 1 | use ress::prelude::*; 2 | 3 | #[test] 4 | fn vue_number_error() { 5 | let js = "refElm = isUndef(newCh[newEndIdx + 1]) ? null : newCh[newEndIdx + 1].elm;"; 6 | for item in Scanner::new(js) { 7 | println!("{:?}", item); 8 | } 9 | } 10 | #[test] 11 | fn moment_regex_error() { 12 | let js = r"function removeFormattingTokens(input) { 13 | if (input.match(/\[[\s\S]/)) { 14 | return input.replace(/^\[|\]$/g, ''); 15 | } 16 | return input.replace(/\\/g, ''); 17 | }"; 18 | for item in Scanner::new(js) { 19 | println!("{:?}", item); 20 | } 21 | } 22 | 23 | #[test] 24 | fn number_member() { 25 | compare( 26 | "20..toString()", 27 | &[ 28 | Token::Number("20.".into()), 29 | Token::Punct(Punct::Period), 30 | Token::Ident("toString".into()), 31 | Token::Punct(Punct::OpenParen), 32 | Token::Punct(Punct::CloseParen), 33 | ], 34 | ); 35 | } 36 | #[test] 37 | fn if_then_regex() { 38 | compare( 39 | "if (1) /a/", 40 | &[ 41 | Token::Keyword(Keyword::If("If")), 42 | Token::Punct(Punct::OpenParen), 43 | Token::Number("1".into()), 44 | Token::Punct(Punct::CloseParen), 45 | Token::RegEx(RegEx { 46 | body: "a", 47 | flags: None, 48 | }), 49 | ], 50 | ); 51 | } 52 | 53 | #[test] 54 | fn line_terminator_in_string_literal() { 55 | let js = "'
'"; 56 | for _ in Scanner::new(js) { 57 | // just testing for panics on the byte index 58 | // for now 59 | //TODO: Allow this character in string literals 60 | // as per spec under feature "json superset" 61 | } 62 | } 63 | 64 | #[test] 65 | fn lots_of_arcs() { 66 | let mut top = "".to_string(); 67 | let mut bottom = "[".to_string(); 68 | let ascii_start = 97; 69 | for i in 0..26 { 70 | let id = std::char::from_u32(ascii_start + i).unwrap(); 71 | let obj = format!("{{{}:{}}}", id, i); 72 | top.push_str(&format!("({})", obj)); 73 | if i != 25 { 74 | top.push_str(", "); 75 | } 76 | bottom.push_str(&format!("{},", obj)); 77 | } 78 | bottom.push(']'); 79 | let js = format!("{}\n\n{}", top, bottom); 80 | 81 | let s = Scanner::new(&js); 82 | for item in s { 83 | println!("{:?}", item.unwrap()); 84 | } 85 | } 86 | 87 | #[test] 88 | fn div_over_regex() { 89 | let js = "if (true) { 90 | ({} / function(){return 1}); 91 | } 92 | "; 93 | for tok in panicking_scanner(js) { 94 | eprintln!("{:?}", tok) 95 | } 96 | } 97 | #[test] 98 | fn regex_over_div() { 99 | let js = "{}/\\d/g;;"; 100 | compare( 101 | js, 102 | &[ 103 | Token::Punct(Punct::OpenBrace), 104 | Token::Punct(Punct::CloseBrace), 105 | Token::RegEx(RegEx::from_parts("\\d", Some("g"))), 106 | Token::Punct(Punct::SemiColon), 107 | Token::Punct(Punct::SemiColon), 108 | ], 109 | ); 110 | } 111 | #[test] 112 | fn regex_over_div2() { 113 | let js = "function(){}/\\d/g;;"; 114 | compare( 115 | js, 116 | &[ 117 | Token::Keyword(Keyword::Function("function")), 118 | Token::Punct(Punct::OpenParen), 119 | Token::Punct(Punct::CloseParen), 120 | Token::Punct(Punct::OpenBrace), 121 | Token::Punct(Punct::CloseBrace), 122 | Token::RegEx(RegEx::from_parts("\\d", Some("g"))), 123 | Token::Punct(Punct::SemiColon), 124 | Token::Punct(Punct::SemiColon), 125 | ], 126 | ); 127 | } 128 | #[test] 129 | fn regex_over_div3() { 130 | let js = "function name(){}/\\d/g;;"; 131 | compare( 132 | js, 133 | &[ 134 | Token::Keyword(Keyword::Function("function")), 135 | Token::Ident("name".into()), 136 | Token::Punct(Punct::OpenParen), 137 | Token::Punct(Punct::CloseParen), 138 | Token::Punct(Punct::OpenBrace), 139 | Token::Punct(Punct::CloseBrace), 140 | Token::RegEx(RegEx::from_parts("\\d", Some("g"))), 141 | Token::Punct(Punct::SemiColon), 142 | Token::Punct(Punct::SemiColon), 143 | ], 144 | ); 145 | } 146 | #[test] 147 | fn regex_over_div4() { 148 | let _ = pretty_env_logger::try_init(); 149 | let js = "'use strict';function name(){}/\\d/g;;"; 150 | compare( 151 | js, 152 | &[ 153 | Token::String(StringLit::single("use strict", false)), 154 | Token::Punct(Punct::SemiColon), 155 | Token::Keyword(Keyword::Function("function")), 156 | Token::Ident("name".into()), 157 | Token::Punct(Punct::OpenParen), 158 | Token::Punct(Punct::CloseParen), 159 | Token::Punct(Punct::OpenBrace), 160 | Token::Punct(Punct::CloseBrace), 161 | Token::RegEx(RegEx::from_parts("\\d", Some("g"))), 162 | Token::Punct(Punct::SemiColon), 163 | Token::Punct(Punct::SemiColon), 164 | ], 165 | ); 166 | } 167 | 168 | #[test] 169 | fn html_comment_close() { 170 | let js = " 171 | --> stuff is in a comment 172 | --> also a comment 173 | /*multi-comment*/--> with trailer 174 | /*---*/ 175 | let a; 176 | /*first comment*/ /*second comment*/--> with trailer"; 177 | compare( 178 | js, 179 | &[ 180 | Token::Comment(Comment { 181 | kind: ress::tokens::CommentKind::Html, 182 | content: "", 183 | tail_content: Some(" stuff is in a comment"), 184 | }), 185 | Token::Comment(Comment { 186 | kind: ress::tokens::CommentKind::Html, 187 | content: "", 188 | tail_content: Some(" also a comment"), 189 | }), 190 | Token::Comment(Comment { 191 | kind: ress::tokens::CommentKind::Multi, 192 | content: "multi-comment", 193 | tail_content: Some(" with trailer"), 194 | }), 195 | Token::Comment(Comment { 196 | kind: ress::tokens::CommentKind::Multi, 197 | content: "---", 198 | tail_content: None, 199 | }), 200 | Token::Keyword(Keyword::Let("let")), 201 | Token::Ident("a".into()), 202 | Token::Punct(Punct::SemiColon), 203 | Token::Comment(Comment { 204 | kind: ress::tokens::CommentKind::Multi, 205 | content: "first comment", 206 | tail_content: None, 207 | }), 208 | Token::Comment(Comment { 209 | kind: ress::tokens::CommentKind::Multi, 210 | content: "second comment", 211 | tail_content: Some(" with trailer"), 212 | }), 213 | ], 214 | ); 215 | } 216 | #[test] 217 | fn decrement_greater_than() { 218 | compare( 219 | "for (var x = 0; x --> 0;);", 220 | &[ 221 | Token::Keyword(Keyword::For("for")), 222 | Token::Punct(Punct::OpenParen), 223 | Token::Keyword(Keyword::Var("var")), 224 | Token::Ident("x".into()), 225 | Token::Punct(Punct::Equal), 226 | Token::Number("0".into()), 227 | Token::Punct(Punct::SemiColon), 228 | Token::Ident("x".into()), 229 | Token::Punct(Punct::DoubleDash), 230 | Token::Punct(Punct::GreaterThan), 231 | Token::Number("0".into()), 232 | Token::Punct(Punct::SemiColon), 233 | Token::Punct(Punct::CloseParen), 234 | Token::Punct(Punct::SemiColon), 235 | ], 236 | ) 237 | } 238 | #[test] 239 | fn decrement_greater_than_inline_multi() { 240 | compare( 241 | "for (var x = 0; x /**/--> 0;);", 242 | &[ 243 | Token::Keyword(Keyword::For("for")), 244 | Token::Punct(Punct::OpenParen), 245 | Token::Keyword(Keyword::Var("var")), 246 | Token::Ident("x".into()), 247 | Token::Punct(Punct::Equal), 248 | Token::Number("0".into()), 249 | Token::Punct(Punct::SemiColon), 250 | Token::Ident("x".into()), 251 | Token::Comment(Comment::new_multi_line("")), 252 | Token::Punct(Punct::DoubleDash), 253 | Token::Punct(Punct::GreaterThan), 254 | Token::Number("0".into()), 255 | Token::Punct(Punct::SemiColon), 256 | Token::Punct(Punct::CloseParen), 257 | Token::Punct(Punct::SemiColon), 258 | ], 259 | ) 260 | } 261 | 262 | #[test] 263 | #[should_panic = "unterminated multi-line comment"] 264 | fn star_only_regex() { 265 | run_failure("/*/"); 266 | } 267 | 268 | #[test] 269 | fn leading_space_regex() { 270 | let js = r"/ \{[\s\S]*$/"; 271 | compare( 272 | js, 273 | &[Token::RegEx(RegEx { 274 | body: r" \{[\s\S]*$", 275 | flags: None, 276 | })], 277 | ) 278 | } 279 | 280 | #[test] 281 | #[should_panic] 282 | fn var_escaped_cr() { 283 | let js = r"var\u000Dx;"; 284 | run_failure(js); 285 | } 286 | 287 | #[test] 288 | fn long_comment() { 289 | let _ = pretty_env_logger::try_init(); 290 | let inner = "\n* \n*\n"; 291 | let js = format!("/*{}*/", inner); 292 | compare( 293 | &js, 294 | &[Token::Comment(Comment { 295 | kind: ress::tokens::CommentKind::Multi, 296 | content: inner, 297 | tail_content: None, 298 | })], 299 | ) 300 | } 301 | 302 | #[test] 303 | fn regex_column() { 304 | compare_with_position( 305 | "'abc'.match(/abc/);", 306 | &[ 307 | (Token::String(StringLit::single("abc", false)), 1, 1), 308 | (Token::Punct(Punct::Period), 1, 6), 309 | (Token::Ident("match".into()), 1, 7), 310 | (Token::Punct(Punct::OpenParen), 1, 12), 311 | (Token::RegEx(RegEx::from_parts("abc", None)), 1, 13), 312 | (Token::Punct(Punct::CloseParen), 1, 18), 313 | (Token::Punct(Punct::SemiColon), 1, 19), 314 | ], 315 | ); 316 | } 317 | 318 | #[test] 319 | fn regex_spaces() { 320 | let scanner = Scanner::new("var = / a /"); 321 | let mut last_end = 0; 322 | for (i, item) in scanner.enumerate() { 323 | let item = item.unwrap(); 324 | if item.token.is_eof() { 325 | break; 326 | } 327 | 328 | assert_eq!( 329 | 1, 330 | item.location.start.column - last_end, 331 | "{} for {:?}", 332 | i, 333 | item 334 | ); 335 | last_end = item.location.end.column; 336 | } 337 | } 338 | 339 | #[test] 340 | fn regex_out_of_order() { 341 | pretty_env_logger::try_init().ok(); 342 | let regex = r#"((?:[^BEGHLMOSWYZabcdhmswyz']+)|(?:'(?:[^']|'')*')|(?:G{1,5}|y{1,4}|Y{1,4}|M{1,5}|L{1,5}|w{1,2}|W{1}|d{1,2}|E{1,6}|c{1,6}|a{1,5}|b{1,5}|B{1,5}|h{1,2}|H{1,2}|m{1,2}|s{1,2}|S{1,3}|z{1,4}|Z{1,5}|O{1,4}))([\s\S]*)"#; 343 | let js = format!("var DATE_FORMATS_SPLIT = /{}/", ®ex); 344 | compare_with_position( 345 | js.as_str(), 346 | &[ 347 | (Token::Keyword(Keyword::Var("var")), 1, 1), 348 | (Token::Ident("DATE_FORMATS_SPLIT".into()), 1, 5), 349 | (Token::Punct(Punct::Equal), 1, 24), 350 | (Token::RegEx(RegEx::from_parts(regex, None)), 1, 26), 351 | ], 352 | ); 353 | } 354 | 355 | #[test] 356 | fn regex_pattern() { 357 | pretty_env_logger::try_init().ok(); 358 | let re = r#" \{[\s\S]*$"#; 359 | let js = format!("/{re}/"); 360 | 361 | let mut scanner = Scanner::new(&js); 362 | let Item { 363 | location, 364 | token: Token::RegEx(re2), 365 | .. 366 | } = scanner.next().unwrap().unwrap() 367 | else { 368 | panic!("Expected regex"); 369 | }; 370 | assert_eq!(location.start.line, 1); 371 | assert_eq!(location.end.line, 1); 372 | assert_eq!(location.start.column, 1); 373 | assert_eq!(re2.body, re); 374 | assert_eq!(location.end.column, re.len() + 3); 375 | } 376 | 377 | #[test] 378 | fn regex_over_a0() { 379 | let js = r#"val = / /"#; 380 | compare( 381 | js, 382 | &[ 383 | Token::Ident("val".into()), 384 | Token::Punct(Punct::Equal), 385 | Token::RegEx(RegEx { 386 | body: "\u{a0}", 387 | flags: None, 388 | }), 389 | ], 390 | ) 391 | } 392 | 393 | #[test] 394 | fn regex_over_a0_manual() { 395 | use ress::ManualScanner; 396 | let js = r#"val = / /"#; 397 | let mut scanner = ManualScanner::new(js); 398 | assert_eq!( 399 | scanner.next_token().unwrap().unwrap().token, 400 | Token::Ident("val".into()) 401 | ); 402 | assert_eq!( 403 | scanner.next_token().unwrap().unwrap().token, 404 | Token::Punct(Punct::Equal) 405 | ); 406 | assert_eq!( 407 | scanner.next_token().unwrap().unwrap().token, 408 | Token::Punct(Punct::ForwardSlash) 409 | ); 410 | assert_eq!( 411 | scanner.next_regex(1).unwrap().unwrap().token, 412 | Token::RegEx(RegEx { 413 | body: "\u{a0}", 414 | flags: None 415 | }) 416 | ); 417 | } 418 | 419 | #[test] 420 | fn regex_all_whitespaces() { 421 | let re: String = [ 422 | '\t', '\u{000b}', '\u{000c}', ' ', '\u{feff}', '\u{2000}', '\u{2001}', '\u{2002}', 423 | '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', 424 | '\u{200a}', '\u{202f}', '\u{205f}', '\u{3000}', 425 | ] 426 | .iter() 427 | .collect(); 428 | run_failure(&format!("var = /{re}/")); 429 | } 430 | 431 | fn compare(js: &str, expectation: &[Token<&str>]) { 432 | for (i, (par, ex)) in panicking_scanner(js).zip(expectation.iter()).enumerate() { 433 | assert_eq!((i, &par), (i, ex)); 434 | } 435 | } 436 | 437 | fn compare_with_position(js: &str, expectation: &[(Token<&str>, usize, usize)]) { 438 | let scanner = Scanner::new(js); 439 | let mut i = 0; 440 | let mut expectation = expectation.iter(); 441 | for r in scanner { 442 | let r = r.unwrap(); 443 | if r.is_eof() { 444 | return; 445 | } 446 | i += 1; 447 | let ex = expectation 448 | .next() 449 | .ok_or_else(|| { 450 | panic!("expectations too short for {:?}", r); 451 | }) 452 | .unwrap(); 453 | assert_eq!((i, &r.token), (i, &ex.0), "{:?} vs {:?}", r, ex.0); 454 | assert_eq!( 455 | (i, r.location.start.line), 456 | (i, ex.1), 457 | "{:?} vs {:?}", 458 | r, 459 | ex.0 460 | ); 461 | assert_eq!( 462 | (i, r.location.start.column), 463 | (i, ex.2), 464 | "{:?} vs {:?}", 465 | r, 466 | ex.0 467 | ); 468 | } 469 | } 470 | 471 | fn run_failure(js: &str) { 472 | for _ in panicking_scanner(js) {} 473 | } 474 | 475 | fn panicking_scanner(js: &str) -> impl Iterator> { 476 | Scanner::new(js).map(|r| r.unwrap().token) 477 | } 478 | --------------------------------------------------------------------------------