├── .github └── workflows │ └── ci.yaml ├── .gitignore ├── Cargo.toml ├── README.md ├── bors.toml ├── lua-patterns ├── .gitignore ├── Cargo.toml ├── LICENSE.txt ├── examples │ ├── errors.rs │ ├── iter.rs │ ├── multiple_captures.rs │ ├── range.rs │ └── strings.rs ├── readme.md └── src │ ├── errors.rs │ ├── lib.rs │ └── luapat.rs ├── rustfmt.toml ├── src ├── annot.rs ├── annot │ └── generated.rs ├── ast.rs ├── ast │ └── generated.rs ├── attribute.rs ├── block.rs ├── emoji.rs ├── html.rs ├── inline.rs ├── lib.rs ├── main.rs ├── patterns.rs ├── sourcegen.rs ├── sourcegen │ ├── annot.rs │ └── ast.rs └── tree.rs └── tests ├── data ├── attributes.test ├── code_blocks.test ├── emoji.test ├── emphasis.test ├── hello_world.test ├── insert_delete_mark.test ├── links_and_images.test ├── para.test ├── regression.test ├── super_subscript.test └── verbatim.test ├── spec.rs └── tidy.rs /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | branches: ["master", "staging", "trying"] 6 | 7 | env: 8 | CARGO_INCREMENTAL: 0 9 | CARGO_NET_RETRY: 10 10 | CI: 1 11 | RUST_BACKTRACE: short 12 | RUSTFLAGS: -D warnings 13 | RUSTUP_MAX_RETRIES: 10 14 | 15 | jobs: 16 | test: 17 | name: Rust 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - uses: Swatinem/rust-cache@6720f05bc48b77f96918929a9019fb2203ff71f8 23 | - run: rustup update --no-self-update stable 24 | - run: sudo apt-get install lua5.3 25 | - run: cargo test 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode 2 | /target 3 | /Cargo.lock 4 | /ref 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "djot" 3 | version = "0.1.0" 4 | license = "MIT OR Apache-2.0" 5 | authors = ["Aleksey Kladov "] 6 | edition = "2021" 7 | 8 | [dependencies] 9 | anyhow = "1.0.66" 10 | indexmap = { version = "1.9.1", features = ["serde"] } 11 | lexopt = "0.2.1" 12 | lua-patterns = { path = "lua-patterns" } 13 | serde = { version = "1.0.147", features = ["derive"] } 14 | serde_json = "1.0.87" 15 | 16 | [dev-dependencies] 17 | xshell = "0.2.0" 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # djot-rs 2 | 3 | UPDATE: 4 | 5 | This didn't went particularly far, checkout 6 | 7 | 8 | 9 | instead. 10 | 11 | 12 | An experimental Rust implementation of the [Djot](https://djot.net) light markup 13 | language. 14 | 15 | ## Design Rules 16 | 17 | Djot is in development, this defines _current_ design rules: 18 | 19 | 1. 100% compatibility with the reference Lua implementation, bugs and all. We 20 | don't want to fork a language which barely exist. 21 | 2. Reasonable source compatibility with the reference Lua implementation. We 22 | want to makes it easy to incorporate changes, though we don't necessary want 23 | to bend Rust to be lua. 24 | 25 | Currently this is very incomplete, feel free to submit PR to fill the blank 26 | spaces, just try to be close to the original code. 27 | 28 | There are some tests, run with `cargo test`. We are using the same test suite as 29 | the upstream project (see `.test` files in `tests/data`) 30 | 31 | ## Aspirations 32 | 33 | * "Easy", obvious API -- no streaming parsing, no allocation minimization, just 34 | gives you a full ast 35 | * core + alloc. We don't need OS. Getting rid of the allocator would be nice, but not for this library. 36 | * in general, leave pulldown-djot to someone else (or to the next iteration of this library) 37 | * djot.ts module for convenience 38 | * typescript extensible visitor API for rendering: `./djot.ts intput.adoc --template slides.ts` 39 | `ast.to_html({ code_block: (tag) => { ... }})`. 40 | 41 | ## See Also 42 | 43 | * https://git.sr.ht/~kmaasrud/djr a pulldown-cmark inspired parser 44 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ "Rust" ] 2 | delete_merged_branches = true 3 | -------------------------------------------------------------------------------- /lua-patterns/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | scratch 4 | -------------------------------------------------------------------------------- /lua-patterns/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lua-patterns" 3 | version = "0.3.0" 4 | authors = ["steve donovan "] 5 | description = "Binding to Lua String Patterns" 6 | license = "MIT" 7 | repository = "https://github.com/stevedonovan/lua-patterns" 8 | documentation = "https://docs.rs/lua-patterns" 9 | 10 | keywords = ["string","matching","lua"] 11 | 12 | categories = ["parsing","api-bindings"] 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /lua-patterns/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright © 2017 Steve Donovan 2 | 3 | Copyright © 1994–2017 Lua.org, PUC-Rio. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), 7 | to deal in the Software without restriction, including without 8 | limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, 10 | and to permit persons to whom the Software is furnished to do so, 11 | subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included 14 | in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 19 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /lua-patterns/examples/errors.rs: -------------------------------------------------------------------------------- 1 | extern crate lua_patterns; 2 | use lua_patterns::errors::PatternError; 3 | 4 | fn main() { 5 | let bad = [ 6 | ("bonzo %","malformed pattern (ends with '%')"), 7 | ("bonzo (dog%(","unfinished capture"), 8 | ("alles [%a%[","malformed pattern (missing ']')"), 9 | ("bonzo (dog (cat)","unfinished capture"), 10 | ("frodo %f[%A","malformed pattern (missing ']')"), 11 | ("frodo (1) (2(3)%2)%1","invalid capture index %2"), 12 | ]; 13 | 14 | fn error(s: &str) -> PatternError { 15 | PatternError(s.into()) 16 | } 17 | 18 | for p in bad.iter() { 19 | let res = lua_patterns::LuaPattern::new_try(p.0); 20 | if let Err(e) = res { 21 | assert_eq!(e, error(p.1)); 22 | } else { 23 | println!("'{}' was fine",p.0); 24 | } 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /lua-patterns/examples/iter.rs: -------------------------------------------------------------------------------- 1 | extern crate lua_patterns as lp; 2 | 3 | 4 | 5 | fn main() { 6 | 7 | //~ let mut m = lp::LuaPattern::new("hello%"); 8 | //~ m.matches("hello"); 9 | //~ println!("ok"); 10 | 11 | let mut m = lp::LuaPattern::new("(%a+)"); 12 | let mut iter = m.gmatch("one two three"); 13 | assert_eq!(iter.next(), Some("one")); 14 | assert_eq!(iter.next(), Some("two")); 15 | assert_eq!(iter.next(), Some("three")); 16 | assert_eq!(iter.next(), None); 17 | 18 | let mut m = lp::LuaPattern::new("%S+"); 19 | let split: Vec<_> = m.gmatch("dog cat leopard wolf").collect(); 20 | assert_eq!(split,&["dog","cat","leopard","wolf"]); 21 | 22 | let mut m = lp::LuaPattern::new("%s*(%S+)%s*=%s*(.-);"); 23 | let cc = m.captures(" hello= bonzo dog;"); 24 | assert_eq!(cc[0], " hello= bonzo dog;"); 25 | assert_eq!(cc[1],"hello"); 26 | assert_eq!(cc[2],"bonzo dog"); 27 | 28 | for cc in m.gmatch_captures("hello=bonzo dog; bye=cat;") { 29 | println!("'{}'='{}'",cc.get(1),cc.get(2)); 30 | } 31 | 32 | let mut m = lp::LuaPattern::new("%$(%S+)"); 33 | let res = m.gsub_with("hello $dolly you're so $fine", 34 | |cc| cc.get(1).to_uppercase() 35 | ); 36 | assert_eq!(res,"hello DOLLY you're so FINE"); 37 | 38 | let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);"); 39 | let res = m.gsub_with("alpha=bonzo; beta=felix;", 40 | |cc| format!("{}:'{}',", cc.get(1), cc.get(2)) 41 | ); 42 | assert_eq!(res, "alpha:'bonzo', beta:'felix',"); 43 | 44 | 45 | 46 | } 47 | -------------------------------------------------------------------------------- /lua-patterns/examples/multiple_captures.rs: -------------------------------------------------------------------------------- 1 | extern crate lua_patterns as lp; 2 | 3 | fn main() { 4 | let mut p = lp::LuaPattern::new("%s*(%d+)%s+(%S+)"); 5 | if let Some((int,rest)) = p.match_maybe_2(" 233 hello dolly") { 6 | assert_eq!(int,"233"); 7 | assert_eq!(rest,"hello"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /lua-patterns/examples/range.rs: -------------------------------------------------------------------------------- 1 | extern crate lua_patterns; 2 | use lua_patterns::{LuaPattern,LuaPatternBuilder}; 3 | 4 | fn main() { 5 | let mut m = LuaPattern::new("(%a+) one"); 6 | let text = " hello one two"; 7 | assert!(m.matches(text)); 8 | assert_eq!(m.capture(1),1..6); 9 | assert_eq!(m.capture(0),1..10); 10 | 11 | let v = m.captures(text); 12 | assert_eq!(v, &["hello one","hello"]); 13 | 14 | let mut v = Vec::new(); 15 | assert!(m.capture_into(text,&mut v)); 16 | assert_eq!(v, &["hello one","hello"]); 17 | 18 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0]; 19 | 20 | let patt = LuaPatternBuilder::new() 21 | .bytes_as_hex("DE24") 22 | .text("+") 23 | .bytes(&[0xBE]) 24 | .build(); 25 | 26 | let mut m = LuaPattern::from_bytes(&patt); 27 | assert!(m.matches_bytes(bytes)); 28 | assert_eq!(&bytes[m.capture(0)], &[0xDE,0x24,0x24,0xBE]); 29 | 30 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*"); 31 | let res = m.gsub("a=2; b=3; c = 4;","'%2':%1 "); 32 | println!("{}",res); 33 | 34 | let mut m = LuaPattern::new("%s+"); 35 | let res = m.gsub("hello dolly you're so fine",""); 36 | println!("{}",res); 37 | 38 | } 39 | -------------------------------------------------------------------------------- /lua-patterns/examples/strings.rs: -------------------------------------------------------------------------------- 1 | // Similar to the strings(1) utility 2 | // We print any sequences involving four or more ASCII letters 3 | extern crate lua_patterns; 4 | use lua_patterns::LuaPattern; 5 | 6 | use std::env; 7 | use std::str; 8 | use std::fs::File; 9 | use std::io::prelude::*; 10 | 11 | fn main() { 12 | let file = env::args().skip(1).next().expect("provide a binary file"); 13 | let mut f = File::open(&file).expect("can't open file"); 14 | let mut buf = Vec::new(); 15 | f.read_to_end(&mut buf).expect("can't read file"); 16 | 17 | let mut words = LuaPattern::new("%a%a%a%a+"); 18 | for w in words.gmatch_bytes(&buf) { 19 | println!("{}",str::from_utf8(w).unwrap()); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /lua-patterns/readme.md: -------------------------------------------------------------------------------- 1 | ## Lua string patterns in Rust 2 | 3 | [Lua string patterns](https://www.lua.org/pil/20.2.html) are a powerful 4 | yet lightweight alternative to full regular expressions. They are not 5 | regexps, since there is no alternation (the `|` operator), but this 6 | is not usually a problem. In fact, full regexps become _too powerful_ and 7 | power can be dangerous or just plain confusing. 8 | This is why OpenBSD's httpd has [Lua patterns](http://man.openbsd.org/patterns.7). 9 | The decision to use `%` as the escape rather than the traditional `\` is refreshing. 10 | In the Rust context, `lua-patterns` is a very lightweight dependency, if you 11 | don't need the full power of the `regex` crate. 12 | 13 | This library reuses the original source from Lua 5.2 - only 14 | 400 lines of battle-tested C. I originally did this for a similar project to bring 15 | [these patterns to C++](https::/github.com/stevedonovan/rx-cpp). 16 | 17 | More information can be found on [the Lua wiki](http://lua-users.org/wiki/PatternsTutorial). 18 | The cool thing is that Lua is a 300KB download, if you want to test patterns out 19 | without going through Rust. 20 | 21 | I've organized the Rust interface much as the original Lua library, 'match', 22 | 'gmatch' and 'gsub', but made these methods of a `LuaPattern` struct. This is 23 | for two main reasons: 24 | 25 | - although string patterns are not compiled, they can be validated upfront 26 | - after a match, the struct contains the results 27 | 28 | ```rust 29 | extern crate lua_patterns; 30 | use lua_patterns::LuaPattern; 31 | 32 | let mut m = LuaPattern::new("one"); 33 | let text = "hello one two"; 34 | assert!(m.matches(text)); 35 | let r = m.range(); 36 | assert_eq!(r.start, 6); 37 | assert_eq!(r.end, 9); 38 | ``` 39 | This not in itself impressive, since it can be done with the string `find` 40 | method. (`new` will panic if you feed it a bad pattern, so use `new_try` if 41 | you want more control.) 42 | 43 | Once we start using patterns it gets more exciting, especially 44 | with _captures_: 45 | 46 | ```rust 47 | let mut m = LuaPattern::new("(%a+) one"); 48 | let text = " hello one two"; 49 | assert!(m.matches(text)); 50 | assert_eq!(m.capture(0),1..10); // "hello one" 51 | assert_eq!(m.capture(1),1..6); // "hello" 52 | ``` 53 | Lua patterns (like regexps) are not anchored by default, so this finds 54 | the first match and works from there. The 0 capture always exists 55 | (the full match) and here the 1 capture just picks up the first word. 56 | 57 | > There is an obvious limitation: "%a" refers specifically to a single byte 58 | > representing a letter according to the C locale. Lua people will often 59 | > look for 'sequence of non-spaces' ("%S+"), etc - that is, identify maybe-UTF-8 60 | > sequences using surronding punctuation or spaces. 61 | 62 | If you want your captures as strings, then there are several options. If there's 63 | just one, then `match_maybe` is useful: 64 | 65 | ```rust 66 | let mut m = LuaPattern::new("OK%s+(%d+)"); 67 | let res = m.match_maybe("and that's OK 400 to you"); 68 | assert_eq!(res, Some("400")); 69 | ``` 70 | You can grab them as a vector (it will be empty if the match fails.) 71 | 72 | ```rust 73 | let mut m = LuaPattern::new("(%a+) one"); 74 | let text = " hello one two"; 75 | let v = m.captures(text); 76 | assert_eq!(v, &["hello one","hello"]); 77 | ``` 78 | This will create a vector. You can avoid excessive allocations with `capture_into`: 79 | 80 | ```rust 81 | let mut v = Vec::new(); 82 | if m.capture_into(text,&mut v) { 83 | assert_eq!(v, &["hello one","hello"]); 84 | } 85 | ``` 86 | Imagine that this is happening in a loop - the vector is only allocated the first 87 | time it is filled, and thereafter there are no allocations. It's a convenient 88 | method if you are checking text against several patterns, and is actually 89 | more ergonomic than using Lua's `string.match`. (Personally I prefer 90 | to use those marvelous things called "if statements" rather than elaborate 91 | regular expressions.) 92 | 93 | The `gmatch` method creates an interator over all matched strings. 94 | 95 | ```rust 96 | let mut m = lp::LuaPattern::new("%S+"); 97 | let split: Vec<_> = m.gmatch("dog cat leopard wolf ").collect(); 98 | assert_eq!(split,&["dog","cat","leopard","wolf"]); 99 | ``` 100 | A single match is returned; if the pattern has no captures, you get the full match, 101 | otherwise you get the first match. So "(%S+)" would give you the same result. 102 | 103 | A more general version is `gmatch_captures` which creates a _streaming_ iterator 104 | over captures. You have to be a little careful with this one; in particular, you 105 | will get nonsense if you try to `collect` on the return captures: don't try to 106 | keep these values. 107 | It is fine to collect from an expression involving the `get` method however! 108 | 109 | ```rust 110 | let mut m = lua_patterns::LuaPattern::new("(%S)%S+"); 111 | let split: Vec<_> = m.gmatch_captures("dog cat leopard wolf") 112 | .map(|cc| cc.get(1)).collect(); 113 | assert_eq!(split,&["d","c","l","w"]); 114 | ``` 115 | 116 | Text substitution is an old favourite of mine, so here's `gsub_with`: 117 | 118 | ```rust 119 | let mut m = lp::LuaPattern::new("%$(%S+)"); 120 | let res = m.gsub_with("hello $dolly you're so $fine", 121 | |cc| cc.get(1).to_uppercase() 122 | ); 123 | assert_eq!(res,"hello DOLLY you're so FINE"); 124 | ``` 125 | The closure is passed a `Closures` object and the captures are accessed 126 | using the `get` method; it returns a `String`. 127 | 128 | The second form of `gsub` is convenient when you have a replacement 129 | string, which may contain closure references. (To add a literal "%" escape 130 | it like so "%%") 131 | 132 | ```rust 133 | let mut m = LuaPattern::new("%s+"); 134 | let res = m.gsub("hello dolly you're so fine",""); 135 | assert_eq!(res, "hellodollyyou'resofine"); 136 | 137 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*"); 138 | let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 "); 139 | assert_eq!(res, "'2':a '3':b '4':c "); 140 | ``` 141 | The third form of `string.gsub` in Lua does lookup with a table - that is, a map. 142 | But for maps you really want to handle the 'not found' case in some special way: 143 | 144 | ```rust 145 | let mut map = HashMap::new(); 146 | // updating old lines for the 21st Century 147 | map.insert("dolly", "baby"); 148 | map.insert("fine", "cool"); 149 | map.insert("good-looking", "pretty"); 150 | 151 | let mut m = LuaPattern::new("%$%((.-)%)"); 152 | let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)", 153 | |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string() 154 | ); 155 | assert_eq!(res,"hello baby you're so cool and pretty"); 156 | ``` 157 | 158 | (The ".-" pattern means 'match as little as possible' - often called 'lazy' 159 | matching.) 160 | 161 | This is equivalent to a replace string "%1:'%2'": 162 | 163 | ```rust 164 | let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);"); 165 | let res = m.gsub_with("alpha=bonzo; beta=felix;", 166 | |cc| format!("{}:'{}',", cc.get(1), cc.get(2)) 167 | ); 168 | assert_eq!(res, "alpha:'bonzo', beta:'felix',"); 169 | ``` 170 | Having a byte-oriented pattern matcher can be useful. For instance, this 171 | is basically the old `strings` utility - we read all of a 'binary' file into 172 | a vector of bytes, and then use `gmatch_bytes` to iterate over all `&[u8]` 173 | matches corresponding to two or more adjacent ASCII letters: 174 | 175 | ```rust 176 | let mut words = LuaPattern::new("%a%a+"); 177 | for w in words.gmatch_bytes(&buf) { 178 | println!("{}",std::str::from_utf8(w).unwrap()); 179 | } 180 | ``` 181 | The pattern itself may be arbitrary bytes - Lua 'string' matching does 182 | not care about embedded nul bytes: 183 | 184 | ```rust 185 | let patt = &[0xDE,0x00,b'+',0xBE]; 186 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x0,0x0,0xBE,0x0,0x0]; 187 | 188 | let mut m = LuaPattern::from_bytes(patt); 189 | assert!(m.matches_bytes(bytes)); 190 | assert_eq!(&bytes[m.capture(0)], &[0xDE,0x00,0x00,0xBE]); 191 | ``` 192 | The problem here is that it's not obvious when our 'arbitrary' bytes 193 | include one of the special matching characters like `$` (which is 0x24) 194 | and so on. Hence there is `LuaPatternBuilder`: 195 | 196 | ```rust 197 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0]; 198 | 199 | let patt = LuaPatternBuilder::new() 200 | .bytes_as_hex("DE24") // less tedious than a byte slice 201 | .text("+") // unescaped 202 | .bytes(&[0xBE]) // byte slice 203 | .build(); 204 | 205 | let mut m = LuaPattern::from_bytes(&patt); 206 | // picks up "DE2424BE" 207 | ``` 208 | > Static verification: this version attempts to verify string patterns. If you 209 | > want errors, use `new_try` and `from_bytes_try`, otherwise the constructors panic. 210 | > If a match panics after successful verification, it is a __BUG__ - please 211 | > report the offending pattern. 212 | 213 | -------------------------------------------------------------------------------- /lua-patterns/src/errors.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::error::Error; 3 | 4 | /// Error type returned by _try methods 5 | #[derive(Debug,PartialEq)] 6 | pub struct PatternError(pub String); 7 | 8 | impl fmt::Display for PatternError { 9 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 10 | write!(f,"{}",self.0) 11 | } 12 | } 13 | 14 | impl Error for PatternError { 15 | fn description(&self) -> &str { 16 | &self.0 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /lua-patterns/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This is a Rust binding to [Lua string patterns](https://www.lua.org/pil/20.2.html), 2 | //! using the original code from Lua 5.2. 3 | //! 4 | //! Although not regular expressions (they lack alternation) they are a powerful 5 | //! and lightweight way to process text. Please note that they are not 6 | //! UTF-8-aware, and in fact can process arbitrary binary data. 7 | //! 8 | //! `LuaPattern` can be created from a string _or_ a byte slice, and has 9 | //! methods which are similar to the original Lua API. Please see 10 | //! [the README](https://github.com/stevedonovan/lua-patterns/blob/master/readme.md) 11 | //! for more discussion. 12 | //! 13 | //! [LuaPattern](struct.LuaPattern.html) implements the public API. 14 | //! 15 | //! ## Examples 16 | //! 17 | //! ```rust 18 | //! extern crate lua_patterns; 19 | //! let mut m = lua_patterns::LuaPattern::new("one"); 20 | //! let text = "hello one two"; 21 | //! assert!(m.matches(text)); 22 | //! let r = m.range(); 23 | //! assert_eq!(r.start, 6); 24 | //! assert_eq!(r.end, 9); 25 | //! ``` 26 | //! 27 | //! Collecting captures from a match: 28 | //! 29 | //! ```rust 30 | //! extern crate lua_patterns; 31 | //! let text = " hello one"; 32 | //! let mut m = lua_patterns::LuaPattern::new("(%S+) one"); 33 | //! 34 | //! // allocates a vector of captures 35 | //! let v = m.captures(text); 36 | //! assert_eq!(v, &["hello one","hello"]); 37 | //! let mut v = Vec::new(); 38 | //! // writes captures into preallocated vector 39 | //! if m.capture_into(text,&mut v) { 40 | //! assert_eq!(v, &["hello one","hello"]); 41 | //! } 42 | //! ``` 43 | 44 | use std::ops; 45 | 46 | pub mod errors; 47 | use errors::*; 48 | mod luapat; 49 | use luapat::*; 50 | 51 | 52 | /// Represents a Lua string pattern and the results of a match 53 | pub struct LuaPattern<'a> { 54 | patt: &'a [u8], 55 | matches: [LuaMatch; LUA_MAXCAPTURES], 56 | n_match: usize 57 | } 58 | 59 | impl <'a> LuaPattern<'a> { 60 | /// Maybe create a new Lua pattern from a slice of bytes 61 | pub fn from_bytes_try (bytes: &'a [u8]) -> Result,PatternError> { 62 | str_check(bytes)?; 63 | let matches = [LuaMatch{start: 0, end: 0}; LUA_MAXCAPTURES]; 64 | Ok(LuaPattern{patt: bytes, matches: matches, n_match: 0}) 65 | } 66 | 67 | /// Maybe create a new Lua pattern from a string 68 | pub fn new_try(patt: &'a str) -> Result,PatternError> { 69 | LuaPattern::from_bytes_try(patt.as_bytes()) 70 | } 71 | 72 | /// Create a new Lua pattern from a string, panicking if bad 73 | pub fn new(patt: &'a str) -> LuaPattern<'a> { 74 | LuaPattern::new_try(patt).expect("bad pattern") 75 | } 76 | 77 | /// Create a new Lua pattern from a slice of bytes, panicking if bad 78 | pub fn from_bytes (bytes: &'a [u8]) -> LuaPattern<'a> { 79 | LuaPattern::from_bytes_try(bytes).expect("bad pattern") 80 | } 81 | 82 | /// Match a slice of bytes with a pattern 83 | /// 84 | /// ``` 85 | /// let patt = &[0xFE,0xEE,b'+',0xED]; 86 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt); 87 | /// let bytes = &[0x00,0x01,0xFE,0xEE,0xEE,0xED,0xEF]; 88 | /// assert!(m.matches_bytes(bytes)); 89 | /// assert_eq!(&bytes[m.range()], &[0xFE,0xEE,0xEE,0xED]); 90 | /// ``` 91 | pub fn matches_bytes(&mut self, s: &[u8]) -> bool { 92 | self.n_match = str_match(s,self.patt,&mut self.matches).expect("Should not fail - report as bug"); 93 | self.n_match > 0 94 | } 95 | 96 | /// Match a string with a pattern 97 | /// 98 | /// ``` 99 | /// let mut m = lua_patterns::LuaPattern::new("(%a+) one"); 100 | /// let text = " hello one two"; 101 | /// assert!(m.matches(text)); 102 | /// ``` 103 | pub fn matches(&mut self, text: &str) -> bool { 104 | self.matches_bytes(text.as_bytes()) 105 | } 106 | 107 | /// Match a string, returning first capture if successful 108 | /// 109 | /// ``` 110 | /// let mut m = lua_patterns::LuaPattern::new("OK%s+(%d+)"); 111 | /// let res = m.match_maybe("and that's OK 400 to you"); 112 | /// assert_eq!(res, Some("400")); 113 | /// ``` 114 | pub fn match_maybe<'t>(&mut self, text: &'t str) -> Option<&'t str> { 115 | if self.matches(text) { 116 | Some(&text[self.first_capture()]) 117 | } else { 118 | None 119 | } 120 | } 121 | 122 | /// Match a string, returning first two explicit captures if successful 123 | /// 124 | /// ``` 125 | /// let mut p = lua_patterns::LuaPattern::new("%s*(%d+)%s+(%S+)"); 126 | /// let (int,rest) = p.match_maybe_2(" 233 hello dolly").unwrap(); 127 | /// assert_eq!(int,"233"); 128 | /// assert_eq!(rest,"hello"); 129 | /// ``` 130 | pub fn match_maybe_2<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str)> { 131 | if self.matches(text) { 132 | let cc = self.match_captures(text); 133 | if cc.num_matches() != 3 { return None; } 134 | Some((cc.get(1),cc.get(2))) 135 | } else { 136 | None 137 | } 138 | } 139 | 140 | /// Match a string, returning first three explicit captures if successful 141 | /// 142 | /// ``` 143 | /// let mut p = lua_patterns::LuaPattern::new("(%d+)/(%d+)/(%d+)"); 144 | /// let (y,m,d) = p.match_maybe_3("2017/11/10").unwrap(); 145 | /// assert_eq!(y,"2017"); 146 | /// assert_eq!(m,"11"); 147 | /// assert_eq!(d,"10"); 148 | /// ``` 149 | pub fn match_maybe_3<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str,&'t str)> { 150 | if self.matches(text) { 151 | let cc = self.match_captures(text); 152 | if cc.num_matches() != 4 { return None; } 153 | Some((cc.get(1),cc.get(2),cc.get(3))) 154 | } else { 155 | None 156 | } 157 | } 158 | 159 | /// Match and collect all captures as a vector of string slices 160 | /// 161 | /// ``` 162 | /// let mut m = lua_patterns::LuaPattern::new("(one).+"); 163 | /// assert_eq!(m.captures(" one two"), &["one two","one"]); 164 | /// ``` 165 | pub fn captures<'b>(&mut self, text: &'b str) -> Vec<&'b str> { 166 | let mut res = Vec::new(); 167 | self.capture_into(text, &mut res); 168 | res 169 | } 170 | 171 | /// A convenient way to access the captures with no allocation 172 | /// 173 | /// ```rust 174 | /// let text = " hello one"; 175 | /// let mut m = lua_patterns::LuaPattern::new("(%S+) one"); 176 | /// if m.matches(text) { 177 | /// let cc = m.match_captures(text); 178 | /// assert_eq!(cc.get(0), "hello one"); 179 | /// assert_eq!(cc.get(1), "hello"); 180 | /// } 181 | /// ``` 182 | pub fn match_captures<'b,'c>(&'c self, text: &'b str) -> Captures<'a,'b,'c> { 183 | Captures {m: self, text: text} 184 | } 185 | 186 | /// Match and collect all captures into the provided vector. 187 | /// 188 | /// ```rust 189 | /// let text = " hello one"; 190 | /// let mut m = lua_patterns::LuaPattern::new("(%S+) one"); 191 | /// let mut v = Vec::new(); 192 | /// if m.capture_into(text,&mut v) { 193 | /// assert_eq!(v, &["hello one","hello"]); 194 | /// } 195 | /// ``` 196 | pub fn capture_into<'b>(&mut self, text: &'b str, vec: &mut Vec<&'b str>) -> bool { 197 | self.matches(text); 198 | vec.clear(); 199 | for i in 0..self.n_match { 200 | vec.push(&text[self.capture(i)]); 201 | } 202 | self.n_match > 0 203 | } 204 | 205 | /// The full match (same as `capture(0)`) 206 | pub fn range(&self) -> ops::Range { 207 | self.capture(0) 208 | } 209 | 210 | /// Get the nth capture of the match. 211 | /// 212 | /// ``` 213 | /// let mut m = lua_patterns::LuaPattern::new("(%a+) one"); 214 | /// let text = " hello one two"; 215 | /// assert!(m.matches(text)); 216 | /// assert_eq!(m.capture(0),1..10); 217 | /// assert_eq!(m.capture(1),1..6); 218 | /// ``` 219 | pub fn capture(&self, i: usize) -> ops::Range { 220 | ops::Range{ 221 | start: self.matches[i].start as usize, 222 | end: self.matches[i].end as usize 223 | } 224 | } 225 | 226 | /// Get the 'first' capture of the match 227 | /// 228 | /// If there are no matches, this is the same as `range`, 229 | /// otherwise it's `capture(1)` 230 | pub fn first_capture(&self) -> ops::Range { 231 | let idx = if self.n_match > 1 {1} else {0}; 232 | self.capture(idx) 233 | } 234 | 235 | /// An iterator over all matches in a string. 236 | /// 237 | /// The matches are returned as string slices; if there are no 238 | /// captures the full match is used, otherwise the first capture. 239 | /// That is, this example will also work with the pattern "(%S+)". 240 | /// 241 | /// ``` 242 | /// let mut m = lua_patterns::LuaPattern::new("%S+"); 243 | /// let split: Vec<_> = m.gmatch("dog cat leopard wolf").collect(); 244 | /// assert_eq!(split,&["dog","cat","leopard","wolf"]); 245 | /// ``` 246 | pub fn gmatch<'b,'c>(&'c mut self, text: &'b str) -> GMatch<'a,'b,'c> { 247 | GMatch{m: self, text: text} 248 | } 249 | 250 | /// An iterator over all captures in a string. 251 | /// 252 | /// The matches are returned as captures; this is a _streaming_ 253 | /// iterator, so don't try to collect the captures directly; extract 254 | /// the string slices using `get`. 255 | /// 256 | /// ``` 257 | /// let mut m = lua_patterns::LuaPattern::new("(%S)%S+"); 258 | /// let split: Vec<_> = m.gmatch_captures("dog cat leopard wolf") 259 | /// .map(|cc| cc.get(1)).collect(); 260 | /// assert_eq!(split,&["d","c","l","w"]); 261 | /// ``` 262 | pub fn gmatch_captures<'b,'c>(&'c mut self, text: &'b str) -> GMatchCaptures<'a,'b,'c> { 263 | GMatchCaptures{m: self, text: text} 264 | } 265 | 266 | /// An iterator over all matches in a slice of bytes. 267 | /// 268 | /// ``` 269 | /// let bytes = &[0xAA,0x01,0x01,0x03,0xBB,0x01,0x01,0x01]; 270 | /// let patt = &[0x01,b'+']; 271 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt); 272 | /// let mut iter = m.gmatch_bytes(bytes); 273 | /// assert_eq!(iter.next().unwrap(), &[0x01,0x01]); 274 | /// assert_eq!(iter.next().unwrap(), &[0x01,0x01,0x01]); 275 | /// assert_eq!(iter.next(), None); 276 | /// ``` 277 | pub fn gmatch_bytes<'b>(&'a mut self, bytes: &'b [u8]) -> GMatchBytes<'a,'b> { 278 | GMatchBytes{m: self, bytes: bytes} 279 | } 280 | 281 | /// Globally substitute all matches with a replacement 282 | /// provided by a function of the captures. 283 | /// 284 | /// ``` 285 | /// let mut m = lua_patterns::LuaPattern::new("%$(%S+)"); 286 | /// let res = m.gsub_with("hello $dolly you're so $fine!", 287 | /// |cc| cc.get(1).to_uppercase() 288 | /// ); 289 | /// assert_eq!(res, "hello DOLLY you're so FINE!"); 290 | /// ``` 291 | pub fn gsub_with (&mut self, text: &str, lookup: F) -> String 292 | where F: Fn(Captures)-> String { 293 | let mut slice = text; 294 | let mut res = String::new(); 295 | while self.matches(slice) { 296 | // full range of match 297 | let all = self.range(); 298 | // append everything up to match 299 | res.push_str(&slice[0..all.start]); 300 | let captures = Captures{m: self, text: slice}; 301 | let repl = lookup(captures); 302 | res.push_str(&repl); 303 | slice = &slice[all.end..]; 304 | } 305 | res.push_str(slice); 306 | res 307 | } 308 | 309 | /// Globally substitute all matches with a replacement string 310 | /// 311 | /// This string _may_ have capture references ("%0",..). Use "%%" 312 | /// to represent "%". Plain strings like "" work just fine ;) 313 | /// 314 | /// ``` 315 | /// let mut m = lua_patterns::LuaPattern::new("(%S+)%s*=%s*(%S+);%s*"); 316 | /// let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 "); 317 | /// assert_eq!(res,"'2':a '3':b '4':c "); 318 | /// ``` 319 | pub fn gsub (&mut self, text: &str, repl: &str) -> String { 320 | let repl = generate_gsub_patterns(repl); 321 | let mut slice = text; 322 | let mut res = String::new(); 323 | while self.matches(slice) { 324 | let all = self.range(); 325 | res.push_str(&slice[0..all.start]); 326 | let captures = Captures{m: self, text: slice}; 327 | for r in &repl { 328 | match *r { 329 | Subst::Text(ref s) => res.push_str(&s), 330 | Subst::Capture(i) => res.push_str(captures.get(i)) 331 | } 332 | } 333 | slice = &slice[all.end..]; 334 | } 335 | res.push_str(slice); 336 | res 337 | } 338 | 339 | /// Globally substitute all _byte_ matches with a replacement 340 | /// provided by a function of the captures. 341 | /// 342 | /// ``` 343 | /// let bytes = &[0xAA,0x01,0x02,0x03,0xBB]; 344 | /// let patt = &[0x01,0x02]; 345 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt); 346 | /// let res = m.gsub_bytes_with(bytes,|cc| vec![0xFF]); 347 | /// assert_eq!(res, &[0xAA,0xFF,0x03,0xBB]); 348 | /// ``` 349 | pub fn gsub_bytes_with (&mut self, bytes: &[u8], lookup: F) -> Vec 350 | where F: Fn(ByteCaptures)-> Vec { 351 | let mut slice = bytes; 352 | let mut res = Vec::new(); 353 | while self.matches_bytes(slice) { 354 | let all = self.range(); 355 | let capture = &slice[0..all.start]; 356 | res.extend_from_slice(capture); 357 | let captures = ByteCaptures{m: self, bytes: slice}; 358 | let repl = lookup(captures); 359 | res.extend(repl); 360 | slice = &slice[all.end..]; 361 | } 362 | res.extend_from_slice(slice); 363 | res 364 | } 365 | 366 | } 367 | 368 | #[derive(Debug)] 369 | pub enum Subst { 370 | Text(String), 371 | Capture(usize) 372 | } 373 | 374 | impl Subst { 375 | fn new_text(text: &str) -> Subst { 376 | Subst::Text(text.to_string()) 377 | } 378 | } 379 | 380 | pub fn generate_gsub_patterns(repl: &str) -> Vec { 381 | let mut m = LuaPattern::new("%%([%%%d])"); 382 | let mut res = Vec::new(); 383 | let mut slice = repl; 384 | while m.matches(slice) { 385 | let all = m.range(); 386 | let before = &slice[0..all.start]; 387 | if before != "" { 388 | res.push(Subst::new_text(before)); 389 | } 390 | let capture = &slice[m.capture(1)]; 391 | if capture == "%" { // escaped literal '%' 392 | res.push(Subst::new_text("%")); 393 | } else { // has to be a digit 394 | let index: usize = capture.parse().unwrap(); 395 | res.push(Subst::Capture(index)); 396 | } 397 | slice = &slice[all.end..]; 398 | } 399 | res.push(Subst::new_text(slice)); 400 | res 401 | } 402 | 403 | pub struct Substitute { 404 | repl: Vec 405 | } 406 | 407 | impl Substitute { 408 | pub fn new(repl: &str) -> Substitute { 409 | Substitute{ 410 | repl: generate_gsub_patterns(repl) 411 | } 412 | } 413 | 414 | pub fn subst(&self, patt: &LuaPattern, text: &str) -> String { 415 | let mut res = String::new(); 416 | let captures = patt.match_captures(text); 417 | for r in &self.repl { 418 | match *r { 419 | Subst::Text(ref s) => res.push_str(&s), 420 | Subst::Capture(i) => res.push_str(captures.get(i)) 421 | } 422 | } 423 | res 424 | } 425 | 426 | } 427 | 428 | 429 | 430 | /// Low-overhead convenient access to string match captures 431 | // note: there are three borrows going on here. 432 | // The lifetime 'a is for the _pattern_, the lifetime 'b is 433 | // for the _source string_, and 'c is for the reference to LuaPattern 434 | // And the LuaPattern reference cannot live longer than the pattern reference 435 | pub struct Captures<'a,'b,'c> where 'a: 'c { 436 | m: &'c LuaPattern<'a>, 437 | text: &'b str 438 | } 439 | 440 | impl <'a,'b,'c> Captures<'a,'b,'c> { 441 | /// get the capture as a string slice 442 | pub fn get(&self, i: usize) -> &'b str { 443 | &self.text[self.m.capture(i)] 444 | } 445 | 446 | /// number of matches 447 | pub fn num_matches(&self) -> usize { 448 | self.m.n_match 449 | } 450 | } 451 | 452 | 453 | /// Low-overhead convenient access to byte match captures 454 | pub struct ByteCaptures<'a,'b> { 455 | m: &'a LuaPattern<'a>, 456 | bytes: &'b [u8] 457 | } 458 | 459 | impl <'a,'b> ByteCaptures<'a,'b> { 460 | /// get the capture as a byte slice 461 | pub fn get(&self, i: usize) -> &'b [u8] { 462 | &self.bytes[self.m.capture(i)] 463 | } 464 | 465 | /// number of matches 466 | pub fn num_matches(&self) -> usize { 467 | self.m.n_match 468 | } 469 | } 470 | 471 | /// Iterator for all string slices from `gmatch` 472 | // note lifetimes as for Captures above! 473 | pub struct GMatch<'a,'b,'c> where 'a: 'c { 474 | m: &'c mut LuaPattern<'a>, 475 | text: &'b str 476 | } 477 | 478 | impl <'a,'b,'c>Iterator for GMatch<'a,'b,'c> { 479 | type Item = &'b str; 480 | 481 | fn next(&mut self) -> Option { 482 | if ! self.m.matches(self.text) { 483 | None 484 | } else { 485 | let slice = &self.text[self.m.first_capture()]; 486 | self.text = &self.text[self.m.range().end..]; 487 | Some(slice) 488 | } 489 | } 490 | 491 | } 492 | 493 | /// Unsafe version of Captures, needed for gmatch_captures 494 | // It's unsafe because the lifetime only depends on the original 495 | // text, not the borrowed matches. 496 | pub struct CapturesUnsafe<'b>{ 497 | matches: *const LuaMatch, 498 | text: &'b str 499 | } 500 | 501 | impl <'b> CapturesUnsafe<'b> { 502 | /// get the capture as a string slice 503 | pub fn get(&self, i: usize) -> &'b str { 504 | unsafe { 505 | let p = self.matches.offset(i as isize); 506 | let range = 507 | ops::Range{ 508 | start: (*p).start as usize, 509 | end: (*p).end as usize 510 | }; 511 | &self.text[range] 512 | } 513 | } 514 | } 515 | 516 | /// Streaming iterator for all captures from `gmatch_captures` 517 | // lifetimes as for Captures above! 518 | // 'a is pattern, 'b is text, 'c is ref to LuaPattern 519 | pub struct GMatchCaptures<'a,'b,'c> where 'a: 'c { 520 | m: &'c mut LuaPattern<'a>, 521 | text: &'b str 522 | } 523 | 524 | impl <'a,'b,'c> Iterator for GMatchCaptures<'a,'b,'c> where 'a: 'c { 525 | type Item = CapturesUnsafe<'b>; 526 | 527 | fn next(&mut self) -> Option { 528 | if ! self.m.matches(self.text) { 529 | None 530 | } else { 531 | let split = self.text.split_at(self.m.range().end); 532 | self.text = split.1; 533 | let match_ptr: *const LuaMatch = self.m.matches.as_ptr(); 534 | Some(CapturesUnsafe{matches: match_ptr, text: split.0}) 535 | } 536 | } 537 | 538 | } 539 | 540 | /// Iterator for all byte slices from `gmatch_bytes` 541 | pub struct GMatchBytes<'a,'b> { 542 | m: &'a mut LuaPattern<'a>, 543 | bytes: &'b [u8] 544 | } 545 | 546 | impl <'a,'b>Iterator for GMatchBytes<'a,'b> { 547 | type Item = &'b [u8]; 548 | 549 | fn next(&mut self) -> Option { 550 | if ! self.m.matches_bytes(self.bytes) { 551 | None 552 | } else { 553 | let slice = &self.bytes[self.m.first_capture()]; 554 | self.bytes = &self.bytes[self.m.range().end..]; 555 | Some(slice) 556 | } 557 | } 558 | 559 | } 560 | 561 | /// Build a byte Lua pattern, optionally escaping 'magic' characters 562 | pub struct LuaPatternBuilder { 563 | bytes: Vec 564 | } 565 | 566 | impl LuaPatternBuilder { 567 | /// Create a new Lua pattern builder 568 | pub fn new() -> LuaPatternBuilder { 569 | LuaPatternBuilder{bytes: Vec::new()} 570 | } 571 | 572 | /// Add unescaped characters from a string 573 | /// 574 | /// ``` 575 | /// let patt = lua_patterns::LuaPatternBuilder::new() 576 | /// .text("(boo)") 577 | /// .build(); 578 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "(boo)"); 579 | /// ``` 580 | pub fn text(&mut self, s: &str) -> &mut Self { 581 | self.bytes.extend_from_slice(s.as_bytes()); 582 | self 583 | } 584 | 585 | /// Add unescaped characters from lines 586 | /// 587 | /// This looks for first non-whitespace run in each line, 588 | /// useful for spreading patterns out and commmenting them. 589 | /// Works with patterns that use '%s' religiously! 590 | /// 591 | /// ``` 592 | /// let patt = lua_patterns::LuaPatternBuilder::new() 593 | /// .text_lines(" 594 | /// hello-dolly 595 | /// you-are-fine # comment 596 | /// cool 597 | /// ") 598 | /// .build(); 599 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), 600 | /// "hello-dollyyou-are-finecool"); 601 | /// ``` 602 | pub fn text_lines(&mut self, lines: &str) -> &mut Self { 603 | let mut text = String::new(); 604 | for line in lines.lines() { 605 | if let Some(first) = line.split_whitespace().next() { 606 | text.push_str(first); 607 | } 608 | } 609 | self.text(&text) 610 | } 611 | 612 | /// Add escaped bytes from a slice 613 | /// 614 | /// ``` 615 | /// let patt = lua_patterns::LuaPatternBuilder::new() 616 | /// .text("^") 617 | /// .bytes(b"^") // magic character! 618 | /// .build(); 619 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^"); 620 | /// ``` 621 | pub fn bytes(&mut self, b: &[u8]) -> &mut Self { 622 | let mut m = LuaPattern::new("[%-%.%+%[%]%(%)%$%^%%%?%*]"); 623 | let bb = m.gsub_bytes_with(b,|cc| { 624 | let mut res = Vec::new(); 625 | res.push(b'%'); 626 | res.push(cc.get(0)[0]); 627 | res 628 | }); 629 | self.bytes.extend(bb); 630 | self 631 | } 632 | 633 | /// Add escaped bytes from hex string 634 | /// 635 | /// This consists of adjacent pairs of hex digits. 636 | /// 637 | /// ``` 638 | /// let patt = lua_patterns::LuaPatternBuilder::new() 639 | /// .text("^") 640 | /// .bytes_as_hex("5E") // which is ASCII '^' 641 | /// .build(); 642 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^"); 643 | /// ``` 644 | pub fn bytes_as_hex(&mut self, bs: &str) -> &mut Self { 645 | let bb = LuaPatternBuilder::hex_to_bytes(bs); 646 | self.bytes(&bb) 647 | } 648 | 649 | /// Create the pattern 650 | pub fn build(&mut self) -> Vec { 651 | let mut v = Vec::new(); 652 | std::mem::swap(&mut self.bytes, &mut v); 653 | v 654 | } 655 | 656 | /// Utility to create a vector of bytes from a hex string 657 | /// 658 | /// ``` 659 | /// let bb = lua_patterns::LuaPatternBuilder::hex_to_bytes("AEFE00FE"); 660 | /// assert_eq!(bb, &[0xAE,0xFE,0x00,0xFE]); 661 | /// ``` 662 | pub fn hex_to_bytes(s: &str) -> Vec { 663 | let mut m = LuaPattern::new("%x%x"); 664 | m.gmatch(s).map(|pair| u8::from_str_radix(pair,16).unwrap()).collect() 665 | } 666 | 667 | /// Utility to create a hex string from a slice of bytes 668 | /// 669 | /// ``` 670 | /// let hex = lua_patterns::LuaPatternBuilder::bytes_to_hex(&[0xAE,0xFE,0x00,0xFE]); 671 | /// assert_eq!(hex,"AEFE00FE"); 672 | /// 673 | /// ``` 674 | pub fn bytes_to_hex(s: &[u8]) -> String { 675 | s.iter().map(|b| format!("{:02X}",b)).collect() 676 | } 677 | 678 | } 679 | 680 | #[cfg(test)] 681 | mod tests { 682 | use super::*; 683 | 684 | #[test] 685 | fn captures_and_matching() { 686 | let mut m = LuaPattern::new("(one).+"); 687 | assert_eq!(m.captures(" one two"), &["one two","one"]); 688 | let empty: &[&str] = &[]; 689 | assert_eq!(m.captures("four"), empty); 690 | 691 | assert_eq!(m.matches("one dog"), true); 692 | assert_eq!(m.matches("dog one "), true); 693 | assert_eq!(m.matches("dog one"), false); 694 | 695 | let text = "one dog"; 696 | let mut m = LuaPattern::new("^(%a+)"); 697 | assert_eq!(m.matches(text), true); 698 | assert_eq!(&text[m.capture(1)], "one"); 699 | assert_eq!(m.matches(" one dog"), false); 700 | 701 | // captures without allocation 702 | m.matches(text); 703 | let captures = m.match_captures(text); 704 | assert_eq!(captures.get(0), "one"); 705 | assert_eq!(captures.get(1), "one"); 706 | 707 | let mut m = LuaPattern::new("(%S+)%s*=%s*(.+)"); 708 | 709 | // captures as Vec 710 | let cc = m.captures(" hello= bonzo dog"); 711 | assert_eq!(cc[0], "hello= bonzo dog"); 712 | assert_eq!(cc[1], "hello"); 713 | assert_eq!(cc[2], "bonzo dog"); 714 | 715 | } 716 | 717 | #[test] 718 | fn multiple_captures() { 719 | let mut p = LuaPattern::new("%s*(%d+)%s+(%S+)"); 720 | let (int,rest) = p.match_maybe_2(" 233 hello dolly").unwrap(); 721 | assert_eq!(int,"233"); 722 | assert_eq!(rest,"hello"); 723 | } 724 | 725 | #[test] 726 | fn gmatch() { 727 | let mut m = LuaPattern::new("%a+"); 728 | let mut iter = m.gmatch("one two three"); 729 | assert_eq!(iter.next(), Some("one")); 730 | assert_eq!(iter.next(), Some("two")); 731 | assert_eq!(iter.next(), Some("three")); 732 | assert_eq!(iter.next(), None); 733 | 734 | let mut m = LuaPattern::new("(%a+)"); 735 | let mut iter = m.gmatch("one two three"); 736 | assert_eq!(iter.next(), Some("one")); 737 | assert_eq!(iter.next(), Some("two")); 738 | assert_eq!(iter.next(), Some("three")); 739 | assert_eq!(iter.next(), None); 740 | 741 | let mut m = LuaPattern::new("(%a+)"); 742 | let mut iter = m.gmatch_captures("one two three"); 743 | assert_eq!(iter.next().unwrap().get(1), "one"); 744 | assert_eq!(iter.next().unwrap().get(1), "two"); 745 | assert_eq!(iter.next().unwrap().get(1), "three"); 746 | } 747 | 748 | #[test] 749 | fn gsub() { 750 | use std::collections::HashMap; 751 | 752 | let mut m = LuaPattern::new("%$(%S+)"); 753 | let res = m.gsub_with("hello $dolly you're so $fine!", 754 | |cc| cc.get(1).to_uppercase() 755 | ); 756 | assert_eq!(res, "hello DOLLY you're so FINE!"); 757 | 758 | let mut map = HashMap::new(); 759 | map.insert("dolly", "baby"); 760 | map.insert("fine", "cool"); 761 | map.insert("good-looking", "pretty"); 762 | 763 | let mut m = LuaPattern::new("%$%((.-)%)"); 764 | let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)", 765 | |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string() 766 | ); 767 | assert_eq!(res, "hello baby you're so cool and pretty"); 768 | 769 | let mut m = LuaPattern::new("%s+"); 770 | let res = m.gsub("hello dolly you're so fine",""); 771 | assert_eq!(res, "hellodollyyou'resofine"); 772 | 773 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*"); 774 | let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 "); 775 | assert_eq!(res,"'2':a '3':b '4':c "); 776 | } 777 | 778 | #[test] 779 | fn bad_patterns() { 780 | let bad = [ 781 | ("bonzo %","malformed pattern (ends with '%')"), 782 | ("bonzo (dog%(","unfinished capture"), 783 | ("alles [%a%[","malformed pattern (missing ']')"), 784 | ("bonzo (dog (cat)","unfinished capture"), 785 | ("frodo %f[%A","malformed pattern (missing ']')"), 786 | ("frodo (1) (2(3)%2)%1","invalid capture index %2"), 787 | ]; 788 | for p in bad.iter() { 789 | let res = LuaPattern::new_try(p.0); 790 | if let Err(e) = res { 791 | assert_eq!(e, PatternError(p.1.into())); 792 | } else { 793 | panic!("false positive"); 794 | } 795 | } 796 | } 797 | } 798 | -------------------------------------------------------------------------------- /lua-patterns/src/luapat.rs: -------------------------------------------------------------------------------- 1 | // translation of Lua 5.2 string pattern code 2 | 3 | use errors::*; 4 | use std::ptr::null; 5 | 6 | pub const LUA_MAXCAPTURES: usize = 32; 7 | /* maximum recursion depth for 'match' */ 8 | const MAXCCALLS: usize = 200; 9 | 10 | const L_ESC: u8 = b'%'; 11 | 12 | fn add(p: CPtr, count: usize) -> CPtr { 13 | unsafe {p.offset(count as isize)} 14 | } 15 | 16 | fn sub(p: CPtr, count: usize) -> CPtr { 17 | unsafe {p.offset(-(count as isize))} 18 | } 19 | 20 | fn next(p: CPtr) -> CPtr { 21 | add(p, 1) 22 | } 23 | 24 | fn at(p: CPtr) -> u8 { 25 | unsafe { *p } 26 | } 27 | 28 | fn diff(p1: CPtr, p2: CPtr) -> usize { 29 | let d = (p1 as isize).wrapping_sub(p2 as isize); 30 | d as usize 31 | } 32 | 33 | #[derive(Copy,Clone,Debug)] 34 | pub struct LuaMatch { 35 | pub start: usize, 36 | pub end: usize, 37 | } 38 | 39 | #[derive(Copy,Clone)] 40 | enum CapLen { 41 | Len(usize), 42 | Unfinished, 43 | Position, 44 | } 45 | 46 | impl CapLen { 47 | fn is_unfinished(&self) -> bool { 48 | match *self { 49 | CapLen::Unfinished => true, 50 | _ => false 51 | } 52 | } 53 | 54 | fn size(&self) -> Result { 55 | match *self { 56 | CapLen::Len(size) => Ok(size), 57 | _ => error("capture was unfinished or positional") 58 | } 59 | } 60 | 61 | } 62 | 63 | type CPtr = *const u8; 64 | 65 | #[derive(Copy,Clone)] 66 | struct Capture { 67 | init: CPtr, 68 | len: CapLen, 69 | } 70 | 71 | impl Capture { 72 | fn is_unfinished(&self) -> bool { 73 | self.len.is_unfinished() 74 | } 75 | } 76 | 77 | use std::result; 78 | 79 | type Result = result::Result; 80 | 81 | fn error(msg: &str) -> Result { 82 | Err(PatternError(msg.into())) 83 | } 84 | 85 | struct MatchState { 86 | matchdepth: usize, /* control for recursive depth (to avoid stack overflow) */ 87 | src_init: CPtr, /* init of source string */ 88 | src_end: CPtr, /* end ('\0') of source string */ 89 | p_end: CPtr, /* end ('\0') of pattern */ 90 | level: usize, /* total number of captures (finished or unfinished) */ 91 | capture: [Capture; LUA_MAXCAPTURES], 92 | } 93 | 94 | impl MatchState { 95 | fn new(s: CPtr, se: CPtr, pe: CPtr) -> MatchState { 96 | MatchState { 97 | matchdepth: MAXCCALLS, 98 | src_init: s, 99 | src_end: se, 100 | p_end: pe, 101 | level: 0, 102 | capture: [Capture{init: null(), len: CapLen::Len(0) }; LUA_MAXCAPTURES], 103 | } 104 | } 105 | 106 | fn check_capture(&self, l: usize) -> Result { 107 | let l = l as i8 - b'1' as i8; 108 | if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() { 109 | return error(&format!("invalid capture index %{}", l + 1)); 110 | } 111 | Ok(l as usize) 112 | } 113 | 114 | fn capture_to_close(&self) -> Result { 115 | let mut level = (self.level - 1) as isize; 116 | while level >= 0 { 117 | if self.capture[level as usize].is_unfinished() { 118 | return Ok(level as usize); 119 | } 120 | level -= 1; 121 | } 122 | error("invalid pattern capture") 123 | } 124 | 125 | fn classend (&self, p: CPtr) -> Result { 126 | let ch = at(p); 127 | let mut next_p = next(p); 128 | Ok(match ch { 129 | L_ESC => { 130 | if next_p == self.p_end { 131 | return error("malformed pattern (ends with '%')"); 132 | } 133 | next(next_p) 134 | }, 135 | b'[' => { 136 | if at(next_p) == b'^' { 137 | next_p = next(next_p); 138 | } 139 | while at(next_p) != b']' { 140 | if next_p == self.p_end { 141 | return error("malformed pattern (missing ']')"); 142 | } 143 | let ch = at(next_p); 144 | next_p = next(next_p); 145 | if ch == L_ESC && p < self.p_end { 146 | next_p = next(next_p); /* skip escapes (e.g. `%]') */ 147 | } 148 | } 149 | next(next_p) 150 | }, 151 | _ => next_p 152 | }) 153 | } 154 | 155 | } 156 | 157 | fn match_class (ch: u8, class: u8) -> bool { 158 | let res = match class.to_ascii_lowercase() { 159 | b'a' => ch.is_ascii_alphabetic(), 160 | b'c' => ch.is_ascii_control(), 161 | b'd' => ch.is_ascii_digit(), 162 | b'g' => ch.is_ascii_graphic(), 163 | b'l' => ch.is_ascii_lowercase(), 164 | b'p' => ch.is_ascii_punctuation(), 165 | b's' => ch.is_ascii_whitespace(), 166 | b'u' => ch.is_ascii_uppercase(), 167 | b'w' => ch.is_ascii_alphanumeric(), 168 | b'x' => ch.is_ascii_hexdigit(), 169 | lc => return lc == ch, 170 | }; 171 | if class.is_ascii_lowercase() { res } else {! res} 172 | } 173 | 174 | 175 | fn matchbracketclass (c: u8, p: CPtr, ec: CPtr) -> bool { 176 | let mut p = p; 177 | // [^ inverts match 178 | let sig = if at(next(p)) == b'^' { 179 | p = next(p); 180 | false 181 | } else { 182 | true 183 | }; 184 | p = next(p); 185 | while p < ec { 186 | if at(p) == L_ESC { // e.g %s 187 | p = next(p); 188 | if match_class(c, at(p)) { 189 | return sig; 190 | } 191 | } else 192 | // e.g a-z 193 | if at(next(p)) == b'-' && add(p,2) < ec { 194 | let lastc = at(p); 195 | p = add(p,2); 196 | if lastc <= c && c <= at(p) { 197 | return sig; 198 | } 199 | } else 200 | if at(p) == c { 201 | return sig; 202 | } 203 | p = next(p); 204 | } 205 | return ! sig; 206 | } 207 | 208 | impl MatchState { 209 | 210 | fn singlematch (&self, s: CPtr, p: CPtr, ep: CPtr) -> bool { 211 | if s >= self.src_end { 212 | return false; 213 | } 214 | let c = at(s); 215 | let pc = at(p); 216 | match pc { 217 | b'.' => true, /* matches any char */ 218 | L_ESC => match_class(c, at(next(p))), 219 | b'[' => matchbracketclass(c, p, sub(ep,1)), 220 | _ => c == pc 221 | } 222 | } 223 | 224 | fn matchbalance (&self, s: CPtr, p: CPtr) -> Result { 225 | if p >= sub(self.p_end,1) { 226 | return error("malformed pattern (missing arguments to '%b')"); 227 | } 228 | if at(s) != at(p) { 229 | return Ok(null()); 230 | } 231 | // e.g. %b() 232 | let b = at(p); 233 | let e = at(next(p)); 234 | let mut cont = 1; 235 | let mut s = next(s); 236 | while s < self.src_end { 237 | let ch = at(s); 238 | if ch == e { 239 | cont -= 1; 240 | if cont == 0 { 241 | return Ok(next(s)); 242 | } 243 | } else 244 | if ch == b { 245 | cont += 1; 246 | } 247 | s = next(s); 248 | } 249 | Ok(null()) /* string ends out of balance */ 250 | } 251 | 252 | fn max_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result { 253 | let mut i = 0isize; /* counts maximum expand for item */ 254 | while self.singlematch(add(s,i as usize),p,ep) { 255 | i += 1; 256 | } 257 | /* keeps trying to match with the maximum repetitions */ 258 | while i >= 0 { 259 | let res = self.patt_match(add(s,i as usize),next(ep))?; 260 | if ! res.is_null() { 261 | return Ok(res); 262 | } 263 | i -= 1; /* else didn't match; reduce 1 repetition to try again */ 264 | } 265 | Ok(null()) 266 | } 267 | 268 | fn min_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result { 269 | let mut s = s; 270 | loop { 271 | let res = self.patt_match(s,next(ep))?; 272 | if ! res.is_null() { 273 | return Ok(res); 274 | } else 275 | if self.singlematch(s, p, ep) { 276 | s = next(s); 277 | } else { 278 | return Ok(null()); 279 | } 280 | } 281 | } 282 | 283 | fn start_capture(&mut self, s: CPtr, p: CPtr, what: CapLen) -> Result { 284 | let level = self.level; 285 | if level >= LUA_MAXCAPTURES { 286 | return error("too many captures"); 287 | } 288 | self.capture[level].init = s; 289 | self.capture[level].len = what; 290 | self.level = level + 1; 291 | let res = self.patt_match(s, p)?; 292 | if res.is_null() { /* match failed? */ 293 | self.level -= 1; /* undo capture */ 294 | } 295 | Ok(res) 296 | } 297 | 298 | fn end_capture(&mut self, s: CPtr, p: CPtr) -> Result { 299 | let l = self.capture_to_close()?; 300 | self.capture[l].len = CapLen::Len(diff(s,self.capture[l].init)); /* close capture */ 301 | let res = self.patt_match(s, p)?; 302 | if res.is_null() { /* match failed? */ 303 | self.capture[l].len = CapLen::Unfinished; 304 | } 305 | Ok(res) 306 | } 307 | 308 | fn match_capture(&mut self, s: CPtr, l: usize) -> Result { 309 | let l = self.check_capture(l)?; 310 | let len = self.capture[l].len.size()?; 311 | if diff(self.src_end, s) >= len { 312 | unsafe {s.copy_to_nonoverlapping(self.capture[l].init as *mut u8, len);} 313 | return Ok(add(s,len)); 314 | } 315 | Ok(null()) 316 | } 317 | 318 | 319 | fn patt_match(&mut self, s: CPtr, p: CPtr) -> Result { 320 | let mut s = s; 321 | let mut p = p; 322 | self.matchdepth -= 1; 323 | if self.matchdepth == 0 { 324 | return error("pattern too complex"); 325 | } 326 | 327 | if p == self.p_end { /* end of pattern? */ 328 | self.matchdepth += 1; 329 | return Ok(s); 330 | } 331 | match at(p) { 332 | b'(' => { /* start capture */ 333 | if at(next(p)) == b')' { /* position capture? */ 334 | s = self.start_capture(s, add(p,2), CapLen::Position)?; 335 | } else { 336 | s = self.start_capture(s, next(p), CapLen::Unfinished)?; 337 | } 338 | }, 339 | b')' => { /* end capture */ 340 | s = self.end_capture(s, next(p))?; 341 | }, 342 | b'$' => { 343 | if next(p) != self.p_end { /* is the `$' the last char in pattern? */ 344 | /* no; go to default */ 345 | return self.patt_default_match(s, p); 346 | } 347 | s = if s == self.src_end {s} else {null()}; /* check end of string */ 348 | } 349 | L_ESC => { /* escaped sequences not in the format class[*+?-]? */ 350 | match at(next(p)) { 351 | b'b' => { /* balanced string? */ 352 | s = self.matchbalance(s, add(p,2))?; 353 | if ! s.is_null() { 354 | // e.g, after %b() 355 | return self.patt_match(s, add(p,4)); 356 | } 357 | }, 358 | b'f' => { /* frontier? */ 359 | p = add(p,2); 360 | if at(p) != b'[' { 361 | return error("missing '[' after '%f' in pattern"); 362 | } 363 | let ep = self.classend(p)?; /* points to what is next */ 364 | let previous = if s == self.src_init {b'\0'} else {at(sub(s,1))}; 365 | let epl = sub(ep,1); 366 | if ! matchbracketclass(previous,p,epl) 367 | && matchbracketclass(at(s),p,epl) { 368 | return self.patt_match(s, ep); 369 | } 370 | s = null(); /* match failed */ 371 | }, 372 | b'0'..=b'9' => { /* capture results (%0-%9)? */ 373 | s = self.match_capture(s,at(next(p)) as usize)?; 374 | if ! s.is_null() { 375 | return self.patt_match(s, add(p,2)); 376 | } 377 | }, 378 | _ => return self.patt_default_match(s, p) 379 | } 380 | 381 | }, 382 | _ => return self.patt_default_match(s, p) 383 | 384 | } 385 | self.matchdepth += 1; 386 | Ok(s) 387 | } 388 | 389 | fn patt_default_match(&mut self, s: CPtr, p: CPtr) -> Result { 390 | let mut s = s; 391 | /* pattern class plus optional suffix */ 392 | let ep = self.classend(p)?; /* points to optional suffix */ 393 | let epc = if ep == self.p_end { 0 } else { at(ep) }; 394 | /* does not match at least once? */ 395 | if ! self.singlematch(s, p, ep) { 396 | if epc == b'*' || epc == b'?' || epc == b'-' { /* accept empty? */ 397 | return self.patt_match(s, next(ep)); 398 | } else { /* '+' or no suffix */ 399 | s = null(); /* fail */ 400 | } 401 | } else { /* matched once */ 402 | match at(ep) { /* handle optional suffix */ 403 | b'?' => { 404 | let res = self.patt_match(next(s),next(ep))?; 405 | if ! res.is_null() { 406 | s = res; 407 | } else { 408 | return self.patt_match(s, next(ep)); 409 | } 410 | }, 411 | b'+' => { /* 1 or more repetitions */ 412 | s = next(s); 413 | s = self.max_expand(s, p, ep)?; 414 | }, 415 | b'*' => { /* 0 or more repetitions */ 416 | s = self.max_expand(s, p, ep)?; 417 | }, 418 | b'-' => { /* 0 or more repetitions (minimum) */ 419 | s = self.min_expand(s, p, ep)? ; 420 | }, 421 | _ => { /* no suffix */ 422 | return self.patt_match(next(s),ep); 423 | } 424 | } 425 | } 426 | self.matchdepth += 1; 427 | Ok(s) 428 | } 429 | 430 | fn push_onecapture(&mut self, i: usize, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result<()> { 431 | if i >= self.level { 432 | if i == 0 { /* ms->level == 0, too */ 433 | mm[0].start = 0; 434 | mm[0].end = diff(e,s); 435 | Ok(()) 436 | } else { 437 | return error("invalid capture index"); 438 | } 439 | } else { 440 | let init = self.capture[i].init; 441 | match self.capture[i].len { 442 | CapLen::Unfinished => error("unfinished capture"), 443 | CapLen::Position => { 444 | mm[i].start = diff(init,next(self.src_init)); 445 | mm[i].end = mm[i].start; 446 | Ok(()) 447 | }, 448 | CapLen::Len(l) => { 449 | mm[i].start = diff(init,self.src_init); 450 | mm[i].end = mm[i].start + l; 451 | Ok(()) 452 | } 453 | } 454 | } 455 | 456 | } 457 | 458 | fn push_captures(&mut self, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result { 459 | let nlevels = if self.level == 0 && ! s.is_null() {1} else {self.level}; 460 | for i in 0..nlevels { 461 | self.push_onecapture(i, s, e, mm)?; 462 | } 463 | Ok(nlevels) /* number of strings pushed */ 464 | } 465 | 466 | pub fn str_match_check(&mut self, p: CPtr) -> Result<()> { 467 | let mut level_stack = [0; LUA_MAXCAPTURES]; 468 | let mut stack_idx = 0; 469 | let mut p = p; 470 | while p < self.p_end { 471 | let ch = at(p); 472 | p = next(p); 473 | match ch { 474 | L_ESC => { 475 | //p = next(p); 476 | let c = at(p); 477 | match c { 478 | b'b' => { 479 | p = next(p); 480 | if p >= self.p_end { 481 | return error("malformed pattern (missing arguments to '%b')"); 482 | } 483 | }, 484 | b'f' => { 485 | p = next(p); 486 | if at(p) != b'[' { 487 | return error("missing '[' after '%f' in pattern"); 488 | } 489 | p = sub(p,1); // so we see [...] 490 | }, 491 | b'0' ..= b'9' => { 492 | let l = (c as i8) - (b'1' as i8); 493 | println!("level {}", self.level); 494 | if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() { 495 | return error(&format!("invalid capture index %{}", l + 1)); 496 | } 497 | p = sub(p,1); 498 | }, 499 | _ => {} 500 | } 501 | }, 502 | b'[' => { 503 | while at(p) != b']' { 504 | if p == self.p_end { 505 | return error("malformed pattern (missing ']')"); 506 | } 507 | if at(p) == L_ESC && p < self.p_end { 508 | p = next(p); 509 | } 510 | p = next(p); 511 | } 512 | }, 513 | b'(' => { 514 | if at(p) != b')' { // not a position capture 515 | level_stack[stack_idx] = self.level; 516 | stack_idx += 1; 517 | self.capture[self.level].len = CapLen::Unfinished; 518 | self.level += 1; 519 | if self.level >= LUA_MAXCAPTURES { 520 | return error("too many captures"); 521 | } 522 | } else { 523 | p = next(p); 524 | } 525 | }, 526 | b')' => { 527 | if stack_idx == 0 { 528 | return error("no open capture"); 529 | } 530 | stack_idx -= 1; 531 | self.capture[level_stack[stack_idx]].len = CapLen::Position; 532 | }, 533 | _ => {} 534 | } 535 | } 536 | if stack_idx > 0 { 537 | return error("unfinished capture"); 538 | } 539 | Ok(()) 540 | } 541 | } 542 | 543 | pub fn str_match(s: &[u8], p: &[u8], mm: &mut [LuaMatch]) -> Result { 544 | let mut lp = p.len(); 545 | let mut p = p.as_ptr(); 546 | let ls = s.len(); 547 | let s = s.as_ptr(); 548 | let mut s1 = s; 549 | let anchor = at(p) == b'^'; 550 | if anchor { 551 | p = next(p); 552 | lp -= 1; /* skip anchor character */ 553 | } 554 | 555 | let mut ms = MatchState::new(s,add(s,ls),add(p,lp)); 556 | loop { 557 | let res = ms.patt_match(s1, p)?; 558 | if ! res.is_null() { 559 | mm[0].start = diff(s1,s); /* start */ 560 | mm[0].end = diff(res,s); /* end */ 561 | return Ok(ms.push_captures(null(),null(),&mut mm[1..])? + 1); 562 | } 563 | s1 = next(s1); 564 | if ! (s1 < ms.src_end && ! anchor) { 565 | break; 566 | } 567 | } 568 | Ok(0) 569 | } 570 | 571 | pub fn str_check(p: &[u8]) -> Result<()> { 572 | let mut lp = p.len(); 573 | let mut p = p.as_ptr(); 574 | let anchor = at(p) == b'^'; 575 | if anchor { 576 | p = next(p); 577 | lp -= 1; /* skip anchor character */ 578 | } 579 | let mut ms = MatchState::new(null(),null(),add(p,lp)); 580 | if at(sub(ms.p_end,1)) == b'%' { 581 | return error("malformed pattern (ends with '%')"); 582 | } 583 | ms.str_match_check(p)?; 584 | Ok(()) 585 | } 586 | 587 | /* 588 | fn check(s: &[u8], p: &[u8]) { 589 | if let Err(e) = str_check(p) { 590 | println!("check error {}",e); 591 | return; 592 | } 593 | 594 | let mut matches = [LuaMatch{start: 0, end: 0}; 10]; 595 | match str_match(s, p, &mut matches) { 596 | Ok(n) => { 597 | println!("ok {} matches", n); 598 | for i in 0..n { 599 | println!("match {:?} {:?}", 600 | matches[i], 601 | String::from_utf8(s[matches[i].start .. matches[i].end].to_vec()) 602 | ); 603 | } 604 | }, 605 | Err(e) => { 606 | println!("error: {}", e) 607 | } 608 | } 609 | } 610 | 611 | 612 | 613 | fn main() { 614 | let mut args = std::env::args().skip(1); 615 | let pat = args.next().unwrap(); 616 | let s = args.next().unwrap(); 617 | check(s.as_bytes(), pat.as_bytes()); 618 | 619 | //~ check(b"hello",b"%a"); 620 | //~ check(b"0hello",b"%a+"); 621 | //~ check(b"hello",b"%l(%a)"); 622 | //check(b"hello",b"he(l+)"); 623 | //check(b"k {and {so}}",b"k%s+(%b{})"); 624 | } 625 | */ 626 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | reorder_modules = false 2 | use_small_heuristics = "Max" 3 | tab_spaces = 2 4 | -------------------------------------------------------------------------------- /src/annot.rs: -------------------------------------------------------------------------------- 1 | mod generated; 2 | 3 | use std::fmt; 4 | 5 | pub(crate) use self::generated::*; 6 | 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 8 | pub(crate) enum Annot { 9 | Atom(Atom), 10 | Add(Comp), 11 | Sub(Comp), 12 | } 13 | 14 | impl PartialEq for Annot { 15 | fn eq(&self, other: &Atom) -> bool { 16 | match self { 17 | Annot::Atom(it) => it == other, 18 | _ => false, 19 | } 20 | } 21 | } 22 | 23 | impl From for Annot { 24 | fn from(value: Atom) -> Annot { 25 | Annot::Atom(value) 26 | } 27 | } 28 | 29 | impl fmt::Display for Annot { 30 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 31 | match self { 32 | Annot::Atom(it) => write!(f, "{it}"), 33 | Annot::Add(it) => write!(f, "+{it}"), 34 | Annot::Sub(it) => write!(f, "-{it}"), 35 | } 36 | } 37 | } 38 | 39 | impl Comp { 40 | pub(crate) fn add(self) -> Annot { 41 | Annot::Add(self) 42 | } 43 | pub(crate) fn sub(self) -> Annot { 44 | Annot::Sub(self) 45 | } 46 | } 47 | 48 | impl Default for Comp { 49 | fn default() -> Self { 50 | Comp::Para 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/annot/generated.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 3 | pub(crate) enum Comp { 4 | Verbatim, 5 | Email, 6 | Url, 7 | Subscript, 8 | Superscript, 9 | Para, 10 | CodeBlock, 11 | Imagetext, 12 | Linktext, 13 | Reference, 14 | Destination, 15 | Emph, 16 | Strong, 17 | Span, 18 | DoubleQuoted, 19 | ReferenceDefinition, 20 | Insert, 21 | Delete, 22 | Mark, 23 | Attributes, 24 | } 25 | 26 | impl fmt::Display for Comp { 27 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 28 | f.write_str(match self { 29 | Comp::Verbatim => "verbatim", 30 | Comp::Email => "email", 31 | Comp::Url => "url", 32 | Comp::Subscript => "subscript", 33 | Comp::Superscript => "superscript", 34 | Comp::Para => "para", 35 | Comp::CodeBlock => "code_block", 36 | Comp::Imagetext => "imagetext", 37 | Comp::Linktext => "linktext", 38 | Comp::Reference => "reference", 39 | Comp::Destination => "destination", 40 | Comp::Emph => "emph", 41 | Comp::Strong => "strong", 42 | Comp::Span => "span", 43 | Comp::DoubleQuoted => "double_quoted", 44 | Comp::ReferenceDefinition => "reference_definition", 45 | Comp::Insert => "insert", 46 | Comp::Delete => "delete", 47 | Comp::Mark => "mark", 48 | Comp::Attributes => "attributes", 49 | }) 50 | } 51 | } 52 | 53 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 54 | pub(crate) enum Atom { 55 | Str, 56 | Escape, 57 | Hardbreak, 58 | Nbsp, 59 | Blankline, 60 | ImageMarker, 61 | LeftDoubleQuote, 62 | RightDoubleQuote, 63 | Ellipses, 64 | Softbreak, 65 | FootnoteReference, 66 | OpenMarker, 67 | Emoji, 68 | ReferenceKey, 69 | ReferenceValue, 70 | CodeLanguage, 71 | EmDash, 72 | EnDash, 73 | Id, 74 | Key, 75 | Value, 76 | Class, 77 | } 78 | 79 | impl Atom { 80 | pub(crate) fn is_left_atom(self) -> bool { 81 | matches!(self, | Atom::LeftDoubleQuote) 82 | } 83 | pub(crate) fn is_right_atom(self) -> bool { 84 | matches!(self, | Atom::RightDoubleQuote) 85 | } 86 | pub(crate) fn corresponding_left_atom(self) -> Atom { 87 | match self { 88 | Atom::RightDoubleQuote => Atom::LeftDoubleQuote, 89 | 90 | _ => self, 91 | } 92 | } 93 | pub(crate) fn corresponding_right_atom(self) -> Atom { 94 | match self { 95 | Atom::LeftDoubleQuote => Atom::RightDoubleQuote, 96 | 97 | _ => self, 98 | } 99 | } 100 | } 101 | 102 | impl fmt::Display for Atom { 103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | f.write_str(match self { 105 | Atom::Str => "str", 106 | Atom::Escape => "escape", 107 | Atom::Hardbreak => "hardbreak", 108 | Atom::Nbsp => "nbsp", 109 | Atom::Blankline => "blankline", 110 | Atom::ImageMarker => "image_marker", 111 | Atom::LeftDoubleQuote => "left_double_quote", 112 | Atom::RightDoubleQuote => "right_double_quote", 113 | Atom::Ellipses => "ellipses", 114 | Atom::Softbreak => "softbreak", 115 | Atom::FootnoteReference => "footnote_reference", 116 | Atom::OpenMarker => "open_marker", 117 | Atom::Emoji => "emoji", 118 | Atom::ReferenceKey => "reference_key", 119 | Atom::ReferenceValue => "reference_value", 120 | Atom::CodeLanguage => "code_language", 121 | Atom::EmDash => "em_dash", 122 | Atom::EnDash => "en_dash", 123 | Atom::Id => "id", 124 | Atom::Key => "key", 125 | Atom::Value => "value", 126 | Atom::Class => "class", 127 | }) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/ast.rs: -------------------------------------------------------------------------------- 1 | mod generated; 2 | 3 | use indexmap::IndexMap; 4 | 5 | pub use self::generated::*; 6 | 7 | pub type Attrs = IndexMap; 8 | 9 | #[derive(Debug, Default, Clone, serde::Serialize)] 10 | pub struct ReferenceDefinition { 11 | #[serde(skip_serializing_if = "Attrs::is_empty")] 12 | pub attrs: Attrs, 13 | pub destination: String, 14 | } 15 | -------------------------------------------------------------------------------- /src/ast/generated.rs: -------------------------------------------------------------------------------- 1 | use super::Attrs; 2 | 3 | #[derive(Debug, Default, Clone, serde::Serialize)] 4 | pub struct Heading { 5 | #[serde(skip_serializing_if = "Attrs::is_empty")] 6 | pub attrs: Attrs, 7 | pub children: Vec, 8 | pub level: u32, 9 | } 10 | 11 | #[derive(Debug, Default, Clone, serde::Serialize)] 12 | pub struct Para { 13 | #[serde(skip_serializing_if = "Attrs::is_empty")] 14 | pub attrs: Attrs, 15 | pub children: Vec, 16 | } 17 | 18 | #[derive(Debug, Default, Clone, serde::Serialize)] 19 | pub struct Link { 20 | #[serde(skip_serializing_if = "Attrs::is_empty")] 21 | pub attrs: Attrs, 22 | pub children: Vec, 23 | pub destination: Option, 24 | pub reference: Option, 25 | } 26 | 27 | #[derive(Debug, Default, Clone, serde::Serialize)] 28 | pub struct Image { 29 | #[serde(skip_serializing_if = "Attrs::is_empty")] 30 | pub attrs: Attrs, 31 | pub children: Vec, 32 | pub destination: Option, 33 | pub reference: Option, 34 | } 35 | 36 | #[derive(Debug, Default, Clone, serde::Serialize)] 37 | pub struct CodeBlock { 38 | #[serde(skip_serializing_if = "Attrs::is_empty")] 39 | pub attrs: Attrs, 40 | pub children: Vec, 41 | pub lang: Option, 42 | pub text: String, 43 | } 44 | 45 | #[derive(Debug, Default, Clone, serde::Serialize)] 46 | pub struct Strong { 47 | #[serde(skip_serializing_if = "Attrs::is_empty")] 48 | pub attrs: Attrs, 49 | pub children: Vec, 50 | } 51 | 52 | #[derive(Debug, Default, Clone, serde::Serialize)] 53 | pub struct Emph { 54 | #[serde(skip_serializing_if = "Attrs::is_empty")] 55 | pub attrs: Attrs, 56 | pub children: Vec, 57 | } 58 | 59 | #[derive(Debug, Default, Clone, serde::Serialize)] 60 | pub struct Insert { 61 | #[serde(skip_serializing_if = "Attrs::is_empty")] 62 | pub attrs: Attrs, 63 | pub children: Vec, 64 | } 65 | 66 | #[derive(Debug, Default, Clone, serde::Serialize)] 67 | pub struct Delete { 68 | #[serde(skip_serializing_if = "Attrs::is_empty")] 69 | pub attrs: Attrs, 70 | pub children: Vec, 71 | } 72 | 73 | #[derive(Debug, Default, Clone, serde::Serialize)] 74 | pub struct Mark { 75 | #[serde(skip_serializing_if = "Attrs::is_empty")] 76 | pub attrs: Attrs, 77 | pub children: Vec, 78 | } 79 | 80 | #[derive(Debug, Default, Clone, serde::Serialize)] 81 | pub struct Superscript { 82 | #[serde(skip_serializing_if = "Attrs::is_empty")] 83 | pub attrs: Attrs, 84 | pub children: Vec, 85 | } 86 | 87 | #[derive(Debug, Default, Clone, serde::Serialize)] 88 | pub struct Subscript { 89 | #[serde(skip_serializing_if = "Attrs::is_empty")] 90 | pub attrs: Attrs, 91 | pub children: Vec, 92 | } 93 | 94 | #[derive(Debug, Default, Clone, serde::Serialize)] 95 | pub struct Span { 96 | #[serde(skip_serializing_if = "Attrs::is_empty")] 97 | pub attrs: Attrs, 98 | pub children: Vec, 99 | } 100 | 101 | #[derive(Debug, Default, Clone, serde::Serialize)] 102 | pub struct DoubleQuoted { 103 | #[serde(skip_serializing_if = "Attrs::is_empty")] 104 | pub attrs: Attrs, 105 | pub children: Vec, 106 | } 107 | 108 | #[derive(Debug, Default, Clone, serde::Serialize)] 109 | pub struct Url { 110 | #[serde(skip_serializing_if = "Attrs::is_empty")] 111 | pub attrs: Attrs, 112 | pub children: Vec, 113 | pub destination: String, 114 | } 115 | 116 | #[derive(Debug, Default, Clone, serde::Serialize)] 117 | pub struct SoftBreak { 118 | #[serde(skip_serializing_if = "Attrs::is_empty")] 119 | pub attrs: Attrs, 120 | } 121 | 122 | #[derive(Debug, Default, Clone, serde::Serialize)] 123 | pub struct EmDash { 124 | #[serde(skip_serializing_if = "Attrs::is_empty")] 125 | pub attrs: Attrs, 126 | } 127 | 128 | #[derive(Debug, Default, Clone, serde::Serialize)] 129 | pub struct EnDash { 130 | #[serde(skip_serializing_if = "Attrs::is_empty")] 131 | pub attrs: Attrs, 132 | } 133 | 134 | #[derive(Debug, Default, Clone, serde::Serialize)] 135 | pub struct Verbatim { 136 | #[serde(skip_serializing_if = "Attrs::is_empty")] 137 | pub attrs: Attrs, 138 | pub text: String, 139 | } 140 | 141 | #[derive(Debug, Default, Clone, serde::Serialize)] 142 | pub struct Str { 143 | #[serde(skip_serializing_if = "Attrs::is_empty")] 144 | pub attrs: Attrs, 145 | pub text: String, 146 | } 147 | 148 | #[derive(Debug, Default, Clone, serde::Serialize)] 149 | pub struct Emoji { 150 | #[serde(skip_serializing_if = "Attrs::is_empty")] 151 | pub attrs: Attrs, 152 | pub alias: String, 153 | } 154 | 155 | #[derive(Debug, Clone, serde::Serialize)] 156 | #[serde(tag = "tag", rename_all = "snake_case")] 157 | pub enum Tag { 158 | Heading(Heading), 159 | Para(Para), 160 | Link(Link), 161 | Image(Image), 162 | CodeBlock(CodeBlock), 163 | Strong(Strong), 164 | Emph(Emph), 165 | Insert(Insert), 166 | Delete(Delete), 167 | Mark(Mark), 168 | Superscript(Superscript), 169 | Subscript(Subscript), 170 | Span(Span), 171 | DoubleQuoted(DoubleQuoted), 172 | Url(Url), 173 | SoftBreak(SoftBreak), 174 | EmDash(EmDash), 175 | EnDash(EnDash), 176 | Verbatim(Verbatim), 177 | Str(Str), 178 | Emoji(Emoji), 179 | } 180 | -------------------------------------------------------------------------------- /src/attribute.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | 3 | use crate::{ 4 | annot::{Annot, Atom}, 5 | patterns::find_at, 6 | Match, 7 | }; 8 | 9 | #[derive(Default)] 10 | pub(crate) struct Tokenizer { 11 | subject: String, 12 | state: State, 13 | begin: usize, 14 | lastpos: usize, 15 | matches: Vec, 16 | } 17 | 18 | #[derive(Default)] 19 | enum State { 20 | Scanning, 21 | ScanningId, 22 | ScanningClass, 23 | ScanningKey, 24 | ScanningValue, 25 | ScanningBareValue, 26 | ScanningQuotedValue, 27 | ScanningEscaped, 28 | ScanningComment, 29 | Fail, 30 | Done, 31 | #[default] 32 | Start, 33 | } 34 | 35 | pub(crate) enum Status { 36 | Done, 37 | Fail, 38 | Continue, 39 | } 40 | 41 | impl Tokenizer { 42 | pub(crate) fn new(subject: String) -> Tokenizer { 43 | let mut res = Tokenizer::default(); 44 | res.subject = subject; 45 | res 46 | } 47 | 48 | fn add_match(&mut self, range: Range, annot: impl Into) { 49 | self.matches.push(Match::new(range, annot)) 50 | } 51 | 52 | pub(crate) fn get_matches(&mut self) -> Vec { 53 | std::mem::take(&mut self.matches) 54 | } 55 | 56 | // Feed tokenizer a slice of text from the subject, between 57 | // startpos and endpos inclusive. Return status, position, 58 | // where status is either "done" (position should point to 59 | // final '}'), "fail" (position should point to first character 60 | // that could not be tokenized), or "continue" (position should 61 | // point to last character parsed). 62 | pub(crate) fn feed(&mut self, startpos: usize, endpos: usize) -> (Status, usize) { 63 | let mut pos = startpos; 64 | while pos <= endpos { 65 | self.state = self.step(pos); 66 | match self.state { 67 | State::Done => return (Status::Done, pos), 68 | State::Fail => { 69 | self.lastpos = pos + 1; 70 | return (Status::Fail, pos); 71 | } 72 | _ => { 73 | self.lastpos = pos + 1; 74 | pos = pos + 1 75 | } 76 | } 77 | } 78 | (Status::Continue, pos) 79 | } 80 | 81 | fn step(&mut self, pos: usize) -> State { 82 | match self.state { 83 | State::Start => { 84 | if find_at(&self.subject, "^{", pos).is_match { 85 | State::Scanning 86 | } else { 87 | State::Fail 88 | } 89 | } 90 | State::Fail => State::Fail, 91 | State::Done => State::Done, 92 | State::Scanning => match self.subject.as_bytes()[pos] { 93 | b' ' | b'\t' | b'\n' | b'\r' => State::Scanning, 94 | b'}' => State::Done, 95 | b'#' => { 96 | self.begin = pos; 97 | State::ScanningId 98 | } 99 | b'%' => { 100 | self.begin = pos; 101 | State::ScanningComment 102 | } 103 | b'.' => { 104 | self.begin = pos; 105 | State::ScanningClass 106 | } 107 | _ => { 108 | if find_at(&self.subject, "^[%a%d_:-]", pos).is_match { 109 | self.begin = pos; 110 | State::ScanningKey 111 | } else { 112 | State::Fail 113 | } 114 | } 115 | }, 116 | State::ScanningComment => { 117 | if self.subject.as_bytes()[pos] == b'%' { 118 | State::Scanning 119 | } else { 120 | State::ScanningComment 121 | } 122 | } 123 | State::ScanningId => self.step_ident(pos, Atom::Id, State::ScanningId), 124 | State::ScanningClass => self.step_ident(pos, Atom::Class, State::ScanningClass), 125 | State::ScanningKey => { 126 | let c = self.subject.as_bytes()[pos]; 127 | if c == b'=' { 128 | self.add_match(self.begin..self.lastpos, Atom::Key); 129 | self.begin = !0; 130 | State::ScanningValue 131 | } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match { 132 | State::ScanningKey 133 | } else { 134 | State::Fail 135 | } 136 | } 137 | State::ScanningValue => { 138 | let c = self.subject.as_bytes()[pos]; 139 | if c == b'"' { 140 | self.begin = pos; 141 | State::ScanningQuotedValue 142 | } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match { 143 | self.begin = pos; 144 | State::ScanningBareValue 145 | } else { 146 | State::Fail 147 | } 148 | } 149 | State::ScanningBareValue => { 150 | let c = self.subject.as_bytes()[pos]; 151 | if find_at(&self.subject, "^[%a%d_:-]", pos).is_match { 152 | State::ScanningBareValue 153 | } else if c == b'}' { 154 | self.add_match(self.begin..self.lastpos, Atom::Value); 155 | self.begin = !0; 156 | State::Done 157 | } else if find_at(&self.subject, "^%s", pos).is_match { 158 | self.add_match(self.begin..self.lastpos, Atom::Value); 159 | self.begin = !0; 160 | State::Scanning 161 | } else { 162 | State::Fail 163 | } 164 | } 165 | State::ScanningEscaped => State::ScanningQuotedValue, 166 | State::ScanningQuotedValue => { 167 | let c = self.subject.as_bytes()[pos]; 168 | match c { 169 | b'"' => { 170 | self.add_match(self.begin + 1..self.lastpos, Atom::Value); 171 | self.begin = !0; 172 | State::Scanning 173 | } 174 | b'\\' => State::ScanningEscaped, 175 | b'{' | b'}' => State::Fail, 176 | b'\n' => { 177 | self.add_match(self.begin + 1..self.lastpos, Atom::Value); 178 | State::ScanningQuotedValue 179 | } 180 | _ => State::ScanningQuotedValue, 181 | } 182 | } 183 | } 184 | } 185 | 186 | fn step_ident(&mut self, pos: usize, atom: Atom, state: State) -> State { 187 | let c = self.subject.as_bytes()[pos]; 188 | match c { 189 | b'_' | b'-' | b':' => state, 190 | b'}' => { 191 | if self.lastpos > self.begin + 1 { 192 | self.add_match(self.begin + 1..self.lastpos, atom) 193 | } 194 | self.begin = !0; 195 | State::Done 196 | } 197 | _ => { 198 | if find_at(&self.subject, "^[^%s%p]", pos).is_match { 199 | state 200 | } else if find_at(&self.subject, "^%s", pos).is_match { 201 | if self.lastpos > self.begin { 202 | self.add_match(self.begin + 1..self.lastpos, atom) 203 | } 204 | self.begin = !0; 205 | State::Scanning 206 | } else { 207 | State::Fail 208 | } 209 | } 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /src/block.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | 3 | use crate::{ 4 | annot::{Annot, Atom, Comp}, 5 | format_to, inline, 6 | patterns::{find, find_at, PatMatch}, 7 | Match, ParseOpts, 8 | }; 9 | 10 | #[derive(Default)] 11 | pub struct Tokenizer { 12 | pub subject: String, 13 | indent: usize, 14 | startline: usize, 15 | starteol: usize, 16 | endeol: usize, 17 | pub(crate) matches: Vec, 18 | pos: usize, 19 | last_matched_container: usize, 20 | opts: ParseOpts, 21 | finished_line: bool, 22 | 23 | pub(crate) debug: String, 24 | } 25 | 26 | trait Container { 27 | fn content(&self) -> &'static str; 28 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> { 29 | None 30 | } 31 | fn restore_indent(&self) -> Option { 32 | None 33 | } 34 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool 35 | where 36 | Self: Sized; 37 | fn cont(&mut self, p: &mut Tokenizer) -> bool; 38 | fn close(self: Box, p: &mut Tokenizer); 39 | } 40 | 41 | const CONTAINERS: &[fn(&mut Tokenizer, &mut Vec>) -> bool] = 42 | &[Para::open, CodeBlock::open, ReferenceDefinition::open]; 43 | 44 | struct Para { 45 | inline_parser: inline::Tokenizer, 46 | } 47 | 48 | impl Container for Para { 49 | fn content(&self) -> &'static str { 50 | "inline" 51 | } 52 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> { 53 | Some(&mut self.inline_parser) 54 | } 55 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool 56 | where 57 | Self: Sized, 58 | { 59 | p.add_container( 60 | stack, 61 | Para { inline_parser: inline::Tokenizer::new(p.subject.clone(), p.opts.clone()) }, 62 | ); 63 | p.add_match(p.pos..p.pos, Comp::Para.add()); 64 | true 65 | } 66 | 67 | fn cont(&mut self, p: &mut Tokenizer) -> bool { 68 | p.find("^%S").is_match 69 | } 70 | 71 | fn close(mut self: Box, p: &mut Tokenizer) { 72 | p.matches.extend(self.inline_parser.get_matches()); 73 | p.add_match(p.pos - 1..p.pos - 1, Comp::Para.sub()) 74 | } 75 | } 76 | 77 | struct CodeBlock { 78 | border: char, 79 | indent: usize, 80 | } 81 | 82 | impl Container for CodeBlock { 83 | fn content(&self) -> &'static str { 84 | "text" 85 | } 86 | fn restore_indent(&self) -> Option { 87 | Some(self.indent) 88 | } 89 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool 90 | where 91 | Self: Sized, 92 | { 93 | let mut border = '`'; 94 | let mut m = p.find("^```([ \t]*)([^%s`]*)[ \t]*[\r\n]"); 95 | if !m.is_match { 96 | border = '~'; 97 | m = p.find("^~~~([ \t]*)([^%s`]*)[ \t]*[\r\n]"); 98 | } 99 | if !m.is_match { 100 | return false; 101 | } 102 | p.add_container(stack, CodeBlock { border, indent: p.indent }); 103 | let lang = m.cap2; 104 | 105 | p.add_match(p.pos..p.pos + 3, Comp::CodeBlock.add()); 106 | if !lang.is_empty() { 107 | p.add_match(lang.start..lang.end, Atom::CodeLanguage) 108 | } 109 | 110 | p.pos = p.pos + 2; 111 | p.finished_line = true; 112 | true 113 | } 114 | 115 | fn cont(&mut self, p: &mut Tokenizer) -> bool { 116 | let m = 117 | if self.border == '`' { p.find("^(```)[ \t]*[\r\n]") } else { p.find("^(~~~)[ \t]*[\r\n]") }; 118 | if m.is_match { 119 | p.pos = m.end - 1; 120 | p.finished_line = true; 121 | false 122 | } else { 123 | true 124 | } 125 | } 126 | 127 | fn close(self: Box, p: &mut Tokenizer) { 128 | p.add_match(p.pos - 3..p.pos, Comp::CodeBlock.sub()); 129 | } 130 | } 131 | 132 | struct ReferenceDefinition { 133 | indent: usize, 134 | } 135 | 136 | impl Container for ReferenceDefinition { 137 | fn content(&self) -> &'static str { 138 | "" 139 | } 140 | 141 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool 142 | where 143 | Self: Sized, 144 | { 145 | let m = p.find("^[[]([^\r\n]*)%]:[ \t]*(%S*)"); 146 | if !m.is_match { 147 | return false; 148 | } 149 | p.add_container(stack, ReferenceDefinition { indent: p.indent }); 150 | p.add_match(m.start..m.start, Comp::ReferenceDefinition.add()); 151 | p.add_match(m.start..m.start + m.cap1.len() + 2, Atom::ReferenceKey); 152 | if !m.cap2.is_empty() { 153 | p.add_match(m.end - m.cap2.len()..m.end, Atom::ReferenceValue); 154 | } 155 | p.pos = m.end; 156 | true 157 | } 158 | 159 | fn cont(&mut self, p: &mut Tokenizer) -> bool { 160 | if self.indent >= p.indent { 161 | return false; 162 | } 163 | let m = p.find("^(%S+)"); 164 | if m.is_match { 165 | p.add_match(m.cap1.start..m.cap1.end, Atom::ReferenceValue); 166 | p.pos = m.end; 167 | } 168 | true 169 | } 170 | 171 | fn close(self: Box, p: &mut Tokenizer) { 172 | p.add_match(p.pos..p.pos, Comp::ReferenceDefinition.sub()) 173 | } 174 | 175 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> { 176 | None 177 | } 178 | } 179 | 180 | impl Tokenizer { 181 | pub fn new(mut subject: String, opts: ParseOpts) -> Tokenizer { 182 | if !find(&subject, "[\r\n]$").is_match { 183 | subject.push('\n'); 184 | } 185 | let mut res = Tokenizer::default(); 186 | res.subject = subject; 187 | res.opts = opts; 188 | res 189 | } 190 | 191 | fn find(&self, pat: &'static str) -> PatMatch { 192 | find_at(&self.subject, pat, self.pos) 193 | } 194 | 195 | fn add_match(&mut self, range: Range, annot: impl Into) { 196 | self.matches.push(Match::new(range, annot)) 197 | } 198 | 199 | fn add_container( 200 | &mut self, 201 | stack: &mut Vec>, 202 | container: impl Container + 'static, 203 | ) { 204 | let last_matched = self.last_matched_container; 205 | while stack.len() > last_matched 206 | || (stack.len() > 0 && stack.last().unwrap().content() != "block") 207 | { 208 | stack.pop().unwrap().close(self) 209 | } 210 | stack.push(Box::new(container)) 211 | } 212 | 213 | fn skip_space(&mut self) { 214 | let m = find_at(&self.subject, "[^ \t]", self.pos); 215 | if m.is_match { 216 | self.indent = m.start - self.startline; 217 | self.pos = m.start; 218 | } 219 | } 220 | 221 | fn get_eol(&mut self) { 222 | let mut m = find_at(&self.subject, "[\r]?[\n]", self.pos); 223 | if !m.is_match { 224 | (m.start, m.end) = (self.subject.len(), self.subject.len()); 225 | } 226 | self.starteol = m.start; 227 | self.endeol = m.end; 228 | } 229 | 230 | pub fn parse(&mut self) { 231 | let mut containers: Vec> = Vec::new(); 232 | 233 | let subjectlen = self.subject.len(); 234 | while self.pos < subjectlen { 235 | self.indent = 0; 236 | self.startline = self.pos; 237 | self.finished_line = false; 238 | self.get_eol(); 239 | 240 | // check open containers for continuation 241 | self.last_matched_container = 0; 242 | for idx in 0..containers.len() { 243 | // skip any indentation 244 | self.skip_space(); 245 | if containers[idx].cont(self) { 246 | self.last_matched_container = idx + 1 247 | } else { 248 | break; 249 | } 250 | } 251 | 252 | // if we hit a close fence, we can move to next line 253 | if self.finished_line { 254 | while containers.len() > self.last_matched_container { 255 | containers.pop().unwrap().close(self) 256 | } 257 | } 258 | 259 | if !self.finished_line { 260 | // check for new containers 261 | self.skip_space(); 262 | let mut is_blank = self.pos == self.starteol; 263 | 264 | let mut new_starts = false; 265 | let last_match = containers[..self.last_matched_container].first(); 266 | let mut check_starts = !is_blank 267 | && !matches!(last_match, Some(c) if c.content() != "block") 268 | && !self.find("^%a+%s").is_match; // optimization 269 | 270 | while check_starts { 271 | check_starts = false; 272 | for i in 1..CONTAINERS.len() { 273 | let open = CONTAINERS[i]; 274 | if open(self, &mut containers) { 275 | self.last_matched_container = containers.len(); 276 | if self.finished_line { 277 | check_starts = false 278 | } else { 279 | self.skip_space(); 280 | new_starts = true; 281 | check_starts = containers.last().unwrap().content() != "text" 282 | } 283 | break; 284 | } 285 | } 286 | } 287 | 288 | if !self.finished_line { 289 | // handle remaining content 290 | self.skip_space(); 291 | 292 | is_blank = self.pos == self.starteol; 293 | 294 | let is_lazy = !is_blank 295 | && !new_starts 296 | && self.last_matched_container < containers.len() 297 | && containers.last().unwrap().content() == "inline"; 298 | 299 | if !is_lazy && self.last_matched_container < containers.len() { 300 | while containers.len() > self.last_matched_container { 301 | containers.pop().unwrap().close(self); 302 | } 303 | } 304 | 305 | // add para by default if there's text 306 | if !matches!(containers.last(), Some(c) if c.content() != "block") { 307 | if is_blank { 308 | if !new_starts { 309 | // need to track these for tight/loose lists 310 | self.add_match(self.pos..self.endeol, Atom::Blankline); 311 | } 312 | } else { 313 | CONTAINERS[0](self, &mut containers); 314 | } 315 | } 316 | 317 | if let Some(tip) = containers.last_mut() { 318 | if let Some(tip_indent) = tip.restore_indent() { 319 | let mut startpos = self.pos; 320 | if self.indent > tip_indent { 321 | // get back the leading spaces we gobbled 322 | startpos = startpos - (self.indent - tip_indent) 323 | } 324 | self.add_match(startpos..self.endeol, Atom::Str) 325 | } else if let Some(inline_parser) = tip.inline_parser() { 326 | if !is_blank { 327 | inline_parser.feed(self.pos, self.endeol) 328 | } 329 | } 330 | } 331 | } 332 | } 333 | 334 | self.pos = self.endeol; 335 | } 336 | self.finish(containers) 337 | } 338 | 339 | fn finish(&mut self, mut containers: Vec>) { 340 | // close unmatched containers 341 | while let Some(cont) = containers.pop() { 342 | cont.close(self) 343 | } 344 | if self.opts.debug_matches { 345 | for m in &self.matches { 346 | let ms = format!( 347 | "{} {}-{}", 348 | m.a, 349 | m.range.start + 1, 350 | if m.range.is_empty() { m.range.end + 1 } else { m.range.end } 351 | ); 352 | format_to!( 353 | self.debug, 354 | "{ms:<20} {:?}\n", 355 | self.subject.get(m.range.clone()).unwrap_or_default() 356 | ); 357 | } 358 | } 359 | } 360 | } 361 | -------------------------------------------------------------------------------- /src/emoji.rs: -------------------------------------------------------------------------------- 1 | pub(crate) fn find_emoji(s: &str) -> Option<&'static str> { 2 | let idx = EMOJI_LIST.binary_search_by_key(&s, |&(k, _)| k).ok()?; 3 | Some(EMOJI_LIST[idx].1) 4 | } 5 | 6 | #[test] 7 | fn emoji_list_is_sorted() { 8 | let mut sorted = EMOJI_LIST.to_vec(); 9 | sorted.sort_by_key(|&(k, _)| k); 10 | if EMOJI_LIST != sorted { 11 | let mut buf = String::new(); 12 | for (k, v) in sorted { 13 | crate::format_to!(buf, r#"("{k}", "{v}"),"#); 14 | } 15 | std::fs::write("./emoji.sorted", &buf).unwrap(); 16 | panic!("unsorted emoji list, sorted version in: ./emoji.sorted") 17 | } 18 | } 19 | 20 | static EMOJI_LIST: &[(&str, &str)] = &[ 21 | ("+1", "👍"), 22 | ("-1", "👎"), 23 | ("100", "💯"), 24 | ("1234", "🔢"), 25 | ("1st_place_medal", "🥇"), 26 | ("2nd_place_medal", "🥈"), 27 | ("3rd_place_medal", "🥉"), 28 | ("8ball", "🎱"), 29 | ("a", "🅰️"), 30 | ("ab", "🆎"), 31 | ("abacus", "🧮"), 32 | ("abc", "🔤"), 33 | ("abcd", "🔡"), 34 | ("accept", "🉑"), 35 | ("accordion", "🪗"), 36 | ("adhesive_bandage", "🩹"), 37 | ("adult", "🧑"), 38 | ("aerial_tramway", "🚡"), 39 | ("afghanistan", "🇦🇫"), 40 | ("airplane", "✈️"), 41 | ("aland_islands", "🇦🇽"), 42 | ("alarm_clock", "⏰"), 43 | ("albania", "🇦🇱"), 44 | ("alembic", "⚗️"), 45 | ("algeria", "🇩🇿"), 46 | ("alien", "👽"), 47 | ("ambulance", "🚑"), 48 | ("american_samoa", "🇦🇸"), 49 | ("amphora", "🏺"), 50 | ("anatomical_heart", "🫀"), 51 | ("anchor", "⚓"), 52 | ("andorra", "🇦🇩"), 53 | ("angel", "👼"), 54 | ("anger", "💢"), 55 | ("angola", "🇦🇴"), 56 | ("angry", "😠"), 57 | ("anguilla", "🇦🇮"), 58 | ("anguished", "😧"), 59 | ("ant", "🐜"), 60 | ("antarctica", "🇦🇶"), 61 | ("antigua_barbuda", "🇦🇬"), 62 | ("apple", "🍎"), 63 | ("aquarius", "♒"), 64 | ("argentina", "🇦🇷"), 65 | ("aries", "♈"), 66 | ("armenia", "🇦🇲"), 67 | ("arrow_backward", "◀️"), 68 | ("arrow_double_down", "⏬"), 69 | ("arrow_double_up", "⏫"), 70 | ("arrow_down", "⬇️"), 71 | ("arrow_down_small", "🔽"), 72 | ("arrow_forward", "▶️"), 73 | ("arrow_heading_down", "⤵️"), 74 | ("arrow_heading_up", "⤴️"), 75 | ("arrow_left", "⬅️"), 76 | ("arrow_lower_left", "↙️"), 77 | ("arrow_lower_right", "↘️"), 78 | ("arrow_right", "➡️"), 79 | ("arrow_right_hook", "↪️"), 80 | ("arrow_up", "⬆️"), 81 | ("arrow_up_down", "↕️"), 82 | ("arrow_up_small", "🔼"), 83 | ("arrow_upper_left", "↖️"), 84 | ("arrow_upper_right", "↗️"), 85 | ("arrows_clockwise", "🔃"), 86 | ("arrows_counterclockwise", "🔄"), 87 | ("art", "🎨"), 88 | ("articulated_lorry", "🚛"), 89 | ("artificial_satellite", "🛰️"), 90 | ("artist", "🧑‍🎨"), 91 | ("aruba", "🇦🇼"), 92 | ("ascension_island", "🇦🇨"), 93 | ("asterisk", "*️⃣"), 94 | ("astonished", "😲"), 95 | ("astronaut", "🧑‍🚀"), 96 | ("athletic_shoe", "👟"), 97 | ("atm", "🏧"), 98 | ("atom_symbol", "⚛️"), 99 | ("australia", "🇦🇺"), 100 | ("austria", "🇦🇹"), 101 | ("auto_rickshaw", "🛺"), 102 | ("avocado", "🥑"), 103 | ("axe", "🪓"), 104 | ("azerbaijan", "🇦🇿"), 105 | ("b", "🅱️"), 106 | ("baby", "👶"), 107 | ("baby_bottle", "🍼"), 108 | ("baby_chick", "🐤"), 109 | ("baby_symbol", "🚼"), 110 | ("back", "🔙"), 111 | ("bacon", "🥓"), 112 | ("badger", "🦡"), 113 | ("badminton", "🏸"), 114 | ("bagel", "🥯"), 115 | ("baggage_claim", "🛄"), 116 | ("baguette_bread", "🥖"), 117 | ("bahamas", "🇧🇸"), 118 | ("bahrain", "🇧🇭"), 119 | ("balance_scale", "⚖️"), 120 | ("bald_man", "👨‍🦲"), 121 | ("bald_woman", "👩‍🦲"), 122 | ("ballet_shoes", "🩰"), 123 | ("balloon", "🎈"), 124 | ("ballot_box", "🗳️"), 125 | ("ballot_box_with_check", "☑️"), 126 | ("bamboo", "🎍"), 127 | ("banana", "🍌"), 128 | ("bangbang", "‼️"), 129 | ("bangladesh", "🇧🇩"), 130 | ("banjo", "🪕"), 131 | ("bank", "🏦"), 132 | ("bar_chart", "📊"), 133 | ("barbados", "🇧🇧"), 134 | ("barber", "💈"), 135 | ("baseball", "⚾"), 136 | ("basket", "🧺"), 137 | ("basketball", "🏀"), 138 | ("basketball_man", "⛹️‍♂️"), 139 | ("basketball_woman", "⛹️‍♀️"), 140 | ("bat", "🦇"), 141 | ("bath", "🛀"), 142 | ("bathtub", "🛁"), 143 | ("battery", "🔋"), 144 | ("beach_umbrella", "🏖️"), 145 | ("bear", "🐻"), 146 | ("bearded_person", "🧔"), 147 | ("beaver", "🦫"), 148 | ("bed", "🛏️"), 149 | ("bee", "🐝"), 150 | ("beer", "🍺"), 151 | ("beers", "🍻"), 152 | ("beetle", "🪲"), 153 | ("beginner", "🔰"), 154 | ("belarus", "🇧🇾"), 155 | ("belgium", "🇧🇪"), 156 | ("belize", "🇧🇿"), 157 | ("bell", "🔔"), 158 | ("bell_pepper", "🫑"), 159 | ("bellhop_bell", "🛎️"), 160 | ("benin", "🇧🇯"), 161 | ("bento", "🍱"), 162 | ("bermuda", "🇧🇲"), 163 | ("beverage_box", "🧃"), 164 | ("bhutan", "🇧🇹"), 165 | ("bicyclist", "🚴"), 166 | ("bike", "🚲"), 167 | ("biking_man", "🚴‍♂️"), 168 | ("biking_woman", "🚴‍♀️"), 169 | ("bikini", "👙"), 170 | ("billed_cap", "🧢"), 171 | ("biohazard", "☣️"), 172 | ("bird", "🐦"), 173 | ("birthday", "🎂"), 174 | ("bison", "🦬"), 175 | ("black_cat", "🐈‍⬛"), 176 | ("black_circle", "⚫"), 177 | ("black_flag", "🏴"), 178 | ("black_heart", "🖤"), 179 | ("black_joker", "🃏"), 180 | ("black_large_square", "⬛"), 181 | ("black_medium_small_square", "◾"), 182 | ("black_medium_square", "◼️"), 183 | ("black_nib", "✒️"), 184 | ("black_small_square", "▪️"), 185 | ("black_square_button", "🔲"), 186 | ("blond_haired_man", "👱‍♂️"), 187 | ("blond_haired_person", "👱"), 188 | ("blond_haired_woman", "👱‍♀️"), 189 | ("blonde_woman", "👱‍♀️"), 190 | ("blossom", "🌼"), 191 | ("blowfish", "🐡"), 192 | ("blue_book", "📘"), 193 | ("blue_car", "🚙"), 194 | ("blue_heart", "💙"), 195 | ("blue_square", "🟦"), 196 | ("blueberries", "🫐"), 197 | ("blush", "😊"), 198 | ("boar", "🐗"), 199 | ("boat", "⛵"), 200 | ("bolivia", "🇧🇴"), 201 | ("bomb", "💣"), 202 | ("bone", "🦴"), 203 | ("book", "📖"), 204 | ("bookmark", "🔖"), 205 | ("bookmark_tabs", "📑"), 206 | ("books", "📚"), 207 | ("boom", "💥"), 208 | ("boomerang", "🪃"), 209 | ("boot", "👢"), 210 | ("bosnia_herzegovina", "🇧🇦"), 211 | ("botswana", "🇧🇼"), 212 | ("bouncing_ball_man", "⛹️‍♂️"), 213 | ("bouncing_ball_person", "⛹️"), 214 | ("bouncing_ball_woman", "⛹️‍♀️"), 215 | ("bouquet", "💐"), 216 | ("bouvet_island", "🇧🇻"), 217 | ("bow", "🙇"), 218 | ("bow_and_arrow", "🏹"), 219 | ("bowing_man", "🙇‍♂️"), 220 | ("bowing_woman", "🙇‍♀️"), 221 | ("bowl_with_spoon", "🥣"), 222 | ("bowling", "🎳"), 223 | ("boxing_glove", "🥊"), 224 | ("boy", "👦"), 225 | ("brain", "🧠"), 226 | ("brazil", "🇧🇷"), 227 | ("bread", "🍞"), 228 | ("breast_feeding", "🤱"), 229 | ("bricks", "🧱"), 230 | ("bride_with_veil", "👰‍♀️"), 231 | ("bridge_at_night", "🌉"), 232 | ("briefcase", "💼"), 233 | ("british_indian_ocean_territory", "🇮🇴"), 234 | ("british_virgin_islands", "🇻🇬"), 235 | ("broccoli", "🥦"), 236 | ("broken_heart", "💔"), 237 | ("broom", "🧹"), 238 | ("brown_circle", "🟤"), 239 | ("brown_heart", "🤎"), 240 | ("brown_square", "🟫"), 241 | ("brunei", "🇧🇳"), 242 | ("bubble_tea", "🧋"), 243 | ("bucket", "🪣"), 244 | ("bug", "🐛"), 245 | ("building_construction", "🏗️"), 246 | ("bulb", "💡"), 247 | ("bulgaria", "🇧🇬"), 248 | ("bullettrain_front", "🚅"), 249 | ("bullettrain_side", "🚄"), 250 | ("burkina_faso", "🇧🇫"), 251 | ("burrito", "🌯"), 252 | ("burundi", "🇧🇮"), 253 | ("bus", "🚌"), 254 | ("business_suit_levitating", "🕴️"), 255 | ("busstop", "🚏"), 256 | ("bust_in_silhouette", "👤"), 257 | ("busts_in_silhouette", "👥"), 258 | ("butter", "🧈"), 259 | ("butterfly", "🦋"), 260 | ("cactus", "🌵"), 261 | ("cake", "🍰"), 262 | ("calendar", "📆"), 263 | ("call_me_hand", "🤙"), 264 | ("calling", "📲"), 265 | ("cambodia", "🇰🇭"), 266 | ("camel", "🐫"), 267 | ("camera", "📷"), 268 | ("camera_flash", "📸"), 269 | ("cameroon", "🇨🇲"), 270 | ("camping", "🏕️"), 271 | ("canada", "🇨🇦"), 272 | ("canary_islands", "🇮🇨"), 273 | ("cancer", "♋"), 274 | ("candle", "🕯️"), 275 | ("candy", "🍬"), 276 | ("canned_food", "🥫"), 277 | ("canoe", "🛶"), 278 | ("cape_verde", "🇨🇻"), 279 | ("capital_abcd", "🔠"), 280 | ("capricorn", "♑"), 281 | ("car", "🚗"), 282 | ("card_file_box", "🗃️"), 283 | ("card_index", "📇"), 284 | ("card_index_dividers", "🗂️"), 285 | ("caribbean_netherlands", "🇧🇶"), 286 | ("carousel_horse", "🎠"), 287 | ("carpentry_saw", "🪚"), 288 | ("carrot", "🥕"), 289 | ("cartwheeling", "🤸"), 290 | ("cat", "🐱"), 291 | ("cat2", "🐈"), 292 | ("cayman_islands", "🇰🇾"), 293 | ("cd", "💿"), 294 | ("central_african_republic", "🇨🇫"), 295 | ("ceuta_melilla", "🇪🇦"), 296 | ("chad", "🇹🇩"), 297 | ("chains", "⛓️"), 298 | ("chair", "🪑"), 299 | ("champagne", "🍾"), 300 | ("chart", "💹"), 301 | ("chart_with_downwards_trend", "📉"), 302 | ("chart_with_upwards_trend", "📈"), 303 | ("checkered_flag", "🏁"), 304 | ("cheese", "🧀"), 305 | ("cherries", "🍒"), 306 | ("cherry_blossom", "🌸"), 307 | ("chess_pawn", "♟️"), 308 | ("chestnut", "🌰"), 309 | ("chicken", "🐔"), 310 | ("child", "🧒"), 311 | ("children_crossing", "🚸"), 312 | ("chile", "🇨🇱"), 313 | ("chipmunk", "🐿️"), 314 | ("chocolate_bar", "🍫"), 315 | ("chopsticks", "🥢"), 316 | ("christmas_island", "🇨🇽"), 317 | ("christmas_tree", "🎄"), 318 | ("church", "⛪"), 319 | ("cinema", "🎦"), 320 | ("circus_tent", "🎪"), 321 | ("city_sunrise", "🌇"), 322 | ("city_sunset", "🌆"), 323 | ("cityscape", "🏙️"), 324 | ("cl", "🆑"), 325 | ("clamp", "🗜️"), 326 | ("clap", "👏"), 327 | ("clapper", "🎬"), 328 | ("classical_building", "🏛️"), 329 | ("climbing", "🧗"), 330 | ("climbing_man", "🧗‍♂️"), 331 | ("climbing_woman", "🧗‍♀️"), 332 | ("clinking_glasses", "🥂"), 333 | ("clipboard", "📋"), 334 | ("clipperton_island", "🇨🇵"), 335 | ("clock1", "🕐"), 336 | ("clock10", "🕙"), 337 | ("clock1030", "🕥"), 338 | ("clock11", "🕚"), 339 | ("clock1130", "🕦"), 340 | ("clock12", "🕛"), 341 | ("clock1230", "🕧"), 342 | ("clock130", "🕜"), 343 | ("clock2", "🕑"), 344 | ("clock230", "🕝"), 345 | ("clock3", "🕒"), 346 | ("clock330", "🕞"), 347 | ("clock4", "🕓"), 348 | ("clock430", "🕟"), 349 | ("clock5", "🕔"), 350 | ("clock530", "🕠"), 351 | ("clock6", "🕕"), 352 | ("clock630", "🕡"), 353 | ("clock7", "🕖"), 354 | ("clock730", "🕢"), 355 | ("clock8", "🕗"), 356 | ("clock830", "🕣"), 357 | ("clock9", "🕘"), 358 | ("clock930", "🕤"), 359 | ("closed_book", "📕"), 360 | ("closed_lock_with_key", "🔐"), 361 | ("closed_umbrella", "🌂"), 362 | ("cloud", "☁️"), 363 | ("cloud_with_lightning", "🌩️"), 364 | ("cloud_with_lightning_and_rain", "⛈️"), 365 | ("cloud_with_rain", "🌧️"), 366 | ("cloud_with_snow", "🌨️"), 367 | ("clown_face", "🤡"), 368 | ("clubs", "♣️"), 369 | ("cn", "🇨🇳"), 370 | ("coat", "🧥"), 371 | ("cockroach", "🪳"), 372 | ("cocktail", "🍸"), 373 | ("coconut", "🥥"), 374 | ("cocos_islands", "🇨🇨"), 375 | ("coffee", "☕"), 376 | ("coffin", "⚰️"), 377 | ("coin", "🪙"), 378 | ("cold_face", "🥶"), 379 | ("cold_sweat", "😰"), 380 | ("collision", "💥"), 381 | ("colombia", "🇨🇴"), 382 | ("comet", "☄️"), 383 | ("comoros", "🇰🇲"), 384 | ("compass", "🧭"), 385 | ("computer", "💻"), 386 | ("computer_mouse", "🖱️"), 387 | ("confetti_ball", "🎊"), 388 | ("confounded", "😖"), 389 | ("confused", "😕"), 390 | ("congo_brazzaville", "🇨🇬"), 391 | ("congo_kinshasa", "🇨🇩"), 392 | ("congratulations", "㊗️"), 393 | ("construction", "🚧"), 394 | ("construction_worker", "👷"), 395 | ("construction_worker_man", "👷‍♂️"), 396 | ("construction_worker_woman", "👷‍♀️"), 397 | ("control_knobs", "🎛️"), 398 | ("convenience_store", "🏪"), 399 | ("cook", "🧑‍🍳"), 400 | ("cook_islands", "🇨🇰"), 401 | ("cookie", "🍪"), 402 | ("cool", "🆒"), 403 | ("cop", "👮"), 404 | ("copyright", "©️"), 405 | ("corn", "🌽"), 406 | ("costa_rica", "🇨🇷"), 407 | ("cote_divoire", "🇨🇮"), 408 | ("couch_and_lamp", "🛋️"), 409 | ("couple", "👫"), 410 | ("couple_with_heart", "💑"), 411 | ("couple_with_heart_man_man", "👨‍❤️‍👨"), 412 | ("couple_with_heart_woman_man", "👩‍❤️‍👨"), 413 | ("couple_with_heart_woman_woman", "👩‍❤️‍👩"), 414 | ("couplekiss", "💏"), 415 | ("couplekiss_man_man", "👨‍❤️‍💋‍👨"), 416 | ("couplekiss_man_woman", "👩‍❤️‍💋‍👨"), 417 | ("couplekiss_woman_woman", "👩‍❤️‍💋‍👩"), 418 | ("cow", "🐮"), 419 | ("cow2", "🐄"), 420 | ("cowboy_hat_face", "🤠"), 421 | ("crab", "🦀"), 422 | ("crayon", "🖍️"), 423 | ("credit_card", "💳"), 424 | ("crescent_moon", "🌙"), 425 | ("cricket", "🦗"), 426 | ("cricket_game", "🏏"), 427 | ("croatia", "🇭🇷"), 428 | ("crocodile", "🐊"), 429 | ("croissant", "🥐"), 430 | ("crossed_fingers", "🤞"), 431 | ("crossed_flags", "🎌"), 432 | ("crossed_swords", "⚔️"), 433 | ("crown", "👑"), 434 | ("cry", "😢"), 435 | ("crying_cat_face", "😿"), 436 | ("crystal_ball", "🔮"), 437 | ("cuba", "🇨🇺"), 438 | ("cucumber", "🥒"), 439 | ("cup_with_straw", "🥤"), 440 | ("cupcake", "🧁"), 441 | ("cupid", "💘"), 442 | ("curacao", "🇨🇼"), 443 | ("curling_stone", "🥌"), 444 | ("curly_haired_man", "👨‍🦱"), 445 | ("curly_haired_woman", "👩‍🦱"), 446 | ("curly_loop", "➰"), 447 | ("currency_exchange", "💱"), 448 | ("curry", "🍛"), 449 | ("cursing_face", "🤬"), 450 | ("custard", "🍮"), 451 | ("customs", "🛃"), 452 | ("cut_of_meat", "🥩"), 453 | ("cyclone", "🌀"), 454 | ("cyprus", "🇨🇾"), 455 | ("czech_republic", "🇨🇿"), 456 | ("dagger", "🗡️"), 457 | ("dancer", "💃"), 458 | ("dancers", "👯"), 459 | ("dancing_men", "👯‍♂️"), 460 | ("dancing_women", "👯‍♀️"), 461 | ("dango", "🍡"), 462 | ("dark_sunglasses", "🕶️"), 463 | ("dart", "🎯"), 464 | ("dash", "💨"), 465 | ("date", "📅"), 466 | ("de", "🇩🇪"), 467 | ("deaf_man", "🧏‍♂️"), 468 | ("deaf_person", "🧏"), 469 | ("deaf_woman", "🧏‍♀️"), 470 | ("deciduous_tree", "🌳"), 471 | ("deer", "🦌"), 472 | ("denmark", "🇩🇰"), 473 | ("department_store", "🏬"), 474 | ("derelict_house", "🏚️"), 475 | ("desert", "🏜️"), 476 | ("desert_island", "🏝️"), 477 | ("desktop_computer", "🖥️"), 478 | ("detective", "🕵️"), 479 | ("diamond_shape_with_a_dot_inside", "💠"), 480 | ("diamonds", "♦️"), 481 | ("diego_garcia", "🇩🇬"), 482 | ("disappointed", "😞"), 483 | ("disappointed_relieved", "😥"), 484 | ("disguised_face", "🥸"), 485 | ("diving_mask", "🤿"), 486 | ("diya_lamp", "🪔"), 487 | ("dizzy", "💫"), 488 | ("dizzy_face", "😵"), 489 | ("djibouti", "🇩🇯"), 490 | ("dna", "🧬"), 491 | ("do_not_litter", "🚯"), 492 | ("dodo", "🦤"), 493 | ("dog", "🐶"), 494 | ("dog2", "🐕"), 495 | ("dollar", "💵"), 496 | ("dolls", "🎎"), 497 | ("dolphin", "🐬"), 498 | ("dominica", "🇩🇲"), 499 | ("dominican_republic", "🇩🇴"), 500 | ("door", "🚪"), 501 | ("doughnut", "🍩"), 502 | ("dove", "🕊️"), 503 | ("dragon", "🐉"), 504 | ("dragon_face", "🐲"), 505 | ("dress", "👗"), 506 | ("dromedary_camel", "🐪"), 507 | ("drooling_face", "🤤"), 508 | ("drop_of_blood", "🩸"), 509 | ("droplet", "💧"), 510 | ("drum", "🥁"), 511 | ("duck", "🦆"), 512 | ("dumpling", "🥟"), 513 | ("dvd", "📀"), 514 | ("e-mail", "📧"), 515 | ("eagle", "🦅"), 516 | ("ear", "👂"), 517 | ("ear_of_rice", "🌾"), 518 | ("ear_with_hearing_aid", "🦻"), 519 | ("earth_africa", "🌍"), 520 | ("earth_americas", "🌎"), 521 | ("earth_asia", "🌏"), 522 | ("ecuador", "🇪🇨"), 523 | ("egg", "🥚"), 524 | ("eggplant", "🍆"), 525 | ("egypt", "🇪🇬"), 526 | ("eight", "8️⃣"), 527 | ("eight_pointed_black_star", "✴️"), 528 | ("eight_spoked_asterisk", "✳️"), 529 | ("eject_button", "⏏️"), 530 | ("el_salvador", "🇸🇻"), 531 | ("electric_plug", "🔌"), 532 | ("elephant", "🐘"), 533 | ("elevator", "🛗"), 534 | ("elf", "🧝"), 535 | ("elf_man", "🧝‍♂️"), 536 | ("elf_woman", "🧝‍♀️"), 537 | ("email", "📧"), 538 | ("end", "🔚"), 539 | ("england", "🏴󠁧󠁢󠁥󠁮󠁧󠁿"), 540 | ("envelope", "✉️"), 541 | ("envelope_with_arrow", "📩"), 542 | ("equatorial_guinea", "🇬🇶"), 543 | ("eritrea", "🇪🇷"), 544 | ("es", "🇪🇸"), 545 | ("estonia", "🇪🇪"), 546 | ("ethiopia", "🇪🇹"), 547 | ("eu", "🇪🇺"), 548 | ("euro", "💶"), 549 | ("european_castle", "🏰"), 550 | ("european_post_office", "🏤"), 551 | ("european_union", "🇪🇺"), 552 | ("evergreen_tree", "🌲"), 553 | ("exclamation", "❗"), 554 | ("exploding_head", "🤯"), 555 | ("expressionless", "😑"), 556 | ("eye", "👁️"), 557 | ("eye_speech_bubble", "👁️‍🗨️"), 558 | ("eyeglasses", "👓"), 559 | ("eyes", "👀"), 560 | ("face_exhaling", "😮‍💨"), 561 | ("face_in_clouds", "😶‍🌫️"), 562 | ("face_with_head_bandage", "🤕"), 563 | ("face_with_spiral_eyes", "😵‍💫"), 564 | ("face_with_thermometer", "🤒"), 565 | ("facepalm", "🤦"), 566 | ("facepunch", "👊"), 567 | ("factory", "🏭"), 568 | ("factory_worker", "🧑‍🏭"), 569 | ("fairy", "🧚"), 570 | ("fairy_man", "🧚‍♂️"), 571 | ("fairy_woman", "🧚‍♀️"), 572 | ("falafel", "🧆"), 573 | ("falkland_islands", "🇫🇰"), 574 | ("fallen_leaf", "🍂"), 575 | ("family", "👪"), 576 | ("family_man_boy", "👨‍👦"), 577 | ("family_man_boy_boy", "👨‍👦‍👦"), 578 | ("family_man_girl", "👨‍👧"), 579 | ("family_man_girl_boy", "👨‍👧‍👦"), 580 | ("family_man_girl_girl", "👨‍👧‍👧"), 581 | ("family_man_man_boy", "👨‍👨‍👦"), 582 | ("family_man_man_boy_boy", "👨‍👨‍👦‍👦"), 583 | ("family_man_man_girl", "👨‍👨‍👧"), 584 | ("family_man_man_girl_boy", "👨‍👨‍👧‍👦"), 585 | ("family_man_man_girl_girl", "👨‍👨‍👧‍👧"), 586 | ("family_man_woman_boy", "👨‍👩‍👦"), 587 | ("family_man_woman_boy_boy", "👨‍👩‍👦‍👦"), 588 | ("family_man_woman_girl", "👨‍👩‍👧"), 589 | ("family_man_woman_girl_boy", "👨‍👩‍👧‍👦"), 590 | ("family_man_woman_girl_girl", "👨‍👩‍👧‍👧"), 591 | ("family_woman_boy", "👩‍👦"), 592 | ("family_woman_boy_boy", "👩‍👦‍👦"), 593 | ("family_woman_girl", "👩‍👧"), 594 | ("family_woman_girl_boy", "👩‍👧‍👦"), 595 | ("family_woman_girl_girl", "👩‍👧‍👧"), 596 | ("family_woman_woman_boy", "👩‍👩‍👦"), 597 | ("family_woman_woman_boy_boy", "👩‍👩‍👦‍👦"), 598 | ("family_woman_woman_girl", "👩‍👩‍👧"), 599 | ("family_woman_woman_girl_boy", "👩‍👩‍👧‍👦"), 600 | ("family_woman_woman_girl_girl", "👩‍👩‍👧‍👧"), 601 | ("farmer", "🧑‍🌾"), 602 | ("faroe_islands", "🇫🇴"), 603 | ("fast_forward", "⏩"), 604 | ("fax", "📠"), 605 | ("fearful", "😨"), 606 | ("feather", "🪶"), 607 | ("feet", "🐾"), 608 | ("female_detective", "🕵️‍♀️"), 609 | ("female_sign", "♀️"), 610 | ("ferris_wheel", "🎡"), 611 | ("ferry", "⛴️"), 612 | ("field_hockey", "🏑"), 613 | ("fiji", "🇫🇯"), 614 | ("file_cabinet", "🗄️"), 615 | ("file_folder", "📁"), 616 | ("film_projector", "📽️"), 617 | ("film_strip", "🎞️"), 618 | ("finland", "🇫🇮"), 619 | ("fire", "🔥"), 620 | ("fire_engine", "🚒"), 621 | ("fire_extinguisher", "🧯"), 622 | ("firecracker", "🧨"), 623 | ("firefighter", "🧑‍🚒"), 624 | ("fireworks", "🎆"), 625 | ("first_quarter_moon", "🌓"), 626 | ("first_quarter_moon_with_face", "🌛"), 627 | ("fish", "🐟"), 628 | ("fish_cake", "🍥"), 629 | ("fishing_pole_and_fish", "🎣"), 630 | ("fist", "✊"), 631 | ("fist_left", "🤛"), 632 | ("fist_oncoming", "👊"), 633 | ("fist_raised", "✊"), 634 | ("fist_right", "🤜"), 635 | ("five", "5️⃣"), 636 | ("flags", "🎏"), 637 | ("flamingo", "🦩"), 638 | ("flashlight", "🔦"), 639 | ("flat_shoe", "🥿"), 640 | ("flatbread", "🫓"), 641 | ("fleur_de_lis", "⚜️"), 642 | ("flight_arrival", "🛬"), 643 | ("flight_departure", "🛫"), 644 | ("flipper", "🐬"), 645 | ("floppy_disk", "💾"), 646 | ("flower_playing_cards", "🎴"), 647 | ("flushed", "😳"), 648 | ("fly", "🪰"), 649 | ("flying_disc", "🥏"), 650 | ("flying_saucer", "🛸"), 651 | ("fog", "🌫️"), 652 | ("foggy", "🌁"), 653 | ("fondue", "🫕"), 654 | ("foot", "🦶"), 655 | ("football", "🏈"), 656 | ("footprints", "👣"), 657 | ("fork_and_knife", "🍴"), 658 | ("fortune_cookie", "🥠"), 659 | ("fountain", "⛲"), 660 | ("fountain_pen", "🖋️"), 661 | ("four", "4️⃣"), 662 | ("four_leaf_clover", "🍀"), 663 | ("fox_face", "🦊"), 664 | ("fr", "🇫🇷"), 665 | ("framed_picture", "🖼️"), 666 | ("free", "🆓"), 667 | ("french_guiana", "🇬🇫"), 668 | ("french_polynesia", "🇵🇫"), 669 | ("french_southern_territories", "🇹🇫"), 670 | ("fried_egg", "🍳"), 671 | ("fried_shrimp", "🍤"), 672 | ("fries", "🍟"), 673 | ("frog", "🐸"), 674 | ("frowning", "😦"), 675 | ("frowning_face", "☹️"), 676 | ("frowning_man", "🙍‍♂️"), 677 | ("frowning_person", "🙍"), 678 | ("frowning_woman", "🙍‍♀️"), 679 | ("fu", "🖕"), 680 | ("fuelpump", "⛽"), 681 | ("full_moon", "🌕"), 682 | ("full_moon_with_face", "🌝"), 683 | ("funeral_urn", "⚱️"), 684 | ("gabon", "🇬🇦"), 685 | ("gambia", "🇬🇲"), 686 | ("game_die", "🎲"), 687 | ("garlic", "🧄"), 688 | ("gb", "🇬🇧"), 689 | ("gear", "⚙️"), 690 | ("gem", "💎"), 691 | ("gemini", "♊"), 692 | ("genie", "🧞"), 693 | ("genie_man", "🧞‍♂️"), 694 | ("genie_woman", "🧞‍♀️"), 695 | ("georgia", "🇬🇪"), 696 | ("ghana", "🇬🇭"), 697 | ("ghost", "👻"), 698 | ("gibraltar", "🇬🇮"), 699 | ("gift", "🎁"), 700 | ("gift_heart", "💝"), 701 | ("giraffe", "🦒"), 702 | ("girl", "👧"), 703 | ("globe_with_meridians", "🌐"), 704 | ("gloves", "🧤"), 705 | ("goal_net", "🥅"), 706 | ("goat", "🐐"), 707 | ("goggles", "🥽"), 708 | ("golf", "⛳"), 709 | ("golfing", "🏌️"), 710 | ("golfing_man", "🏌️‍♂️"), 711 | ("golfing_woman", "🏌️‍♀️"), 712 | ("gorilla", "🦍"), 713 | ("grapes", "🍇"), 714 | ("greece", "🇬🇷"), 715 | ("green_apple", "🍏"), 716 | ("green_book", "📗"), 717 | ("green_circle", "🟢"), 718 | ("green_heart", "💚"), 719 | ("green_salad", "🥗"), 720 | ("green_square", "🟩"), 721 | ("greenland", "🇬🇱"), 722 | ("grenada", "🇬🇩"), 723 | ("grey_exclamation", "❕"), 724 | ("grey_question", "❔"), 725 | ("grimacing", "😬"), 726 | ("grin", "😁"), 727 | ("grinning", "😀"), 728 | ("guadeloupe", "🇬🇵"), 729 | ("guam", "🇬🇺"), 730 | ("guard", "💂"), 731 | ("guardsman", "💂‍♂️"), 732 | ("guardswoman", "💂‍♀️"), 733 | ("guatemala", "🇬🇹"), 734 | ("guernsey", "🇬🇬"), 735 | ("guide_dog", "🦮"), 736 | ("guinea", "🇬🇳"), 737 | ("guinea_bissau", "🇬🇼"), 738 | ("guitar", "🎸"), 739 | ("gun", "🔫"), 740 | ("guyana", "🇬🇾"), 741 | ("haircut", "💇"), 742 | ("haircut_man", "💇‍♂️"), 743 | ("haircut_woman", "💇‍♀️"), 744 | ("haiti", "🇭🇹"), 745 | ("hamburger", "🍔"), 746 | ("hammer", "🔨"), 747 | ("hammer_and_pick", "⚒️"), 748 | ("hammer_and_wrench", "🛠️"), 749 | ("hamster", "🐹"), 750 | ("hand", "✋"), 751 | ("hand_over_mouth", "🤭"), 752 | ("handbag", "👜"), 753 | ("handball_person", "🤾"), 754 | ("handshake", "🤝"), 755 | ("hankey", "💩"), 756 | ("hash", "#️⃣"), 757 | ("hatched_chick", "🐥"), 758 | ("hatching_chick", "🐣"), 759 | ("headphones", "🎧"), 760 | ("headstone", "🪦"), 761 | ("health_worker", "🧑‍⚕️"), 762 | ("hear_no_evil", "🙉"), 763 | ("heard_mcdonald_islands", "🇭🇲"), 764 | ("heart", "❤️"), 765 | ("heart_decoration", "💟"), 766 | ("heart_eyes", "😍"), 767 | ("heart_eyes_cat", "😻"), 768 | ("heart_on_fire", "❤️‍🔥"), 769 | ("heartbeat", "💓"), 770 | ("heartpulse", "💗"), 771 | ("hearts", "♥️"), 772 | ("heavy_check_mark", "✔️"), 773 | ("heavy_division_sign", "➗"), 774 | ("heavy_dollar_sign", "💲"), 775 | ("heavy_exclamation_mark", "❗"), 776 | ("heavy_heart_exclamation", "❣️"), 777 | ("heavy_minus_sign", "➖"), 778 | ("heavy_multiplication_x", "✖️"), 779 | ("heavy_plus_sign", "➕"), 780 | ("hedgehog", "🦔"), 781 | ("helicopter", "🚁"), 782 | ("herb", "🌿"), 783 | ("hibiscus", "🌺"), 784 | ("high_brightness", "🔆"), 785 | ("high_heel", "👠"), 786 | ("hiking_boot", "🥾"), 787 | ("hindu_temple", "🛕"), 788 | ("hippopotamus", "🦛"), 789 | ("hocho", "🔪"), 790 | ("hole", "🕳️"), 791 | ("honduras", "🇭🇳"), 792 | ("honey_pot", "🍯"), 793 | ("honeybee", "🐝"), 794 | ("hong_kong", "🇭🇰"), 795 | ("hook", "🪝"), 796 | ("horse", "🐴"), 797 | ("horse_racing", "🏇"), 798 | ("hospital", "🏥"), 799 | ("hot_face", "🥵"), 800 | ("hot_pepper", "🌶️"), 801 | ("hotdog", "🌭"), 802 | ("hotel", "🏨"), 803 | ("hotsprings", "♨️"), 804 | ("hourglass", "⌛"), 805 | ("hourglass_flowing_sand", "⏳"), 806 | ("house", "🏠"), 807 | ("house_with_garden", "🏡"), 808 | ("houses", "🏘️"), 809 | ("hugs", "🤗"), 810 | ("hungary", "🇭🇺"), 811 | ("hushed", "😯"), 812 | ("hut", "🛖"), 813 | ("ice_cream", "🍨"), 814 | ("ice_cube", "🧊"), 815 | ("ice_hockey", "🏒"), 816 | ("ice_skate", "⛸️"), 817 | ("icecream", "🍦"), 818 | ("iceland", "🇮🇸"), 819 | ("id", "🆔"), 820 | ("ideograph_advantage", "🉐"), 821 | ("imp", "👿"), 822 | ("inbox_tray", "📥"), 823 | ("incoming_envelope", "📨"), 824 | ("india", "🇮🇳"), 825 | ("indonesia", "🇮🇩"), 826 | ("infinity", "♾️"), 827 | ("information_desk_person", "💁"), 828 | ("information_source", "ℹ️"), 829 | ("innocent", "😇"), 830 | ("interrobang", "⁉️"), 831 | ("iphone", "📱"), 832 | ("iran", "🇮🇷"), 833 | ("iraq", "🇮🇶"), 834 | ("ireland", "🇮🇪"), 835 | ("isle_of_man", "🇮🇲"), 836 | ("israel", "🇮🇱"), 837 | ("it", "🇮🇹"), 838 | ("izakaya_lantern", "🏮"), 839 | ("jack_o_lantern", "🎃"), 840 | ("jamaica", "🇯🇲"), 841 | ("japan", "🗾"), 842 | ("japanese_castle", "🏯"), 843 | ("japanese_goblin", "👺"), 844 | ("japanese_ogre", "👹"), 845 | ("jeans", "👖"), 846 | ("jersey", "🇯🇪"), 847 | ("jigsaw", "🧩"), 848 | ("jordan", "🇯🇴"), 849 | ("joy", "😂"), 850 | ("joy_cat", "😹"), 851 | ("joystick", "🕹️"), 852 | ("jp", "🇯🇵"), 853 | ("judge", "🧑‍⚖️"), 854 | ("juggling_person", "🤹"), 855 | ("kaaba", "🕋"), 856 | ("kangaroo", "🦘"), 857 | ("kazakhstan", "🇰🇿"), 858 | ("kenya", "🇰🇪"), 859 | ("key", "🔑"), 860 | ("keyboard", "⌨️"), 861 | ("keycap_ten", "🔟"), 862 | ("kick_scooter", "🛴"), 863 | ("kimono", "👘"), 864 | ("kiribati", "🇰🇮"), 865 | ("kiss", "💋"), 866 | ("kissing", "😗"), 867 | ("kissing_cat", "😽"), 868 | ("kissing_closed_eyes", "😚"), 869 | ("kissing_heart", "😘"), 870 | ("kissing_smiling_eyes", "😙"), 871 | ("kite", "🪁"), 872 | ("kiwi_fruit", "🥝"), 873 | ("kneeling_man", "🧎‍♂️"), 874 | ("kneeling_person", "🧎"), 875 | ("kneeling_woman", "🧎‍♀️"), 876 | ("knife", "🔪"), 877 | ("knot", "🪢"), 878 | ("koala", "🐨"), 879 | ("koko", "🈁"), 880 | ("kosovo", "🇽🇰"), 881 | ("kr", "🇰🇷"), 882 | ("kuwait", "🇰🇼"), 883 | ("kyrgyzstan", "🇰🇬"), 884 | ("lab_coat", "🥼"), 885 | ("label", "🏷️"), 886 | ("lacrosse", "🥍"), 887 | ("ladder", "🪜"), 888 | ("lady_beetle", "🐞"), 889 | ("lantern", "🏮"), 890 | ("laos", "🇱🇦"), 891 | ("large_blue_circle", "🔵"), 892 | ("large_blue_diamond", "🔷"), 893 | ("large_orange_diamond", "🔶"), 894 | ("last_quarter_moon", "🌗"), 895 | ("last_quarter_moon_with_face", "🌜"), 896 | ("latin_cross", "✝️"), 897 | ("latvia", "🇱🇻"), 898 | ("laughing", "😆"), 899 | ("leafy_green", "🥬"), 900 | ("leaves", "🍃"), 901 | ("lebanon", "🇱🇧"), 902 | ("ledger", "📒"), 903 | ("left_luggage", "🛅"), 904 | ("left_right_arrow", "↔️"), 905 | ("left_speech_bubble", "🗨️"), 906 | ("leftwards_arrow_with_hook", "↩️"), 907 | ("leg", "🦵"), 908 | ("lemon", "🍋"), 909 | ("leo", "♌"), 910 | ("leopard", "🐆"), 911 | ("lesotho", "🇱🇸"), 912 | ("level_slider", "🎚️"), 913 | ("liberia", "🇱🇷"), 914 | ("libra", "♎"), 915 | ("libya", "🇱🇾"), 916 | ("liechtenstein", "🇱🇮"), 917 | ("light_rail", "🚈"), 918 | ("link", "🔗"), 919 | ("lion", "🦁"), 920 | ("lips", "👄"), 921 | ("lipstick", "💄"), 922 | ("lithuania", "🇱🇹"), 923 | ("lizard", "🦎"), 924 | ("llama", "🦙"), 925 | ("lobster", "🦞"), 926 | ("lock", "🔒"), 927 | ("lock_with_ink_pen", "🔏"), 928 | ("lollipop", "🍭"), 929 | ("long_drum", "🪘"), 930 | ("loop", "➿"), 931 | ("lotion_bottle", "🧴"), 932 | ("lotus_position", "🧘"), 933 | ("lotus_position_man", "🧘‍♂️"), 934 | ("lotus_position_woman", "🧘‍♀️"), 935 | ("loud_sound", "🔊"), 936 | ("loudspeaker", "📢"), 937 | ("love_hotel", "🏩"), 938 | ("love_letter", "💌"), 939 | ("love_you_gesture", "🤟"), 940 | ("low_brightness", "🔅"), 941 | ("luggage", "🧳"), 942 | ("lungs", "🫁"), 943 | ("luxembourg", "🇱🇺"), 944 | ("lying_face", "🤥"), 945 | ("m", "Ⓜ️"), 946 | ("macau", "🇲🇴"), 947 | ("macedonia", "🇲🇰"), 948 | ("madagascar", "🇲🇬"), 949 | ("mag", "🔍"), 950 | ("mag_right", "🔎"), 951 | ("mage", "🧙"), 952 | ("mage_man", "🧙‍♂️"), 953 | ("mage_woman", "🧙‍♀️"), 954 | ("magic_wand", "🪄"), 955 | ("magnet", "🧲"), 956 | ("mahjong", "🀄"), 957 | ("mailbox", "📫"), 958 | ("mailbox_closed", "📪"), 959 | ("mailbox_with_mail", "📬"), 960 | ("mailbox_with_no_mail", "📭"), 961 | ("malawi", "🇲🇼"), 962 | ("malaysia", "🇲🇾"), 963 | ("maldives", "🇲🇻"), 964 | ("male_detective", "🕵️‍♂️"), 965 | ("male_sign", "♂️"), 966 | ("mali", "🇲🇱"), 967 | ("malta", "🇲🇹"), 968 | ("mammoth", "🦣"), 969 | ("man", "👨"), 970 | ("man_artist", "👨‍🎨"), 971 | ("man_astronaut", "👨‍🚀"), 972 | ("man_beard", "🧔‍♂️"), 973 | ("man_cartwheeling", "🤸‍♂️"), 974 | ("man_cook", "👨‍🍳"), 975 | ("man_dancing", "🕺"), 976 | ("man_facepalming", "🤦‍♂️"), 977 | ("man_factory_worker", "👨‍🏭"), 978 | ("man_farmer", "👨‍🌾"), 979 | ("man_feeding_baby", "👨‍🍼"), 980 | ("man_firefighter", "👨‍🚒"), 981 | ("man_health_worker", "👨‍⚕️"), 982 | ("man_in_manual_wheelchair", "👨‍🦽"), 983 | ("man_in_motorized_wheelchair", "👨‍🦼"), 984 | ("man_in_tuxedo", "🤵‍♂️"), 985 | ("man_judge", "👨‍⚖️"), 986 | ("man_juggling", "🤹‍♂️"), 987 | ("man_mechanic", "👨‍🔧"), 988 | ("man_office_worker", "👨‍💼"), 989 | ("man_pilot", "👨‍✈️"), 990 | ("man_playing_handball", "🤾‍♂️"), 991 | ("man_playing_water_polo", "🤽‍♂️"), 992 | ("man_scientist", "👨‍🔬"), 993 | ("man_shrugging", "🤷‍♂️"), 994 | ("man_singer", "👨‍🎤"), 995 | ("man_student", "👨‍🎓"), 996 | ("man_teacher", "👨‍🏫"), 997 | ("man_technologist", "👨‍💻"), 998 | ("man_with_gua_pi_mao", "👲"), 999 | ("man_with_probing_cane", "👨‍🦯"), 1000 | ("man_with_turban", "👳‍♂️"), 1001 | ("man_with_veil", "👰‍♂️"), 1002 | ("mandarin", "🍊"), 1003 | ("mango", "🥭"), 1004 | ("mans_shoe", "👞"), 1005 | ("mantelpiece_clock", "🕰️"), 1006 | ("manual_wheelchair", "🦽"), 1007 | ("maple_leaf", "🍁"), 1008 | ("marshall_islands", "🇲🇭"), 1009 | ("martial_arts_uniform", "🥋"), 1010 | ("martinique", "🇲🇶"), 1011 | ("mask", "😷"), 1012 | ("massage", "💆"), 1013 | ("massage_man", "💆‍♂️"), 1014 | ("massage_woman", "💆‍♀️"), 1015 | ("mate", "🧉"), 1016 | ("mauritania", "🇲🇷"), 1017 | ("mauritius", "🇲🇺"), 1018 | ("mayotte", "🇾🇹"), 1019 | ("meat_on_bone", "🍖"), 1020 | ("mechanic", "🧑‍🔧"), 1021 | ("mechanical_arm", "🦾"), 1022 | ("mechanical_leg", "🦿"), 1023 | ("medal_military", "🎖️"), 1024 | ("medal_sports", "🏅"), 1025 | ("medical_symbol", "⚕️"), 1026 | ("mega", "📣"), 1027 | ("melon", "🍈"), 1028 | ("memo", "📝"), 1029 | ("men_wrestling", "🤼‍♂️"), 1030 | ("mending_heart", "❤️‍🩹"), 1031 | ("menorah", "🕎"), 1032 | ("mens", "🚹"), 1033 | ("mermaid", "🧜‍♀️"), 1034 | ("merman", "🧜‍♂️"), 1035 | ("merperson", "🧜"), 1036 | ("metal", "🤘"), 1037 | ("metro", "🚇"), 1038 | ("mexico", "🇲🇽"), 1039 | ("microbe", "🦠"), 1040 | ("micronesia", "🇫🇲"), 1041 | ("microphone", "🎤"), 1042 | ("microscope", "🔬"), 1043 | ("middle_finger", "🖕"), 1044 | ("military_helmet", "🪖"), 1045 | ("milk_glass", "🥛"), 1046 | ("milky_way", "🌌"), 1047 | ("minibus", "🚐"), 1048 | ("minidisc", "💽"), 1049 | ("mirror", "🪞"), 1050 | ("mobile_phone_off", "📴"), 1051 | ("moldova", "🇲🇩"), 1052 | ("monaco", "🇲🇨"), 1053 | ("money_mouth_face", "🤑"), 1054 | ("money_with_wings", "💸"), 1055 | ("moneybag", "💰"), 1056 | ("mongolia", "🇲🇳"), 1057 | ("monkey", "🐒"), 1058 | ("monkey_face", "🐵"), 1059 | ("monocle_face", "🧐"), 1060 | ("monorail", "🚝"), 1061 | ("montenegro", "🇲🇪"), 1062 | ("montserrat", "🇲🇸"), 1063 | ("moon", "🌔"), 1064 | ("moon_cake", "🥮"), 1065 | ("morocco", "🇲🇦"), 1066 | ("mortar_board", "🎓"), 1067 | ("mosque", "🕌"), 1068 | ("mosquito", "🦟"), 1069 | ("motor_boat", "🛥️"), 1070 | ("motor_scooter", "🛵"), 1071 | ("motorcycle", "🏍️"), 1072 | ("motorized_wheelchair", "🦼"), 1073 | ("motorway", "🛣️"), 1074 | ("mount_fuji", "🗻"), 1075 | ("mountain", "⛰️"), 1076 | ("mountain_bicyclist", "🚵"), 1077 | ("mountain_biking_man", "🚵‍♂️"), 1078 | ("mountain_biking_woman", "🚵‍♀️"), 1079 | ("mountain_cableway", "🚠"), 1080 | ("mountain_railway", "🚞"), 1081 | ("mountain_snow", "🏔️"), 1082 | ("mouse", "🐭"), 1083 | ("mouse2", "🐁"), 1084 | ("mouse_trap", "🪤"), 1085 | ("movie_camera", "🎥"), 1086 | ("moyai", "🗿"), 1087 | ("mozambique", "🇲🇿"), 1088 | ("mrs_claus", "🤶"), 1089 | ("muscle", "💪"), 1090 | ("mushroom", "🍄"), 1091 | ("musical_keyboard", "🎹"), 1092 | ("musical_note", "🎵"), 1093 | ("musical_score", "🎼"), 1094 | ("mute", "🔇"), 1095 | ("mx_claus", "🧑‍🎄"), 1096 | ("myanmar", "🇲🇲"), 1097 | ("nail_care", "💅"), 1098 | ("name_badge", "📛"), 1099 | ("namibia", "🇳🇦"), 1100 | ("national_park", "🏞️"), 1101 | ("nauru", "🇳🇷"), 1102 | ("nauseated_face", "🤢"), 1103 | ("nazar_amulet", "🧿"), 1104 | ("necktie", "👔"), 1105 | ("negative_squared_cross_mark", "❎"), 1106 | ("nepal", "🇳🇵"), 1107 | ("nerd_face", "🤓"), 1108 | ("nesting_dolls", "🪆"), 1109 | ("netherlands", "🇳🇱"), 1110 | ("neutral_face", "😐"), 1111 | ("new", "🆕"), 1112 | ("new_caledonia", "🇳🇨"), 1113 | ("new_moon", "🌑"), 1114 | ("new_moon_with_face", "🌚"), 1115 | ("new_zealand", "🇳🇿"), 1116 | ("newspaper", "📰"), 1117 | ("newspaper_roll", "🗞️"), 1118 | ("next_track_button", "⏭️"), 1119 | ("ng", "🆖"), 1120 | ("ng_man", "🙅‍♂️"), 1121 | ("ng_woman", "🙅‍♀️"), 1122 | ("nicaragua", "🇳🇮"), 1123 | ("niger", "🇳🇪"), 1124 | ("nigeria", "🇳🇬"), 1125 | ("night_with_stars", "🌃"), 1126 | ("nine", "9️⃣"), 1127 | ("ninja", "🥷"), 1128 | ("niue", "🇳🇺"), 1129 | ("no_bell", "🔕"), 1130 | ("no_bicycles", "🚳"), 1131 | ("no_entry", "⛔"), 1132 | ("no_entry_sign", "🚫"), 1133 | ("no_good", "🙅"), 1134 | ("no_good_man", "🙅‍♂️"), 1135 | ("no_good_woman", "🙅‍♀️"), 1136 | ("no_mobile_phones", "📵"), 1137 | ("no_mouth", "😶"), 1138 | ("no_pedestrians", "🚷"), 1139 | ("no_smoking", "🚭"), 1140 | ("non-potable_water", "🚱"), 1141 | ("norfolk_island", "🇳🇫"), 1142 | ("north_korea", "🇰🇵"), 1143 | ("northern_mariana_islands", "🇲🇵"), 1144 | ("norway", "🇳🇴"), 1145 | ("nose", "👃"), 1146 | ("notebook", "📓"), 1147 | ("notebook_with_decorative_cover", "📔"), 1148 | ("notes", "🎶"), 1149 | ("nut_and_bolt", "🔩"), 1150 | ("o", "⭕"), 1151 | ("o2", "🅾️"), 1152 | ("ocean", "🌊"), 1153 | ("octopus", "🐙"), 1154 | ("oden", "🍢"), 1155 | ("office", "🏢"), 1156 | ("office_worker", "🧑‍💼"), 1157 | ("oil_drum", "🛢️"), 1158 | ("ok", "🆗"), 1159 | ("ok_hand", "👌"), 1160 | ("ok_man", "🙆‍♂️"), 1161 | ("ok_person", "🙆"), 1162 | ("ok_woman", "🙆‍♀️"), 1163 | ("old_key", "🗝️"), 1164 | ("older_adult", "🧓"), 1165 | ("older_man", "👴"), 1166 | ("older_woman", "👵"), 1167 | ("olive", "🫒"), 1168 | ("om", "🕉️"), 1169 | ("oman", "🇴🇲"), 1170 | ("on", "🔛"), 1171 | ("oncoming_automobile", "🚘"), 1172 | ("oncoming_bus", "🚍"), 1173 | ("oncoming_police_car", "🚔"), 1174 | ("oncoming_taxi", "🚖"), 1175 | ("one", "1️⃣"), 1176 | ("one_piece_swimsuit", "🩱"), 1177 | ("onion", "🧅"), 1178 | ("open_book", "📖"), 1179 | ("open_file_folder", "📂"), 1180 | ("open_hands", "👐"), 1181 | ("open_mouth", "😮"), 1182 | ("open_umbrella", "☂️"), 1183 | ("ophiuchus", "⛎"), 1184 | ("orange", "🍊"), 1185 | ("orange_book", "📙"), 1186 | ("orange_circle", "🟠"), 1187 | ("orange_heart", "🧡"), 1188 | ("orange_square", "🟧"), 1189 | ("orangutan", "🦧"), 1190 | ("orthodox_cross", "☦️"), 1191 | ("otter", "🦦"), 1192 | ("outbox_tray", "📤"), 1193 | ("owl", "🦉"), 1194 | ("ox", "🐂"), 1195 | ("oyster", "🦪"), 1196 | ("package", "📦"), 1197 | ("page_facing_up", "📄"), 1198 | ("page_with_curl", "📃"), 1199 | ("pager", "📟"), 1200 | ("paintbrush", "🖌️"), 1201 | ("pakistan", "🇵🇰"), 1202 | ("palau", "🇵🇼"), 1203 | ("palestinian_territories", "🇵🇸"), 1204 | ("palm_tree", "🌴"), 1205 | ("palms_up_together", "🤲"), 1206 | ("panama", "🇵🇦"), 1207 | ("pancakes", "🥞"), 1208 | ("panda_face", "🐼"), 1209 | ("paperclip", "📎"), 1210 | ("paperclips", "🖇️"), 1211 | ("papua_new_guinea", "🇵🇬"), 1212 | ("parachute", "🪂"), 1213 | ("paraguay", "🇵🇾"), 1214 | ("parasol_on_ground", "⛱️"), 1215 | ("parking", "🅿️"), 1216 | ("parrot", "🦜"), 1217 | ("part_alternation_mark", "〽️"), 1218 | ("partly_sunny", "⛅"), 1219 | ("partying_face", "🥳"), 1220 | ("passenger_ship", "🛳️"), 1221 | ("passport_control", "🛂"), 1222 | ("pause_button", "⏸️"), 1223 | ("paw_prints", "🐾"), 1224 | ("peace_symbol", "☮️"), 1225 | ("peach", "🍑"), 1226 | ("peacock", "🦚"), 1227 | ("peanuts", "🥜"), 1228 | ("pear", "🍐"), 1229 | ("pen", "🖊️"), 1230 | ("pencil", "📝"), 1231 | ("pencil2", "✏️"), 1232 | ("penguin", "🐧"), 1233 | ("pensive", "😔"), 1234 | ("people_holding_hands", "🧑‍🤝‍🧑"), 1235 | ("people_hugging", "🫂"), 1236 | ("performing_arts", "🎭"), 1237 | ("persevere", "😣"), 1238 | ("person_bald", "🧑‍🦲"), 1239 | ("person_curly_hair", "🧑‍🦱"), 1240 | ("person_feeding_baby", "🧑‍🍼"), 1241 | ("person_fencing", "🤺"), 1242 | ("person_in_manual_wheelchair", "🧑‍🦽"), 1243 | ("person_in_motorized_wheelchair", "🧑‍🦼"), 1244 | ("person_in_tuxedo", "🤵"), 1245 | ("person_red_hair", "🧑‍🦰"), 1246 | ("person_white_hair", "🧑‍🦳"), 1247 | ("person_with_probing_cane", "🧑‍🦯"), 1248 | ("person_with_turban", "👳"), 1249 | ("person_with_veil", "👰"), 1250 | ("peru", "🇵🇪"), 1251 | ("petri_dish", "🧫"), 1252 | ("philippines", "🇵🇭"), 1253 | ("phone", "☎️"), 1254 | ("pick", "⛏️"), 1255 | ("pickup_truck", "🛻"), 1256 | ("pie", "🥧"), 1257 | ("pig", "🐷"), 1258 | ("pig2", "🐖"), 1259 | ("pig_nose", "🐽"), 1260 | ("pill", "💊"), 1261 | ("pilot", "🧑‍✈️"), 1262 | ("pinata", "🪅"), 1263 | ("pinched_fingers", "🤌"), 1264 | ("pinching_hand", "🤏"), 1265 | ("pineapple", "🍍"), 1266 | ("ping_pong", "🏓"), 1267 | ("pirate_flag", "🏴‍☠️"), 1268 | ("pisces", "♓"), 1269 | ("pitcairn_islands", "🇵🇳"), 1270 | ("pizza", "🍕"), 1271 | ("placard", "🪧"), 1272 | ("place_of_worship", "🛐"), 1273 | ("plate_with_cutlery", "🍽️"), 1274 | ("play_or_pause_button", "⏯️"), 1275 | ("pleading_face", "🥺"), 1276 | ("plunger", "🪠"), 1277 | ("point_down", "👇"), 1278 | ("point_left", "👈"), 1279 | ("point_right", "👉"), 1280 | ("point_up", "☝️"), 1281 | ("point_up_2", "👆"), 1282 | ("poland", "🇵🇱"), 1283 | ("polar_bear", "🐻‍❄️"), 1284 | ("police_car", "🚓"), 1285 | ("police_officer", "👮"), 1286 | ("policeman", "👮‍♂️"), 1287 | ("policewoman", "👮‍♀️"), 1288 | ("poodle", "🐩"), 1289 | ("poop", "💩"), 1290 | ("popcorn", "🍿"), 1291 | ("portugal", "🇵🇹"), 1292 | ("post_office", "🏣"), 1293 | ("postal_horn", "📯"), 1294 | ("postbox", "📮"), 1295 | ("potable_water", "🚰"), 1296 | ("potato", "🥔"), 1297 | ("potted_plant", "🪴"), 1298 | ("pouch", "👝"), 1299 | ("poultry_leg", "🍗"), 1300 | ("pound", "💷"), 1301 | ("pout", "😡"), 1302 | ("pouting_cat", "😾"), 1303 | ("pouting_face", "🙎"), 1304 | ("pouting_man", "🙎‍♂️"), 1305 | ("pouting_woman", "🙎‍♀️"), 1306 | ("pray", "🙏"), 1307 | ("prayer_beads", "📿"), 1308 | ("pregnant_woman", "🤰"), 1309 | ("pretzel", "🥨"), 1310 | ("previous_track_button", "⏮️"), 1311 | ("prince", "🤴"), 1312 | ("princess", "👸"), 1313 | ("printer", "🖨️"), 1314 | ("probing_cane", "🦯"), 1315 | ("puerto_rico", "🇵🇷"), 1316 | ("punch", "👊"), 1317 | ("purple_circle", "🟣"), 1318 | ("purple_heart", "💜"), 1319 | ("purple_square", "🟪"), 1320 | ("purse", "👛"), 1321 | ("pushpin", "📌"), 1322 | ("put_litter_in_its_place", "🚮"), 1323 | ("qatar", "🇶🇦"), 1324 | ("question", "❓"), 1325 | ("rabbit", "🐰"), 1326 | ("rabbit2", "🐇"), 1327 | ("raccoon", "🦝"), 1328 | ("racehorse", "🐎"), 1329 | ("racing_car", "🏎️"), 1330 | ("radio", "📻"), 1331 | ("radio_button", "🔘"), 1332 | ("radioactive", "☢️"), 1333 | ("rage", "😡"), 1334 | ("railway_car", "🚃"), 1335 | ("railway_track", "🛤️"), 1336 | ("rainbow", "🌈"), 1337 | ("rainbow_flag", "🏳️‍🌈"), 1338 | ("raised_back_of_hand", "🤚"), 1339 | ("raised_eyebrow", "🤨"), 1340 | ("raised_hand", "✋"), 1341 | ("raised_hand_with_fingers_splayed", "🖐️"), 1342 | ("raised_hands", "🙌"), 1343 | ("raising_hand", "🙋"), 1344 | ("raising_hand_man", "🙋‍♂️"), 1345 | ("raising_hand_woman", "🙋‍♀️"), 1346 | ("ram", "🐏"), 1347 | ("ramen", "🍜"), 1348 | ("rat", "🐀"), 1349 | ("razor", "🪒"), 1350 | ("receipt", "🧾"), 1351 | ("record_button", "⏺️"), 1352 | ("recycle", "♻️"), 1353 | ("red_car", "🚗"), 1354 | ("red_circle", "🔴"), 1355 | ("red_envelope", "🧧"), 1356 | ("red_haired_man", "👨‍🦰"), 1357 | ("red_haired_woman", "👩‍🦰"), 1358 | ("red_square", "🟥"), 1359 | ("registered", "®️"), 1360 | ("relaxed", "☺️"), 1361 | ("relieved", "😌"), 1362 | ("reminder_ribbon", "🎗️"), 1363 | ("repeat", "🔁"), 1364 | ("repeat_one", "🔂"), 1365 | ("rescue_worker_helmet", "⛑️"), 1366 | ("restroom", "🚻"), 1367 | ("reunion", "🇷🇪"), 1368 | ("revolving_hearts", "💞"), 1369 | ("rewind", "⏪"), 1370 | ("rhinoceros", "🦏"), 1371 | ("ribbon", "🎀"), 1372 | ("rice", "🍚"), 1373 | ("rice_ball", "🍙"), 1374 | ("rice_cracker", "🍘"), 1375 | ("rice_scene", "🎑"), 1376 | ("right_anger_bubble", "🗯️"), 1377 | ("ring", "💍"), 1378 | ("ringed_planet", "🪐"), 1379 | ("robot", "🤖"), 1380 | ("rock", "🪨"), 1381 | ("rocket", "🚀"), 1382 | ("rofl", "🤣"), 1383 | ("roll_eyes", "🙄"), 1384 | ("roll_of_paper", "🧻"), 1385 | ("roller_coaster", "🎢"), 1386 | ("roller_skate", "🛼"), 1387 | ("romania", "🇷🇴"), 1388 | ("rooster", "🐓"), 1389 | ("rose", "🌹"), 1390 | ("rosette", "🏵️"), 1391 | ("rotating_light", "🚨"), 1392 | ("round_pushpin", "📍"), 1393 | ("rowboat", "🚣"), 1394 | ("rowing_man", "🚣‍♂️"), 1395 | ("rowing_woman", "🚣‍♀️"), 1396 | ("ru", "🇷🇺"), 1397 | ("rugby_football", "🏉"), 1398 | ("runner", "🏃"), 1399 | ("running", "🏃"), 1400 | ("running_man", "🏃‍♂️"), 1401 | ("running_shirt_with_sash", "🎽"), 1402 | ("running_woman", "🏃‍♀️"), 1403 | ("rwanda", "🇷🇼"), 1404 | ("sa", "🈂️"), 1405 | ("safety_pin", "🧷"), 1406 | ("safety_vest", "🦺"), 1407 | ("sagittarius", "♐"), 1408 | ("sailboat", "⛵"), 1409 | ("sake", "🍶"), 1410 | ("salt", "🧂"), 1411 | ("samoa", "🇼🇸"), 1412 | ("san_marino", "🇸🇲"), 1413 | ("sandal", "👡"), 1414 | ("sandwich", "🥪"), 1415 | ("santa", "🎅"), 1416 | ("sao_tome_principe", "🇸🇹"), 1417 | ("sari", "🥻"), 1418 | ("sassy_man", "💁‍♂️"), 1419 | ("sassy_woman", "💁‍♀️"), 1420 | ("satellite", "📡"), 1421 | ("satisfied", "😆"), 1422 | ("saudi_arabia", "🇸🇦"), 1423 | ("sauna_man", "🧖‍♂️"), 1424 | ("sauna_person", "🧖"), 1425 | ("sauna_woman", "🧖‍♀️"), 1426 | ("sauropod", "🦕"), 1427 | ("saxophone", "🎷"), 1428 | ("scarf", "🧣"), 1429 | ("school", "🏫"), 1430 | ("school_satchel", "🎒"), 1431 | ("scientist", "🧑‍🔬"), 1432 | ("scissors", "✂️"), 1433 | ("scorpion", "🦂"), 1434 | ("scorpius", "♏"), 1435 | ("scotland", "🏴󠁧󠁢󠁳󠁣󠁴󠁿"), 1436 | ("scream", "😱"), 1437 | ("scream_cat", "🙀"), 1438 | ("screwdriver", "🪛"), 1439 | ("scroll", "📜"), 1440 | ("seal", "🦭"), 1441 | ("seat", "💺"), 1442 | ("secret", "㊙️"), 1443 | ("see_no_evil", "🙈"), 1444 | ("seedling", "🌱"), 1445 | ("selfie", "🤳"), 1446 | ("senegal", "🇸🇳"), 1447 | ("serbia", "🇷🇸"), 1448 | ("service_dog", "🐕‍🦺"), 1449 | ("seven", "7️⃣"), 1450 | ("sewing_needle", "🪡"), 1451 | ("seychelles", "🇸🇨"), 1452 | ("shallow_pan_of_food", "🥘"), 1453 | ("shamrock", "☘️"), 1454 | ("shark", "🦈"), 1455 | ("shaved_ice", "🍧"), 1456 | ("sheep", "🐑"), 1457 | ("shell", "🐚"), 1458 | ("shield", "🛡️"), 1459 | ("shinto_shrine", "⛩️"), 1460 | ("ship", "🚢"), 1461 | ("shirt", "👕"), 1462 | ("shit", "💩"), 1463 | ("shoe", "👞"), 1464 | ("shopping", "🛍️"), 1465 | ("shopping_cart", "🛒"), 1466 | ("shorts", "🩳"), 1467 | ("shower", "🚿"), 1468 | ("shrimp", "🦐"), 1469 | ("shrug", "🤷"), 1470 | ("shushing_face", "🤫"), 1471 | ("sierra_leone", "🇸🇱"), 1472 | ("signal_strength", "📶"), 1473 | ("singapore", "🇸🇬"), 1474 | ("singer", "🧑‍🎤"), 1475 | ("sint_maarten", "🇸🇽"), 1476 | ("six", "6️⃣"), 1477 | ("six_pointed_star", "🔯"), 1478 | ("skateboard", "🛹"), 1479 | ("ski", "🎿"), 1480 | ("skier", "⛷️"), 1481 | ("skull", "💀"), 1482 | ("skull_and_crossbones", "☠️"), 1483 | ("skunk", "🦨"), 1484 | ("sled", "🛷"), 1485 | ("sleeping", "😴"), 1486 | ("sleeping_bed", "🛌"), 1487 | ("sleepy", "😪"), 1488 | ("slightly_frowning_face", "🙁"), 1489 | ("slightly_smiling_face", "🙂"), 1490 | ("slot_machine", "🎰"), 1491 | ("sloth", "🦥"), 1492 | ("slovakia", "🇸🇰"), 1493 | ("slovenia", "🇸🇮"), 1494 | ("small_airplane", "🛩️"), 1495 | ("small_blue_diamond", "🔹"), 1496 | ("small_orange_diamond", "🔸"), 1497 | ("small_red_triangle", "🔺"), 1498 | ("small_red_triangle_down", "🔻"), 1499 | ("smile", "😄"), 1500 | ("smile_cat", "😸"), 1501 | ("smiley", "😃"), 1502 | ("smiley_cat", "😺"), 1503 | ("smiling_face_with_tear", "🥲"), 1504 | ("smiling_face_with_three_hearts", "🥰"), 1505 | ("smiling_imp", "😈"), 1506 | ("smirk", "😏"), 1507 | ("smirk_cat", "😼"), 1508 | ("smoking", "🚬"), 1509 | ("snail", "🐌"), 1510 | ("snake", "🐍"), 1511 | ("sneezing_face", "🤧"), 1512 | ("snowboarder", "🏂"), 1513 | ("snowflake", "❄️"), 1514 | ("snowman", "⛄"), 1515 | ("snowman_with_snow", "☃️"), 1516 | ("soap", "🧼"), 1517 | ("sob", "😭"), 1518 | ("soccer", "⚽"), 1519 | ("socks", "🧦"), 1520 | ("softball", "🥎"), 1521 | ("solomon_islands", "🇸🇧"), 1522 | ("somalia", "🇸🇴"), 1523 | ("soon", "🔜"), 1524 | ("sos", "🆘"), 1525 | ("sound", "🔉"), 1526 | ("south_africa", "🇿🇦"), 1527 | ("south_georgia_south_sandwich_islands", "🇬🇸"), 1528 | ("south_sudan", "🇸🇸"), 1529 | ("space_invader", "👾"), 1530 | ("spades", "♠️"), 1531 | ("spaghetti", "🍝"), 1532 | ("sparkle", "❇️"), 1533 | ("sparkler", "🎇"), 1534 | ("sparkles", "✨"), 1535 | ("sparkling_heart", "💖"), 1536 | ("speak_no_evil", "🙊"), 1537 | ("speaker", "🔈"), 1538 | ("speaking_head", "🗣️"), 1539 | ("speech_balloon", "💬"), 1540 | ("speedboat", "🚤"), 1541 | ("spider", "🕷️"), 1542 | ("spider_web", "🕸️"), 1543 | ("spiral_calendar", "🗓️"), 1544 | ("spiral_notepad", "🗒️"), 1545 | ("sponge", "🧽"), 1546 | ("spoon", "🥄"), 1547 | ("squid", "🦑"), 1548 | ("sri_lanka", "🇱🇰"), 1549 | ("st_barthelemy", "🇧🇱"), 1550 | ("st_helena", "🇸🇭"), 1551 | ("st_kitts_nevis", "🇰🇳"), 1552 | ("st_lucia", "🇱🇨"), 1553 | ("st_martin", "🇲🇫"), 1554 | ("st_pierre_miquelon", "🇵🇲"), 1555 | ("st_vincent_grenadines", "🇻🇨"), 1556 | ("stadium", "🏟️"), 1557 | ("standing_man", "🧍‍♂️"), 1558 | ("standing_person", "🧍"), 1559 | ("standing_woman", "🧍‍♀️"), 1560 | ("star", "⭐"), 1561 | ("star2", "🌟"), 1562 | ("star_and_crescent", "☪️"), 1563 | ("star_of_david", "✡️"), 1564 | ("star_struck", "🤩"), 1565 | ("stars", "🌠"), 1566 | ("station", "🚉"), 1567 | ("statue_of_liberty", "🗽"), 1568 | ("steam_locomotive", "🚂"), 1569 | ("stethoscope", "🩺"), 1570 | ("stew", "🍲"), 1571 | ("stop_button", "⏹️"), 1572 | ("stop_sign", "🛑"), 1573 | ("stopwatch", "⏱️"), 1574 | ("straight_ruler", "📏"), 1575 | ("strawberry", "🍓"), 1576 | ("stuck_out_tongue", "😛"), 1577 | ("stuck_out_tongue_closed_eyes", "😝"), 1578 | ("stuck_out_tongue_winking_eye", "😜"), 1579 | ("student", "🧑‍🎓"), 1580 | ("studio_microphone", "🎙️"), 1581 | ("stuffed_flatbread", "🥙"), 1582 | ("sudan", "🇸🇩"), 1583 | ("sun_behind_large_cloud", "🌥️"), 1584 | ("sun_behind_rain_cloud", "🌦️"), 1585 | ("sun_behind_small_cloud", "🌤️"), 1586 | ("sun_with_face", "🌞"), 1587 | ("sunflower", "🌻"), 1588 | ("sunglasses", "😎"), 1589 | ("sunny", "☀️"), 1590 | ("sunrise", "🌅"), 1591 | ("sunrise_over_mountains", "🌄"), 1592 | ("superhero", "🦸"), 1593 | ("superhero_man", "🦸‍♂️"), 1594 | ("superhero_woman", "🦸‍♀️"), 1595 | ("supervillain", "🦹"), 1596 | ("supervillain_man", "🦹‍♂️"), 1597 | ("supervillain_woman", "🦹‍♀️"), 1598 | ("surfer", "🏄"), 1599 | ("surfing_man", "🏄‍♂️"), 1600 | ("surfing_woman", "🏄‍♀️"), 1601 | ("suriname", "🇸🇷"), 1602 | ("sushi", "🍣"), 1603 | ("suspension_railway", "🚟"), 1604 | ("svalbard_jan_mayen", "🇸🇯"), 1605 | ("swan", "🦢"), 1606 | ("swaziland", "🇸🇿"), 1607 | ("sweat", "😓"), 1608 | ("sweat_drops", "💦"), 1609 | ("sweat_smile", "😅"), 1610 | ("sweden", "🇸🇪"), 1611 | ("sweet_potato", "🍠"), 1612 | ("swim_brief", "🩲"), 1613 | ("swimmer", "🏊"), 1614 | ("swimming_man", "🏊‍♂️"), 1615 | ("swimming_woman", "🏊‍♀️"), 1616 | ("switzerland", "🇨🇭"), 1617 | ("symbols", "🔣"), 1618 | ("synagogue", "🕍"), 1619 | ("syria", "🇸🇾"), 1620 | ("syringe", "💉"), 1621 | ("t-rex", "🦖"), 1622 | ("taco", "🌮"), 1623 | ("tada", "🎉"), 1624 | ("taiwan", "🇹🇼"), 1625 | ("tajikistan", "🇹🇯"), 1626 | ("takeout_box", "🥡"), 1627 | ("tamale", "🫔"), 1628 | ("tanabata_tree", "🎋"), 1629 | ("tangerine", "🍊"), 1630 | ("tanzania", "🇹🇿"), 1631 | ("taurus", "♉"), 1632 | ("taxi", "🚕"), 1633 | ("tea", "🍵"), 1634 | ("teacher", "🧑‍🏫"), 1635 | ("teapot", "🫖"), 1636 | ("technologist", "🧑‍💻"), 1637 | ("teddy_bear", "🧸"), 1638 | ("telephone", "☎️"), 1639 | ("telephone_receiver", "📞"), 1640 | ("telescope", "🔭"), 1641 | ("tennis", "🎾"), 1642 | ("tent", "⛺"), 1643 | ("test_tube", "🧪"), 1644 | ("thailand", "🇹🇭"), 1645 | ("thermometer", "🌡️"), 1646 | ("thinking", "🤔"), 1647 | ("thong_sandal", "🩴"), 1648 | ("thought_balloon", "💭"), 1649 | ("thread", "🧵"), 1650 | ("three", "3️⃣"), 1651 | ("thumbsdown", "👎"), 1652 | ("thumbsup", "👍"), 1653 | ("ticket", "🎫"), 1654 | ("tickets", "🎟️"), 1655 | ("tiger", "🐯"), 1656 | ("tiger2", "🐅"), 1657 | ("timer_clock", "⏲️"), 1658 | ("timor_leste", "🇹🇱"), 1659 | ("tipping_hand_man", "💁‍♂️"), 1660 | ("tipping_hand_person", "💁"), 1661 | ("tipping_hand_woman", "💁‍♀️"), 1662 | ("tired_face", "😫"), 1663 | ("tm", "™️"), 1664 | ("togo", "🇹🇬"), 1665 | ("toilet", "🚽"), 1666 | ("tokelau", "🇹🇰"), 1667 | ("tokyo_tower", "🗼"), 1668 | ("tomato", "🍅"), 1669 | ("tonga", "🇹🇴"), 1670 | ("tongue", "👅"), 1671 | ("toolbox", "🧰"), 1672 | ("tooth", "🦷"), 1673 | ("toothbrush", "🪥"), 1674 | ("top", "🔝"), 1675 | ("tophat", "🎩"), 1676 | ("tornado", "🌪️"), 1677 | ("tr", "🇹🇷"), 1678 | ("trackball", "🖲️"), 1679 | ("tractor", "🚜"), 1680 | ("traffic_light", "🚥"), 1681 | ("train", "🚋"), 1682 | ("train2", "🚆"), 1683 | ("tram", "🚊"), 1684 | ("transgender_flag", "🏳️‍⚧️"), 1685 | ("transgender_symbol", "⚧️"), 1686 | ("triangular_flag_on_post", "🚩"), 1687 | ("triangular_ruler", "📐"), 1688 | ("trident", "🔱"), 1689 | ("trinidad_tobago", "🇹🇹"), 1690 | ("tristan_da_cunha", "🇹🇦"), 1691 | ("triumph", "😤"), 1692 | ("trolleybus", "🚎"), 1693 | ("trophy", "🏆"), 1694 | ("tropical_drink", "🍹"), 1695 | ("tropical_fish", "🐠"), 1696 | ("truck", "🚚"), 1697 | ("trumpet", "🎺"), 1698 | ("tshirt", "👕"), 1699 | ("tulip", "🌷"), 1700 | ("tumbler_glass", "🥃"), 1701 | ("tunisia", "🇹🇳"), 1702 | ("turkey", "🦃"), 1703 | ("turkmenistan", "🇹🇲"), 1704 | ("turks_caicos_islands", "🇹🇨"), 1705 | ("turtle", "🐢"), 1706 | ("tuvalu", "🇹🇻"), 1707 | ("tv", "📺"), 1708 | ("twisted_rightwards_arrows", "🔀"), 1709 | ("two", "2️⃣"), 1710 | ("two_hearts", "💕"), 1711 | ("two_men_holding_hands", "👬"), 1712 | ("two_women_holding_hands", "👭"), 1713 | ("u5272", "🈹"), 1714 | ("u5408", "🈴"), 1715 | ("u55b6", "🈺"), 1716 | ("u6307", "🈯"), 1717 | ("u6708", "🈷️"), 1718 | ("u6709", "🈶"), 1719 | ("u6e80", "🈵"), 1720 | ("u7121", "🈚"), 1721 | ("u7533", "🈸"), 1722 | ("u7981", "🈲"), 1723 | ("u7a7a", "🈳"), 1724 | ("uganda", "🇺🇬"), 1725 | ("uk", "🇬🇧"), 1726 | ("ukraine", "🇺🇦"), 1727 | ("umbrella", "☔"), 1728 | ("unamused", "😒"), 1729 | ("underage", "🔞"), 1730 | ("unicorn", "🦄"), 1731 | ("united_arab_emirates", "🇦🇪"), 1732 | ("united_nations", "🇺🇳"), 1733 | ("unlock", "🔓"), 1734 | ("up", "🆙"), 1735 | ("upside_down_face", "🙃"), 1736 | ("uruguay", "🇺🇾"), 1737 | ("us", "🇺🇸"), 1738 | ("us_outlying_islands", "🇺🇲"), 1739 | ("us_virgin_islands", "🇻🇮"), 1740 | ("uzbekistan", "🇺🇿"), 1741 | ("v", "✌️"), 1742 | ("vampire", "🧛"), 1743 | ("vampire_man", "🧛‍♂️"), 1744 | ("vampire_woman", "🧛‍♀️"), 1745 | ("vanuatu", "🇻🇺"), 1746 | ("vatican_city", "🇻🇦"), 1747 | ("venezuela", "🇻🇪"), 1748 | ("vertical_traffic_light", "🚦"), 1749 | ("vhs", "📼"), 1750 | ("vibration_mode", "📳"), 1751 | ("video_camera", "📹"), 1752 | ("video_game", "🎮"), 1753 | ("vietnam", "🇻🇳"), 1754 | ("violin", "🎻"), 1755 | ("virgo", "♍"), 1756 | ("volcano", "🌋"), 1757 | ("volleyball", "🏐"), 1758 | ("vomiting_face", "🤮"), 1759 | ("vs", "🆚"), 1760 | ("vulcan_salute", "🖖"), 1761 | ("waffle", "🧇"), 1762 | ("wales", "🏴󠁧󠁢󠁷󠁬󠁳󠁿"), 1763 | ("walking", "🚶"), 1764 | ("walking_man", "🚶‍♂️"), 1765 | ("walking_woman", "🚶‍♀️"), 1766 | ("wallis_futuna", "🇼🇫"), 1767 | ("waning_crescent_moon", "🌘"), 1768 | ("waning_gibbous_moon", "🌖"), 1769 | ("warning", "⚠️"), 1770 | ("wastebasket", "🗑️"), 1771 | ("watch", "⌚"), 1772 | ("water_buffalo", "🐃"), 1773 | ("water_polo", "🤽"), 1774 | ("watermelon", "🍉"), 1775 | ("wave", "👋"), 1776 | ("wavy_dash", "〰️"), 1777 | ("waxing_crescent_moon", "🌒"), 1778 | ("waxing_gibbous_moon", "🌔"), 1779 | ("wc", "🚾"), 1780 | ("weary", "😩"), 1781 | ("wedding", "💒"), 1782 | ("weight_lifting", "🏋️"), 1783 | ("weight_lifting_man", "🏋️‍♂️"), 1784 | ("weight_lifting_woman", "🏋️‍♀️"), 1785 | ("western_sahara", "🇪🇭"), 1786 | ("whale", "🐳"), 1787 | ("whale2", "🐋"), 1788 | ("wheel_of_dharma", "☸️"), 1789 | ("wheelchair", "♿"), 1790 | ("white_check_mark", "✅"), 1791 | ("white_circle", "⚪"), 1792 | ("white_flag", "🏳️"), 1793 | ("white_flower", "💮"), 1794 | ("white_haired_man", "👨‍🦳"), 1795 | ("white_haired_woman", "👩‍🦳"), 1796 | ("white_heart", "🤍"), 1797 | ("white_large_square", "⬜"), 1798 | ("white_medium_small_square", "◽"), 1799 | ("white_medium_square", "◻️"), 1800 | ("white_small_square", "▫️"), 1801 | ("white_square_button", "🔳"), 1802 | ("wilted_flower", "🥀"), 1803 | ("wind_chime", "🎐"), 1804 | ("wind_face", "🌬️"), 1805 | ("window", "🪟"), 1806 | ("wine_glass", "🍷"), 1807 | ("wink", "😉"), 1808 | ("wolf", "🐺"), 1809 | ("woman", "👩"), 1810 | ("woman_artist", "👩‍🎨"), 1811 | ("woman_astronaut", "👩‍🚀"), 1812 | ("woman_beard", "🧔‍♀️"), 1813 | ("woman_cartwheeling", "🤸‍♀️"), 1814 | ("woman_cook", "👩‍🍳"), 1815 | ("woman_dancing", "💃"), 1816 | ("woman_facepalming", "🤦‍♀️"), 1817 | ("woman_factory_worker", "👩‍🏭"), 1818 | ("woman_farmer", "👩‍🌾"), 1819 | ("woman_feeding_baby", "👩‍🍼"), 1820 | ("woman_firefighter", "👩‍🚒"), 1821 | ("woman_health_worker", "👩‍⚕️"), 1822 | ("woman_in_manual_wheelchair", "👩‍🦽"), 1823 | ("woman_in_motorized_wheelchair", "👩‍🦼"), 1824 | ("woman_in_tuxedo", "🤵‍♀️"), 1825 | ("woman_judge", "👩‍⚖️"), 1826 | ("woman_juggling", "🤹‍♀️"), 1827 | ("woman_mechanic", "👩‍🔧"), 1828 | ("woman_office_worker", "👩‍💼"), 1829 | ("woman_pilot", "👩‍✈️"), 1830 | ("woman_playing_handball", "🤾‍♀️"), 1831 | ("woman_playing_water_polo", "🤽‍♀️"), 1832 | ("woman_scientist", "👩‍🔬"), 1833 | ("woman_shrugging", "🤷‍♀️"), 1834 | ("woman_singer", "👩‍🎤"), 1835 | ("woman_student", "👩‍🎓"), 1836 | ("woman_teacher", "👩‍🏫"), 1837 | ("woman_technologist", "👩‍💻"), 1838 | ("woman_with_headscarf", "🧕"), 1839 | ("woman_with_probing_cane", "👩‍🦯"), 1840 | ("woman_with_turban", "👳‍♀️"), 1841 | ("woman_with_veil", "👰‍♀️"), 1842 | ("womans_clothes", "👚"), 1843 | ("womans_hat", "👒"), 1844 | ("women_wrestling", "🤼‍♀️"), 1845 | ("womens", "🚺"), 1846 | ("wood", "🪵"), 1847 | ("woozy_face", "🥴"), 1848 | ("world_map", "🗺️"), 1849 | ("worm", "🪱"), 1850 | ("worried", "😟"), 1851 | ("wrench", "🔧"), 1852 | ("wrestling", "🤼"), 1853 | ("writing_hand", "✍️"), 1854 | ("x", "❌"), 1855 | ("yarn", "🧶"), 1856 | ("yawning_face", "🥱"), 1857 | ("yellow_circle", "🟡"), 1858 | ("yellow_heart", "💛"), 1859 | ("yellow_square", "🟨"), 1860 | ("yemen", "🇾🇪"), 1861 | ("yen", "💴"), 1862 | ("yin_yang", "☯️"), 1863 | ("yo_yo", "🪀"), 1864 | ("yum", "😋"), 1865 | ("zambia", "🇿🇲"), 1866 | ("zany_face", "🤪"), 1867 | ("zap", "⚡"), 1868 | ("zebra", "🦓"), 1869 | ("zero", "0️⃣"), 1870 | ("zimbabwe", "🇿🇼"), 1871 | ("zipper_mouth_face", "🤐"), 1872 | ("zombie", "🧟"), 1873 | ("zombie_man", "🧟‍♂️"), 1874 | ("zombie_woman", "🧟‍♀️"), 1875 | ("zzz", "💤"), 1876 | ]; 1877 | -------------------------------------------------------------------------------- /src/html.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use crate::{ 4 | ast::{self, Attrs, Tag}, 5 | tree::get_string_content, 6 | Document, HtmlOpts, 7 | }; 8 | 9 | pub(crate) fn convert(opts: &HtmlOpts, doc: &Document) -> String { 10 | let refs = &doc.references; 11 | let mut ctx = Ctx { opts, refs, res: String::new() }; 12 | ctx.render_doc(doc); 13 | ctx.res 14 | } 15 | 16 | struct Ctx<'a> { 17 | #[allow(unused)] 18 | opts: &'a HtmlOpts, 19 | refs: &'a BTreeMap, 20 | res: String, 21 | } 22 | impl<'a> Ctx<'a> { 23 | fn render_doc(&mut self, doc: &Document) { 24 | for child in &doc.children { 25 | self.render(child) 26 | } 27 | } 28 | fn render(&mut self, tag: &Tag) { 29 | match tag { 30 | Tag::Heading(_) => todo!(), 31 | Tag::Para(para) => { 32 | self.render_tag("p", ¶.attrs); 33 | self.render_children(¶.children); 34 | self.out("

"); 35 | self.out("\n") 36 | } 37 | Tag::Link(link) => { 38 | let mut attrs = Attrs::new(); 39 | let dest = self.resolve_reference(link.destination.as_deref(), link.reference.as_deref()); 40 | if let Some(dest) = dest { 41 | attrs.insert("href".to_string(), dest); 42 | } 43 | self.render_tag("a", &attrs); 44 | self.render_children(&link.children); 45 | self.out(""); 46 | } 47 | Tag::Image(image) => { 48 | let mut attrs = Attrs::new(); 49 | let alt_text = get_string_content(&image.children); 50 | if !alt_text.is_empty() { 51 | attrs.insert("alt".to_string(), alt_text); 52 | } 53 | let dest = self.resolve_reference(image.destination.as_deref(), image.reference.as_deref()); 54 | if let Some(dest) = dest { 55 | attrs.insert("src".to_string(), dest); 56 | } 57 | self.render_tag("img", &attrs) 58 | } 59 | Tag::CodeBlock(code_block) => { 60 | self.render_tag("pre", &code_block.attrs); 61 | let mut attrs = Attrs::default(); 62 | if let Some(lang) = &code_block.lang { 63 | attrs.insert("class".to_string(), format!("language-{lang}")); 64 | } 65 | self.render_tag("code", &attrs); 66 | self.out_escape_html(&code_block.text); 67 | self.out("\n"); 68 | } 69 | Tag::Strong(strong) => { 70 | self.render_tag("strong", &strong.attrs); 71 | self.render_children(&strong.children); 72 | self.out(""); 73 | } 74 | Tag::Emph(emph) => { 75 | self.render_tag("em", &emph.attrs); 76 | self.render_children(&emph.children); 77 | self.out(""); 78 | } 79 | Tag::DoubleQuoted(double_quoted) => { 80 | self.out("“"); 81 | self.render_children(&double_quoted.children); 82 | self.out("”"); 83 | } 84 | Tag::SoftBreak(_) => self.out("\n"), 85 | Tag::Url(url) => { 86 | let mut attrs = Attrs::new(); 87 | attrs.insert("href".to_string(), url.destination.clone()); 88 | self.render_tag("a", &attrs); 89 | self.out_escape_html(&url.destination); 90 | self.out(""); 91 | } 92 | Tag::Str(str) => { 93 | if str.attrs.is_empty() { 94 | self.out_escape_html(&str.text); 95 | } else { 96 | self.render_tag("span", &str.attrs); 97 | self.out_escape_html(&str.text); 98 | self.out("") 99 | } 100 | } 101 | Tag::Emoji(emoji) => { 102 | if let Some(emoji) = crate::emoji::find_emoji(&emoji.alias) { 103 | self.out(emoji); 104 | } else { 105 | self.out(&format!(":{}:", emoji.alias)); 106 | } 107 | } 108 | Tag::Verbatim(verbatim) => { 109 | self.render_tag("code", &verbatim.attrs); 110 | self.out_escape_html(&verbatim.text); 111 | self.out(""); 112 | } 113 | Tag::Span(span) => { 114 | self.render_tag("span", &span.attrs); 115 | self.render_children(&span.children); 116 | self.out(""); 117 | } 118 | Tag::Insert(insert) => { 119 | self.render_tag("ins", &insert.attrs); 120 | self.render_children(&insert.children); 121 | self.out(""); 122 | } 123 | Tag::Delete(delete) => { 124 | self.render_tag("del", &delete.attrs); 125 | self.render_children(&delete.children); 126 | self.out(""); 127 | } 128 | Tag::Mark(mark) => { 129 | self.render_tag("mark", &mark.attrs); 130 | self.render_children(&mark.children); 131 | self.out(""); 132 | } 133 | Tag::Superscript(superscript) => { 134 | self.render_tag("sup", &superscript.attrs); 135 | self.render_children(&superscript.children); 136 | self.out(""); 137 | } 138 | Tag::Subscript(subscript) => { 139 | self.render_tag("sub", &subscript.attrs); 140 | self.render_children(&subscript.children); 141 | self.out(""); 142 | } 143 | Tag::EmDash(_) => self.out("—"), 144 | Tag::EnDash(_) => self.out("–"), 145 | } 146 | } 147 | 148 | fn render_children(&mut self, children: &[Tag]) { 149 | for child in children { 150 | self.render(child) 151 | } 152 | } 153 | 154 | fn render_tag(&mut self, tag_name: &str, attrs: &Attrs) { 155 | self.out("<"); 156 | self.out(tag_name); 157 | for (k, v) in attrs { 158 | self.out(" "); 159 | self.out(k); 160 | self.out("="); 161 | self.out(&format!("{v:?}")); 162 | } 163 | self.out(">"); 164 | } 165 | 166 | fn resolve_reference( 167 | &self, 168 | destination: Option<&str>, 169 | reference: Option<&str>, 170 | ) -> Option { 171 | if let Some(destination) = destination { 172 | return Some(destination.to_string()); 173 | } 174 | if let Some(reference) = reference { 175 | if let Some(reference_definition) = self.refs.get(reference) { 176 | return Some(reference_definition.destination.clone()); 177 | } 178 | } 179 | None 180 | } 181 | 182 | fn out(&mut self, s: &str) { 183 | self.res.push_str(s) 184 | } 185 | fn out_escape_html(&mut self, s: &str) { 186 | self.res.push_str(s) 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/inline.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::{BTreeMap, HashMap}, 3 | ops::Range, 4 | }; 5 | 6 | use crate::{ 7 | annot::{Annot, Atom, Comp}, 8 | attribute, 9 | patterns::{find_at, is_space, PatMatch}, 10 | Match, ParseOpts, 11 | }; 12 | 13 | #[derive(Default)] 14 | pub struct Tokenizer { 15 | opts: ParseOpts, 16 | subject: String, 17 | matches: BTreeMap, 18 | openers: HashMap>, 19 | verbatim: usize, 20 | verbatim_type: Comp, 21 | destination: bool, 22 | firstpos: usize, 23 | lastpos: usize, 24 | allow_attributes: bool, 25 | attribute_tokenizer: Option, 26 | attribute_start: usize, 27 | } 28 | 29 | #[derive(Debug, Clone)] 30 | struct Opener { 31 | range: Range, 32 | annot: &'static str, 33 | sub_range: Range, 34 | } 35 | 36 | impl Opener { 37 | fn new(range: Range) -> Opener { 38 | Opener { range, annot: "", sub_range: 0..0 } 39 | } 40 | } 41 | 42 | // allow up to 3 captures... 43 | fn bounded_find(subj: &str, patt: &'static str, startpos: usize, endpos: usize) -> PatMatch { 44 | let mut m = find_at(subj, patt, startpos); 45 | if m.end > endpos { 46 | m = PatMatch::default() 47 | } 48 | m 49 | } 50 | 51 | impl Tokenizer { 52 | pub fn new(subject: String, opts: ParseOpts) -> Tokenizer { 53 | let mut res = Tokenizer::default(); 54 | res.allow_attributes = true; 55 | res.subject = subject; 56 | res.opts = opts; 57 | res 58 | } 59 | 60 | fn add_match(&mut self, range: Range, annotation: impl Into) { 61 | let m = Match::new(range.clone(), annotation); 62 | self.matches.insert(range.start, m); 63 | } 64 | 65 | fn add_opener(&mut self, name: u8, opener: Opener) { 66 | self.openers.entry(name).or_default().push(opener) 67 | } 68 | 69 | fn clear_openers(&mut self, startpos: usize, endpos: usize) { 70 | for v in self.openers.values_mut() { 71 | v.retain(|it| !(startpos <= it.range.start && it.range.end <= endpos)) 72 | } 73 | } 74 | 75 | fn str_matches(&mut self, startpos: usize, endpos: usize) { 76 | for i in startpos..endpos { 77 | if let Some(m) = self.matches.get_mut(&i) { 78 | if m.is_not(Atom::Str) && m.is_not(Atom::Escape) { 79 | m.a = Atom::Str.into(); 80 | } 81 | } 82 | } 83 | } 84 | 85 | fn between_matched(&mut self, pos: usize, c: u8, annotation: Comp, defaultmatch: Atom) -> usize { 86 | self.between_matched_impl( 87 | pos, 88 | c, 89 | annotation, 90 | defaultmatch, 91 | Option:: PatMatch>::None, 92 | ) 93 | } 94 | 95 | fn between_matched_with_open_test( 96 | &mut self, 97 | pos: usize, 98 | c: u8, 99 | annotation: Comp, 100 | defaultmatch: Atom, 101 | open_test: impl FnOnce(&str, usize) -> PatMatch, 102 | ) -> usize { 103 | self.between_matched_impl(pos, c, annotation, defaultmatch, Some(open_test)) 104 | } 105 | 106 | fn between_matched_impl( 107 | &mut self, 108 | pos: usize, 109 | c: u8, 110 | annotation: Comp, 111 | mut defaultmatch: Atom, 112 | opentest: Option PatMatch>, 113 | ) -> usize { 114 | debug_assert!(self.subject[pos..].as_bytes().starts_with(&[c])); 115 | 116 | let mut can_open = find_at(&self.subject, "^%S", pos + 1).is_match; 117 | let mut can_close = !self.subject[..pos].ends_with(is_space); 118 | let has_open_marker = 119 | pos != 0 && self.matches.get(&(pos - 1)).map_or(false, |it| it.is(Atom::OpenMarker)); 120 | let has_close_marker = self.subject.as_bytes()[pos + 1] == b'}'; 121 | let mut startopener = pos; 122 | let mut endcloser = pos + 1; 123 | 124 | if let Some(opentest) = opentest { 125 | can_open = can_open && opentest(&self.subject, pos).is_match; 126 | } 127 | 128 | // allow explicit open/close markers to override: 129 | if has_open_marker { 130 | can_open = true; 131 | can_close = false; 132 | startopener = pos - 1; 133 | } 134 | if !has_open_marker && has_close_marker { 135 | can_close = true; 136 | can_open = false; 137 | endcloser = pos + 2; 138 | } 139 | 140 | if has_open_marker && defaultmatch.is_right_atom() { 141 | defaultmatch = defaultmatch.corresponding_left_atom(); 142 | } else if has_close_marker && defaultmatch.is_left_atom() { 143 | defaultmatch = defaultmatch.corresponding_right_atom(); 144 | } 145 | 146 | let openers = self.openers.entry(c).or_default(); 147 | if can_close && openers.len() > 0 { 148 | // check openers for a match 149 | let opener = openers.last().unwrap().clone(); 150 | if opener.range.end != pos { 151 | // exclude empty emph 152 | self.clear_openers(opener.range.start, pos + 1); 153 | self.add_match(opener.range.clone(), Annot::Add(annotation)); 154 | self.add_match(pos..endcloser, Annot::Sub(annotation)); 155 | return endcloser; 156 | } 157 | } 158 | // if we get here, we didn't match an opener 159 | if can_open { 160 | self.add_opener(c, Opener::new(startopener..pos + 1)); 161 | self.add_match(startopener..pos + 1, defaultmatch); 162 | pos + 1 163 | } else { 164 | self.add_match(startopener..endcloser, defaultmatch); 165 | endcloser 166 | } 167 | } 168 | 169 | fn matchers(&mut self, c: u8, pos: usize, endpos: usize) -> Option { 170 | match c { 171 | b'`' => { 172 | let m = bounded_find(&self.subject, "^`*", pos, endpos); 173 | if !m.is_match { 174 | return None; 175 | } 176 | // TODO: display/inline math 177 | 178 | self.add_match(pos..m.end, Annot::Add(Comp::Verbatim)); 179 | self.verbatim_type = Comp::Verbatim; 180 | 181 | self.verbatim = m.end - pos; 182 | return Some(m.end); 183 | } 184 | b'\\' => { 185 | let m = bounded_find(&self.subject, "^[ \t]*\r?\n", pos + 1, endpos); 186 | self.add_match(pos..pos + 1, Atom::Escape); 187 | 188 | if m.is_match { 189 | // see f there were preceding spaces 190 | if let Some((_, mm)) = self.matches.iter().rev().next() { 191 | let sp = mm.range.start; 192 | let mut ep = mm.range.end; 193 | if mm.is(Atom::Str) { 194 | while self.subject.as_bytes()[ep] == b' ' || self.subject.as_bytes()[ep] == b'\t' { 195 | ep = ep - 1 196 | } 197 | if sp == ep { 198 | self.matches.remove(&sp); 199 | } else { 200 | self.add_match(sp..ep, Atom::Str) 201 | } 202 | } 203 | } 204 | self.add_match(pos + 1..m.end, Atom::Hardbreak); 205 | return Some(m.end); 206 | } else { 207 | let m = bounded_find(&self.subject, "^[%p ]", pos + 1, endpos); 208 | if !m.is_match { 209 | self.add_match(pos..pos + 1, Atom::Str); 210 | return Some(pos + 1); 211 | } else { 212 | self.add_match(pos..pos + 1, Atom::Escape); 213 | if find_at(&self.subject, "^ ", pos + 1).is_match { 214 | self.add_match(pos + 1..m.end, Atom::Nbsp) 215 | } else { 216 | self.add_match(pos + 1..m.end, Atom::Str) 217 | } 218 | return Some(m.end); 219 | } 220 | } 221 | } 222 | b'<' => { 223 | let url = bounded_find(&self.subject, "^%<[^<>%s]+%>", pos, endpos); 224 | if url.is_match { 225 | let is_url = bounded_find(&self.subject, "^%a+:", pos + 1, url.end).is_match; 226 | let is_email = bounded_find(&self.subject, "^[^:]+%@", pos + 1, url.end).is_match; 227 | if is_email { 228 | self.add_match(url.start..url.start + 1, Comp::Email.add()); 229 | self.add_match(url.start + 1..url.end - 1, Atom::Str); 230 | self.add_match(url.end - 1..url.end, Comp::Email.sub()); 231 | return Some(url.end); 232 | } else if is_url { 233 | self.add_match(url.start..url.start + 1, Comp::Url.add()); 234 | self.add_match(url.start + 1..url.end - 1, Atom::Str); 235 | self.add_match(url.end - 1..url.end, Comp::Url.sub()); 236 | return Some(url.end); 237 | } 238 | } 239 | return None; 240 | } 241 | b'~' => Some(self.between_matched(pos, b'~', Comp::Subscript, Atom::Str)), 242 | b'^' => Some(self.between_matched(pos, b'^', Comp::Superscript, Atom::Str)), 243 | b'[' => { 244 | let m = bounded_find(&self.subject, "^%^([^]]+)%]", pos + 1, endpos); 245 | if m.is_match { 246 | self.add_match(pos..m.end, Atom::FootnoteReference); 247 | return Some(m.end); 248 | } else { 249 | self.add_opener(b'[', Opener::new(pos..pos + 1)); 250 | self.add_match(pos..pos + 1, Atom::Str); 251 | return Some(pos + 1); 252 | } 253 | } 254 | b']' => { 255 | let openers = self.openers.entry(b'[').or_default(); 256 | if openers.len() > 0 { 257 | let opener = openers.last_mut().unwrap(); 258 | if opener.annot == "reference_link" { 259 | let opener = opener.clone(); 260 | // found a reference link 261 | // add the matches 262 | let is_image = self.subject[..opener.range.start].ends_with('!') 263 | && !self.subject[..opener.range.start].ends_with("[]"); 264 | if is_image { 265 | self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker); 266 | self.add_match(opener.range.clone(), Comp::Imagetext.add()); 267 | self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub()); 268 | } else { 269 | self.add_match(opener.range.clone(), Comp::Linktext.add()); 270 | self.add_match(opener.sub_range.clone(), Comp::Linktext.sub()); 271 | } 272 | self.add_match(opener.sub_range.end - 1..opener.sub_range.end, Comp::Reference.add()); 273 | self.add_match(pos..pos, Comp::Reference.sub()); 274 | // convert all matches to str 275 | self.str_matches(opener.sub_range.end, pos); 276 | // remove from openers 277 | self.clear_openers(opener.range.start, pos); 278 | return Some(pos + 1); 279 | } else if bounded_find(&self.subject, "^[%[]", pos + 1, endpos).is_match { 280 | opener.annot = "reference_link"; 281 | opener.sub_range.start = pos; // intermediate ] 282 | opener.sub_range.end = pos + 2; // intermediate [ 283 | self.add_match(pos..pos + 2, Atom::Str); 284 | return Some(pos + 2); 285 | } else if bounded_find(&self.subject, "^[(]", pos + 1, endpos).is_match { 286 | opener.annot = "explicit_link"; 287 | opener.sub_range.start = pos; // intermediate ] 288 | opener.sub_range.end = pos + 2; // intermediate ( 289 | self.openers.remove(&b'('); // clear ( openers 290 | self.destination = true; 291 | self.add_match(pos..pos + 2, Atom::Str); 292 | return Some(pos + 2); 293 | } else if bounded_find(&self.subject, "^%{", pos + 1, endpos).is_match { 294 | let opener = opener.clone(); 295 | // assume this is attributes, bracketed span 296 | self.add_match(opener.range.clone(), Comp::Span.add()); 297 | self.add_match(pos..pos + 1, Comp::Span.sub()); 298 | // remove any openers between [ and ] 299 | self.clear_openers(opener.range.start, pos); 300 | return Some(pos + 1); 301 | } 302 | } 303 | return None; 304 | } 305 | b'(' => { 306 | if !self.destination { 307 | return None; 308 | } 309 | self.add_opener(b'(', Opener::new(pos..pos + 1)); 310 | self.add_match(pos..pos + 1, Atom::Str); 311 | return Some(pos + 1); 312 | } 313 | b')' => { 314 | if !self.destination { 315 | return None; 316 | } 317 | let parens = self.openers.entry(b'(').or_default(); 318 | if parens.len() > 0 { 319 | // TODO? 320 | parens.pop(); 321 | self.add_match(pos..pos + 1, Atom::Str); 322 | return Some(pos + 1); 323 | } else { 324 | let openers = &self.openers.entry(b'[').or_default().clone(); 325 | if let Some(opener) = openers.last().cloned() { 326 | if opener.annot == "explicit_link" { 327 | let (startdest, enddest) = (opener.sub_range.end - 1, pos); 328 | // we have inline link 329 | let is_image = self.subject[..opener.range.start].ends_with('!') 330 | && !self.subject[..opener.range.start].ends_with("[]"); 331 | if is_image { 332 | self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker); 333 | self.add_match(opener.range.clone(), Comp::Imagetext.add()); 334 | self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub()); 335 | } else { 336 | self.add_match(opener.range.clone(), Comp::Linktext.add()); 337 | self.add_match(opener.sub_range.clone(), Comp::Linktext.sub()); 338 | } 339 | self.add_match(startdest..startdest + 1, Comp::Destination.add()); 340 | self.add_match(enddest..enddest + 1, Comp::Destination.sub()); 341 | self.destination = false; 342 | // convert all matches to str 343 | self.str_matches(opener.sub_range.end + 1, pos); 344 | // remove from openers 345 | self.clear_openers(opener.range.start, pos); 346 | return Some(enddest + 1); 347 | } 348 | } 349 | return None; 350 | } 351 | } 352 | b'_' => Some(self.between_matched(pos, b'_', Comp::Emph, Atom::Str)), 353 | b'*' => Some(self.between_matched(pos, b'*', Comp::Strong, Atom::Str)), 354 | b'{' => { 355 | if self.subject[pos + 1..endpos].starts_with(|c: char| "_*~^+='\"-".contains(c)) { 356 | self.add_match(pos..pos + 1, Atom::OpenMarker); 357 | return Some(pos + 1); 358 | } else if self.allow_attributes { 359 | self.attribute_tokenizer = Some(attribute::Tokenizer::new(self.subject.clone())); 360 | self.attribute_start = pos; 361 | return Some(pos); 362 | } else { 363 | // disabling allow_attributes only lasts 364 | // for one potential attribute start {, and then is re-enabled 365 | self.allow_attributes = true; 366 | self.add_match(pos..pos + 1, Atom::Str); 367 | return Some(pos + 1); 368 | } 369 | } 370 | b':' => { 371 | let m = bounded_find(&self.subject, "^%:[%w_+-]+%:", pos, endpos); 372 | if m.is_match { 373 | self.add_match(m.start..m.end, Atom::Emoji); 374 | return Some(m.end); 375 | } else { 376 | self.add_match(pos..pos + 1, Atom::Str); 377 | return Some(pos + 1); 378 | } 379 | } 380 | b'+' => Some(self.between_matched_with_open_test( 381 | pos, 382 | b'+', 383 | Comp::Insert, 384 | Atom::Str, 385 | |subject, pos| { 386 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1)) 387 | }, 388 | )), 389 | b'=' => Some(self.between_matched_with_open_test( 390 | pos, 391 | b'=', 392 | Comp::Mark, 393 | Atom::Str, 394 | |subject, pos| { 395 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1)) 396 | }, 397 | )), 398 | b'\'' => todo!(), 399 | b'"' => Some(self.between_matched(pos, b'"', Comp::DoubleQuoted, Atom::LeftDoubleQuote)), 400 | b'-' => { 401 | let subject = &self.subject[..]; 402 | if subject.as_bytes().get(pos - 1) == Some(&b'{') 403 | || subject.as_bytes().get(pos + 1) == Some(&b'}') 404 | { 405 | return Some(self.between_matched_with_open_test( 406 | pos, 407 | b'-', 408 | Comp::Delete, 409 | Atom::Str, 410 | |subject, pos| { 411 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1)) 412 | }, 413 | )); 414 | } 415 | 416 | let ep = find_at(subject, "^%-*", pos).end.min(endpos); 417 | let mut hyphens = ep - pos; 418 | if subject.as_bytes().get(ep) == Some(&b'}') { 419 | // last hyphen is close del 420 | hyphens -= 1; 421 | } 422 | if hyphens == 0 { 423 | self.add_match(pos..pos + 2, Atom::Str); 424 | return Some(pos + 2); 425 | } 426 | let mut pos = pos; 427 | let all_em = hyphens % 3 == 0; 428 | let all_en = hyphens % 2 == 0; 429 | while hyphens > 0 { 430 | if all_em { 431 | self.add_match(pos..pos + 3, Atom::EmDash); 432 | pos += 3; 433 | hyphens -= 3; 434 | } else if all_en { 435 | self.add_match(pos..pos + 2, Atom::EnDash); 436 | pos += 2; 437 | hyphens -= 2; 438 | } else if hyphens >= 3 && (hyphens % 2 != 0 || hyphens > 4) { 439 | self.add_match(pos..pos + 3, Atom::EmDash); 440 | pos += 3; 441 | hyphens -= 3; 442 | } else if hyphens >= 2 { 443 | self.add_match(pos..pos + 2, Atom::EnDash); 444 | pos += 2; 445 | hyphens -= 2; 446 | } else { 447 | self.add_match(pos..pos + 1, Atom::Str); 448 | pos += 1; 449 | hyphens -= 1; 450 | } 451 | } 452 | Some(pos) 453 | } 454 | b'.' => { 455 | if bounded_find(&self.subject, "^%.%.", pos + 1, endpos).is_match { 456 | self.add_match(pos..pos + 3, Atom::Ellipses); 457 | return Some(pos + 3); 458 | } 459 | return None; 460 | } 461 | _ => return None, 462 | } 463 | } 464 | 465 | fn single_char(&mut self, pos: usize) -> usize { 466 | self.add_match(pos..pos + 1, Atom::Str); 467 | pos + 1 468 | } 469 | 470 | // Feed a slice to the parser, updating state. 471 | pub fn feed(&mut self, spos: usize, endpos: usize) { 472 | let special = "[%]%[\\`{}_*()!<>~^:=+$\r\n'\".-]"; 473 | let subject = self.subject.clone(); 474 | if spos < self.firstpos { 475 | self.firstpos = spos 476 | } 477 | if endpos > self.lastpos { 478 | self.lastpos = endpos 479 | } 480 | let mut pos = spos; 481 | while pos < endpos { 482 | if let Some(mut attribute_tokenizer) = self.attribute_tokenizer.take() { 483 | let sp = pos; 484 | let m = bounded_find(&self.subject, special, pos, endpos); 485 | let ep2 = if m.is_match { m.start } else { endpos }; 486 | let (status, ep) = attribute_tokenizer.feed(sp, ep2); 487 | match status { 488 | attribute::Status::Done => { 489 | let attribute_start = self.attribute_start; 490 | // add attribute matches 491 | self.add_match(attribute_start..attribute_start + 1, Comp::Attributes.add()); 492 | self.add_match(ep..ep + 1, Comp::Attributes.sub()); 493 | let attr_matches = attribute_tokenizer.get_matches(); 494 | for m in attr_matches { 495 | self.add_match(m.range, m.a); 496 | } 497 | self.attribute_tokenizer = None; 498 | self.attribute_start = !0; 499 | pos = ep + 1; 500 | } 501 | attribute::Status::Fail => { 502 | pos = self.attribute_start; 503 | self.allow_attributes = false; 504 | self.attribute_tokenizer = None; 505 | self.attribute_start = !0; 506 | } 507 | attribute::Status::Continue => { 508 | self.attribute_tokenizer = Some(attribute_tokenizer); 509 | pos = ep 510 | } 511 | } 512 | } else { 513 | // find next interesting character: 514 | let newpos = bounded_find(&subject, special, pos, endpos).or(endpos); 515 | if newpos > pos { 516 | self.add_match(pos..newpos, Atom::Str); 517 | pos = newpos; 518 | if pos > endpos { 519 | break; // otherwise, fall through: 520 | } 521 | } 522 | // if we get here, then newpos = pos, 523 | // i.e. we have something interesting at pos 524 | let c = subject.as_bytes()[pos]; 525 | if c == b'\r' || c == b'\n' { 526 | if c == b'\r' && bounded_find(&subject, "^[%n]", pos + 1, endpos).is_match { 527 | self.add_match(pos..pos + 2, Atom::Softbreak); 528 | pos = pos + 2 529 | } else { 530 | self.add_match(pos..pos + 1, Atom::Softbreak); 531 | pos = pos + 1 532 | } 533 | } else if self.verbatim > 0 { 534 | if c == b'`' { 535 | let m = bounded_find(&subject, "^`+", pos, endpos); 536 | if m.is_match && m.end - pos == self.verbatim { 537 | // TODO: Check for raw attributes 538 | self.add_match(pos..m.end, self.verbatim_type.sub()); 539 | pos = m.end; 540 | self.verbatim = 0; 541 | self.verbatim_type = Comp::default(); 542 | } else { 543 | let endchar = m.end_or(endpos); 544 | self.add_match(pos..endchar, Atom::Str); 545 | pos = endchar 546 | } 547 | } else { 548 | self.add_match(pos..pos + 1, Atom::Str); 549 | pos = pos + 1 550 | } 551 | } else { 552 | pos = self.matchers(c, pos, endpos).unwrap_or_else(|| self.single_char(pos)) 553 | } 554 | } 555 | } 556 | } 557 | 558 | pub(crate) fn get_matches(&mut self) -> Vec { 559 | let mut sorted: Vec = Vec::new(); 560 | let mut m_last = Match::new(0..0, Atom::Ellipses); // TODO 561 | for i in self.firstpos..=self.lastpos { 562 | if let Some(m) = self.matches.get(&i) { 563 | if m.is(Atom::Str) && m_last.is(Atom::Str) && m_last.range.end == m.range.start { 564 | (*sorted.last_mut().unwrap()).range.end = m.range.end; 565 | m_last.range.end = m.range.end; 566 | } else { 567 | sorted.push(m.clone()); 568 | m_last = m.clone() 569 | } 570 | } 571 | } 572 | if sorted.len() > 0 { 573 | if sorted.last().unwrap().is(Atom::Softbreak) { 574 | // remove final softbreak 575 | sorted.pop(); 576 | } 577 | if self.verbatim > 0 { 578 | // unclosed verbatim 579 | let e = sorted.last().unwrap().range.end; 580 | sorted.push(Match::new(e..e, self.verbatim_type.sub())) 581 | } 582 | } 583 | sorted 584 | } 585 | } 586 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // TODO: re-export everything. 2 | pub mod ast; 3 | 4 | mod annot; 5 | mod patterns; 6 | mod block; 7 | mod inline; 8 | mod attribute; 9 | mod tree; 10 | mod emoji; 11 | mod html; 12 | #[cfg(test)] 13 | mod sourcegen; 14 | 15 | use std::{collections::BTreeMap, ops::Range}; 16 | 17 | use crate::annot::Annot; 18 | 19 | #[derive(Debug, Default, Clone)] 20 | pub struct Document { 21 | pub children: Vec, 22 | pub references: BTreeMap, 23 | pub debug: String, 24 | } 25 | 26 | #[derive(Default, Clone)] 27 | pub struct ParseOpts { 28 | pub debug_matches: bool, 29 | } 30 | 31 | #[derive(Default, Clone)] 32 | pub struct HtmlOpts {} 33 | 34 | impl Document { 35 | pub fn parse(text: &str) -> Document { 36 | Document::parse_opts(ParseOpts::default(), text) 37 | } 38 | 39 | pub fn parse_opts(opts: ParseOpts, text: &str) -> Document { 40 | let mut p = block::Tokenizer::new(text.to_string(), opts); 41 | p.parse(); 42 | tree::build(p) 43 | } 44 | 45 | pub fn to_html(&self) -> String { 46 | self.to_html_opts(&HtmlOpts::default()) 47 | } 48 | 49 | pub fn to_html_opts(&self, opts: &HtmlOpts) -> String { 50 | html::convert(opts, self) 51 | } 52 | 53 | pub fn to_json(&self) -> String { 54 | #[derive(serde::Serialize)] 55 | struct DocRepr<'a> { 56 | tag: &'static str, 57 | children: &'a [ast::Tag], 58 | references: &'a BTreeMap, 59 | } 60 | serde_json::to_string_pretty(&DocRepr { 61 | tag: "doc", 62 | children: self.children.as_slice(), 63 | references: &self.references, 64 | }) 65 | .unwrap() 66 | } 67 | } 68 | 69 | #[derive(Debug, Clone)] 70 | struct Match { 71 | range: Range, 72 | a: Annot, 73 | } 74 | 75 | impl Match { 76 | fn new(range: Range, a: impl Into) -> Match { 77 | Match { range, a: a.into() } 78 | } 79 | fn is(&self, annot: impl Into) -> bool { 80 | self.a == annot.into() 81 | } 82 | fn is_not(&self, annot: impl Into) -> bool { 83 | !self.is(annot) 84 | } 85 | } 86 | 87 | /// Appends formatted string to a `String`. 88 | macro_rules! _format_to { 89 | ($buf:expr) => (); 90 | ($buf:expr, $lit:literal $($arg:tt)*) => { 91 | { use ::std::fmt::Write as _; let _ = ::std::write!($buf, $lit $($arg)*); } 92 | }; 93 | } 94 | pub(crate) use _format_to as format_to; 95 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use anyhow::Context; 4 | use lexopt::{Arg::Long, Arg::Short, Arg::Value}; 5 | 6 | fn main() -> anyhow::Result<()> { 7 | let mut matches = false; 8 | let mut ast = false; 9 | let mut files = Vec::new(); 10 | 11 | let mut parser = lexopt::Parser::from_env(); 12 | while let Some(arg) = parser.next()? { 13 | match arg { 14 | Short('m') | Long("matches") => matches = true, 15 | Short('a') | Long("ast") => ast = true, 16 | Value(val) => files.push(val), 17 | _ => Err(arg.unexpected())?, 18 | } 19 | } 20 | 21 | let mut inputs = Vec::new(); 22 | if files.is_empty() { 23 | let content = std::io::read_to_string(std::io::stdin()).context("failed to read stdin")?; 24 | inputs.push(content) 25 | } else { 26 | for file in files { 27 | let path = PathBuf::from(file); 28 | let content = std::fs::read_to_string(&path) 29 | .with_context(|| format!("failed to read {}", path.display()))?; 30 | inputs.push(content) 31 | } 32 | } 33 | 34 | let opts = djot::ParseOpts { debug_matches: matches }; 35 | for content in inputs { 36 | let doc = djot::Document::parse_opts(opts.clone(), &content); 37 | if matches { 38 | println!("{}", doc.debug) 39 | } else if ast { 40 | println!("{}", doc.to_json()) 41 | } else { 42 | println!("{}", doc.to_html()) 43 | } 44 | } 45 | 46 | Ok(()) 47 | } 48 | -------------------------------------------------------------------------------- /src/patterns.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | 3 | #[derive(Debug, Default)] 4 | pub struct PatMatch { 5 | pub is_match: bool, 6 | pub start: usize, 7 | pub end: usize, 8 | pub cap1: Range, 9 | pub cap2: Range, 10 | } 11 | 12 | impl PatMatch { 13 | pub(crate) fn or(&self, endpos: usize) -> usize { 14 | if self.is_match { 15 | self.start 16 | } else { 17 | endpos 18 | } 19 | } 20 | 21 | pub(crate) fn end_or(&self, endpos: usize) -> usize { 22 | if self.is_match { 23 | self.end 24 | } else { 25 | endpos 26 | } 27 | } 28 | 29 | pub(crate) fn or_else(self, f: impl FnOnce() -> Self) -> Self { 30 | if self.is_match { 31 | self 32 | } else { 33 | f() 34 | } 35 | } 36 | } 37 | 38 | pub fn find(subject: &str, pat: &'static str) -> PatMatch { 39 | find_at(subject, pat, 0) 40 | } 41 | 42 | pub fn find_at(subject: &str, pat: &'static str, start: usize) -> PatMatch { 43 | let mut pat = lua_patterns::LuaPattern::new(pat); 44 | let is_match = pat.matches(&subject[start..]); 45 | let range = pat.range(); 46 | let cap1 = pat.capture(1); 47 | let cap2 = pat.capture(2); 48 | PatMatch { 49 | start: range.start + start, 50 | end: range.end + start, 51 | is_match, 52 | cap1: cap1.start + start..cap1.end + start, 53 | cap2: cap2.start + start..cap2.end + start, 54 | } 55 | } 56 | 57 | pub(crate) fn is_space(c: char) -> bool { 58 | " \n\t".contains(c) 59 | } 60 | -------------------------------------------------------------------------------- /src/sourcegen.rs: -------------------------------------------------------------------------------- 1 | //! Generates matches and ast structures 2 | mod annot; 3 | mod ast; 4 | 5 | use std::path::Path; 6 | 7 | fn camel_case(ident: &str) -> String { 8 | ident 9 | .split('_') 10 | .flat_map(|word| { 11 | word.chars().next().map(|it| it.to_ascii_uppercase()).into_iter().chain(word.chars().skip(1)) 12 | }) 13 | .collect() 14 | } 15 | 16 | fn ensure_content(path: &str, content: &str) { 17 | let base = Path::new(env!("CARGO_MANIFEST_DIR")); 18 | let path = base.join(path); 19 | let old = std::fs::read_to_string(&path).unwrap_or_default(); 20 | if normalize(&old) == normalize(content) { 21 | return; 22 | } 23 | std::fs::write(&path, content) 24 | .unwrap_or_else(|err| panic!("can't write {}: {err}", path.display())); 25 | } 26 | 27 | fn normalize(s: &str) -> String { 28 | s.split_ascii_whitespace().flat_map(|it| it.split(',')).collect() 29 | } 30 | -------------------------------------------------------------------------------- /src/sourcegen/annot.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | format_to, 3 | sourcegen::{camel_case, ensure_content}, 4 | }; 5 | 6 | const ANNOTATIONS: &str = " 7 | verbatim 8 | email 9 | url 10 | subscript 11 | superscript 12 | para 13 | code_block 14 | imagetext 15 | linktext 16 | reference 17 | destination 18 | emph 19 | strong 20 | span 21 | double_quoted 22 | reference_definition 23 | insert 24 | delete 25 | mark 26 | attributes 27 | 28 | str 29 | escape 30 | hardbreak 31 | nbsp 32 | blankline 33 | image_marker 34 | left_double_quote 35 | right_double_quote 36 | ellipses 37 | softbreak 38 | footnote_reference 39 | open_marker 40 | emoji 41 | reference_key 42 | reference_value 43 | code_language 44 | em_dash 45 | en_dash 46 | id 47 | key 48 | value 49 | class 50 | "; 51 | 52 | #[test] 53 | fn generate_annotations() { 54 | let (composites, atoms) = ANNOTATIONS.trim().split_once("\n\n").unwrap(); 55 | 56 | let mut buf = "\ 57 | use std::fmt; 58 | " 59 | .to_string(); 60 | 61 | emit_comp(&mut buf, composites); 62 | emit_atom(&mut buf, atoms); 63 | ensure_content("src/annot/generated.rs", &buf); 64 | } 65 | 66 | fn emit_comp(buf: &mut String, composites: &str) { 67 | format_to!( 68 | buf, 69 | "\ 70 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 71 | pub(crate) enum Comp {{ 72 | " 73 | ); 74 | for ident in composites.lines() { 75 | format_to!(buf, " {},\n", camel_case(ident)) 76 | } 77 | format_to!(buf, "}}\n"); 78 | 79 | let mut display_arms = String::new(); 80 | for ident in composites.lines() { 81 | format_to!(display_arms, " Comp::{} => \"{ident}\",\n", camel_case(ident)) 82 | } 83 | 84 | format_to!( 85 | buf, 86 | " 87 | impl fmt::Display for Comp {{ 88 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{ 89 | f.write_str(match self {{ 90 | {display_arms} 91 | }}) 92 | }} 93 | }} 94 | " 95 | ); 96 | } 97 | 98 | fn emit_atom(buf: &mut String, atoms: &str) { 99 | let mut variants = String::new(); 100 | for ident in atoms.lines() { 101 | format_to!(variants, " {},\n", camel_case(ident)) 102 | } 103 | 104 | format_to!( 105 | buf, 106 | " 107 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 108 | pub(crate) enum Atom {{ 109 | {variants} 110 | }} 111 | " 112 | ); 113 | 114 | let mut left_atoms = String::new(); 115 | let mut right_atoms = String::new(); 116 | let mut ltr = String::new(); 117 | let mut rtl = String::new(); 118 | for ident in atoms.lines() { 119 | if ident.starts_with("left_") { 120 | format_to!(left_atoms, " | Atom::{}", camel_case(ident)); 121 | let rident = &ident.replace("left", "right"); 122 | format_to!(ltr, "Atom::{} => Atom::{},\n", camel_case(ident), camel_case(rident)); 123 | format_to!(rtl, "Atom::{} => Atom::{},\n", camel_case(rident), camel_case(ident)); 124 | } 125 | if ident.starts_with("right_") { 126 | format_to!(right_atoms, " | Atom::{}", camel_case(ident)) 127 | } 128 | } 129 | 130 | format_to!( 131 | buf, 132 | " 133 | impl Atom {{ 134 | pub(crate) fn is_left_atom(self) -> bool {{ 135 | matches!(self, {left_atoms}) 136 | }} 137 | pub(crate) fn is_right_atom(self) -> bool {{ 138 | matches!(self, {right_atoms}) 139 | }} 140 | pub(crate) fn corresponding_left_atom(self) -> Atom {{ 141 | match self {{ 142 | {rtl} 143 | _ => self 144 | }} 145 | }} 146 | pub(crate) fn corresponding_right_atom(self) -> Atom {{ 147 | match self {{ 148 | {ltr} 149 | _ => self 150 | }} 151 | }} 152 | }} 153 | " 154 | ); 155 | 156 | let mut display_arms = String::new(); 157 | for ident in atoms.lines() { 158 | format_to!(display_arms, " Atom::{} => \"{ident}\",\n", camel_case(ident)) 159 | } 160 | 161 | format_to!( 162 | buf, 163 | " 164 | impl fmt::Display for Atom {{ 165 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{ 166 | f.write_str(match self {{ 167 | {display_arms} 168 | }}) 169 | }} 170 | }} 171 | " 172 | ); 173 | } 174 | -------------------------------------------------------------------------------- /src/sourcegen/ast.rs: -------------------------------------------------------------------------------- 1 | use crate::{format_to, sourcegen::camel_case}; 2 | 3 | use crate::sourcegen::ensure_content; 4 | 5 | const TAGS: &str = " 6 | heading level: u32 7 | para 8 | link destination: Option, reference: Option 9 | image destination: Option, reference: Option 10 | code_block lang: Option, text: String 11 | strong 12 | emph 13 | insert 14 | delete 15 | mark 16 | superscript 17 | subscript 18 | span 19 | double_quoted 20 | url destination: String 21 | 22 | soft_break 23 | em_dash 24 | en_dash 25 | verbatim text: String 26 | str text: String 27 | emoji alias: String 28 | "; 29 | 30 | #[test] 31 | fn generate_annotations() { 32 | let (composites, atoms) = TAGS.trim().split_once("\n\n").unwrap(); 33 | 34 | let mut buf = format!("use super::Attrs;\n"); 35 | emit_ast_comp(&mut buf, composites); 36 | emit_ast_atom(&mut buf, atoms); 37 | emit_ast_tag(&mut buf, composites, atoms); 38 | ensure_content("src/ast/generated.rs", &buf); 39 | } 40 | 41 | fn emit_ast_comp(buf: &mut String, composites: &str) { 42 | for comp in composites.lines() { 43 | let (ident, fields) = comp.split_once(" ").unwrap_or((comp, "")); 44 | let fields = if fields.is_empty() { 45 | String::new() 46 | } else { 47 | fields.split(", ").map(|it| format!("pub {it},\n")).collect::() 48 | }; 49 | 50 | format_to! {buf, " 51 | #[derive(Debug, Default, Clone, serde::Serialize)] 52 | pub struct {} {{ 53 | #[serde(skip_serializing_if = \"Attrs::is_empty\")] 54 | pub attrs: Attrs, 55 | pub children: Vec, 56 | {fields} 57 | }} 58 | ", camel_case(ident)} 59 | } 60 | } 61 | 62 | fn emit_ast_atom(buf: &mut String, atoms: &str) { 63 | for atom in atoms.lines() { 64 | let (ident, fields) = atom.split_once(" ").unwrap_or((atom, "")); 65 | let fields = if fields.is_empty() { 66 | String::new() 67 | } else { 68 | fields.split(", ").map(|it| format!("pub {it},\n")).collect::() 69 | }; 70 | format_to! {buf, " 71 | #[derive(Debug, Default, Clone, serde::Serialize)] 72 | pub struct {} {{ 73 | #[serde(skip_serializing_if = \"Attrs::is_empty\")] 74 | pub attrs: Attrs, 75 | {fields} 76 | }} 77 | ", camel_case(ident)} 78 | } 79 | } 80 | 81 | fn emit_ast_tag(buf: &mut String, composites: &str, atoms: &str) { 82 | let mut variants = String::new(); 83 | for comp in composites.lines() { 84 | let ident = comp.split_once(" ").map_or(comp, |it| it.0); 85 | let camel = camel_case(ident); 86 | format_to!(variants, " {camel}({camel}),\n"); 87 | } 88 | for atom in atoms.lines() { 89 | let ident = atom.split_once(" ").map_or(atom, |it| it.0); 90 | let camel = camel_case(ident); 91 | format_to!(variants, " {camel}({camel}),\n"); 92 | } 93 | format_to!( 94 | buf, 95 | " 96 | #[derive(Debug, Clone, serde::Serialize)] 97 | #[serde(tag = \"tag\", rename_all = \"snake_case\")] 98 | pub enum Tag {{ {variants} }} 99 | " 100 | ) 101 | } 102 | -------------------------------------------------------------------------------- /src/tree.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use crate::{ 4 | annot::{Annot, Atom, Comp}, 5 | ast::{ 6 | Attrs, CodeBlock, Delete, DoubleQuoted, Emoji, Emph, Image, Insert, Link, Mark, Para, 7 | ReferenceDefinition, SoftBreak, Span, Str, Strong, Subscript, Superscript, Tag, Url, Verbatim, 8 | }, 9 | block, 10 | patterns::find, 11 | Document, Match, 12 | }; 13 | 14 | pub(crate) fn build(p: block::Tokenizer) -> Document { 15 | let mut ctx = Ctx { subject: p.subject, matches: p.matches, idx: 0, references: BTreeMap::new() }; 16 | let mut doc = ctx.get_doc(); 17 | doc.debug = p.debug; 18 | doc.references = ctx.references; 19 | doc 20 | } 21 | 22 | struct Ctx { 23 | subject: String, 24 | matches: Vec, 25 | references: BTreeMap, 26 | idx: usize, 27 | } 28 | 29 | impl Ctx { 30 | fn get_doc(&mut self) -> Document { 31 | let mut res = Document::default(); 32 | while self.idx < self.matches.len() { 33 | self.get_tag(&mut res.children) 34 | } 35 | res 36 | } 37 | 38 | fn get_tag(&mut self, acc: &mut Vec) { 39 | self.skip_trivia(); 40 | let m = self.matches[self.idx].clone(); 41 | self.idx += 1; 42 | let res = match m.a { 43 | Annot::Add(comp) => match comp { 44 | Comp::CodeBlock => Tag::CodeBlock(self.get_code_block()), 45 | Comp::Para => Tag::Para(self.get_para()), 46 | Comp::Verbatim => Tag::Verbatim(self.get_verbatim()), 47 | Comp::Strong => Tag::Strong(self.get_strong()), 48 | Comp::Emph => Tag::Emph(self.get_emph()), 49 | Comp::Insert => Tag::Insert(self.get_insert()), 50 | Comp::Delete => Tag::Delete(self.get_delete()), 51 | Comp::Mark => Tag::Mark(self.get_mark()), 52 | Comp::Subscript => Tag::Subscript(self.get_subscript()), 53 | Comp::Superscript => Tag::Superscript(self.get_superscript()), 54 | Comp::DoubleQuoted => Tag::DoubleQuoted(self.get_double_quoted()), 55 | Comp::Linktext => Tag::Link(self.get_link()), 56 | Comp::Imagetext => Tag::Image(self.get_image()), 57 | Comp::Url => Tag::Url(self.get_url()), 58 | Comp::Attributes => todo!(), 59 | Comp::Span => Tag::Span(self.get_span()), 60 | Comp::ReferenceDefinition => { 61 | self.get_reference_definition(); 62 | return; 63 | } 64 | _ => todo!("{comp:?}"), 65 | }, 66 | Annot::Sub(sub) => unreachable!("-{sub}"), 67 | Annot::Atom(atom) => match atom { 68 | Atom::Str => { 69 | let mut text = self.subject[m.range].to_string(); 70 | let attrs = self.get_attrs(); 71 | if !attrs.is_empty() { 72 | if let Some(idx) = text.rfind(|it: char| it.is_ascii_whitespace()) { 73 | acc.push(Tag::Str(Str { attrs: Attrs::new(), text: text[..idx + 1].to_string() })); 74 | text.drain(..idx + 1); 75 | } 76 | } 77 | Tag::Str(Str { attrs, text }) 78 | } 79 | Atom::Emoji => { 80 | let mut res = Emoji::default(); 81 | res.alias = self.subject[m.range.start + 1..m.range.end - 1].to_string(); 82 | Tag::Emoji(res) 83 | } 84 | Atom::Softbreak => Tag::SoftBreak(SoftBreak::default()), 85 | Atom::Class | Atom::Id => return, 86 | _ => todo!("{atom:?}"), 87 | }, 88 | }; 89 | acc.push(res) 90 | } 91 | 92 | fn get_code_block(&mut self) -> CodeBlock { 93 | let mut res = CodeBlock::default(); 94 | let m = self.matches[self.idx].clone(); 95 | if m.is(Atom::CodeLanguage) { 96 | res.lang = Some(self.subject[m.range].to_string()); 97 | self.idx += 1; 98 | } 99 | res.text = self.get_text_until(Comp::CodeBlock); 100 | res 101 | } 102 | 103 | fn get_para(&mut self) -> Para { 104 | let mut res = Para::default(); 105 | res.children = self.get_tags_until(Comp::Para); 106 | res 107 | } 108 | 109 | fn get_verbatim(&mut self) -> Verbatim { 110 | let mut res = Verbatim::default(); 111 | res.text = self.get_text_until(Comp::Verbatim); 112 | if find(res.text.as_str(), "^ +`").is_match { 113 | res.text.remove(0); 114 | } 115 | if find(res.text.as_str(), "` +$").is_match { 116 | res.text.pop(); 117 | } 118 | res 119 | } 120 | 121 | fn get_strong(&mut self) -> Strong { 122 | let mut res = Strong::default(); 123 | res.children = self.get_tags_until(Comp::Strong); 124 | res 125 | } 126 | 127 | fn get_emph(&mut self) -> Emph { 128 | let mut res = Emph::default(); 129 | res.children = self.get_tags_until(Comp::Emph); 130 | res 131 | } 132 | 133 | fn get_insert(&mut self) -> Insert { 134 | let mut res = Insert::default(); 135 | res.children = self.get_tags_until(Comp::Insert); 136 | res 137 | } 138 | 139 | fn get_delete(&mut self) -> Delete { 140 | let mut res = Delete::default(); 141 | res.children = self.get_tags_until(Comp::Delete); 142 | res 143 | } 144 | 145 | fn get_mark(&mut self) -> Mark { 146 | let mut res = Mark::default(); 147 | res.children = self.get_tags_until(Comp::Mark); 148 | res 149 | } 150 | 151 | fn get_subscript(&mut self) -> Subscript { 152 | let mut res = Subscript::default(); 153 | res.children = self.get_tags_until(Comp::Subscript); 154 | res 155 | } 156 | 157 | fn get_superscript(&mut self) -> Superscript { 158 | let mut res = Superscript::default(); 159 | res.children = self.get_tags_until(Comp::Superscript); 160 | res 161 | } 162 | 163 | fn get_double_quoted(&mut self) -> DoubleQuoted { 164 | let mut res = DoubleQuoted::default(); 165 | res.children = self.get_tags_until(Comp::DoubleQuoted); 166 | res 167 | } 168 | 169 | fn get_link(&mut self) -> Link { 170 | let mut res = Link::default(); 171 | res.children = self.get_tags_until(Comp::Linktext); 172 | match self.get_dest() { 173 | LinkDest::Dest(dest) => res.destination = Some(dest), 174 | LinkDest::Ref(r) => res.reference = Some(r), 175 | LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)), 176 | } 177 | res 178 | } 179 | 180 | fn get_image(&mut self) -> Image { 181 | let mut res = Image::default(); 182 | res.children = self.get_tags_until(Comp::Imagetext); 183 | match self.get_dest() { 184 | LinkDest::Dest(dest) => res.destination = Some(dest), 185 | LinkDest::Ref(r) => res.reference = Some(r), 186 | LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)), 187 | } 188 | res 189 | } 190 | 191 | fn get_dest(&mut self) -> LinkDest { 192 | let m = self.matches[self.idx].clone(); 193 | self.idx += 1; 194 | if m.is(Comp::Destination.add()) { 195 | let dest = self.get_text_until(Comp::Destination); 196 | LinkDest::Dest(dest.replace('\n', "")) 197 | } else { 198 | let r = self.get_text_until(Comp::Reference); 199 | if r.is_empty() { 200 | LinkDest::AutoRef 201 | } else { 202 | LinkDest::Ref(r.replace('\n', " ")) 203 | } 204 | } 205 | } 206 | 207 | fn get_url(&mut self) -> Url { 208 | let mut res = Url::default(); 209 | res.destination = self.get_text_until(Comp::Url); 210 | res 211 | } 212 | 213 | fn get_span(&mut self) -> Span { 214 | let mut res = Span::default(); 215 | res.children = self.get_tags_until(Comp::Span); 216 | res.attrs = self.get_attrs(); 217 | res 218 | } 219 | 220 | fn get_attrs(&mut self) -> Attrs { 221 | if !self.matches[self.idx].is(Comp::Attributes.add()) { 222 | return Attrs::new(); 223 | } 224 | self.idx += 1; 225 | let mut res = Attrs::new(); 226 | loop { 227 | let m = self.matches[self.idx].clone(); 228 | self.idx += 1; 229 | if m.is(Comp::Attributes.sub()) { 230 | break; 231 | } 232 | if m.is(Atom::Class) { 233 | match res.entry("class".to_string()) { 234 | indexmap::map::Entry::Occupied(mut it) => { 235 | it.insert(format!("{} {}", it.get(), &self.subject[m.range.clone()])); 236 | } 237 | indexmap::map::Entry::Vacant(it) => { 238 | it.insert(self.subject[m.range.clone()].to_string()); 239 | } 240 | } 241 | } else if m.is(Atom::Id) { 242 | res.insert("id".to_string(), self.subject[m.range].to_string()); 243 | } else if m.is(Atom::Key) { 244 | let key = self.subject[m.range].to_string(); 245 | let m = self.matches[self.idx].clone(); 246 | self.idx += 1; 247 | let value = self.subject[m.range].to_string(); 248 | res.insert(key, value); 249 | } 250 | } 251 | res 252 | } 253 | 254 | fn get_reference_definition(&mut self) { 255 | let mut res = ReferenceDefinition::default(); 256 | let key = self.matches[self.idx].clone(); 257 | self.idx += 1; 258 | loop { 259 | let m = self.matches[self.idx].clone(); 260 | if !m.is(Atom::ReferenceValue) { 261 | break; 262 | } 263 | self.idx += 1; 264 | res.destination.push_str(&self.subject[m.range]); 265 | } 266 | assert!(self.matches[self.idx].is(Comp::ReferenceDefinition.sub())); 267 | self.idx += 1; 268 | self.references.insert(self.subject[key.range.start + 1..key.range.end - 1].to_string(), res); 269 | } 270 | 271 | fn get_tags_until(&mut self, comp: Comp) -> Vec { 272 | let mut res = vec![]; 273 | while !self.matches[self.idx].is(comp.sub()) { 274 | self.get_tag(&mut res) 275 | } 276 | self.idx += 1; 277 | res 278 | } 279 | 280 | fn get_text_until(&mut self, comp: Comp) -> String { 281 | let mut res = String::new(); 282 | loop { 283 | let m = self.matches[self.idx].clone(); 284 | self.idx += 1; 285 | if m.is(comp.sub()) { 286 | break; 287 | } 288 | res.push_str(&self.subject[m.range]); 289 | } 290 | res 291 | } 292 | 293 | fn skip_trivia(&mut self) { 294 | while self.idx < self.matches.len() { 295 | let m = self.matches[self.idx].clone(); 296 | if !(m.is(Atom::Blankline) || m.is(Atom::ImageMarker) || m.is(Atom::Escape)) { 297 | break; 298 | } 299 | self.idx += 1; 300 | continue; 301 | } 302 | } 303 | } 304 | 305 | pub(crate) fn get_string_content(tags: &[Tag]) -> String { 306 | let mut res = String::new(); 307 | for tag in tags { 308 | match tag { 309 | Tag::SoftBreak(_) => res.push('\n'), 310 | Tag::Str(str) => res.push_str(&str.text), 311 | Tag::Emph(emph) => res.push_str(&get_string_content(&emph.children)), 312 | _ => (), 313 | } 314 | } 315 | res 316 | } 317 | 318 | enum LinkDest { 319 | Dest(String), 320 | Ref(String), 321 | AutoRef, 322 | } 323 | -------------------------------------------------------------------------------- /tests/data/attributes.test: -------------------------------------------------------------------------------- 1 | An inline attribute allies to the preceding element, which might 2 | be complex (span, emphasis, link) or a simple word (defined as a 3 | sequence of non-ASCII-whitespace characters). 4 | ``` 5 | foo привет{.ru} 6 | . 7 |

foo привет

8 | ``` 9 | 10 | ``` 11 | (some text){.attr} 12 | . 13 |

(some text)

14 | ``` 15 | 16 | ``` 17 | [some text]{.attr} 18 | . 19 |

some text

20 | ``` 21 | 22 | Ensure that emphasis that starts before the attribute can still close, 23 | even if the attribute contains a potential closer. 24 | 25 | ``` 26 | a *b{#id key="*"}* 27 | . 28 |

a b

29 | ``` 30 | 31 | ``` 32 | a *b{#id key="*"}o 33 | . 34 |

a *bo

35 | ``` 36 | 37 | Don't allow attributes to start when we're parsing a potential 38 | attribute. 39 | 40 | ``` 41 | hi{key="{#hi"} 42 | . 43 |

hi{key=“{#hi”

44 | ``` 45 | 46 | ``` 47 | hi\{key="abc{#hi}" 48 | . 49 |

hi{key=“abc

50 | ``` 51 | STOP 52 | ``` 53 | hi{key="\{#hi"} 54 | . 55 |

hi

56 | ``` 57 | 58 | Line break: 59 | 60 | ``` 61 | hi{#id .class 62 | key="value"} 63 | . 64 |

hi

65 | ``` 66 | 67 | Here there is nothing for the attribute to attach to: 68 | 69 | ``` 70 | {#id} at beginning 71 | . 72 |

at beginning

73 | ``` 74 | 75 | ``` 76 | After {#id} space 77 | {.class} 78 | . 79 |

After space 80 |

81 | ``` 82 | 83 | Block attributes come before the block, on a line by themselves. 84 | 85 | ``` 86 | {#id .class} 87 | A paragraph 88 | . 89 |

A paragraph

90 | ``` 91 | 92 | Use indentation if you need to continue the attributes over a line break. 93 | 94 | ``` 95 | {#id .class 96 | style="color:red"} 97 | A paragraph 98 | . 99 |

A paragraph

100 | ``` 101 | 102 | If the attribute block can't be parsed as attributes, it will be 103 | parsed as a regular paragraph: 104 | 105 | ``` 106 | {#id .cla*ss* 107 | . 108 |

{#id .class

109 | ``` 110 | 111 | You can use consecutive attribute blocks. 112 | In case of conflict, later values take precedence over earlier ones, 113 | but classes accumulate: 114 | 115 | ``` 116 | {#id} 117 | {key=val} 118 | {.foo .bar} 119 | {key=val2} 120 | {.baz} 121 | {#id2} 122 | Okay 123 | . 124 |

Okay

125 | ``` 126 | 127 | Attributes on different kinds of blocks: 128 | 129 | ``` 130 | {#id} 131 | > Block quote 132 | . 133 |
134 |

Block quote

135 |
136 | ``` 137 | 138 | ``` 139 | {#id} 140 | # Heading 141 | . 142 |
143 |

Heading

144 |
145 | ``` 146 | 147 | ``` 148 | {.blue} 149 | - - - - - 150 | . 151 |
152 | ``` 153 | 154 | ```` 155 | {highlight=3} 156 | ``` ruby 157 | x = 3 158 | ``` 159 | . 160 |
x = 3
161 | 
162 | ```` 163 | 164 | ``` 165 | {.special} 166 | 1. one 167 | 2. two 168 | . 169 |
    170 |
  1. 171 | one 172 |
  2. 173 |
  3. 174 | two 175 |
  4. 176 |
177 | ``` 178 | 179 | ``` 180 | > {.foo} 181 | > > {.bar} 182 | > > nested 183 | . 184 |
185 |
186 |

nested

187 |
188 |
189 | ``` 190 | 191 | Comments start at a `%` character 192 | (not in quotes) and end with another `%`. 193 | These can be used to comment up an attribute 194 | list or without any real attributes. 195 | 196 | ``` 197 | foo{#ident % this is a comment % .class} 198 | . 199 |

foo

200 | ``` 201 | 202 | In block-level comment, subsequent lines must 203 | be indented, as with attributes: 204 | 205 | ``` 206 | {% This is a comment before a 207 | block-level item. %} 208 | Paragraph. 209 | . 210 |

Paragraph.

211 | ``` 212 | 213 | Inline attributes can be empty: 214 | 215 | ``` 216 | hi{} 217 | . 218 |

hi

219 | ``` 220 | 221 | Block attributes can be empty: 222 | 223 | ``` 224 | {} 225 | hi 226 | . 227 |

hi

228 | ``` 229 | -------------------------------------------------------------------------------- /tests/data/code_blocks.test: -------------------------------------------------------------------------------- 1 | 2 | ``` 3 | ~~~ 4 | code 5 | block 6 | ~~~ 7 | . 8 |
code
 9 |   block
10 | 
11 | ``` 12 | 13 | ```` 14 | ``` python 15 | x = y + 3 16 | ``` 17 | . 18 |
x = y + 3
19 | 
20 | ```` 21 | 22 | ```` 23 | ``` python 24 | if true: 25 | x = 3 26 | ``` 27 | . 28 |
if true:
29 |   x = 3
30 | 
31 | ```` 32 | 33 | ```` 34 | ``` not a code block ``` 35 | . 36 |

not a code block

37 | ```` 38 | 39 | ```` 40 | ``` not a code block 41 | . 42 |

not a code block

43 | ```` 44 | 45 | ```` 46 | ``` 47 | hi 48 | ``` 49 | ``` 50 | two 51 | ``` 52 | . 53 |
hi
54 | 
55 |
two
56 | 
57 | ```` 58 | 59 | Empty code block: 60 | 61 | ```` 62 | ``` 63 | ``` 64 | . 65 |
66 | ```` 67 | -------------------------------------------------------------------------------- /tests/data/emoji.test: -------------------------------------------------------------------------------- 1 | ``` 2 | :+1: :scream: 3 | . 4 |

👍 😱

5 | ``` 6 | 7 | ``` 8 | This is a :nonexistent: emoji. 9 | . 10 |

This is a :nonexistent: emoji.

11 | ``` 12 | 13 | ``` 14 | :ice:scream: 15 | . 16 |

:ice:scream:

17 | ``` 18 | -------------------------------------------------------------------------------- /tests/data/emphasis.test: -------------------------------------------------------------------------------- 1 | ``` 2 | *foo bar* 3 | . 4 |

foo bar

5 | ``` 6 | 7 | ``` 8 | a* foo bar* 9 | . 10 |

a* foo bar*

11 | ``` 12 | 13 | ``` 14 | *foo bar * 15 | . 16 |

*foo bar *

17 | ``` 18 | 19 | Unicode spaces don't block emphasis. 20 | 21 | ``` 22 | * a * 23 | . 24 |

 a 

25 | ``` 26 | 27 | Intraword: 28 | 29 | ``` 30 | foo*bar*baz 31 | . 32 |

foobarbaz

33 | ``` 34 | 35 | ``` 36 | _foo bar_ 37 | . 38 |

foo bar

39 | ``` 40 | 41 | ``` 42 | _ foo bar_ 43 | . 44 |

_ foo bar_

45 | ``` 46 | 47 | ``` 48 | _foo bar _ 49 | . 50 |

_foo bar _

51 | ``` 52 | 53 | Unicode spaces don't block emphasis. 54 | 55 | ``` 56 | _ a _ 57 | . 58 |

 a 

59 | ``` 60 | 61 | Intraword: 62 | 63 | ``` 64 | foo_bar_baz 65 | . 66 |

foobarbaz

67 | ``` 68 | 69 | ``` 70 | aa_"bb"_cc 71 | . 72 |

aa“bb”cc

73 | ``` 74 | 75 | ``` 76 | *foo_ 77 | . 78 |

*foo_

79 | ``` 80 | 81 | ``` 82 | _foo* 83 | . 84 |

_foo*

85 | ``` 86 | 87 | A line ending counts as whitespace: 88 | 89 | ``` 90 | _foo bar 91 | _ 92 | . 93 |

_foo bar 94 | _

95 | ``` 96 | 97 | So does a tab: 98 | 99 | ``` 100 | _ a_ 101 | . 102 |

_ a_

103 | ``` 104 | 105 | This one is different from commonmark: 106 | 107 | ``` 108 | _(_foo_)_ 109 | . 110 |

(foo)

111 | ``` 112 | 113 | But you can force the second `_` to be an opener 114 | using the marker `{`. 115 | 116 | ``` 117 | _({_foo_})_ 118 | . 119 |

(foo)

120 | ``` 121 | 122 | ``` 123 | _(*foo*)_ 124 | . 125 |

(foo)

126 | ``` 127 | 128 | Overlapping scopes (first to close wins): 129 | 130 | ``` 131 | _foo *bar_ baz* 132 | . 133 |

foo *bar baz*

134 | ``` 135 | 136 | Over line break: 137 | 138 | ``` 139 | _foo 140 | bar_ 141 | . 142 |

foo 143 | bar

144 | ``` 145 | 146 | Inline content allowed: 147 | 148 | ``` 149 | *foo [link](url) `*`* 150 | . 151 |

foo link *

152 | ``` 153 | 154 | Can't emph an underscore: 155 | 156 | ``` 157 | ___ 158 | . 159 |

___

160 | ``` 161 | 162 | Unless you escape it: 163 | 164 | ``` 165 | _\__ 166 | . 167 |

_

168 | ``` 169 | 170 | No empty emph: 171 | 172 | ``` 173 | __ 174 | . 175 |

__

176 | ``` 177 | 178 | ``` 179 | _}b_ 180 | . 181 |

_}b_

182 | ``` 183 | 184 | ``` 185 | _\}b_ 186 | . 187 |

}b

188 | ``` 189 | 190 | ``` 191 | _ab\_c_ 192 | . 193 |

ab_c

194 | ``` 195 | 196 | ``` 197 | *****a***** 198 | . 199 |

a

200 | ``` 201 | 202 | ``` 203 | _[bar_](url) 204 | . 205 |

[bar](url)

206 | ``` 207 | 208 | ``` 209 | \_[bar_](url) 210 | . 211 |

_bar_

212 | ``` 213 | 214 | Code takes precedence: 215 | 216 | ``` 217 | _`a_`b 218 | . 219 |

_a_b

220 | ``` 221 | 222 | Autolinks take precedence: 223 | 224 | ``` 225 | _ 226 | . 227 |

_http://example.com/a_b

228 | ``` 229 | -------------------------------------------------------------------------------- /tests/data/hello_world.test: -------------------------------------------------------------------------------- 1 | ``` 2 | Hello, world! 3 | . 4 |

Hello, world!

5 | ``` 6 | -------------------------------------------------------------------------------- /tests/data/insert_delete_mark.test: -------------------------------------------------------------------------------- 1 | ``` 2 | This is {-deleted 3 | _text_-}. The braces are -required-. 4 | And they must be in the -}right order{-. 5 | . 6 |

This is deleted 7 | text. The braces are -required-. 8 | And they must be in the -}right order{-.

9 | ``` 10 | 11 | ``` 12 | {+ Inserted text +} 13 | . 14 |

Inserted text

15 | ``` 16 | 17 | Interaction with smart: 18 | 19 | ``` 20 | {--hello--} 21 | . 22 |

-hello-

23 | ``` 24 | 25 | ``` 26 | This is {=marked *text*=}. 27 | . 28 |

This is marked text.

29 | ``` 30 | -------------------------------------------------------------------------------- /tests/data/links_and_images.test: -------------------------------------------------------------------------------- 1 | 2 | ``` 3 | [basic _link_][a_b_] 4 | 5 | [a_b_]: url 6 | . 7 |

basic link

8 | ``` 9 | 10 | ``` 11 | ![basic _image_][a_b_] 12 | 13 | [a_b_]: url 14 | . 15 |

basic image

16 | ``` 17 | 18 | ``` 19 | [link][] 20 | 21 | [link]: url 22 | . 23 |

link

24 | ``` 25 | 26 | ``` 27 | [link][] 28 | 29 | [link]: 30 | url 31 | . 32 |

link

33 | ``` 34 | 35 | The URL can be split over multiple lines: 36 | 37 | ``` 38 | [link][] 39 | 40 | [link]: 41 | url 42 | andurl 43 | . 44 |

link

45 | ``` 46 | 47 | ``` 48 | [link](url 49 | andurl) 50 | . 51 |

link

52 | ``` 53 | 54 | ``` 55 | [link][] 56 | 57 | [link]: 58 | [link2]: url 59 | . 60 |

link

61 | ``` 62 | 63 | ``` 64 | [link][] 65 | [link][link2] 66 | 67 | [link2]: 68 | url2 69 | [link]: 70 | url 71 | . 72 |

link 73 | link

74 | ``` 75 | 76 | ``` 77 | [link][a and 78 | b] 79 | 80 | [a and b]: url 81 | . 82 |

link

83 | ``` 84 | 85 | If the reference isn't found, we get an empty link. 86 | 87 | ``` 88 | [link][a and 89 | b] 90 | . 91 |

link

92 | ``` 93 | 94 | Reference definitions can't have line breaks in the key: 95 | 96 | ``` 97 | [link][a and 98 | b] 99 | 100 | [a and 101 | b]: url 102 | . 103 |

link

104 |

[a and 105 | b]: url

106 | ``` 107 | 108 | No case normalization is done on reference definitions: 109 | 110 | ``` 111 | [Link][] 112 | 113 | [link]: /url 114 | . 115 |

Link

116 | ``` 117 | STOP 118 | Attributes on reference definitions get transferred to 119 | the link: 120 | 121 | ``` 122 | {title=foo} 123 | [ref]: /url 124 | 125 | [ref][] 126 | . 127 |

ref

128 | ``` 129 | 130 | Attributes on the link override those on references: 131 | 132 | ``` 133 | {title=foo} 134 | [ref]: /url 135 | 136 | [ref][]{title=bar} 137 | . 138 |

ref

139 | ``` 140 | 141 | ``` 142 | [link _and_ link][] 143 | 144 | [link and link]: url 145 | . 146 |

link and link

147 | ``` 148 | 149 | ``` 150 | ![basic _image_](url) 151 | . 152 |

basic image

153 | ``` 154 | 155 | ``` 156 | [![image](img.jpg)](url) 157 | . 158 |

image

159 | ``` 160 | 161 | ``` 162 | [unclosed](hello *a 163 | b* 164 | . 165 |

[unclosed](hello a 166 | b

167 | ``` 168 | 169 | Note that soft breaks are ignored, so long URLs 170 | can be split over multiple lines: 171 | ``` 172 | [closed](hello *a 173 | b*) 174 | . 175 |

closed

176 | ``` 177 | 178 | Here the strong takes precedence over the link because it 179 | starts first: 180 | ``` 181 | *[closed](hello*) 182 | . 183 |

[closed](hello)

184 | ``` 185 | 186 | Avoid this with a backslash escape: 187 | ``` 188 | *[closed](hello\*) 189 | . 190 |

*closed

191 | ``` 192 | 193 | Link in link? 194 | ``` 195 | [[foo](bar)](baz) 196 | . 197 |

foo

198 | ``` 199 | 200 | Link in image? 201 | ``` 202 | ![[link](url)](img) 203 | . 204 |

link

205 | ``` 206 | 207 | Image in link? 208 | ``` 209 | [![image](img)](url) 210 | . 211 |

image

212 | ``` 213 | 214 | Autolinks: 215 | ``` 216 | 217 | 218 | . 219 |

http://example.com/foo 220 | me@example.com

221 | ``` 222 | 223 | Openers inside `[..](` or `[..][` or `[..]{` can't match 224 | outside them, even if the construction doesn't turn out to be 225 | a link or span or image. 226 | 227 | ``` 228 | [x_y](x_y) 229 | . 230 |

x_y

231 | ``` 232 | 233 | ``` 234 | [x_y](x_ 235 | . 236 |

[x_y](x_

237 | ``` 238 | 239 | ``` 240 | [x_y]{.bar_} 241 | . 242 |

x_y

243 | ``` 244 | -------------------------------------------------------------------------------- /tests/data/para.test: -------------------------------------------------------------------------------- 1 | ``` 2 | hi 3 | there 4 | . 5 |

hi 6 | there

7 | ``` 8 | -------------------------------------------------------------------------------- /tests/data/regression.test: -------------------------------------------------------------------------------- 1 | Issue #104: 2 | 3 | ``` 4 | {1--} 5 | 6 | {1-} 7 | . 8 |

{1--}

9 |

{1-}

10 | ``` 11 | 12 | Issue #106: 13 | 14 | ``` 15 | 16 | |`| 17 | . 18 |

||

19 | ``` 20 | 21 | ``` [matches] 22 | 23 | |`|x 24 | . 25 | blankline 1-1 26 | +para 2-2 27 | str 2-2 28 | +verbatim 3-3 29 | str 4-5 30 | -verbatim 5-5 31 | -para 6-6 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /tests/data/super_subscript.test: -------------------------------------------------------------------------------- 1 | ``` 2 | H~2~O 3 | . 4 |

H2O

5 | ``` 6 | 7 | ``` 8 | mc^2^ 9 | . 10 |

mc2

11 | ``` 12 | 13 | ``` 14 | test^of superscript ~with subscript~^ 15 | . 16 |

testof superscript with subscript

17 | ``` 18 | 19 | ``` 20 | H{~2 ~}O 21 | . 22 |

H2 O

23 | ``` 24 | -------------------------------------------------------------------------------- /tests/data/verbatim.test: -------------------------------------------------------------------------------- 1 | 2 | ``` 3 | Some `code` 4 | . 5 |

Some code

6 | ``` 7 | 8 | ``` 9 | Some `code 10 | with a line break` 11 | . 12 |

Some code 13 | with a line break

14 | ``` 15 | 16 | ``` 17 | Special characters: `*hi*` 18 | . 19 |

Special characters: *hi*

20 | ``` 21 | 22 | ``` 23 | *foo`*` 24 | . 25 |

*foo*

26 | ``` 27 | 28 | ``` 29 | `````a`a``a```a````a``````a````` 30 | . 31 |

a`a``a```a````a``````a

32 | ``` 33 | 34 | ``` 35 | ` ``a`` ` 36 | . 37 |

``a``

38 | ``` 39 | 40 | Implicitly closed by end of paragraph: 41 | 42 | ``` 43 | ` a 44 | c 45 | . 46 |

a 47 | c

48 | ``` 49 | -------------------------------------------------------------------------------- /tests/spec.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, path::PathBuf}; 2 | 3 | #[allow(unused)] 4 | fn to_ref_html(source: &str, matches: bool) -> String { 5 | let sh = xshell::Shell::new().unwrap(); 6 | if !sh.path_exists("ref") { 7 | xshell::cmd!(sh, "git clone https://github.com/jgm/djot ref").run().unwrap(); 8 | } 9 | sh.change_dir("ref"); 10 | let matches = if matches { Some("-m") } else { None }; 11 | let mut html = xshell::cmd!(sh, "lua ./bin/main.lua {matches...}").stdin(source).read().unwrap(); 12 | if cfg!(windows) { 13 | html = html.replace("\r\n", "\n"); 14 | } 15 | html.push('\n'); 16 | html 17 | } 18 | 19 | struct TestOpts { 20 | debug_ast: bool, 21 | ref_matches: bool, 22 | parse: djot::ParseOpts, 23 | } 24 | 25 | #[test] 26 | fn spec_tests() { 27 | let opts = 28 | TestOpts { debug_ast: true, ref_matches: true, parse: djot::ParseOpts { debug_matches: true } }; 29 | 30 | let mut last_fail = LastFail::load(); 31 | let sh = xshell::Shell::new().unwrap(); 32 | let mut total = 0; 33 | for path in sh.read_dir("./tests/data").unwrap() { 34 | if path.extension().unwrap_or_default() == "test" { 35 | let file_stem = path.file_stem().unwrap_or_default().to_str().unwrap_or_default(); 36 | let source = fs::read_to_string(&path).unwrap(); 37 | for (i, test_case) in parse_test(source.as_str()).into_iter().enumerate() { 38 | if last_fail.skip(file_stem, i) { 39 | continue; 40 | } 41 | let mut debug = String::new(); 42 | let doc = djot::Document::parse_opts(opts.parse.clone(), &test_case.djot); 43 | debug.push_str(&doc.debug); 44 | if opts.debug_ast { 45 | debug.push_str(&doc.to_json()); 46 | } 47 | let got = doc.to_html(); 48 | let want = test_case.html.as_str(); 49 | let ref_html = to_ref_html(&test_case.djot, false); 50 | if opts.ref_matches { 51 | debug.push_str(&format!("Ref Matches:\n{}-----", to_ref_html(&test_case.djot, true))); 52 | } 53 | if want != ref_html.as_str() { 54 | panic!( 55 | "\nReference mismatch in {}\nRef:\n{ref_html}-----\nWant:\n{want}-----\n", 56 | file_stem 57 | ) 58 | } 59 | if got.as_str() != want { 60 | let mut msg = format!( 61 | "\nMismatch in {}\nSource:\n{}-----\nWant:\n{want}-----\nGot:\n{got}-----\n", 62 | file_stem, test_case.djot, 63 | ); 64 | if !debug.is_empty() { 65 | msg = format!("{msg}Debug:\n{debug}-----\n") 66 | } 67 | panic!("{msg}") 68 | } 69 | last_fail.test_ok(); 70 | total += 1; 71 | } 72 | } 73 | } 74 | eprintln!("total tests: {total}"); 75 | } 76 | 77 | #[derive(Debug, Default)] 78 | struct TestCase { 79 | djot: String, 80 | html: String, 81 | } 82 | 83 | #[derive(Debug)] 84 | enum ParseState { 85 | Init, 86 | Djot(TestCase, usize), 87 | Html(TestCase, usize), 88 | } 89 | 90 | fn parse_test(source: &str) -> Vec { 91 | let mut res = Vec::new(); 92 | let mut state = ParseState::Init; 93 | for line in source.lines() { 94 | state = match state { 95 | ParseState::Init if line == "STOP" => { 96 | break; 97 | } 98 | ParseState::Init => match parse_fence(line) { 99 | Some(fence) => ParseState::Djot(TestCase::default(), fence), 100 | None => ParseState::Init, 101 | }, 102 | ParseState::Djot(mut test_case, test_case_fence) => { 103 | if line == "." { 104 | ParseState::Html(test_case, test_case_fence) 105 | } else { 106 | test_case.djot.push_str(line); 107 | test_case.djot.push('\n'); 108 | ParseState::Djot(test_case, test_case_fence) 109 | } 110 | } 111 | ParseState::Html(mut test_case, test_case_fence) => match parse_fence(line) { 112 | Some(fence) if fence == test_case_fence => { 113 | res.push(test_case); 114 | ParseState::Init 115 | } 116 | _ => { 117 | test_case.html.push_str(line); 118 | test_case.html.push('\n'); 119 | ParseState::Html(test_case, test_case_fence) 120 | } 121 | }, 122 | }; 123 | } 124 | 125 | res 126 | } 127 | 128 | fn parse_fence(line: &str) -> Option { 129 | if line.bytes().all(|it| it == b'`') && line.len() > 0 { 130 | Some(line.len()) 131 | } else { 132 | None 133 | } 134 | } 135 | 136 | struct LastFail { 137 | loaded: Option<(String, usize)>, 138 | current: Option<(String, usize)>, 139 | } 140 | 141 | impl LastFail { 142 | fn load() -> LastFail { 143 | let mut loaded = None; 144 | if let Ok(text) = fs::read_to_string(fail_file()) { 145 | let (name, pos) = text.split_once(':').unwrap_or_else(|| panic!("bad fail file {text:?}")); 146 | let idx = pos.parse::().unwrap_or_else(|_| panic!("bad fail file {text:?}")); 147 | eprintln!("loaded fail {name}:{idx}"); 148 | loaded = Some((name.to_string(), idx)) 149 | } 150 | LastFail { loaded, current: None } 151 | } 152 | fn skip(&mut self, name: &str, pos: usize) -> bool { 153 | self.current = Some((name.to_string(), pos)); 154 | if let Some(loaded) = &self.loaded { 155 | return !(loaded.0 == name && loaded.1 == pos); 156 | } 157 | false 158 | } 159 | fn test_ok(&mut self) { 160 | if let Some((name, pos)) = &self.loaded { 161 | eprintln!("{}:{} is now ok!", name, pos); 162 | let _ = fs::remove_file(&fail_file()); 163 | self.loaded = None; 164 | } 165 | self.current = None 166 | } 167 | } 168 | 169 | impl Drop for LastFail { 170 | fn drop(&mut self) { 171 | if let Some((name, pos)) = &self.current { 172 | eprintln!("saved fail {name}:{pos}"); 173 | let _ = fs::write(fail_file(), format!("{name}:{pos}")); 174 | } 175 | } 176 | } 177 | 178 | fn fail_file() -> PathBuf { 179 | PathBuf::from(env!("CARGO_TARGET_TMPDIR")).join("fail") 180 | } 181 | -------------------------------------------------------------------------------- /tests/tidy.rs: -------------------------------------------------------------------------------- 1 | use xshell::{cmd, Shell}; 2 | 3 | #[test] 4 | fn test_formatting() { 5 | let sh = Shell::new().unwrap(); 6 | cmd!(sh, "cargo fmt -- --check").run().unwrap() 7 | } 8 | --------------------------------------------------------------------------------