├── .github
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── Cargo.toml
├── README.md
├── bors.toml
├── lua-patterns
    ├── .gitignore
    ├── Cargo.toml
    ├── LICENSE.txt
    ├── examples
    │   ├── errors.rs
    │   ├── iter.rs
    │   ├── multiple_captures.rs
    │   ├── range.rs
    │   └── strings.rs
    ├── readme.md
    └── src
    │   ├── errors.rs
    │   ├── lib.rs
    │   └── luapat.rs
├── rustfmt.toml
├── src
    ├── annot.rs
    ├── annot
    │   └── generated.rs
    ├── ast.rs
    ├── ast
    │   └── generated.rs
    ├── attribute.rs
    ├── block.rs
    ├── emoji.rs
    ├── html.rs
    ├── inline.rs
    ├── lib.rs
    ├── main.rs
    ├── patterns.rs
    ├── sourcegen.rs
    ├── sourcegen
    │   ├── annot.rs
    │   └── ast.rs
    └── tree.rs
└── tests
    ├── data
        ├── attributes.test
        ├── code_blocks.test
        ├── emoji.test
        ├── emphasis.test
        ├── hello_world.test
        ├── insert_delete_mark.test
        ├── links_and_images.test
        ├── para.test
        ├── regression.test
        ├── super_subscript.test
        └── verbatim.test
    ├── spec.rs
    └── tidy.rs


/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches: ["master", "staging", "trying"]
 6 | 
 7 | env:
 8 |   CARGO_INCREMENTAL: 0
 9 |   CARGO_NET_RETRY: 10
10 |   CI: 1
11 |   RUST_BACKTRACE: short
12 |   RUSTFLAGS: -D warnings
13 |   RUSTUP_MAX_RETRIES: 10
14 | 
15 | jobs:
16 |   test:
17 |     name: Rust
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - uses: Swatinem/rust-cache@6720f05bc48b77f96918929a9019fb2203ff71f8
23 |     - run: rustup update --no-self-update stable
24 |     - run: sudo apt-get install lua5.3
25 |     - run: cargo test
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode
2 | /target
3 | /Cargo.lock
4 | /ref
5 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "djot"
 3 | version = "0.1.0"
 4 | license = "MIT OR Apache-2.0"
 5 | authors = ["Aleksey Kladov <aleksey.kladov@gmail.com>"]
 6 | edition = "2021"
 7 | 
 8 | [dependencies]
 9 | anyhow = "1.0.66"
10 | indexmap = { version = "1.9.1", features = ["serde"] }
11 | lexopt = "0.2.1"
12 | lua-patterns = { path = "lua-patterns" }
13 | serde = { version = "1.0.147", features = ["derive"] }
14 | serde_json = "1.0.87"
15 | 
16 | [dev-dependencies]
17 | xshell = "0.2.0"
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # djot-rs
 2 | 
 3 | UPDATE:
 4 | 
 5 | This didn't went particularly far, checkout
 6 | 
 7 | <https://github.com/hellux/jotdown>
 8 | 
 9 | instead.
10 | 
11 | 
12 | An experimental Rust implementation of the [Djot](https://djot.net) light markup
13 | language.
14 | 
15 | ## Design Rules
16 | 
17 | Djot is in development, this defines _current_ design rules:
18 | 
19 | 1. 100% compatibility with the reference Lua implementation, bugs and all. We
20 |    don't want to fork a language which barely exist.
21 | 2. Reasonable source compatibility with the reference Lua implementation. We
22 |    want to makes it easy to incorporate changes, though we don't necessary want
23 |    to bend Rust to be lua.
24 | 
25 | Currently this is very incomplete, feel free to submit PR to fill the blank
26 | spaces, just try to be close to the original code.
27 | 
28 | There are some tests, run with `cargo test`. We are using the same test suite as
29 | the upstream project (see `.test` files in `tests/data`)
30 | 
31 | ## Aspirations
32 | 
33 | * "Easy", obvious API -- no streaming parsing, no allocation minimization, just
34 |   gives you a full ast
35 | * core + alloc. We don't need OS. Getting rid of the allocator would be nice, but not for this library.
36 | * in general, leave pulldown-djot to someone else (or to the next iteration of this library)
37 | * djot.ts module for convenience
38 | * typescript extensible visitor API for rendering: `./djot.ts intput.adoc --template slides.ts`
39 |   `ast.to_html({ code_block: (tag) => { ... }})`.
40 | 
41 | ## See Also
42 | 
43 | * https://git.sr.ht/~kmaasrud/djr a pulldown-cmark inspired parser
44 | 


--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [ "Rust" ]
2 | delete_merged_branches = true
3 | 


--------------------------------------------------------------------------------
/lua-patterns/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | scratch
4 | 


--------------------------------------------------------------------------------
/lua-patterns/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lua-patterns"
 3 | version = "0.3.0"
 4 | authors = ["steve donovan <steve.j.donovan@gmail.com>"]
 5 | description = "Binding to Lua String Patterns"
 6 | license = "MIT"
 7 | repository = "https://github.com/stevedonovan/lua-patterns"
 8 | documentation = "https://docs.rs/lua-patterns"
 9 | 
10 | keywords = ["string","matching","lua"]
11 | 
12 | categories = ["parsing","api-bindings"]
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/lua-patterns/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright © 2017 Steve Donovan
 2 | 
 3 | Copyright © 1994–2017 Lua.org, PUC-Rio.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"),
 7 | to deal in the Software without restriction, including without
 8 | limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software,
10 | and to permit persons to whom the Software is furnished to do so,
11 | subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included
14 | in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
19 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lua-patterns/examples/errors.rs:
--------------------------------------------------------------------------------
 1 | extern crate lua_patterns;
 2 | use lua_patterns::errors::PatternError;
 3 | 
 4 | fn main() {
 5 |    let bad = [
 6 |     ("bonzo %","malformed pattern (ends with '%')"),
 7 |     ("bonzo (dog%(","unfinished capture"),
 8 |     ("alles [%a%[","malformed pattern (missing ']')"),
 9 |     ("bonzo (dog (cat)","unfinished capture"),
10 |     ("frodo %f[%A","malformed pattern (missing ']')"),
11 |     ("frodo (1) (2(3)%2)%1","invalid capture index %2"),
12 |     ];
13 | 
14 |     fn error(s: &str) -> PatternError {
15 |             PatternError(s.into())
16 |     }
17 | 
18 |     for p in bad.iter() {
19 |         let res = lua_patterns::LuaPattern::new_try(p.0);
20 |         if let Err(e) = res {
21 |             assert_eq!(e, error(p.1));
22 |         } else {
23 |             println!("'{}' was fine",p.0);
24 |         }
25 |    }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/lua-patterns/examples/iter.rs:
--------------------------------------------------------------------------------
 1 | extern crate lua_patterns as lp;
 2 | 
 3 | 
 4 | 
 5 | fn main() {
 6 | 
 7 |     //~ let mut m = lp::LuaPattern::new("hello%");
 8 |     //~ m.matches("hello");
 9 |     //~ println!("ok");
10 | 
11 |     let mut m = lp::LuaPattern::new("(%a+)");
12 |     let mut iter = m.gmatch("one two three");
13 |     assert_eq!(iter.next(), Some("one"));
14 |     assert_eq!(iter.next(), Some("two"));
15 |     assert_eq!(iter.next(), Some("three"));
16 |     assert_eq!(iter.next(), None);
17 | 
18 |     let mut m = lp::LuaPattern::new("%S+");
19 |     let split: Vec<_> = m.gmatch("dog  cat leopard wolf").collect();
20 |     assert_eq!(split,&["dog","cat","leopard","wolf"]);
21 | 
22 |     let mut m = lp::LuaPattern::new("%s*(%S+)%s*=%s*(.-);");
23 |     let cc = m.captures(" hello= bonzo dog;");
24 |     assert_eq!(cc[0], " hello= bonzo dog;");
25 |     assert_eq!(cc[1],"hello");
26 |     assert_eq!(cc[2],"bonzo dog");
27 | 
28 |     for cc in m.gmatch_captures("hello=bonzo dog; bye=cat;") {
29 |         println!("'{}'='{}'",cc.get(1),cc.get(2));
30 |     }
31 | 
32 |     let mut m = lp::LuaPattern::new("%$(%S+)");
33 |     let res = m.gsub_with("hello $dolly you're so $fine",
34 |         |cc| cc.get(1).to_uppercase()
35 |     );
36 |     assert_eq!(res,"hello DOLLY you're so FINE");
37 | 
38 |     let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);");
39 |     let res = m.gsub_with("alpha=bonzo; beta=felix;",
40 |         |cc| format!("{}:'{}',", cc.get(1), cc.get(2))
41 |     );
42 |     assert_eq!(res, "alpha:'bonzo', beta:'felix',");
43 | 
44 | 
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/lua-patterns/examples/multiple_captures.rs:
--------------------------------------------------------------------------------
 1 | extern crate lua_patterns as lp;
 2 | 
 3 | fn main() {
 4 |     let mut p = lp::LuaPattern::new("%s*(%d+)%s+(%S+)");
 5 |     if let Some((int,rest)) = p.match_maybe_2(" 233   hello dolly") {
 6 |         assert_eq!(int,"233");
 7 |         assert_eq!(rest,"hello");
 8 |     }
 9 | }
10 | 


--------------------------------------------------------------------------------
/lua-patterns/examples/range.rs:
--------------------------------------------------------------------------------
 1 | extern crate lua_patterns;
 2 | use lua_patterns::{LuaPattern,LuaPatternBuilder};
 3 | 
 4 | fn main() {
 5 |     let mut m = LuaPattern::new("(%a+) one");
 6 |     let text = " hello one two";
 7 |     assert!(m.matches(text));
 8 |     assert_eq!(m.capture(1),1..6);
 9 |     assert_eq!(m.capture(0),1..10);
10 | 
11 |     let v = m.captures(text);
12 |     assert_eq!(v, &["hello one","hello"]);
13 | 
14 |     let mut v = Vec::new();
15 |     assert!(m.capture_into(text,&mut v));
16 |     assert_eq!(v, &["hello one","hello"]);
17 |     
18 |     let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0];      
19 | 
20 |     let patt = LuaPatternBuilder::new()
21 |         .bytes_as_hex("DE24")
22 |         .text("+")
23 |         .bytes(&[0xBE])
24 |         .build();
25 |     
26 |     let mut m = LuaPattern::from_bytes(&patt);
27 |     assert!(m.matches_bytes(bytes));
28 |     assert_eq!(&bytes[m.capture(0)], &[0xDE,0x24,0x24,0xBE]);
29 |     
30 |     let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
31 |     let res = m.gsub("a=2; b=3; c = 4;","'%2':%1 ");
32 |     println!("{}",res);
33 |     
34 |     let mut m = LuaPattern::new("%s+");
35 |     let res = m.gsub("hello dolly you're so fine","");
36 |     println!("{}",res);
37 |     
38 | }
39 | 


--------------------------------------------------------------------------------
/lua-patterns/examples/strings.rs:
--------------------------------------------------------------------------------
 1 | // Similar to the strings(1) utility
 2 | // We print any sequences involving four or more ASCII letters
 3 | extern crate lua_patterns;
 4 | use lua_patterns::LuaPattern;
 5 | 
 6 | use std::env;
 7 | use std::str;
 8 | use std::fs::File;
 9 | use std::io::prelude::*;
10 | 
11 | fn main() {
12 |     let file = env::args().skip(1).next().expect("provide a binary file");
13 |     let mut f = File::open(&file).expect("can't open file");
14 |     let mut buf = Vec::new();
15 |     f.read_to_end(&mut buf).expect("can't read file");
16 | 
17 |     let mut words = LuaPattern::new("%a%a%a%a+");
18 |     for w in words.gmatch_bytes(&buf) {
19 |         println!("{}",str::from_utf8(w).unwrap());
20 |     }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/lua-patterns/readme.md:
--------------------------------------------------------------------------------
  1 | ## Lua string patterns in Rust
  2 | 
  3 | [Lua string patterns](https://www.lua.org/pil/20.2.html) are a powerful
  4 | yet lightweight alternative to full regular expressions. They are not
  5 | regexps, since there is no alternation (the `|` operator), but this
  6 | is not usually a problem. In fact, full regexps become _too powerful_ and
  7 | power can be dangerous or just plain confusing.
  8 | This is why OpenBSD's httpd has [Lua patterns](http://man.openbsd.org/patterns.7).
  9 | The decision to use `%` as the escape rather than the traditional `\` is refreshing.
 10 | In the Rust context, `lua-patterns` is a very lightweight dependency, if you
 11 | don't need the full power of the `regex` crate.
 12 | 
 13 | This library reuses the original source from Lua 5.2 - only
 14 | 400 lines of battle-tested C. I originally did this for a similar project to bring
 15 | [these patterns to C++](https::/github.com/stevedonovan/rx-cpp).
 16 | 
 17 | More information can be found on [the Lua wiki](http://lua-users.org/wiki/PatternsTutorial).
 18 | The cool thing is that Lua is a 300KB download, if you want to test patterns out
 19 | without going through Rust.
 20 | 
 21 | I've organized the Rust interface much as the original Lua library, 'match',
 22 | 'gmatch' and 'gsub', but made these methods of a `LuaPattern` struct. This is
 23 | for two main reasons:
 24 | 
 25 |   - although string patterns are not compiled, they can be validated upfront
 26 |   - after a match, the struct contains the results
 27 | 
 28 | ```rust
 29 | extern crate lua_patterns;
 30 | use lua_patterns::LuaPattern;
 31 | 
 32 | let mut m = LuaPattern::new("one");
 33 | let text = "hello one two";
 34 | assert!(m.matches(text));
 35 | let r = m.range();
 36 | assert_eq!(r.start, 6);
 37 | assert_eq!(r.end, 9);
 38 | ```
 39 | This not in itself impressive, since it can be done with the string `find`
 40 | method. (`new` will panic if you feed it a bad pattern, so use `new_try` if
 41 | you want more control.)
 42 | 
 43 | Once we start using patterns it gets more exciting, especially
 44 | with _captures_:
 45 | 
 46 | ```rust
 47 | let mut m = LuaPattern::new("(%a+) one");
 48 | let text = " hello one two";
 49 | assert!(m.matches(text));
 50 | assert_eq!(m.capture(0),1..10); // "hello one"
 51 | assert_eq!(m.capture(1),1..6); // "hello"
 52 | ```
 53 | Lua patterns (like regexps) are not anchored by default, so this finds
 54 | the first match and works from there. The 0 capture always exists
 55 | (the full match) and here the 1 capture just picks up the first word.
 56 | 
 57 | > There is an obvious limitation: "%a" refers specifically to a single byte
 58 | > representing a letter according to the C locale. Lua people will often
 59 | > look for 'sequence of non-spaces' ("%S+"), etc - that is, identify maybe-UTF-8
 60 | > sequences using surronding punctuation or spaces.
 61 | 
 62 | If you want your captures as strings, then there are several options. If there's
 63 | just one, then `match_maybe` is useful:
 64 | 
 65 | ```rust
 66 | let mut m = LuaPattern::new("OK%s+(%d+)");
 67 | let res = m.match_maybe("and that's OK 400 to you");
 68 | assert_eq!(res, Some("400"));
 69 | ```
 70 | You can grab them as a vector (it will be empty if the match fails.)
 71 | 
 72 | ```rust
 73 | let mut m = LuaPattern::new("(%a+) one");
 74 | let text = " hello one two";
 75 | let v = m.captures(text);
 76 | assert_eq!(v, &["hello one","hello"]);
 77 | ```
 78 | This will create a vector. You can avoid excessive allocations with `capture_into`:
 79 | 
 80 | ```rust
 81 | let mut v = Vec::new();
 82 | if m.capture_into(text,&mut v) {
 83 |     assert_eq!(v, &["hello one","hello"]);
 84 | }
 85 | ```
 86 | Imagine that this is happening in a loop - the vector is only allocated the first
 87 | time it is filled, and thereafter there are no allocations. It's a convenient
 88 | method if you are checking text against several patterns, and is actually
 89 | more ergonomic than using Lua's `string.match`.  (Personally I prefer
 90 | to use those marvelous things called "if statements" rather than elaborate
 91 | regular expressions.)
 92 | 
 93 | The `gmatch` method creates an interator over all matched strings.
 94 | 
 95 | ```rust
 96 | let mut m = lp::LuaPattern::new("%S+");
 97 | let split: Vec<_> = m.gmatch("dog  cat leopard wolf  ").collect();
 98 | assert_eq!(split,&["dog","cat","leopard","wolf"]);
 99 | ```
100 | A single match is returned; if the pattern has no captures, you get the full match,
101 | otherwise you get the first match. So "(%S+)" would give you the same result.
102 | 
103 | A more general version is `gmatch_captures` which creates a _streaming_ iterator
104 | over captures. You have to be a little careful with this one; in particular, you
105 | will get nonsense if you try to `collect` on the return captures: don't try to
106 | keep these values.
107 | It is fine to collect from an expression involving the `get` method however!
108 | 
109 | ```rust
110 | let mut m = lua_patterns::LuaPattern::new("(%S)%S+");
111 | let split: Vec<_> = m.gmatch_captures("dog  cat leopard wolf")
112 |        .map(|cc| cc.get(1)).collect();
113 | assert_eq!(split,&["d","c","l","w"]);
114 | ```
115 | 
116 | Text substitution is an old favourite of mine, so here's `gsub_with`:
117 | 
118 | ```rust
119 | let mut m = lp::LuaPattern::new("%$(%S+)");
120 | let res = m.gsub_with("hello $dolly you're so $fine",
121 |     |cc| cc.get(1).to_uppercase()
122 | );
123 | assert_eq!(res,"hello DOLLY you're so FINE");
124 | ```
125 | The closure is passed a `Closures` object and the captures are accessed
126 | using the `get` method; it returns a `String`.
127 | 
128 | The second form of `gsub` is convenient when you have a replacement
129 | string, which may contain closure references. (To add a literal "%" escape
130 | it like so "%%")
131 | 
132 | ```rust
133 | let mut m = LuaPattern::new("%s+");
134 | let res = m.gsub("hello dolly you're so fine","");
135 | assert_eq!(res, "hellodollyyou'resofine");
136 | 
137 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
138 | let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
139 | assert_eq!(res, "'2':a '3':b '4':c ");
140 | ```
141 | The third form of `string.gsub` in Lua does lookup with a table - that is, a map.
142 | But for maps you really want to handle the 'not found' case in some special way:
143 | 
144 | ```rust
145 | let mut map = HashMap::new();
146 | // updating old lines for the 21st Century
147 | map.insert("dolly", "baby");
148 | map.insert("fine", "cool");
149 | map.insert("good-looking", "pretty");
150 | 
151 | let mut m = LuaPattern::new("%$%((.-)%)");
152 | let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)",
153 |     |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string()
154 | );
155 | assert_eq!(res,"hello baby you're so cool and pretty");
156 | ```
157 | 
158 | (The ".-" pattern means 'match as little as possible' - often called 'lazy'
159 | matching.)
160 | 
161 | This is equivalent to a replace string "%1:'%2'":
162 | 
163 | ```rust
164 | let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);");
165 | let res = m.gsub_with("alpha=bonzo; beta=felix;",
166 |     |cc| format!("{}:'{}',", cc.get(1), cc.get(2))
167 | );
168 | assert_eq!(res, "alpha:'bonzo', beta:'felix',");
169 | ```
170 | Having a byte-oriented pattern matcher can be useful. For instance, this
171 | is basically the old `strings` utility - we read all of a 'binary' file into
172 | a vector of bytes, and then use `gmatch_bytes` to iterate over all `&[u8]`
173 | matches corresponding to two or more adjacent ASCII letters:
174 | 
175 | ```rust
176 | let mut words = LuaPattern::new("%a%a+");
177 | for w in words.gmatch_bytes(&buf) {
178 |     println!("{}",std::str::from_utf8(w).unwrap());
179 | }
180 | ```
181 | The pattern itself may be arbitrary bytes - Lua 'string' matching does
182 | not care about embedded nul bytes:
183 | 
184 | ```rust
185 | let patt = &[0xDE,0x00,b'+',0xBE];
186 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x0,0x0,0xBE,0x0,0x0];
187 | 
188 | let mut m = LuaPattern::from_bytes(patt);
189 | assert!(m.matches_bytes(bytes));
190 | assert_eq!(&bytes[m.capture(0)], &[0xDE,0x00,0x00,0xBE]);
191 | ```
192 | The problem here is that it's not obvious when our 'arbitrary' bytes
193 | include one of the special matching characters like `$` (which is 0x24)
194 | and so on. Hence there is `LuaPatternBuilder`:
195 | 
196 | ```rust
197 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0];
198 | 
199 | let patt = LuaPatternBuilder::new()
200 |     .bytes_as_hex("DE24") // less tedious than a byte slice
201 |     .text("+")  // unescaped
202 |     .bytes(&[0xBE]) // byte slice
203 |     .build();
204 | 
205 | let mut m = LuaPattern::from_bytes(&patt);
206 | // picks up "DE2424BE"
207 | ```
208 | > Static verification: this version attempts to verify string patterns. If you
209 | > want errors, use `new_try` and `from_bytes_try`, otherwise the constructors panic.
210 | > If a match panics after successful verification, it is a __BUG__ - please
211 | > report the offending pattern.
212 | 
213 | 


--------------------------------------------------------------------------------
/lua-patterns/src/errors.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::error::Error;
 3 | 
 4 | /// Error type returned by _try methods
 5 | #[derive(Debug,PartialEq)]
 6 | pub struct PatternError(pub String);
 7 | 
 8 | impl fmt::Display for PatternError {
 9 | 	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
10 | 		write!(f,"{}",self.0)
11 | 	}
12 | }
13 | 
14 | impl Error for PatternError {
15 | 	fn description(&self) -> &str {
16 | 		&self.0
17 | 	}
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/lua-patterns/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! This is a Rust binding to [Lua string patterns](https://www.lua.org/pil/20.2.html),
  2 | //! using the original code from Lua 5.2.
  3 | //!
  4 | //! Although not regular expressions (they lack alternation) they are a powerful
  5 | //! and lightweight way to process text. Please note that they are not
  6 | //! UTF-8-aware, and in fact can process arbitrary binary data.
  7 | //!
  8 | //! `LuaPattern` can be created from a string _or_ a byte slice, and has
  9 | //! methods which are similar to the original Lua API. Please see
 10 | //! [the README](https://github.com/stevedonovan/lua-patterns/blob/master/readme.md)
 11 | //! for more discussion.
 12 | //!
 13 | //! [LuaPattern](struct.LuaPattern.html) implements the public API.
 14 | //!
 15 | //! ## Examples
 16 | //!
 17 | //! ```rust
 18 | //! extern crate lua_patterns;
 19 | //! let mut m = lua_patterns::LuaPattern::new("one");
 20 | //! let text = "hello one two";
 21 | //! assert!(m.matches(text));
 22 | //! let r = m.range();
 23 | //! assert_eq!(r.start, 6);
 24 | //! assert_eq!(r.end, 9);
 25 | //! ```
 26 | //!
 27 | //! Collecting captures from a match:
 28 | //!
 29 | //! ```rust
 30 | //! extern crate lua_patterns;
 31 | //! let text = "  hello one";
 32 | //! let mut m = lua_patterns::LuaPattern::new("(%S+) one");
 33 | //!
 34 | //! // allocates a vector of captures
 35 | //! let v = m.captures(text);
 36 | //! assert_eq!(v, &["hello one","hello"]);
 37 | //! let mut v = Vec::new();
 38 | //! // writes captures into preallocated vector
 39 | //! if m.capture_into(text,&mut v) {
 40 | //!     assert_eq!(v, &["hello one","hello"]);
 41 | //! }
 42 | //! ```
 43 | 
 44 | use std::ops;
 45 | 
 46 | pub mod errors;
 47 | use errors::*;
 48 | mod luapat;
 49 | use luapat::*;
 50 | 
 51 | 
 52 | /// Represents a Lua string pattern and the results of a match
 53 | pub struct LuaPattern<'a> {
 54 |     patt: &'a [u8],
 55 |     matches: [LuaMatch; LUA_MAXCAPTURES],
 56 |     n_match: usize
 57 | }
 58 | 
 59 | impl <'a> LuaPattern<'a> {
 60 |     /// Maybe create a new Lua pattern from a slice of bytes
 61 |     pub fn from_bytes_try (bytes: &'a [u8]) -> Result<LuaPattern<'a>,PatternError> {
 62 |         str_check(bytes)?;
 63 |         let matches = [LuaMatch{start: 0, end: 0}; LUA_MAXCAPTURES];
 64 |         Ok(LuaPattern{patt: bytes, matches: matches, n_match: 0})
 65 |     }
 66 | 
 67 |     /// Maybe create a new Lua pattern from a string
 68 |     pub fn new_try(patt: &'a str) -> Result<LuaPattern<'a>,PatternError> {
 69 |         LuaPattern::from_bytes_try(patt.as_bytes())
 70 |     }
 71 | 
 72 |     /// Create a new Lua pattern from a string, panicking if bad
 73 |     pub fn new(patt: &'a str) -> LuaPattern<'a> {
 74 |         LuaPattern::new_try(patt).expect("bad pattern")
 75 |     }
 76 | 
 77 |     /// Create a new Lua pattern from a slice of bytes, panicking if bad
 78 |     pub fn from_bytes (bytes: &'a [u8]) -> LuaPattern<'a> {
 79 |         LuaPattern::from_bytes_try(bytes).expect("bad pattern")
 80 |     }
 81 | 
 82 |     /// Match a slice of bytes with a pattern
 83 |     ///
 84 |     /// ```
 85 |     /// let patt = &[0xFE,0xEE,b'+',0xED];
 86 |     /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
 87 |     /// let bytes = &[0x00,0x01,0xFE,0xEE,0xEE,0xED,0xEF];
 88 |     /// assert!(m.matches_bytes(bytes));
 89 |     /// assert_eq!(&bytes[m.range()], &[0xFE,0xEE,0xEE,0xED]);
 90 |     /// ```
 91 |     pub fn matches_bytes(&mut self, s: &[u8]) -> bool {
 92 |         self.n_match = str_match(s,self.patt,&mut self.matches).expect("Should not fail - report as bug");
 93 |         self.n_match > 0
 94 |     }
 95 | 
 96 |     /// Match a string with a pattern
 97 |     ///
 98 |     /// ```
 99 |     /// let mut m = lua_patterns::LuaPattern::new("(%a+) one");
100 |     /// let text = " hello one two";
101 |     /// assert!(m.matches(text));
102 |     /// ```
103 |     pub fn matches(&mut self, text: &str) -> bool {
104 |         self.matches_bytes(text.as_bytes())
105 |     }
106 | 
107 |     /// Match a string, returning first capture if successful
108 |     ///
109 |     /// ```
110 |     /// let mut m = lua_patterns::LuaPattern::new("OK%s+(%d+)");
111 |     /// let res = m.match_maybe("and that's OK 400 to you");
112 |     /// assert_eq!(res, Some("400"));
113 |     /// ```
114 |     pub fn match_maybe<'t>(&mut self, text: &'t str) -> Option<&'t str> {
115 |         if self.matches(text) {
116 |             Some(&text[self.first_capture()])
117 |         } else {
118 |             None
119 |         }
120 |     }
121 | 
122 |     /// Match a string, returning first two explicit captures if successful
123 |     ///
124 |     /// ```
125 |     /// let mut p = lua_patterns::LuaPattern::new("%s*(%d+)%s+(%S+)");
126 |     /// let (int,rest) = p.match_maybe_2(" 233   hello dolly").unwrap();
127 |     /// assert_eq!(int,"233");
128 |     /// assert_eq!(rest,"hello");
129 |     /// ```
130 |     pub fn match_maybe_2<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str)> {
131 |         if self.matches(text) {
132 |             let cc = self.match_captures(text);
133 |             if cc.num_matches() != 3 { return None; }
134 |             Some((cc.get(1),cc.get(2)))
135 |         } else {
136 |             None
137 |         }
138 |     }
139 | 
140 |     /// Match a string, returning first three explicit captures if successful
141 |     ///
142 |     /// ```
143 |     /// let mut p = lua_patterns::LuaPattern::new("(%d+)/(%d+)/(%d+)");
144 |     /// let (y,m,d) = p.match_maybe_3("2017/11/10").unwrap();
145 |     /// assert_eq!(y,"2017");
146 |     /// assert_eq!(m,"11");
147 |     /// assert_eq!(d,"10");
148 |     /// ```
149 |     pub fn match_maybe_3<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str,&'t str)> {
150 |         if self.matches(text) {
151 |             let cc = self.match_captures(text);
152 |             if cc.num_matches() != 4 { return None; }
153 |             Some((cc.get(1),cc.get(2),cc.get(3)))
154 |         } else {
155 |             None
156 |         }
157 |     }
158 | 
159 |     /// Match and collect all captures as a vector of string slices
160 |     ///
161 |     /// ```
162 |     /// let mut m = lua_patterns::LuaPattern::new("(one).+");
163 |     /// assert_eq!(m.captures(" one two"), &["one two","one"]);
164 |     /// ```
165 |     pub fn captures<'b>(&mut self, text: &'b str) -> Vec<&'b str> {
166 |         let mut res = Vec::new();
167 |         self.capture_into(text, &mut res);
168 |         res
169 |     }
170 | 
171 |     /// A convenient way to access the captures with no allocation
172 |     ///
173 |     /// ```rust
174 |     /// let text = "  hello one";
175 |     /// let mut m = lua_patterns::LuaPattern::new("(%S+) one");
176 |     /// if m.matches(text) {
177 |     ///     let cc = m.match_captures(text);
178 |     ///     assert_eq!(cc.get(0), "hello one");
179 |     ///     assert_eq!(cc.get(1), "hello");
180 |     /// }
181 |     /// ```
182 |     pub fn match_captures<'b,'c>(&'c self, text: &'b str) -> Captures<'a,'b,'c> {
183 |         Captures {m: self, text: text}
184 |     }
185 | 
186 |     /// Match and collect all captures into the provided vector.
187 |     ///
188 |     /// ```rust
189 |     /// let text = "  hello one";
190 |     /// let mut m = lua_patterns::LuaPattern::new("(%S+) one");
191 |     /// let mut v = Vec::new();
192 |     /// if m.capture_into(text,&mut v) {
193 |     ///     assert_eq!(v, &["hello one","hello"]);
194 |     /// }
195 |     /// ```
196 |     pub fn capture_into<'b>(&mut self, text: &'b str, vec: &mut Vec<&'b str>) -> bool {
197 |         self.matches(text);
198 |         vec.clear();
199 |         for i in 0..self.n_match {
200 |             vec.push(&text[self.capture(i)]);
201 |         }
202 |         self.n_match > 0
203 |     }
204 | 
205 |     /// The full match (same as `capture(0)`)
206 |     pub fn range(&self) -> ops::Range<usize> {
207 |         self.capture(0)
208 |     }
209 | 
210 |     /// Get the nth capture of the match.
211 |     ///
212 |     /// ```
213 |     /// let mut m = lua_patterns::LuaPattern::new("(%a+) one");
214 |     /// let text = " hello one two";
215 |     /// assert!(m.matches(text));
216 |     /// assert_eq!(m.capture(0),1..10);
217 |     /// assert_eq!(m.capture(1),1..6);
218 |     /// ```
219 |     pub fn capture(&self, i: usize) -> ops::Range<usize> {
220 |         ops::Range{
221 |             start: self.matches[i].start as usize,
222 |             end: self.matches[i].end as usize
223 |         }
224 |     }
225 | 
226 |     /// Get the 'first' capture of the match
227 |     ///
228 |     /// If there are no matches, this is the same as `range`,
229 |     /// otherwise it's `capture(1)`
230 |     pub fn first_capture(&self) -> ops::Range<usize> {
231 |         let idx = if self.n_match > 1 {1} else {0};
232 |         self.capture(idx)
233 |     }
234 | 
235 |     /// An iterator over all matches in a string.
236 |     ///
237 |     /// The matches are returned as string slices; if there are no
238 |     /// captures the full match is used, otherwise the first capture.
239 |     /// That is, this example will also work with the pattern "(%S+)".
240 |     ///
241 |     /// ```
242 |     /// let mut m = lua_patterns::LuaPattern::new("%S+");
243 |     /// let split: Vec<_> = m.gmatch("dog  cat leopard wolf").collect();
244 |     /// assert_eq!(split,&["dog","cat","leopard","wolf"]);
245 |     /// ```
246 |     pub fn gmatch<'b,'c>(&'c mut self, text: &'b str) -> GMatch<'a,'b,'c> {
247 |         GMatch{m: self, text: text}
248 |     }
249 | 
250 |     /// An iterator over all captures in a string.
251 |     ///
252 |     /// The matches are returned as captures; this is a _streaming_
253 |     /// iterator, so don't try to collect the captures directly; extract
254 |     /// the string slices using `get`.
255 |     ///
256 |     /// ```
257 |     /// let mut m = lua_patterns::LuaPattern::new("(%S)%S+");
258 |     /// let split: Vec<_> = m.gmatch_captures("dog  cat leopard wolf")
259 |     ///       .map(|cc| cc.get(1)).collect();
260 |     /// assert_eq!(split,&["d","c","l","w"]);
261 |     /// ```
262 |     pub fn gmatch_captures<'b,'c>(&'c mut self, text: &'b str) -> GMatchCaptures<'a,'b,'c> {
263 |         GMatchCaptures{m: self, text: text}
264 |     }
265 | 
266 |     /// An iterator over all matches in a slice of bytes.
267 |     ///
268 |     /// ```
269 |     /// let bytes = &[0xAA,0x01,0x01,0x03,0xBB,0x01,0x01,0x01];
270 |     /// let patt = &[0x01,b'+'];
271 |     /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
272 |     /// let mut iter = m.gmatch_bytes(bytes);
273 |     /// assert_eq!(iter.next().unwrap(), &[0x01,0x01]);
274 |     /// assert_eq!(iter.next().unwrap(), &[0x01,0x01,0x01]);
275 |     /// assert_eq!(iter.next(), None);
276 |     /// ```
277 |     pub fn gmatch_bytes<'b>(&'a mut self, bytes: &'b [u8]) -> GMatchBytes<'a,'b> {
278 |         GMatchBytes{m: self, bytes: bytes}
279 |     }
280 | 
281 |     /// Globally substitute all matches with a replacement
282 |     /// provided by a function of the captures.
283 |     ///
284 |     /// ```
285 |     /// let mut m = lua_patterns::LuaPattern::new("%$(%S+)");
286 |     /// let res = m.gsub_with("hello $dolly you're so $fine!",
287 |     ///     |cc| cc.get(1).to_uppercase()
288 |     /// );
289 |     /// assert_eq!(res, "hello DOLLY you're so FINE!");
290 |     /// ```
291 |     pub fn gsub_with <F> (&mut self, text: &str, lookup: F) -> String
292 |     where F: Fn(Captures)-> String {
293 |         let mut slice = text;
294 |         let mut res = String::new();
295 |         while self.matches(slice) {
296 |             // full range of match
297 |             let all = self.range();
298 |             // append everything up to match
299 |             res.push_str(&slice[0..all.start]);
300 |             let captures = Captures{m: self, text: slice};
301 |             let repl = lookup(captures);
302 |             res.push_str(&repl);
303 |             slice = &slice[all.end..];
304 |         }
305 |         res.push_str(slice);
306 |         res
307 |     }
308 | 
309 |     /// Globally substitute all matches with a replacement string
310 |     ///
311 |     /// This string _may_ have capture references ("%0",..). Use "%%"
312 |     /// to represent "%". Plain strings like "" work just fine ;)
313 |     ///
314 |     /// ```
315 |     /// let mut m = lua_patterns::LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
316 |     /// let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
317 |     /// assert_eq!(res,"'2':a '3':b '4':c ");
318 |     /// ```
319 |     pub fn gsub (&mut self, text: &str, repl: &str) -> String {
320 |         let repl = generate_gsub_patterns(repl);
321 |         let mut slice = text;
322 |         let mut res = String::new();
323 |         while self.matches(slice) {
324 |             let all = self.range();
325 |             res.push_str(&slice[0..all.start]);
326 |             let captures = Captures{m: self, text: slice};
327 |             for r in &repl {
328 |                 match *r {
329 |                     Subst::Text(ref s) => res.push_str(&s),
330 |                     Subst::Capture(i) => res.push_str(captures.get(i))
331 |                 }
332 |             }
333 |             slice = &slice[all.end..];
334 |         }
335 |         res.push_str(slice);
336 |         res
337 |     }
338 | 
339 |     /// Globally substitute all _byte_ matches with a replacement
340 |     /// provided by a function of the captures.
341 |     ///
342 |     /// ```
343 |     /// let bytes = &[0xAA,0x01,0x02,0x03,0xBB];
344 |     /// let patt = &[0x01,0x02];
345 |     /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
346 |     /// let res = m.gsub_bytes_with(bytes,|cc| vec![0xFF]);
347 |     /// assert_eq!(res, &[0xAA,0xFF,0x03,0xBB]);
348 |     /// ```
349 |     pub fn gsub_bytes_with <F> (&mut self, bytes: &[u8], lookup: F) -> Vec<u8>
350 |     where F: Fn(ByteCaptures)-> Vec<u8> {
351 |         let mut slice = bytes;
352 |         let mut res = Vec::new();
353 |         while self.matches_bytes(slice) {
354 |             let all = self.range();
355 |             let capture = &slice[0..all.start];
356 |             res.extend_from_slice(capture);
357 |             let captures = ByteCaptures{m: self, bytes: slice};
358 |             let repl = lookup(captures);
359 |             res.extend(repl);
360 |             slice = &slice[all.end..];
361 |         }
362 |         res.extend_from_slice(slice);
363 |         res
364 |     }
365 | 
366 | }
367 | 
368 | #[derive(Debug)]
369 | pub enum Subst {
370 |     Text(String),
371 |     Capture(usize)
372 | }
373 | 
374 | impl Subst {
375 |     fn new_text(text: &str) -> Subst {
376 |         Subst::Text(text.to_string())
377 |     }
378 | }
379 | 
380 | pub fn generate_gsub_patterns(repl: &str) -> Vec<Subst> {
381 |     let mut m = LuaPattern::new("%%([%%%d])");
382 |     let mut res = Vec::new();
383 |     let mut slice = repl;
384 |     while m.matches(slice) {
385 |         let all = m.range();
386 |         let before = &slice[0..all.start];
387 |         if before != "" {
388 |             res.push(Subst::new_text(before));
389 |         }
390 |         let capture = &slice[m.capture(1)];
391 |         if capture == "%" { // escaped literal '%'
392 |             res.push(Subst::new_text("%"));
393 |         } else { // has to be a digit
394 |             let index: usize = capture.parse().unwrap();
395 |             res.push(Subst::Capture(index));
396 |         }
397 |         slice = &slice[all.end..];
398 |     }
399 |     res.push(Subst::new_text(slice));
400 |     res
401 | }
402 | 
403 | pub struct Substitute {
404 |     repl: Vec<Subst>
405 | }
406 | 
407 | impl Substitute {
408 |     pub fn new(repl: &str) -> Substitute {
409 |         Substitute{
410 |             repl: generate_gsub_patterns(repl)
411 |         }
412 |     }
413 | 
414 |     pub fn subst(&self, patt: &LuaPattern, text: &str) -> String {
415 |         let mut res = String::new();
416 |         let captures = patt.match_captures(text);
417 |         for r in &self.repl {
418 |             match *r {
419 |                 Subst::Text(ref s) => res.push_str(&s),
420 |                 Subst::Capture(i) => res.push_str(captures.get(i))
421 |             }
422 |         }
423 |         res
424 |     }
425 | 
426 | }
427 | 
428 | 
429 | 
430 | /// Low-overhead convenient access to string match captures
431 | // note: there are three borrows going on here.
432 | // The lifetime 'a is for the _pattern_, the lifetime 'b is
433 | // for the _source string_, and 'c is for the reference to LuaPattern
434 | // And the LuaPattern reference cannot live longer than the pattern reference
435 | pub struct Captures<'a,'b,'c> where 'a: 'c {
436 |     m: &'c LuaPattern<'a>,
437 |     text: &'b str
438 | }
439 | 
440 | impl <'a,'b,'c> Captures<'a,'b,'c> {
441 |     /// get the capture as a string slice
442 |     pub fn get(&self, i: usize) -> &'b str {
443 |         &self.text[self.m.capture(i)]
444 |     }
445 | 
446 |     /// number of matches
447 |     pub fn num_matches(&self) -> usize {
448 |         self.m.n_match
449 |     }
450 | }
451 | 
452 | 
453 | /// Low-overhead convenient access to byte match captures
454 | pub struct ByteCaptures<'a,'b> {
455 |     m: &'a LuaPattern<'a>,
456 |     bytes: &'b [u8]
457 | }
458 | 
459 | impl <'a,'b> ByteCaptures<'a,'b> {
460 |     /// get the capture as a byte slice
461 |     pub fn get(&self, i: usize) -> &'b [u8] {
462 |         &self.bytes[self.m.capture(i)]
463 |     }
464 | 
465 |     /// number of matches
466 |     pub fn num_matches(&self) -> usize {
467 |         self.m.n_match
468 |     }
469 | }
470 | 
471 | /// Iterator for all string slices from `gmatch`
472 | // note lifetimes as for Captures above!
473 | pub struct GMatch<'a,'b,'c> where 'a: 'c {
474 |     m: &'c mut LuaPattern<'a>,
475 |     text: &'b str
476 | }
477 | 
478 | impl <'a,'b,'c>Iterator for GMatch<'a,'b,'c> {
479 |     type Item = &'b str;
480 | 
481 |     fn next(&mut self) -> Option<Self::Item> {
482 |         if ! self.m.matches(self.text) {
483 |             None
484 |         } else {
485 |             let slice = &self.text[self.m.first_capture()];
486 |             self.text = &self.text[self.m.range().end..];
487 |             Some(slice)
488 |         }
489 |     }
490 | 
491 | }
492 | 
493 | /// Unsafe version of Captures, needed for gmatch_captures
494 | // It's unsafe because the lifetime only depends on the original
495 | // text, not the borrowed matches.
496 | pub struct CapturesUnsafe<'b>{
497 |     matches: *const LuaMatch,
498 |     text: &'b str
499 | }
500 | 
501 | impl <'b> CapturesUnsafe<'b> {
502 |     /// get the capture as a string slice
503 |     pub fn get(&self, i: usize) -> &'b str {
504 |         unsafe {
505 |             let p = self.matches.offset(i as isize);
506 |             let range =
507 |                 ops::Range{
508 |                     start: (*p).start as usize,
509 |                     end: (*p).end as usize
510 |                 };
511 |             &self.text[range]
512 |         }
513 |     }
514 | }
515 | 
516 | /// Streaming iterator for all captures from `gmatch_captures`
517 | // lifetimes as for Captures above!
518 | // 'a is pattern, 'b is text, 'c is ref to LuaPattern
519 | pub struct GMatchCaptures<'a,'b,'c> where 'a: 'c {
520 |     m: &'c mut LuaPattern<'a>,
521 |     text: &'b str
522 | }
523 | 
524 | impl <'a,'b,'c> Iterator for GMatchCaptures<'a,'b,'c>  where 'a: 'c {
525 |     type Item = CapturesUnsafe<'b>;
526 | 
527 |     fn next(&mut self) -> Option<Self::Item> {
528 |         if ! self.m.matches(self.text) {
529 |             None
530 |         } else {
531 |             let split = self.text.split_at(self.m.range().end);
532 |             self.text = split.1;
533 |             let match_ptr: *const LuaMatch = self.m.matches.as_ptr();
534 |             Some(CapturesUnsafe{matches: match_ptr, text: split.0})
535 |         }
536 |     }
537 | 
538 | }
539 | 
540 | /// Iterator for all byte slices from `gmatch_bytes`
541 | pub struct GMatchBytes<'a,'b> {
542 |     m: &'a mut LuaPattern<'a>,
543 |     bytes: &'b [u8]
544 | }
545 | 
546 | impl <'a,'b>Iterator for GMatchBytes<'a,'b> {
547 |     type Item = &'b [u8];
548 | 
549 |     fn next(&mut self) -> Option<Self::Item> {
550 |         if ! self.m.matches_bytes(self.bytes) {
551 |             None
552 |         } else {
553 |             let slice = &self.bytes[self.m.first_capture()];
554 |             self.bytes = &self.bytes[self.m.range().end..];
555 |             Some(slice)
556 |         }
557 |     }
558 | 
559 | }
560 | 
561 | /// Build a byte Lua pattern, optionally escaping 'magic' characters
562 | pub struct LuaPatternBuilder {
563 |     bytes: Vec<u8>
564 | }
565 | 
566 | impl LuaPatternBuilder {
567 |     /// Create a new Lua pattern builder
568 |     pub fn new() -> LuaPatternBuilder {
569 |         LuaPatternBuilder{bytes: Vec::new()}
570 |     }
571 | 
572 |     /// Add unescaped characters from a string
573 |     ///
574 |     /// ```
575 |     /// let patt = lua_patterns::LuaPatternBuilder::new()
576 |     ///     .text("(boo)")
577 |     ///     .build();
578 |     /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "(boo)");
579 |     /// ```
580 |     pub fn text(&mut self, s: &str) -> &mut Self {
581 |         self.bytes.extend_from_slice(s.as_bytes());
582 |         self
583 |     }
584 | 
585 |     /// Add unescaped characters from lines
586 |     ///
587 |     /// This looks for first non-whitespace run in each line,
588 |     /// useful for spreading patterns out and commmenting them.
589 |     /// Works with patterns that use '%s' religiously!
590 |     ///
591 |     /// ```
592 |     /// let patt = lua_patterns::LuaPatternBuilder::new()
593 |     ///     .text_lines("
594 |     ///       hello-dolly
595 |     ///       you-are-fine  # comment
596 |     ///       cool
597 |     ///      ")
598 |     ///     .build();
599 |     /// assert_eq!(std::str::from_utf8(&patt).unwrap(),
600 |     ///   "hello-dollyyou-are-finecool");
601 |     /// ```
602 |     pub fn text_lines(&mut self, lines: &str) -> &mut Self {
603 |         let mut text = String::new();
604 |         for line in lines.lines() {
605 |             if let Some(first) = line.split_whitespace().next() {
606 |                 text.push_str(first);
607 |             }
608 |         }
609 |         self.text(&text)
610 |     }
611 | 
612 |     /// Add escaped bytes from a slice
613 |     ///
614 |     /// ```
615 |     /// let patt = lua_patterns::LuaPatternBuilder::new()
616 |     ///     .text("^")
617 |     ///     .bytes(b"^") // magic character!
618 |     ///     .build();
619 |     /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^");
620 |     /// ```
621 |     pub fn bytes(&mut self, b: &[u8]) -> &mut Self {
622 |         let mut m = LuaPattern::new("[%-%.%+%[%]%(%)%$%^%%%?%*]");
623 |         let bb = m.gsub_bytes_with(b,|cc| {
624 |             let mut res = Vec::new();
625 |             res.push(b'%');
626 |             res.push(cc.get(0)[0]);
627 |             res
628 |         });
629 |         self.bytes.extend(bb);
630 |         self
631 |     }
632 | 
633 |     /// Add escaped bytes from hex string
634 |     ///
635 |     /// This consists of adjacent pairs of hex digits.
636 |     ///
637 |     /// ```
638 |     /// let patt = lua_patterns::LuaPatternBuilder::new()
639 |     ///     .text("^")
640 |     ///     .bytes_as_hex("5E") // which is ASCII '^'
641 |     ///     .build();
642 |     /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^");
643 |     /// ```
644 |     pub fn bytes_as_hex(&mut self, bs: &str) -> &mut Self {
645 |         let bb = LuaPatternBuilder::hex_to_bytes(bs);
646 |         self.bytes(&bb)
647 |     }
648 | 
649 |     /// Create the pattern
650 |     pub fn build(&mut self) -> Vec<u8> {
651 |         let mut v = Vec::new();
652 |         std::mem::swap(&mut self.bytes, &mut v);
653 |         v
654 |     }
655 | 
656 |     /// Utility to create a vector of bytes from a hex string
657 |     ///
658 |     /// ```
659 |     /// let bb = lua_patterns::LuaPatternBuilder::hex_to_bytes("AEFE00FE");
660 |     /// assert_eq!(bb, &[0xAE,0xFE,0x00,0xFE]);
661 |     /// ```
662 |     pub fn hex_to_bytes(s: &str) -> Vec<u8> {
663 |         let mut m = LuaPattern::new("%x%x");
664 |         m.gmatch(s).map(|pair| u8::from_str_radix(pair,16).unwrap()).collect()
665 |     }
666 | 
667 |     /// Utility to create a hex string from a slice of bytes
668 |     ///
669 |     /// ```
670 |     /// let hex = lua_patterns::LuaPatternBuilder::bytes_to_hex(&[0xAE,0xFE,0x00,0xFE]);
671 |     /// assert_eq!(hex,"AEFE00FE");
672 |     ///
673 |     /// ```
674 |     pub fn bytes_to_hex(s: &[u8]) -> String {
675 |         s.iter().map(|b| format!("{:02X}",b)).collect()
676 |     }
677 | 
678 | }
679 | 
680 | #[cfg(test)]
681 | mod tests {
682 |     use super::*;
683 | 
684 |     #[test]
685 |     fn captures_and_matching() {
686 |         let mut m = LuaPattern::new("(one).+");
687 |         assert_eq!(m.captures(" one two"), &["one two","one"]);
688 |         let empty: &[&str] = &[];
689 |         assert_eq!(m.captures("four"), empty);
690 | 
691 |         assert_eq!(m.matches("one dog"), true);
692 |         assert_eq!(m.matches("dog one "), true);
693 |         assert_eq!(m.matches("dog one"), false);
694 | 
695 |         let text = "one dog";
696 |         let mut m = LuaPattern::new("^(%a+)");
697 |         assert_eq!(m.matches(text), true);
698 |         assert_eq!(&text[m.capture(1)], "one");
699 |         assert_eq!(m.matches(" one dog"), false);
700 | 
701 |         // captures without allocation
702 |         m.matches(text);
703 |         let captures = m.match_captures(text);
704 |         assert_eq!(captures.get(0), "one");
705 |         assert_eq!(captures.get(1), "one");
706 | 
707 |         let mut m = LuaPattern::new("(%S+)%s*=%s*(.+)");
708 | 
709 |         //  captures as Vec
710 |         let cc = m.captures(" hello= bonzo dog");
711 |         assert_eq!(cc[0], "hello= bonzo dog");
712 |         assert_eq!(cc[1], "hello");
713 |         assert_eq!(cc[2], "bonzo dog");
714 | 
715 |     }
716 | 
717 |     #[test]
718 |     fn multiple_captures() {
719 |         let mut p = LuaPattern::new("%s*(%d+)%s+(%S+)");
720 |         let (int,rest) = p.match_maybe_2(" 233   hello dolly").unwrap();
721 |         assert_eq!(int,"233");
722 |         assert_eq!(rest,"hello");
723 |     }
724 | 
725 |     #[test]
726 |     fn gmatch() {
727 |         let mut m = LuaPattern::new("%a+");
728 |         let mut iter = m.gmatch("one two three");
729 |         assert_eq!(iter.next(), Some("one"));
730 |         assert_eq!(iter.next(), Some("two"));
731 |         assert_eq!(iter.next(), Some("three"));
732 |         assert_eq!(iter.next(), None);
733 | 
734 |         let mut m = LuaPattern::new("(%a+)");
735 |         let mut iter = m.gmatch("one two three");
736 |         assert_eq!(iter.next(), Some("one"));
737 |         assert_eq!(iter.next(), Some("two"));
738 |         assert_eq!(iter.next(), Some("three"));
739 |         assert_eq!(iter.next(), None);
740 | 
741 |         let mut m = LuaPattern::new("(%a+)");
742 |         let mut iter = m.gmatch_captures("one two three");
743 |         assert_eq!(iter.next().unwrap().get(1), "one");
744 |         assert_eq!(iter.next().unwrap().get(1), "two");
745 |         assert_eq!(iter.next().unwrap().get(1), "three");
746 |     }
747 | 
748 |     #[test]
749 |     fn gsub() {
750 |         use std::collections::HashMap;
751 | 
752 |         let mut m = LuaPattern::new("%$(%S+)");
753 |         let res = m.gsub_with("hello $dolly you're so $fine!",
754 |             |cc| cc.get(1).to_uppercase()
755 |         );
756 |         assert_eq!(res, "hello DOLLY you're so FINE!");
757 | 
758 |         let mut map = HashMap::new();
759 |         map.insert("dolly", "baby");
760 |         map.insert("fine", "cool");
761 |         map.insert("good-looking", "pretty");
762 | 
763 |         let mut m = LuaPattern::new("%$%((.-)%)");
764 |         let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)",
765 |             |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string()
766 |         );
767 |         assert_eq!(res, "hello baby you're so cool and pretty");
768 | 
769 |         let mut m = LuaPattern::new("%s+");
770 |         let res = m.gsub("hello dolly you're so fine","");
771 |         assert_eq!(res, "hellodollyyou'resofine");
772 | 
773 |         let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
774 |         let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
775 |         assert_eq!(res,"'2':a '3':b '4':c ");
776 |     }
777 | 
778 |     #[test]
779 |     fn bad_patterns() {
780 |        let bad = [
781 |         ("bonzo %","malformed pattern (ends with '%')"),
782 |         ("bonzo (dog%(","unfinished capture"),
783 |         ("alles [%a%[","malformed pattern (missing ']')"),
784 |         ("bonzo (dog (cat)","unfinished capture"),
785 |         ("frodo %f[%A","malformed pattern (missing ']')"),
786 |         ("frodo (1) (2(3)%2)%1","invalid capture index %2"),
787 |         ];
788 |         for p in bad.iter() {
789 |             let res = LuaPattern::new_try(p.0);
790 |             if let Err(e) = res {
791 |                 assert_eq!(e, PatternError(p.1.into()));
792 |             } else {
793 |                 panic!("false positive");
794 |             }
795 |         }
796 |     }
797 | }
798 | 


--------------------------------------------------------------------------------
/lua-patterns/src/luapat.rs:
--------------------------------------------------------------------------------
  1 | // translation of Lua 5.2 string pattern code
  2 | 
  3 | use errors::*;
  4 | use std::ptr::null;
  5 | 
  6 | pub const LUA_MAXCAPTURES: usize = 32;
  7 | /* maximum recursion depth for 'match' */
  8 | const MAXCCALLS: usize = 200;
  9 | 
 10 | const L_ESC: u8 = b'%';
 11 | 
 12 | fn add(p: CPtr, count: usize) -> CPtr {
 13 |     unsafe {p.offset(count as isize)}
 14 | }
 15 | 
 16 | fn sub(p: CPtr, count: usize) -> CPtr {
 17 |     unsafe {p.offset(-(count as isize))}
 18 | }
 19 | 
 20 | fn next(p: CPtr) -> CPtr {
 21 |     add(p, 1)
 22 | }
 23 | 
 24 | fn at(p: CPtr) -> u8 {
 25 |    unsafe { *p }
 26 | }
 27 | 
 28 | fn diff(p1: CPtr, p2: CPtr) -> usize {
 29 |     let d = (p1 as isize).wrapping_sub(p2 as isize);
 30 |     d as usize
 31 | }
 32 | 
 33 | #[derive(Copy,Clone,Debug)]
 34 | pub struct LuaMatch {
 35 |     pub start: usize,
 36 |     pub end: usize,
 37 | }
 38 | 
 39 | #[derive(Copy,Clone)]
 40 | enum CapLen {
 41 |     Len(usize),
 42 |     Unfinished,
 43 |     Position,
 44 | }
 45 | 
 46 | impl CapLen {
 47 |     fn is_unfinished(&self) -> bool {
 48 |         match *self {
 49 |             CapLen::Unfinished => true,
 50 |             _ => false
 51 |         }
 52 |     }
 53 | 
 54 |     fn size(&self) -> Result<usize> {
 55 |         match *self {
 56 |             CapLen::Len(size) => Ok(size),
 57 |             _ => error("capture was unfinished or positional")
 58 |         }
 59 |     }
 60 | 
 61 | }
 62 | 
 63 | type CPtr = *const u8;
 64 | 
 65 | #[derive(Copy,Clone)]
 66 | struct Capture {
 67 |     init: CPtr,
 68 |     len: CapLen,
 69 | }
 70 | 
 71 | impl Capture {
 72 |     fn is_unfinished(&self) -> bool {
 73 |         self.len.is_unfinished()
 74 |     }
 75 | }
 76 | 
 77 | use std::result;
 78 | 
 79 | type Result<T> = result::Result<T,PatternError>;
 80 | 
 81 | fn error<T>(msg: &str) ->  Result<T> {
 82 |     Err(PatternError(msg.into()))
 83 | }
 84 | 
 85 | struct MatchState {
 86 |     matchdepth: usize, /* control for recursive depth (to avoid stack overflow) */
 87 |     src_init: CPtr, /* init of source string */
 88 |     src_end: CPtr, /* end ('\0') of source string */
 89 |     p_end: CPtr, /* end ('\0') of pattern */
 90 |     level: usize, /* total number of captures (finished or unfinished) */
 91 |     capture: [Capture; LUA_MAXCAPTURES],
 92 | }
 93 | 
 94 | impl MatchState {
 95 |     fn new(s: CPtr, se: CPtr, pe: CPtr) -> MatchState {
 96 |         MatchState {
 97 |             matchdepth: MAXCCALLS,
 98 |             src_init: s,
 99 |             src_end: se,
100 |             p_end: pe,
101 |             level: 0,
102 |             capture: [Capture{init: null(), len: CapLen::Len(0) }; LUA_MAXCAPTURES],
103 |         }
104 |     }
105 | 
106 |     fn check_capture(&self, l: usize) -> Result<usize> {
107 |         let l = l as i8 - b'1' as i8;
108 |         if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() {
109 |             return error(&format!("invalid capture index %{}", l + 1));
110 |         }
111 |         Ok(l as usize)
112 |     }
113 | 
114 |     fn capture_to_close(&self) -> Result<usize> {
115 |         let mut level = (self.level - 1) as isize;
116 |         while level >= 0 {
117 |             if self.capture[level as usize].is_unfinished() {
118 |                 return Ok(level as usize);
119 |             }
120 |             level -= 1;
121 |         }
122 |         error("invalid pattern capture")
123 |     }
124 | 
125 |     fn classend (&self, p: CPtr) -> Result<CPtr> {
126 |         let ch = at(p);
127 |         let mut next_p = next(p);
128 |         Ok(match ch {
129 |             L_ESC => {
130 |                 if next_p == self.p_end {
131 |                     return error("malformed pattern (ends with '%')");
132 |                 }
133 |                 next(next_p)
134 |             },
135 |             b'[' => {
136 |                 if at(next_p) == b'^' {
137 |                     next_p = next(next_p);
138 |                 }
139 |                 while at(next_p) != b']' {
140 |                     if next_p == self.p_end {
141 |                         return error("malformed pattern (missing ']')");
142 |                     }
143 |                     let ch = at(next_p);
144 |                     next_p = next(next_p);
145 |                     if ch == L_ESC && p < self.p_end {
146 |                         next_p = next(next_p);  /* skip escapes (e.g. `%]') */
147 |                     }
148 |                 }
149 |                 next(next_p)
150 |             },
151 |             _ => next_p
152 |         })
153 |     }
154 | 
155 | }
156 | 
157 | fn match_class (ch: u8, class: u8) -> bool {
158 |     let res = match class.to_ascii_lowercase() {
159 |         b'a' => ch.is_ascii_alphabetic(),
160 |         b'c' => ch.is_ascii_control(),
161 |         b'd' => ch.is_ascii_digit(),
162 |         b'g' => ch.is_ascii_graphic(),
163 |         b'l' => ch.is_ascii_lowercase(),
164 |         b'p' => ch.is_ascii_punctuation(),
165 |         b's' => ch.is_ascii_whitespace(),
166 |         b'u' => ch.is_ascii_uppercase(),
167 |         b'w' => ch.is_ascii_alphanumeric(),
168 |         b'x' => ch.is_ascii_hexdigit(),
169 |         lc => return lc == ch,
170 |     };
171 |     if class.is_ascii_lowercase() { res } else {! res}
172 | }
173 | 
174 | 
175 | fn matchbracketclass (c: u8, p: CPtr, ec: CPtr) -> bool {
176 |     let mut p = p;
177 |     // [^ inverts match
178 |     let sig = if at(next(p)) == b'^' {
179 |         p = next(p);
180 |         false
181 |     } else {
182 |         true
183 |     };
184 |     p = next(p);
185 |     while p < ec {
186 |         if at(p) == L_ESC { // e.g %s
187 |             p = next(p);
188 |             if match_class(c, at(p)) {
189 |                 return sig;
190 |             }
191 |         } else
192 |         // e.g a-z
193 |         if at(next(p)) == b'-' && add(p,2) < ec {
194 |             let lastc = at(p);
195 |             p = add(p,2);
196 |             if lastc <= c && c <= at(p) {
197 |                 return sig;
198 |             }
199 |         } else
200 |         if at(p) == c {
201 |             return sig;
202 |         }
203 |         p = next(p);
204 |     }
205 |     return ! sig;
206 | }
207 | 
208 | impl MatchState {
209 | 
210 |     fn singlematch (&self, s: CPtr, p: CPtr, ep: CPtr) -> bool {
211 |         if s >= self.src_end {
212 |             return false;
213 |         }
214 |         let c = at(s);
215 |         let pc = at(p);
216 |         match pc {
217 |             b'.' => true, /* matches any char */
218 |             L_ESC => match_class(c, at(next(p))),
219 |             b'[' => matchbracketclass(c, p, sub(ep,1)),
220 |             _ => c == pc
221 |         }
222 |     }
223 | 
224 |     fn matchbalance (&self, s: CPtr, p: CPtr) -> Result<CPtr> {
225 |         if p >= sub(self.p_end,1) {
226 |             return error("malformed pattern (missing arguments to '%b')");
227 |         }
228 |         if at(s) != at(p) {
229 |             return Ok(null());
230 |         }
231 |         // e.g. %b()
232 |         let b = at(p);
233 |         let e = at(next(p));
234 |         let mut cont = 1;
235 |         let mut s = next(s);
236 |         while s < self.src_end {
237 |             let ch = at(s);
238 |             if ch == e {
239 |                 cont -= 1;
240 |                 if cont == 0 {
241 |                     return Ok(next(s));
242 |                 }
243 |             } else
244 |             if ch == b {
245 |                 cont += 1;
246 |             }
247 |             s = next(s);
248 |         }
249 |         Ok(null()) /* string ends out of balance */
250 |     }
251 | 
252 |     fn max_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result<CPtr> {
253 |         let mut i = 0isize; /* counts maximum expand for item */
254 |         while self.singlematch(add(s,i as usize),p,ep) {
255 |             i += 1;
256 |         }
257 |         /* keeps trying to match with the maximum repetitions */
258 |         while  i >= 0 {
259 |             let res = self.patt_match(add(s,i as usize),next(ep))?;
260 |             if ! res.is_null() {
261 |                 return Ok(res);
262 |             }
263 |             i -= 1; /* else didn't match; reduce 1 repetition to try again */
264 |         }
265 |         Ok(null())
266 |     }
267 | 
268 |     fn min_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result<CPtr> {
269 |         let mut s = s;
270 |         loop {
271 |             let res = self.patt_match(s,next(ep))?;
272 |             if ! res.is_null() {
273 |                 return Ok(res);
274 |             } else
275 |             if self.singlematch(s, p, ep) {
276 |                 s = next(s);
277 |             } else {
278 |                 return Ok(null());
279 |             }
280 |         }
281 |     }
282 | 
283 |     fn start_capture(&mut self, s: CPtr, p: CPtr, what: CapLen) -> Result<CPtr> {
284 |         let level = self.level;
285 |         if level >= LUA_MAXCAPTURES {
286 |             return error("too many captures");
287 |         }
288 |         self.capture[level].init = s;
289 |         self.capture[level].len = what;
290 |         self.level = level + 1;
291 |         let res = self.patt_match(s, p)?;
292 |         if res.is_null() { /* match failed? */
293 |             self.level -= 1; /* undo capture */
294 |         }
295 |         Ok(res)
296 |     }
297 | 
298 |     fn end_capture(&mut self, s: CPtr, p: CPtr) -> Result<CPtr> {
299 |         let l = self.capture_to_close()?;
300 |         self.capture[l].len = CapLen::Len(diff(s,self.capture[l].init));  /* close capture */
301 |         let res = self.patt_match(s, p)?;
302 |         if res.is_null() { /* match failed? */
303 |             self.capture[l].len = CapLen::Unfinished;
304 |         }
305 |         Ok(res)
306 |     }
307 | 
308 |     fn match_capture(&mut self, s: CPtr, l: usize) -> Result<CPtr> {
309 |         let l = self.check_capture(l)?;
310 |         let len = self.capture[l].len.size()?;
311 |         if diff(self.src_end, s) >= len {
312 |             unsafe {s.copy_to_nonoverlapping(self.capture[l].init as *mut u8, len);}
313 |             return Ok(add(s,len));
314 |         }
315 |         Ok(null())
316 |     }
317 | 
318 | 
319 |     fn patt_match(&mut self, s: CPtr, p: CPtr) -> Result<CPtr> {
320 |         let mut s = s;
321 |         let mut p = p;
322 |         self.matchdepth -= 1;
323 |         if self.matchdepth == 0 {
324 |             return error("pattern too complex");
325 |         }
326 | 
327 |         if p == self.p_end {  /* end of pattern? */
328 |             self.matchdepth += 1;
329 |             return Ok(s);
330 |         }
331 |         match at(p) {
332 |             b'(' => { /* start capture */
333 |                 if at(next(p)) == b')' { /* position capture? */
334 |                     s = self.start_capture(s, add(p,2), CapLen::Position)?;
335 |                 } else {
336 |                     s = self.start_capture(s, next(p), CapLen::Unfinished)?;
337 |                 }
338 |             },
339 |             b')' => { /* end capture */
340 |                 s = self.end_capture(s, next(p))?;
341 |             },
342 |             b'$' => {
343 |                 if next(p) != self.p_end { /* is the `$' the last char in pattern? */
344 |                    /* no; go to default */
345 |                    return self.patt_default_match(s, p);
346 |                 }
347 |                 s = if s == self.src_end {s} else {null()}; /* check end of string */
348 |             }
349 |             L_ESC => {  /* escaped sequences not in the format class[*+?-]? */
350 |                 match at(next(p)) {
351 |                     b'b' => { /* balanced string? */
352 |                         s = self.matchbalance(s, add(p,2))?;
353 |                         if ! s.is_null() {
354 |                             // e.g, after %b()
355 |                             return self.patt_match(s, add(p,4));
356 |                         }
357 |                     },
358 |                     b'f' => { /* frontier? */
359 |                         p = add(p,2);
360 |                         if at(p) != b'[' {
361 |                             return error("missing '[' after '%f' in pattern");
362 |                         }
363 |                         let ep = self.classend(p)?; /* points to what is next */
364 |                         let previous = if s == self.src_init {b'\0'} else {at(sub(s,1))};
365 |                         let epl = sub(ep,1);
366 |                         if ! matchbracketclass(previous,p,epl)
367 |                            && matchbracketclass(at(s),p,epl) {
368 |                             return self.patt_match(s, ep);
369 |                         }
370 |                         s = null(); /* match failed */
371 |                     },
372 |                     b'0'..=b'9' => {  /* capture results (%0-%9)? */
373 |                         s = self.match_capture(s,at(next(p)) as usize)?;
374 |                         if ! s.is_null() {
375 |                             return self.patt_match(s, add(p,2));
376 |                         }
377 |                     },
378 |                     _ => return self.patt_default_match(s, p)
379 |                 }
380 | 
381 |             },
382 |             _ => return self.patt_default_match(s, p)
383 | 
384 |         }
385 |         self.matchdepth += 1;
386 |         Ok(s)
387 |     }
388 | 
389 |     fn patt_default_match(&mut self, s: CPtr, p: CPtr) -> Result<CPtr> {
390 |         let mut s = s;
391 |         /* pattern class plus optional suffix */
392 |         let ep = self.classend(p)?; /* points to optional suffix */
393 |         let epc = if ep == self.p_end { 0 } else { at(ep) };
394 |         /* does not match at least once? */
395 |         if ! self.singlematch(s, p, ep) {
396 |             if epc == b'*' || epc == b'?' || epc == b'-' { /* accept empty? */
397 |                 return self.patt_match(s, next(ep));
398 |             } else { /* '+' or no suffix */
399 |                 s = null(); /* fail */
400 |             }
401 |         } else { /* matched once */
402 |             match at(ep) { /* handle optional suffix */
403 |                 b'?' => {
404 |                     let res = self.patt_match(next(s),next(ep))?;
405 |                     if ! res.is_null() {
406 |                         s = res;
407 |                     } else {
408 |                         return self.patt_match(s, next(ep));
409 |                     }
410 |                 },
411 |                 b'+' => { /* 1 or more repetitions */
412 |                     s = next(s);
413 |                     s = self.max_expand(s, p, ep)?;
414 |                 },
415 |                 b'*' => { /* 0 or more repetitions */
416 |                     s = self.max_expand(s, p, ep)?;
417 |                 },
418 |                 b'-' => { /* 0 or more repetitions (minimum) */
419 |                     s = self.min_expand(s, p, ep)?  ;
420 |                 },
421 |                 _ => { /* no suffix */
422 |                     return self.patt_match(next(s),ep);
423 |                 }
424 |             }
425 |         }
426 |         self.matchdepth += 1;
427 |         Ok(s)
428 |     }
429 | 
430 |     fn push_onecapture(&mut self, i: usize, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result<()> {
431 |         if i >= self.level {
432 |             if i == 0 { /* ms->level == 0, too */
433 |                 mm[0].start = 0;
434 |                 mm[0].end = diff(e,s);
435 |                 Ok(())
436 |             } else {
437 |                 return error("invalid capture index");
438 |             }
439 |         } else {
440 |             let init = self.capture[i].init;
441 |             match self.capture[i].len {
442 |                 CapLen::Unfinished => error("unfinished capture"),
443 |                 CapLen::Position => {
444 |                     mm[i].start = diff(init,next(self.src_init));
445 |                     mm[i].end = mm[i].start;
446 |                     Ok(())
447 |                 },
448 |                 CapLen::Len(l) => {
449 |                     mm[i].start = diff(init,self.src_init);
450 |                     mm[i].end = mm[i].start + l;
451 |                     Ok(())
452 |                 }
453 |             }
454 |         }
455 | 
456 |     }
457 | 
458 |     fn push_captures(&mut self, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result<usize> {
459 |         let nlevels = if self.level == 0 && ! s.is_null() {1} else {self.level};
460 |         for i in 0..nlevels {
461 |             self.push_onecapture(i, s, e, mm)?;
462 |         }
463 |         Ok(nlevels)  /* number of strings pushed */
464 |     }
465 | 
466 |     pub fn str_match_check(&mut self, p: CPtr) -> Result<()> {
467 |         let mut level_stack = [0; LUA_MAXCAPTURES];
468 |         let mut stack_idx = 0;
469 |         let mut p = p;
470 |         while p < self.p_end {
471 |             let ch = at(p);
472 |             p = next(p);
473 |             match ch {
474 |                 L_ESC => {
475 |                     //p = next(p);
476 |                     let c = at(p);
477 |                     match c {
478 |                         b'b' => {
479 |                             p = next(p);
480 |                             if p >= self.p_end {
481 |                                 return error("malformed pattern (missing arguments to '%b')");
482 |                             }
483 |                         },
484 |                         b'f' => {
485 |                             p = next(p);
486 |                             if at(p) != b'[' {
487 |                                 return error("missing '['  after '%f' in pattern");
488 |                             }
489 |                             p = sub(p,1); // so we see [...]
490 |                         },
491 |                         b'0' ..= b'9' => {
492 |                             let l = (c as i8) - (b'1' as i8);
493 |                             println!("level {}", self.level);
494 |                             if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() {
495 |                                 return error(&format!("invalid capture index %{}", l + 1));
496 |                             }
497 |                             p = sub(p,1);
498 |                         },
499 |                         _ => {}
500 |                     }
501 |                 },
502 |                 b'[' => {
503 |                     while at(p) != b']' {
504 |                         if p == self.p_end {
505 |                             return error("malformed pattern (missing ']')");
506 |                         }
507 |                         if at(p) == L_ESC && p < self.p_end {
508 |                             p = next(p);
509 |                         }
510 |                         p = next(p);
511 |                     }
512 |                 },
513 |                 b'(' => {
514 |                     if at(p) != b')' { // not a position capture
515 |                         level_stack[stack_idx] = self.level;
516 |                         stack_idx += 1;
517 |                         self.capture[self.level].len = CapLen::Unfinished;
518 |                         self.level += 1;
519 |                         if self.level >= LUA_MAXCAPTURES {
520 |                             return error("too many captures");
521 |                         }
522 |                     } else {
523 |                         p = next(p);
524 |                     }
525 |                 },
526 |                 b')' => {
527 |                     if stack_idx == 0 {
528 |                         return error("no open capture");
529 |                     }
530 |                     stack_idx -= 1;
531 |                     self.capture[level_stack[stack_idx]].len = CapLen::Position;
532 |                 },
533 |                 _ => {}
534 |             }
535 |         }
536 |         if stack_idx > 0 {
537 |             return error("unfinished capture");
538 |         }
539 |         Ok(())
540 |     }
541 | }
542 | 
543 | pub fn str_match(s: &[u8], p: &[u8], mm: &mut [LuaMatch]) -> Result<usize> {
544 |     let mut lp = p.len();
545 |     let mut p = p.as_ptr();
546 |     let ls = s.len();
547 |     let s = s.as_ptr();
548 |     let mut s1 = s;
549 |     let anchor = at(p) == b'^';
550 |     if anchor {
551 |         p = next(p);
552 |         lp -= 1;  /* skip anchor character */
553 |     }
554 | 
555 |     let mut ms = MatchState::new(s,add(s,ls),add(p,lp));
556 |     loop {
557 |         let res = ms.patt_match(s1, p)?;
558 |         if ! res.is_null() {
559 |             mm[0].start = diff(s1,s); /* start */
560 |             mm[0].end = diff(res,s); /* end */
561 |             return Ok(ms.push_captures(null(),null(),&mut mm[1..])? + 1);
562 |         }
563 |         s1 = next(s1);
564 |         if ! (s1 < ms.src_end && ! anchor) {
565 |             break;
566 |         }
567 |     }
568 |     Ok(0)
569 | }
570 | 
571 | pub fn str_check(p: &[u8]) -> Result<()> {
572 |     let mut lp = p.len();
573 |     let mut p = p.as_ptr();
574 |     let anchor = at(p) == b'^';
575 |     if anchor {
576 |         p = next(p);
577 |         lp -= 1;  /* skip anchor character */
578 |     }
579 |     let mut ms = MatchState::new(null(),null(),add(p,lp));
580 |     if at(sub(ms.p_end,1)) == b'%' {
581 |         return error("malformed pattern (ends with '%')");
582 |     }
583 |     ms.str_match_check(p)?;
584 |     Ok(())
585 | }
586 | 
587 | /*
588 | fn check(s: &[u8], p: &[u8]) {
589 |     if let Err(e) = str_check(p) {
590 |         println!("check error {}",e);
591 |         return;
592 |     }
593 | 
594 |     let mut matches = [LuaMatch{start: 0, end: 0}; 10];
595 |     match str_match(s, p, &mut matches) {
596 |         Ok(n) => {
597 |             println!("ok {} matches", n);
598 |             for i in 0..n {
599 |                 println!("match {:?} {:?}",
600 |                     matches[i],
601 |                     String::from_utf8(s[matches[i].start .. matches[i].end].to_vec())
602 |                 );
603 |             }
604 |         },
605 |         Err(e) => {
606 |             println!("error: {}", e)
607 |         }
608 |     }
609 | }
610 | 
611 | 
612 | 
613 | fn main() {
614 |     let mut args = std::env::args().skip(1);
615 |     let pat = args.next().unwrap();
616 |     let s = args.next().unwrap();
617 |     check(s.as_bytes(), pat.as_bytes());
618 | 
619 |     //~ check(b"hello",b"%a");
620 |     //~ check(b"0hello",b"%a+");
621 |     //~ check(b"hello",b"%l(%a)");
622 |     //check(b"hello",b"he(l+)");
623 |     //check(b"k  {and {so}}",b"k%s+(%b{})");
624 | }
625 |  */
626 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | reorder_modules = false
2 | use_small_heuristics = "Max"
3 | tab_spaces = 2
4 | 


--------------------------------------------------------------------------------
/src/annot.rs:
--------------------------------------------------------------------------------
 1 | mod generated;
 2 | 
 3 | use std::fmt;
 4 | 
 5 | pub(crate) use self::generated::*;
 6 | 
 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 8 | pub(crate) enum Annot {
 9 |   Atom(Atom),
10 |   Add(Comp),
11 |   Sub(Comp),
12 | }
13 | 
14 | impl PartialEq<Atom> for Annot {
15 |   fn eq(&self, other: &Atom) -> bool {
16 |     match self {
17 |       Annot::Atom(it) => it == other,
18 |       _ => false,
19 |     }
20 |   }
21 | }
22 | 
23 | impl From<Atom> for Annot {
24 |   fn from(value: Atom) -> Annot {
25 |     Annot::Atom(value)
26 |   }
27 | }
28 | 
29 | impl fmt::Display for Annot {
30 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
31 |     match self {
32 |       Annot::Atom(it) => write!(f, "{it}"),
33 |       Annot::Add(it) => write!(f, "+{it}"),
34 |       Annot::Sub(it) => write!(f, "-{it}"),
35 |     }
36 |   }
37 | }
38 | 
39 | impl Comp {
40 |   pub(crate) fn add(self) -> Annot {
41 |     Annot::Add(self)
42 |   }
43 |   pub(crate) fn sub(self) -> Annot {
44 |     Annot::Sub(self)
45 |   }
46 | }
47 | 
48 | impl Default for Comp {
49 |   fn default() -> Self {
50 |     Comp::Para
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/annot/generated.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
  3 | pub(crate) enum Comp {
  4 |   Verbatim,
  5 |   Email,
  6 |   Url,
  7 |   Subscript,
  8 |   Superscript,
  9 |   Para,
 10 |   CodeBlock,
 11 |   Imagetext,
 12 |   Linktext,
 13 |   Reference,
 14 |   Destination,
 15 |   Emph,
 16 |   Strong,
 17 |   Span,
 18 |   DoubleQuoted,
 19 |   ReferenceDefinition,
 20 |   Insert,
 21 |   Delete,
 22 |   Mark,
 23 |   Attributes,
 24 | }
 25 | 
 26 | impl fmt::Display for Comp {
 27 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 28 |     f.write_str(match self {
 29 |       Comp::Verbatim => "verbatim",
 30 |       Comp::Email => "email",
 31 |       Comp::Url => "url",
 32 |       Comp::Subscript => "subscript",
 33 |       Comp::Superscript => "superscript",
 34 |       Comp::Para => "para",
 35 |       Comp::CodeBlock => "code_block",
 36 |       Comp::Imagetext => "imagetext",
 37 |       Comp::Linktext => "linktext",
 38 |       Comp::Reference => "reference",
 39 |       Comp::Destination => "destination",
 40 |       Comp::Emph => "emph",
 41 |       Comp::Strong => "strong",
 42 |       Comp::Span => "span",
 43 |       Comp::DoubleQuoted => "double_quoted",
 44 |       Comp::ReferenceDefinition => "reference_definition",
 45 |       Comp::Insert => "insert",
 46 |       Comp::Delete => "delete",
 47 |       Comp::Mark => "mark",
 48 |       Comp::Attributes => "attributes",
 49 |     })
 50 |   }
 51 | }
 52 | 
 53 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 54 | pub(crate) enum Atom {
 55 |   Str,
 56 |   Escape,
 57 |   Hardbreak,
 58 |   Nbsp,
 59 |   Blankline,
 60 |   ImageMarker,
 61 |   LeftDoubleQuote,
 62 |   RightDoubleQuote,
 63 |   Ellipses,
 64 |   Softbreak,
 65 |   FootnoteReference,
 66 |   OpenMarker,
 67 |   Emoji,
 68 |   ReferenceKey,
 69 |   ReferenceValue,
 70 |   CodeLanguage,
 71 |   EmDash,
 72 |   EnDash,
 73 |   Id,
 74 |   Key,
 75 |   Value,
 76 |   Class,
 77 | }
 78 | 
 79 | impl Atom {
 80 |   pub(crate) fn is_left_atom(self) -> bool {
 81 |     matches!(self,  | Atom::LeftDoubleQuote)
 82 |   }
 83 |   pub(crate) fn is_right_atom(self) -> bool {
 84 |     matches!(self,  | Atom::RightDoubleQuote)
 85 |   }
 86 |   pub(crate) fn corresponding_left_atom(self) -> Atom {
 87 |     match self {
 88 |       Atom::RightDoubleQuote => Atom::LeftDoubleQuote,
 89 | 
 90 |       _ => self,
 91 |     }
 92 |   }
 93 |   pub(crate) fn corresponding_right_atom(self) -> Atom {
 94 |     match self {
 95 |       Atom::LeftDoubleQuote => Atom::RightDoubleQuote,
 96 | 
 97 |       _ => self,
 98 |     }
 99 |   }
100 | }
101 | 
102 | impl fmt::Display for Atom {
103 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |     f.write_str(match self {
105 |       Atom::Str => "str",
106 |       Atom::Escape => "escape",
107 |       Atom::Hardbreak => "hardbreak",
108 |       Atom::Nbsp => "nbsp",
109 |       Atom::Blankline => "blankline",
110 |       Atom::ImageMarker => "image_marker",
111 |       Atom::LeftDoubleQuote => "left_double_quote",
112 |       Atom::RightDoubleQuote => "right_double_quote",
113 |       Atom::Ellipses => "ellipses",
114 |       Atom::Softbreak => "softbreak",
115 |       Atom::FootnoteReference => "footnote_reference",
116 |       Atom::OpenMarker => "open_marker",
117 |       Atom::Emoji => "emoji",
118 |       Atom::ReferenceKey => "reference_key",
119 |       Atom::ReferenceValue => "reference_value",
120 |       Atom::CodeLanguage => "code_language",
121 |       Atom::EmDash => "em_dash",
122 |       Atom::EnDash => "en_dash",
123 |       Atom::Id => "id",
124 |       Atom::Key => "key",
125 |       Atom::Value => "value",
126 |       Atom::Class => "class",
127 |     })
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/ast.rs:
--------------------------------------------------------------------------------
 1 | mod generated;
 2 | 
 3 | use indexmap::IndexMap;
 4 | 
 5 | pub use self::generated::*;
 6 | 
 7 | pub type Attrs = IndexMap<String, String>;
 8 | 
 9 | #[derive(Debug, Default, Clone, serde::Serialize)]
10 | pub struct ReferenceDefinition {
11 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
12 |   pub attrs: Attrs,
13 |   pub destination: String,
14 | }
15 | 


--------------------------------------------------------------------------------
/src/ast/generated.rs:
--------------------------------------------------------------------------------
  1 | use super::Attrs;
  2 | 
  3 | #[derive(Debug, Default, Clone, serde::Serialize)]
  4 | pub struct Heading {
  5 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
  6 |   pub attrs: Attrs,
  7 |   pub children: Vec<Tag>,
  8 |   pub level: u32,
  9 | }
 10 | 
 11 | #[derive(Debug, Default, Clone, serde::Serialize)]
 12 | pub struct Para {
 13 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 14 |   pub attrs: Attrs,
 15 |   pub children: Vec<Tag>,
 16 | }
 17 | 
 18 | #[derive(Debug, Default, Clone, serde::Serialize)]
 19 | pub struct Link {
 20 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 21 |   pub attrs: Attrs,
 22 |   pub children: Vec<Tag>,
 23 |   pub destination: Option<String>,
 24 |   pub reference: Option<String>,
 25 | }
 26 | 
 27 | #[derive(Debug, Default, Clone, serde::Serialize)]
 28 | pub struct Image {
 29 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 30 |   pub attrs: Attrs,
 31 |   pub children: Vec<Tag>,
 32 |   pub destination: Option<String>,
 33 |   pub reference: Option<String>,
 34 | }
 35 | 
 36 | #[derive(Debug, Default, Clone, serde::Serialize)]
 37 | pub struct CodeBlock {
 38 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 39 |   pub attrs: Attrs,
 40 |   pub children: Vec<Tag>,
 41 |   pub lang: Option<String>,
 42 |   pub text: String,
 43 | }
 44 | 
 45 | #[derive(Debug, Default, Clone, serde::Serialize)]
 46 | pub struct Strong {
 47 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 48 |   pub attrs: Attrs,
 49 |   pub children: Vec<Tag>,
 50 | }
 51 | 
 52 | #[derive(Debug, Default, Clone, serde::Serialize)]
 53 | pub struct Emph {
 54 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 55 |   pub attrs: Attrs,
 56 |   pub children: Vec<Tag>,
 57 | }
 58 | 
 59 | #[derive(Debug, Default, Clone, serde::Serialize)]
 60 | pub struct Insert {
 61 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 62 |   pub attrs: Attrs,
 63 |   pub children: Vec<Tag>,
 64 | }
 65 | 
 66 | #[derive(Debug, Default, Clone, serde::Serialize)]
 67 | pub struct Delete {
 68 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 69 |   pub attrs: Attrs,
 70 |   pub children: Vec<Tag>,
 71 | }
 72 | 
 73 | #[derive(Debug, Default, Clone, serde::Serialize)]
 74 | pub struct Mark {
 75 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 76 |   pub attrs: Attrs,
 77 |   pub children: Vec<Tag>,
 78 | }
 79 | 
 80 | #[derive(Debug, Default, Clone, serde::Serialize)]
 81 | pub struct Superscript {
 82 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 83 |   pub attrs: Attrs,
 84 |   pub children: Vec<Tag>,
 85 | }
 86 | 
 87 | #[derive(Debug, Default, Clone, serde::Serialize)]
 88 | pub struct Subscript {
 89 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 90 |   pub attrs: Attrs,
 91 |   pub children: Vec<Tag>,
 92 | }
 93 | 
 94 | #[derive(Debug, Default, Clone, serde::Serialize)]
 95 | pub struct Span {
 96 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
 97 |   pub attrs: Attrs,
 98 |   pub children: Vec<Tag>,
 99 | }
100 | 
101 | #[derive(Debug, Default, Clone, serde::Serialize)]
102 | pub struct DoubleQuoted {
103 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
104 |   pub attrs: Attrs,
105 |   pub children: Vec<Tag>,
106 | }
107 | 
108 | #[derive(Debug, Default, Clone, serde::Serialize)]
109 | pub struct Url {
110 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
111 |   pub attrs: Attrs,
112 |   pub children: Vec<Tag>,
113 |   pub destination: String,
114 | }
115 | 
116 | #[derive(Debug, Default, Clone, serde::Serialize)]
117 | pub struct SoftBreak {
118 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
119 |   pub attrs: Attrs,
120 | }
121 | 
122 | #[derive(Debug, Default, Clone, serde::Serialize)]
123 | pub struct EmDash {
124 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
125 |   pub attrs: Attrs,
126 | }
127 | 
128 | #[derive(Debug, Default, Clone, serde::Serialize)]
129 | pub struct EnDash {
130 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
131 |   pub attrs: Attrs,
132 | }
133 | 
134 | #[derive(Debug, Default, Clone, serde::Serialize)]
135 | pub struct Verbatim {
136 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
137 |   pub attrs: Attrs,
138 |   pub text: String,
139 | }
140 | 
141 | #[derive(Debug, Default, Clone, serde::Serialize)]
142 | pub struct Str {
143 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
144 |   pub attrs: Attrs,
145 |   pub text: String,
146 | }
147 | 
148 | #[derive(Debug, Default, Clone, serde::Serialize)]
149 | pub struct Emoji {
150 |   #[serde(skip_serializing_if = "Attrs::is_empty")]
151 |   pub attrs: Attrs,
152 |   pub alias: String,
153 | }
154 | 
155 | #[derive(Debug, Clone, serde::Serialize)]
156 | #[serde(tag = "tag", rename_all = "snake_case")]
157 | pub enum Tag {
158 |   Heading(Heading),
159 |   Para(Para),
160 |   Link(Link),
161 |   Image(Image),
162 |   CodeBlock(CodeBlock),
163 |   Strong(Strong),
164 |   Emph(Emph),
165 |   Insert(Insert),
166 |   Delete(Delete),
167 |   Mark(Mark),
168 |   Superscript(Superscript),
169 |   Subscript(Subscript),
170 |   Span(Span),
171 |   DoubleQuoted(DoubleQuoted),
172 |   Url(Url),
173 |   SoftBreak(SoftBreak),
174 |   EmDash(EmDash),
175 |   EnDash(EnDash),
176 |   Verbatim(Verbatim),
177 |   Str(Str),
178 |   Emoji(Emoji),
179 | }
180 | 


--------------------------------------------------------------------------------
/src/attribute.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Range;
  2 | 
  3 | use crate::{
  4 |   annot::{Annot, Atom},
  5 |   patterns::find_at,
  6 |   Match,
  7 | };
  8 | 
  9 | #[derive(Default)]
 10 | pub(crate) struct Tokenizer {
 11 |   subject: String,
 12 |   state: State,
 13 |   begin: usize,
 14 |   lastpos: usize,
 15 |   matches: Vec<Match>,
 16 | }
 17 | 
 18 | #[derive(Default)]
 19 | enum State {
 20 |   Scanning,
 21 |   ScanningId,
 22 |   ScanningClass,
 23 |   ScanningKey,
 24 |   ScanningValue,
 25 |   ScanningBareValue,
 26 |   ScanningQuotedValue,
 27 |   ScanningEscaped,
 28 |   ScanningComment,
 29 |   Fail,
 30 |   Done,
 31 |   #[default]
 32 |   Start,
 33 | }
 34 | 
 35 | pub(crate) enum Status {
 36 |   Done,
 37 |   Fail,
 38 |   Continue,
 39 | }
 40 | 
 41 | impl Tokenizer {
 42 |   pub(crate) fn new(subject: String) -> Tokenizer {
 43 |     let mut res = Tokenizer::default();
 44 |     res.subject = subject;
 45 |     res
 46 |   }
 47 | 
 48 |   fn add_match(&mut self, range: Range<usize>, annot: impl Into<Annot>) {
 49 |     self.matches.push(Match::new(range, annot))
 50 |   }
 51 | 
 52 |   pub(crate) fn get_matches(&mut self) -> Vec<Match> {
 53 |     std::mem::take(&mut self.matches)
 54 |   }
 55 | 
 56 |   // Feed tokenizer a slice of text from the subject, between
 57 |   // startpos and endpos inclusive.  Return status, position,
 58 |   // where status is either "done" (position should point to
 59 |   // final '}'), "fail" (position should point to first character
 60 |   // that could not be tokenized), or "continue" (position should
 61 |   // point to last character parsed).
 62 |   pub(crate) fn feed(&mut self, startpos: usize, endpos: usize) -> (Status, usize) {
 63 |     let mut pos = startpos;
 64 |     while pos <= endpos {
 65 |       self.state = self.step(pos);
 66 |       match self.state {
 67 |         State::Done => return (Status::Done, pos),
 68 |         State::Fail => {
 69 |           self.lastpos = pos + 1;
 70 |           return (Status::Fail, pos);
 71 |         }
 72 |         _ => {
 73 |           self.lastpos = pos + 1;
 74 |           pos = pos + 1
 75 |         }
 76 |       }
 77 |     }
 78 |     (Status::Continue, pos)
 79 |   }
 80 | 
 81 |   fn step(&mut self, pos: usize) -> State {
 82 |     match self.state {
 83 |       State::Start => {
 84 |         if find_at(&self.subject, "^{", pos).is_match {
 85 |           State::Scanning
 86 |         } else {
 87 |           State::Fail
 88 |         }
 89 |       }
 90 |       State::Fail => State::Fail,
 91 |       State::Done => State::Done,
 92 |       State::Scanning => match self.subject.as_bytes()[pos] {
 93 |         b' ' | b'\t' | b'\n' | b'\r' => State::Scanning,
 94 |         b'}' => State::Done,
 95 |         b'#' => {
 96 |           self.begin = pos;
 97 |           State::ScanningId
 98 |         }
 99 |         b'%' => {
100 |           self.begin = pos;
101 |           State::ScanningComment
102 |         }
103 |         b'.' => {
104 |           self.begin = pos;
105 |           State::ScanningClass
106 |         }
107 |         _ => {
108 |           if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
109 |             self.begin = pos;
110 |             State::ScanningKey
111 |           } else {
112 |             State::Fail
113 |           }
114 |         }
115 |       },
116 |       State::ScanningComment => {
117 |         if self.subject.as_bytes()[pos] == b'%' {
118 |           State::Scanning
119 |         } else {
120 |           State::ScanningComment
121 |         }
122 |       }
123 |       State::ScanningId => self.step_ident(pos, Atom::Id, State::ScanningId),
124 |       State::ScanningClass => self.step_ident(pos, Atom::Class, State::ScanningClass),
125 |       State::ScanningKey => {
126 |         let c = self.subject.as_bytes()[pos];
127 |         if c == b'=' {
128 |           self.add_match(self.begin..self.lastpos, Atom::Key);
129 |           self.begin = !0;
130 |           State::ScanningValue
131 |         } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
132 |           State::ScanningKey
133 |         } else {
134 |           State::Fail
135 |         }
136 |       }
137 |       State::ScanningValue => {
138 |         let c = self.subject.as_bytes()[pos];
139 |         if c == b'"' {
140 |           self.begin = pos;
141 |           State::ScanningQuotedValue
142 |         } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
143 |           self.begin = pos;
144 |           State::ScanningBareValue
145 |         } else {
146 |           State::Fail
147 |         }
148 |       }
149 |       State::ScanningBareValue => {
150 |         let c = self.subject.as_bytes()[pos];
151 |         if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
152 |           State::ScanningBareValue
153 |         } else if c == b'}' {
154 |           self.add_match(self.begin..self.lastpos, Atom::Value);
155 |           self.begin = !0;
156 |           State::Done
157 |         } else if find_at(&self.subject, "^%s", pos).is_match {
158 |           self.add_match(self.begin..self.lastpos, Atom::Value);
159 |           self.begin = !0;
160 |           State::Scanning
161 |         } else {
162 |           State::Fail
163 |         }
164 |       }
165 |       State::ScanningEscaped => State::ScanningQuotedValue,
166 |       State::ScanningQuotedValue => {
167 |         let c = self.subject.as_bytes()[pos];
168 |         match c {
169 |           b'"' => {
170 |             self.add_match(self.begin + 1..self.lastpos, Atom::Value);
171 |             self.begin = !0;
172 |             State::Scanning
173 |           }
174 |           b'\\' => State::ScanningEscaped,
175 |           b'{' | b'}' => State::Fail,
176 |           b'\n' => {
177 |             self.add_match(self.begin + 1..self.lastpos, Atom::Value);
178 |             State::ScanningQuotedValue
179 |           }
180 |           _ => State::ScanningQuotedValue,
181 |         }
182 |       }
183 |     }
184 |   }
185 | 
186 |   fn step_ident(&mut self, pos: usize, atom: Atom, state: State) -> State {
187 |     let c = self.subject.as_bytes()[pos];
188 |     match c {
189 |       b'_' | b'-' | b':' => state,
190 |       b'}' => {
191 |         if self.lastpos > self.begin + 1 {
192 |           self.add_match(self.begin + 1..self.lastpos, atom)
193 |         }
194 |         self.begin = !0;
195 |         State::Done
196 |       }
197 |       _ => {
198 |         if find_at(&self.subject, "^[^%s%p]", pos).is_match {
199 |           state
200 |         } else if find_at(&self.subject, "^%s", pos).is_match {
201 |           if self.lastpos > self.begin {
202 |             self.add_match(self.begin + 1..self.lastpos, atom)
203 |           }
204 |           self.begin = !0;
205 |           State::Scanning
206 |         } else {
207 |           State::Fail
208 |         }
209 |       }
210 |     }
211 |   }
212 | }
213 | 


--------------------------------------------------------------------------------
/src/block.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Range;
  2 | 
  3 | use crate::{
  4 |   annot::{Annot, Atom, Comp},
  5 |   format_to, inline,
  6 |   patterns::{find, find_at, PatMatch},
  7 |   Match, ParseOpts,
  8 | };
  9 | 
 10 | #[derive(Default)]
 11 | pub struct Tokenizer {
 12 |   pub subject: String,
 13 |   indent: usize,
 14 |   startline: usize,
 15 |   starteol: usize,
 16 |   endeol: usize,
 17 |   pub(crate) matches: Vec<Match>,
 18 |   pos: usize,
 19 |   last_matched_container: usize,
 20 |   opts: ParseOpts,
 21 |   finished_line: bool,
 22 | 
 23 |   pub(crate) debug: String,
 24 | }
 25 | 
 26 | trait Container {
 27 |   fn content(&self) -> &'static str;
 28 |   fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
 29 |     None
 30 |   }
 31 |   fn restore_indent(&self) -> Option<usize> {
 32 |     None
 33 |   }
 34 |   fn open(p: &mut Tokenizer, stack: &mut Vec<Box<dyn Container>>) -> bool
 35 |   where
 36 |     Self: Sized;
 37 |   fn cont(&mut self, p: &mut Tokenizer) -> bool;
 38 |   fn close(self: Box<Self>, p: &mut Tokenizer);
 39 | }
 40 | 
 41 | const CONTAINERS: &[fn(&mut Tokenizer, &mut Vec<Box<dyn Container>>) -> bool] =
 42 |   &[Para::open, CodeBlock::open, ReferenceDefinition::open];
 43 | 
 44 | struct Para {
 45 |   inline_parser: inline::Tokenizer,
 46 | }
 47 | 
 48 | impl Container for Para {
 49 |   fn content(&self) -> &'static str {
 50 |     "inline"
 51 |   }
 52 |   fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
 53 |     Some(&mut self.inline_parser)
 54 |   }
 55 |   fn open(p: &mut Tokenizer, stack: &mut Vec<Box<dyn Container>>) -> bool
 56 |   where
 57 |     Self: Sized,
 58 |   {
 59 |     p.add_container(
 60 |       stack,
 61 |       Para { inline_parser: inline::Tokenizer::new(p.subject.clone(), p.opts.clone()) },
 62 |     );
 63 |     p.add_match(p.pos..p.pos, Comp::Para.add());
 64 |     true
 65 |   }
 66 | 
 67 |   fn cont(&mut self, p: &mut Tokenizer) -> bool {
 68 |     p.find("^%S").is_match
 69 |   }
 70 | 
 71 |   fn close(mut self: Box<Self>, p: &mut Tokenizer) {
 72 |     p.matches.extend(self.inline_parser.get_matches());
 73 |     p.add_match(p.pos - 1..p.pos - 1, Comp::Para.sub())
 74 |   }
 75 | }
 76 | 
 77 | struct CodeBlock {
 78 |   border: char,
 79 |   indent: usize,
 80 | }
 81 | 
 82 | impl Container for CodeBlock {
 83 |   fn content(&self) -> &'static str {
 84 |     "text"
 85 |   }
 86 |   fn restore_indent(&self) -> Option<usize> {
 87 |     Some(self.indent)
 88 |   }
 89 |   fn open(p: &mut Tokenizer, stack: &mut Vec<Box<dyn Container>>) -> bool
 90 |   where
 91 |     Self: Sized,
 92 |   {
 93 |     let mut border = '`';
 94 |     let mut m = p.find("^```([ \t]*)([^%s`]*)[ \t]*[\r\n]");
 95 |     if !m.is_match {
 96 |       border = '~';
 97 |       m = p.find("^~~~([ \t]*)([^%s`]*)[ \t]*[\r\n]");
 98 |     }
 99 |     if !m.is_match {
100 |       return false;
101 |     }
102 |     p.add_container(stack, CodeBlock { border, indent: p.indent });
103 |     let lang = m.cap2;
104 | 
105 |     p.add_match(p.pos..p.pos + 3, Comp::CodeBlock.add());
106 |     if !lang.is_empty() {
107 |       p.add_match(lang.start..lang.end, Atom::CodeLanguage)
108 |     }
109 | 
110 |     p.pos = p.pos + 2;
111 |     p.finished_line = true;
112 |     true
113 |   }
114 | 
115 |   fn cont(&mut self, p: &mut Tokenizer) -> bool {
116 |     let m =
117 |       if self.border == '`' { p.find("^(```)[ \t]*[\r\n]") } else { p.find("^(~~~)[ \t]*[\r\n]") };
118 |     if m.is_match {
119 |       p.pos = m.end - 1;
120 |       p.finished_line = true;
121 |       false
122 |     } else {
123 |       true
124 |     }
125 |   }
126 | 
127 |   fn close(self: Box<Self>, p: &mut Tokenizer) {
128 |     p.add_match(p.pos - 3..p.pos, Comp::CodeBlock.sub());
129 |   }
130 | }
131 | 
132 | struct ReferenceDefinition {
133 |   indent: usize,
134 | }
135 | 
136 | impl Container for ReferenceDefinition {
137 |   fn content(&self) -> &'static str {
138 |     ""
139 |   }
140 | 
141 |   fn open(p: &mut Tokenizer, stack: &mut Vec<Box<dyn Container>>) -> bool
142 |   where
143 |     Self: Sized,
144 |   {
145 |     let m = p.find("^[[]([^\r\n]*)%]:[ \t]*(%S*)");
146 |     if !m.is_match {
147 |       return false;
148 |     }
149 |     p.add_container(stack, ReferenceDefinition { indent: p.indent });
150 |     p.add_match(m.start..m.start, Comp::ReferenceDefinition.add());
151 |     p.add_match(m.start..m.start + m.cap1.len() + 2, Atom::ReferenceKey);
152 |     if !m.cap2.is_empty() {
153 |       p.add_match(m.end - m.cap2.len()..m.end, Atom::ReferenceValue);
154 |     }
155 |     p.pos = m.end;
156 |     true
157 |   }
158 | 
159 |   fn cont(&mut self, p: &mut Tokenizer) -> bool {
160 |     if self.indent >= p.indent {
161 |       return false;
162 |     }
163 |     let m = p.find("^(%S+)");
164 |     if m.is_match {
165 |       p.add_match(m.cap1.start..m.cap1.end, Atom::ReferenceValue);
166 |       p.pos = m.end;
167 |     }
168 |     true
169 |   }
170 | 
171 |   fn close(self: Box<Self>, p: &mut Tokenizer) {
172 |     p.add_match(p.pos..p.pos, Comp::ReferenceDefinition.sub())
173 |   }
174 | 
175 |   fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
176 |     None
177 |   }
178 | }
179 | 
180 | impl Tokenizer {
181 |   pub fn new(mut subject: String, opts: ParseOpts) -> Tokenizer {
182 |     if !find(&subject, "[\r\n]$").is_match {
183 |       subject.push('\n');
184 |     }
185 |     let mut res = Tokenizer::default();
186 |     res.subject = subject;
187 |     res.opts = opts;
188 |     res
189 |   }
190 | 
191 |   fn find(&self, pat: &'static str) -> PatMatch {
192 |     find_at(&self.subject, pat, self.pos)
193 |   }
194 | 
195 |   fn add_match(&mut self, range: Range<usize>, annot: impl Into<Annot>) {
196 |     self.matches.push(Match::new(range, annot))
197 |   }
198 | 
199 |   fn add_container(
200 |     &mut self,
201 |     stack: &mut Vec<Box<dyn Container>>,
202 |     container: impl Container + 'static,
203 |   ) {
204 |     let last_matched = self.last_matched_container;
205 |     while stack.len() > last_matched
206 |       || (stack.len() > 0 && stack.last().unwrap().content() != "block")
207 |     {
208 |       stack.pop().unwrap().close(self)
209 |     }
210 |     stack.push(Box::new(container))
211 |   }
212 | 
213 |   fn skip_space(&mut self) {
214 |     let m = find_at(&self.subject, "[^ \t]", self.pos);
215 |     if m.is_match {
216 |       self.indent = m.start - self.startline;
217 |       self.pos = m.start;
218 |     }
219 |   }
220 | 
221 |   fn get_eol(&mut self) {
222 |     let mut m = find_at(&self.subject, "[\r]?[\n]", self.pos);
223 |     if !m.is_match {
224 |       (m.start, m.end) = (self.subject.len(), self.subject.len());
225 |     }
226 |     self.starteol = m.start;
227 |     self.endeol = m.end;
228 |   }
229 | 
230 |   pub fn parse(&mut self) {
231 |     let mut containers: Vec<Box<dyn Container>> = Vec::new();
232 | 
233 |     let subjectlen = self.subject.len();
234 |     while self.pos < subjectlen {
235 |       self.indent = 0;
236 |       self.startline = self.pos;
237 |       self.finished_line = false;
238 |       self.get_eol();
239 | 
240 |       // check open containers for continuation
241 |       self.last_matched_container = 0;
242 |       for idx in 0..containers.len() {
243 |         // skip any indentation
244 |         self.skip_space();
245 |         if containers[idx].cont(self) {
246 |           self.last_matched_container = idx + 1
247 |         } else {
248 |           break;
249 |         }
250 |       }
251 | 
252 |       // if we hit a close fence, we can move to next line
253 |       if self.finished_line {
254 |         while containers.len() > self.last_matched_container {
255 |           containers.pop().unwrap().close(self)
256 |         }
257 |       }
258 | 
259 |       if !self.finished_line {
260 |         // check for new containers
261 |         self.skip_space();
262 |         let mut is_blank = self.pos == self.starteol;
263 | 
264 |         let mut new_starts = false;
265 |         let last_match = containers[..self.last_matched_container].first();
266 |         let mut check_starts = !is_blank
267 |           && !matches!(last_match, Some(c) if c.content() != "block")
268 |           && !self.find("^%a+%s").is_match; // optimization
269 | 
270 |         while check_starts {
271 |           check_starts = false;
272 |           for i in 1..CONTAINERS.len() {
273 |             let open = CONTAINERS[i];
274 |             if open(self, &mut containers) {
275 |               self.last_matched_container = containers.len();
276 |               if self.finished_line {
277 |                 check_starts = false
278 |               } else {
279 |                 self.skip_space();
280 |                 new_starts = true;
281 |                 check_starts = containers.last().unwrap().content() != "text"
282 |               }
283 |               break;
284 |             }
285 |           }
286 |         }
287 | 
288 |         if !self.finished_line {
289 |           // handle remaining content
290 |           self.skip_space();
291 | 
292 |           is_blank = self.pos == self.starteol;
293 | 
294 |           let is_lazy = !is_blank
295 |             && !new_starts
296 |             && self.last_matched_container < containers.len()
297 |             && containers.last().unwrap().content() == "inline";
298 | 
299 |           if !is_lazy && self.last_matched_container < containers.len() {
300 |             while containers.len() > self.last_matched_container {
301 |               containers.pop().unwrap().close(self);
302 |             }
303 |           }
304 | 
305 |           // add para by default if there's text
306 |           if !matches!(containers.last(), Some(c) if c.content() != "block") {
307 |             if is_blank {
308 |               if !new_starts {
309 |                 // need to track these for tight/loose lists
310 |                 self.add_match(self.pos..self.endeol, Atom::Blankline);
311 |               }
312 |             } else {
313 |               CONTAINERS[0](self, &mut containers);
314 |             }
315 |           }
316 | 
317 |           if let Some(tip) = containers.last_mut() {
318 |             if let Some(tip_indent) = tip.restore_indent() {
319 |               let mut startpos = self.pos;
320 |               if self.indent > tip_indent {
321 |                 // get back the leading spaces we gobbled
322 |                 startpos = startpos - (self.indent - tip_indent)
323 |               }
324 |               self.add_match(startpos..self.endeol, Atom::Str)
325 |             } else if let Some(inline_parser) = tip.inline_parser() {
326 |               if !is_blank {
327 |                 inline_parser.feed(self.pos, self.endeol)
328 |               }
329 |             }
330 |           }
331 |         }
332 |       }
333 | 
334 |       self.pos = self.endeol;
335 |     }
336 |     self.finish(containers)
337 |   }
338 | 
339 |   fn finish(&mut self, mut containers: Vec<Box<dyn Container>>) {
340 |     // close unmatched containers
341 |     while let Some(cont) = containers.pop() {
342 |       cont.close(self)
343 |     }
344 |     if self.opts.debug_matches {
345 |       for m in &self.matches {
346 |         let ms = format!(
347 |           "{} {}-{}",
348 |           m.a,
349 |           m.range.start + 1,
350 |           if m.range.is_empty() { m.range.end + 1 } else { m.range.end }
351 |         );
352 |         format_to!(
353 |           self.debug,
354 |           "{ms:<20} {:?}\n",
355 |           self.subject.get(m.range.clone()).unwrap_or_default()
356 |         );
357 |       }
358 |     }
359 |   }
360 | }
361 | 


--------------------------------------------------------------------------------
/src/emoji.rs:
--------------------------------------------------------------------------------
   1 | pub(crate) fn find_emoji(s: &str) -> Option<&'static str> {
   2 |   let idx = EMOJI_LIST.binary_search_by_key(&s, |&(k, _)| k).ok()?;
   3 |   Some(EMOJI_LIST[idx].1)
   4 | }
   5 | 
   6 | #[test]
   7 | fn emoji_list_is_sorted() {
   8 |   let mut sorted = EMOJI_LIST.to_vec();
   9 |   sorted.sort_by_key(|&(k, _)| k);
  10 |   if EMOJI_LIST != sorted {
  11 |     let mut buf = String::new();
  12 |     for (k, v) in sorted {
  13 |       crate::format_to!(buf, r#"("{k}", "{v}"),"#);
  14 |     }
  15 |     std::fs::write("./emoji.sorted", &buf).unwrap();
  16 |     panic!("unsorted emoji list, sorted version in: ./emoji.sorted")
  17 |   }
  18 | }
  19 | 
  20 | static EMOJI_LIST: &[(&str, &str)] = &[
  21 |   ("+1", "👍"),
  22 |   ("-1", "👎"),
  23 |   ("100", "💯"),
  24 |   ("1234", "🔢"),
  25 |   ("1st_place_medal", "🥇"),
  26 |   ("2nd_place_medal", "🥈"),
  27 |   ("3rd_place_medal", "🥉"),
  28 |   ("8ball", "🎱"),
  29 |   ("a", "🅰️"),
  30 |   ("ab", "🆎"),
  31 |   ("abacus", "🧮"),
  32 |   ("abc", "🔤"),
  33 |   ("abcd", "🔡"),
  34 |   ("accept", "🉑"),
  35 |   ("accordion", "🪗"),
  36 |   ("adhesive_bandage", "🩹"),
  37 |   ("adult", "🧑"),
  38 |   ("aerial_tramway", "🚡"),
  39 |   ("afghanistan", "🇦🇫"),
  40 |   ("airplane", "✈️"),
  41 |   ("aland_islands", "🇦🇽"),
  42 |   ("alarm_clock", "⏰"),
  43 |   ("albania", "🇦🇱"),
  44 |   ("alembic", "⚗️"),
  45 |   ("algeria", "🇩🇿"),
  46 |   ("alien", "👽"),
  47 |   ("ambulance", "🚑"),
  48 |   ("american_samoa", "🇦🇸"),
  49 |   ("amphora", "🏺"),
  50 |   ("anatomical_heart", "🫀"),
  51 |   ("anchor", "⚓"),
  52 |   ("andorra", "🇦🇩"),
  53 |   ("angel", "👼"),
  54 |   ("anger", "💢"),
  55 |   ("angola", "🇦🇴"),
  56 |   ("angry", "😠"),
  57 |   ("anguilla", "🇦🇮"),
  58 |   ("anguished", "😧"),
  59 |   ("ant", "🐜"),
  60 |   ("antarctica", "🇦🇶"),
  61 |   ("antigua_barbuda", "🇦🇬"),
  62 |   ("apple", "🍎"),
  63 |   ("aquarius", "♒"),
  64 |   ("argentina", "🇦🇷"),
  65 |   ("aries", "♈"),
  66 |   ("armenia", "🇦🇲"),
  67 |   ("arrow_backward", "◀️"),
  68 |   ("arrow_double_down", "⏬"),
  69 |   ("arrow_double_up", "⏫"),
  70 |   ("arrow_down", "⬇️"),
  71 |   ("arrow_down_small", "🔽"),
  72 |   ("arrow_forward", "▶️"),
  73 |   ("arrow_heading_down", "⤵️"),
  74 |   ("arrow_heading_up", "⤴️"),
  75 |   ("arrow_left", "⬅️"),
  76 |   ("arrow_lower_left", "↙️"),
  77 |   ("arrow_lower_right", "↘️"),
  78 |   ("arrow_right", "➡️"),
  79 |   ("arrow_right_hook", "↪️"),
  80 |   ("arrow_up", "⬆️"),
  81 |   ("arrow_up_down", "↕️"),
  82 |   ("arrow_up_small", "🔼"),
  83 |   ("arrow_upper_left", "↖️"),
  84 |   ("arrow_upper_right", "↗️"),
  85 |   ("arrows_clockwise", "🔃"),
  86 |   ("arrows_counterclockwise", "🔄"),
  87 |   ("art", "🎨"),
  88 |   ("articulated_lorry", "🚛"),
  89 |   ("artificial_satellite", "🛰️"),
  90 |   ("artist", "🧑‍🎨"),
  91 |   ("aruba", "🇦🇼"),
  92 |   ("ascension_island", "🇦🇨"),
  93 |   ("asterisk", "*️⃣"),
  94 |   ("astonished", "😲"),
  95 |   ("astronaut", "🧑‍🚀"),
  96 |   ("athletic_shoe", "👟"),
  97 |   ("atm", "🏧"),
  98 |   ("atom_symbol", "⚛️"),
  99 |   ("australia", "🇦🇺"),
 100 |   ("austria", "🇦🇹"),
 101 |   ("auto_rickshaw", "🛺"),
 102 |   ("avocado", "🥑"),
 103 |   ("axe", "🪓"),
 104 |   ("azerbaijan", "🇦🇿"),
 105 |   ("b", "🅱️"),
 106 |   ("baby", "👶"),
 107 |   ("baby_bottle", "🍼"),
 108 |   ("baby_chick", "🐤"),
 109 |   ("baby_symbol", "🚼"),
 110 |   ("back", "🔙"),
 111 |   ("bacon", "🥓"),
 112 |   ("badger", "🦡"),
 113 |   ("badminton", "🏸"),
 114 |   ("bagel", "🥯"),
 115 |   ("baggage_claim", "🛄"),
 116 |   ("baguette_bread", "🥖"),
 117 |   ("bahamas", "🇧🇸"),
 118 |   ("bahrain", "🇧🇭"),
 119 |   ("balance_scale", "⚖️"),
 120 |   ("bald_man", "👨‍🦲"),
 121 |   ("bald_woman", "👩‍🦲"),
 122 |   ("ballet_shoes", "🩰"),
 123 |   ("balloon", "🎈"),
 124 |   ("ballot_box", "🗳️"),
 125 |   ("ballot_box_with_check", "☑️"),
 126 |   ("bamboo", "🎍"),
 127 |   ("banana", "🍌"),
 128 |   ("bangbang", "‼️"),
 129 |   ("bangladesh", "🇧🇩"),
 130 |   ("banjo", "🪕"),
 131 |   ("bank", "🏦"),
 132 |   ("bar_chart", "📊"),
 133 |   ("barbados", "🇧🇧"),
 134 |   ("barber", "💈"),
 135 |   ("baseball", "⚾"),
 136 |   ("basket", "🧺"),
 137 |   ("basketball", "🏀"),
 138 |   ("basketball_man", "⛹️‍♂️"),
 139 |   ("basketball_woman", "⛹️‍♀️"),
 140 |   ("bat", "🦇"),
 141 |   ("bath", "🛀"),
 142 |   ("bathtub", "🛁"),
 143 |   ("battery", "🔋"),
 144 |   ("beach_umbrella", "🏖️"),
 145 |   ("bear", "🐻"),
 146 |   ("bearded_person", "🧔"),
 147 |   ("beaver", "🦫"),
 148 |   ("bed", "🛏️"),
 149 |   ("bee", "🐝"),
 150 |   ("beer", "🍺"),
 151 |   ("beers", "🍻"),
 152 |   ("beetle", "🪲"),
 153 |   ("beginner", "🔰"),
 154 |   ("belarus", "🇧🇾"),
 155 |   ("belgium", "🇧🇪"),
 156 |   ("belize", "🇧🇿"),
 157 |   ("bell", "🔔"),
 158 |   ("bell_pepper", "🫑"),
 159 |   ("bellhop_bell", "🛎️"),
 160 |   ("benin", "🇧🇯"),
 161 |   ("bento", "🍱"),
 162 |   ("bermuda", "🇧🇲"),
 163 |   ("beverage_box", "🧃"),
 164 |   ("bhutan", "🇧🇹"),
 165 |   ("bicyclist", "🚴"),
 166 |   ("bike", "🚲"),
 167 |   ("biking_man", "🚴‍♂️"),
 168 |   ("biking_woman", "🚴‍♀️"),
 169 |   ("bikini", "👙"),
 170 |   ("billed_cap", "🧢"),
 171 |   ("biohazard", "☣️"),
 172 |   ("bird", "🐦"),
 173 |   ("birthday", "🎂"),
 174 |   ("bison", "🦬"),
 175 |   ("black_cat", "🐈‍⬛"),
 176 |   ("black_circle", "⚫"),
 177 |   ("black_flag", "🏴"),
 178 |   ("black_heart", "🖤"),
 179 |   ("black_joker", "🃏"),
 180 |   ("black_large_square", "⬛"),
 181 |   ("black_medium_small_square", "◾"),
 182 |   ("black_medium_square", "◼️"),
 183 |   ("black_nib", "✒️"),
 184 |   ("black_small_square", "▪️"),
 185 |   ("black_square_button", "🔲"),
 186 |   ("blond_haired_man", "👱‍♂️"),
 187 |   ("blond_haired_person", "👱"),
 188 |   ("blond_haired_woman", "👱‍♀️"),
 189 |   ("blonde_woman", "👱‍♀️"),
 190 |   ("blossom", "🌼"),
 191 |   ("blowfish", "🐡"),
 192 |   ("blue_book", "📘"),
 193 |   ("blue_car", "🚙"),
 194 |   ("blue_heart", "💙"),
 195 |   ("blue_square", "🟦"),
 196 |   ("blueberries", "🫐"),
 197 |   ("blush", "😊"),
 198 |   ("boar", "🐗"),
 199 |   ("boat", "⛵"),
 200 |   ("bolivia", "🇧🇴"),
 201 |   ("bomb", "💣"),
 202 |   ("bone", "🦴"),
 203 |   ("book", "📖"),
 204 |   ("bookmark", "🔖"),
 205 |   ("bookmark_tabs", "📑"),
 206 |   ("books", "📚"),
 207 |   ("boom", "💥"),
 208 |   ("boomerang", "🪃"),
 209 |   ("boot", "👢"),
 210 |   ("bosnia_herzegovina", "🇧🇦"),
 211 |   ("botswana", "🇧🇼"),
 212 |   ("bouncing_ball_man", "⛹️‍♂️"),
 213 |   ("bouncing_ball_person", "⛹️"),
 214 |   ("bouncing_ball_woman", "⛹️‍♀️"),
 215 |   ("bouquet", "💐"),
 216 |   ("bouvet_island", "🇧🇻"),
 217 |   ("bow", "🙇"),
 218 |   ("bow_and_arrow", "🏹"),
 219 |   ("bowing_man", "🙇‍♂️"),
 220 |   ("bowing_woman", "🙇‍♀️"),
 221 |   ("bowl_with_spoon", "🥣"),
 222 |   ("bowling", "🎳"),
 223 |   ("boxing_glove", "🥊"),
 224 |   ("boy", "👦"),
 225 |   ("brain", "🧠"),
 226 |   ("brazil", "🇧🇷"),
 227 |   ("bread", "🍞"),
 228 |   ("breast_feeding", "🤱"),
 229 |   ("bricks", "🧱"),
 230 |   ("bride_with_veil", "👰‍♀️"),
 231 |   ("bridge_at_night", "🌉"),
 232 |   ("briefcase", "💼"),
 233 |   ("british_indian_ocean_territory", "🇮🇴"),
 234 |   ("british_virgin_islands", "🇻🇬"),
 235 |   ("broccoli", "🥦"),
 236 |   ("broken_heart", "💔"),
 237 |   ("broom", "🧹"),
 238 |   ("brown_circle", "🟤"),
 239 |   ("brown_heart", "🤎"),
 240 |   ("brown_square", "🟫"),
 241 |   ("brunei", "🇧🇳"),
 242 |   ("bubble_tea", "🧋"),
 243 |   ("bucket", "🪣"),
 244 |   ("bug", "🐛"),
 245 |   ("building_construction", "🏗️"),
 246 |   ("bulb", "💡"),
 247 |   ("bulgaria", "🇧🇬"),
 248 |   ("bullettrain_front", "🚅"),
 249 |   ("bullettrain_side", "🚄"),
 250 |   ("burkina_faso", "🇧🇫"),
 251 |   ("burrito", "🌯"),
 252 |   ("burundi", "🇧🇮"),
 253 |   ("bus", "🚌"),
 254 |   ("business_suit_levitating", "🕴️"),
 255 |   ("busstop", "🚏"),
 256 |   ("bust_in_silhouette", "👤"),
 257 |   ("busts_in_silhouette", "👥"),
 258 |   ("butter", "🧈"),
 259 |   ("butterfly", "🦋"),
 260 |   ("cactus", "🌵"),
 261 |   ("cake", "🍰"),
 262 |   ("calendar", "📆"),
 263 |   ("call_me_hand", "🤙"),
 264 |   ("calling", "📲"),
 265 |   ("cambodia", "🇰🇭"),
 266 |   ("camel", "🐫"),
 267 |   ("camera", "📷"),
 268 |   ("camera_flash", "📸"),
 269 |   ("cameroon", "🇨🇲"),
 270 |   ("camping", "🏕️"),
 271 |   ("canada", "🇨🇦"),
 272 |   ("canary_islands", "🇮🇨"),
 273 |   ("cancer", "♋"),
 274 |   ("candle", "🕯️"),
 275 |   ("candy", "🍬"),
 276 |   ("canned_food", "🥫"),
 277 |   ("canoe", "🛶"),
 278 |   ("cape_verde", "🇨🇻"),
 279 |   ("capital_abcd", "🔠"),
 280 |   ("capricorn", "♑"),
 281 |   ("car", "🚗"),
 282 |   ("card_file_box", "🗃️"),
 283 |   ("card_index", "📇"),
 284 |   ("card_index_dividers", "🗂️"),
 285 |   ("caribbean_netherlands", "🇧🇶"),
 286 |   ("carousel_horse", "🎠"),
 287 |   ("carpentry_saw", "🪚"),
 288 |   ("carrot", "🥕"),
 289 |   ("cartwheeling", "🤸"),
 290 |   ("cat", "🐱"),
 291 |   ("cat2", "🐈"),
 292 |   ("cayman_islands", "🇰🇾"),
 293 |   ("cd", "💿"),
 294 |   ("central_african_republic", "🇨🇫"),
 295 |   ("ceuta_melilla", "🇪🇦"),
 296 |   ("chad", "🇹🇩"),
 297 |   ("chains", "⛓️"),
 298 |   ("chair", "🪑"),
 299 |   ("champagne", "🍾"),
 300 |   ("chart", "💹"),
 301 |   ("chart_with_downwards_trend", "📉"),
 302 |   ("chart_with_upwards_trend", "📈"),
 303 |   ("checkered_flag", "🏁"),
 304 |   ("cheese", "🧀"),
 305 |   ("cherries", "🍒"),
 306 |   ("cherry_blossom", "🌸"),
 307 |   ("chess_pawn", "♟️"),
 308 |   ("chestnut", "🌰"),
 309 |   ("chicken", "🐔"),
 310 |   ("child", "🧒"),
 311 |   ("children_crossing", "🚸"),
 312 |   ("chile", "🇨🇱"),
 313 |   ("chipmunk", "🐿️"),
 314 |   ("chocolate_bar", "🍫"),
 315 |   ("chopsticks", "🥢"),
 316 |   ("christmas_island", "🇨🇽"),
 317 |   ("christmas_tree", "🎄"),
 318 |   ("church", "⛪"),
 319 |   ("cinema", "🎦"),
 320 |   ("circus_tent", "🎪"),
 321 |   ("city_sunrise", "🌇"),
 322 |   ("city_sunset", "🌆"),
 323 |   ("cityscape", "🏙️"),
 324 |   ("cl", "🆑"),
 325 |   ("clamp", "🗜️"),
 326 |   ("clap", "👏"),
 327 |   ("clapper", "🎬"),
 328 |   ("classical_building", "🏛️"),
 329 |   ("climbing", "🧗"),
 330 |   ("climbing_man", "🧗‍♂️"),
 331 |   ("climbing_woman", "🧗‍♀️"),
 332 |   ("clinking_glasses", "🥂"),
 333 |   ("clipboard", "📋"),
 334 |   ("clipperton_island", "🇨🇵"),
 335 |   ("clock1", "🕐"),
 336 |   ("clock10", "🕙"),
 337 |   ("clock1030", "🕥"),
 338 |   ("clock11", "🕚"),
 339 |   ("clock1130", "🕦"),
 340 |   ("clock12", "🕛"),
 341 |   ("clock1230", "🕧"),
 342 |   ("clock130", "🕜"),
 343 |   ("clock2", "🕑"),
 344 |   ("clock230", "🕝"),
 345 |   ("clock3", "🕒"),
 346 |   ("clock330", "🕞"),
 347 |   ("clock4", "🕓"),
 348 |   ("clock430", "🕟"),
 349 |   ("clock5", "🕔"),
 350 |   ("clock530", "🕠"),
 351 |   ("clock6", "🕕"),
 352 |   ("clock630", "🕡"),
 353 |   ("clock7", "🕖"),
 354 |   ("clock730", "🕢"),
 355 |   ("clock8", "🕗"),
 356 |   ("clock830", "🕣"),
 357 |   ("clock9", "🕘"),
 358 |   ("clock930", "🕤"),
 359 |   ("closed_book", "📕"),
 360 |   ("closed_lock_with_key", "🔐"),
 361 |   ("closed_umbrella", "🌂"),
 362 |   ("cloud", "☁️"),
 363 |   ("cloud_with_lightning", "🌩️"),
 364 |   ("cloud_with_lightning_and_rain", "⛈️"),
 365 |   ("cloud_with_rain", "🌧️"),
 366 |   ("cloud_with_snow", "🌨️"),
 367 |   ("clown_face", "🤡"),
 368 |   ("clubs", "♣️"),
 369 |   ("cn", "🇨🇳"),
 370 |   ("coat", "🧥"),
 371 |   ("cockroach", "🪳"),
 372 |   ("cocktail", "🍸"),
 373 |   ("coconut", "🥥"),
 374 |   ("cocos_islands", "🇨🇨"),
 375 |   ("coffee", "☕"),
 376 |   ("coffin", "⚰️"),
 377 |   ("coin", "🪙"),
 378 |   ("cold_face", "🥶"),
 379 |   ("cold_sweat", "😰"),
 380 |   ("collision", "💥"),
 381 |   ("colombia", "🇨🇴"),
 382 |   ("comet", "☄️"),
 383 |   ("comoros", "🇰🇲"),
 384 |   ("compass", "🧭"),
 385 |   ("computer", "💻"),
 386 |   ("computer_mouse", "🖱️"),
 387 |   ("confetti_ball", "🎊"),
 388 |   ("confounded", "😖"),
 389 |   ("confused", "😕"),
 390 |   ("congo_brazzaville", "🇨🇬"),
 391 |   ("congo_kinshasa", "🇨🇩"),
 392 |   ("congratulations", "㊗️"),
 393 |   ("construction", "🚧"),
 394 |   ("construction_worker", "👷"),
 395 |   ("construction_worker_man", "👷‍♂️"),
 396 |   ("construction_worker_woman", "👷‍♀️"),
 397 |   ("control_knobs", "🎛️"),
 398 |   ("convenience_store", "🏪"),
 399 |   ("cook", "🧑‍🍳"),
 400 |   ("cook_islands", "🇨🇰"),
 401 |   ("cookie", "🍪"),
 402 |   ("cool", "🆒"),
 403 |   ("cop", "👮"),
 404 |   ("copyright", "©️"),
 405 |   ("corn", "🌽"),
 406 |   ("costa_rica", "🇨🇷"),
 407 |   ("cote_divoire", "🇨🇮"),
 408 |   ("couch_and_lamp", "🛋️"),
 409 |   ("couple", "👫"),
 410 |   ("couple_with_heart", "💑"),
 411 |   ("couple_with_heart_man_man", "👨‍❤️‍👨"),
 412 |   ("couple_with_heart_woman_man", "👩‍❤️‍👨"),
 413 |   ("couple_with_heart_woman_woman", "👩‍❤️‍👩"),
 414 |   ("couplekiss", "💏"),
 415 |   ("couplekiss_man_man", "👨‍❤️‍💋‍👨"),
 416 |   ("couplekiss_man_woman", "👩‍❤️‍💋‍👨"),
 417 |   ("couplekiss_woman_woman", "👩‍❤️‍💋‍👩"),
 418 |   ("cow", "🐮"),
 419 |   ("cow2", "🐄"),
 420 |   ("cowboy_hat_face", "🤠"),
 421 |   ("crab", "🦀"),
 422 |   ("crayon", "🖍️"),
 423 |   ("credit_card", "💳"),
 424 |   ("crescent_moon", "🌙"),
 425 |   ("cricket", "🦗"),
 426 |   ("cricket_game", "🏏"),
 427 |   ("croatia", "🇭🇷"),
 428 |   ("crocodile", "🐊"),
 429 |   ("croissant", "🥐"),
 430 |   ("crossed_fingers", "🤞"),
 431 |   ("crossed_flags", "🎌"),
 432 |   ("crossed_swords", "⚔️"),
 433 |   ("crown", "👑"),
 434 |   ("cry", "😢"),
 435 |   ("crying_cat_face", "😿"),
 436 |   ("crystal_ball", "🔮"),
 437 |   ("cuba", "🇨🇺"),
 438 |   ("cucumber", "🥒"),
 439 |   ("cup_with_straw", "🥤"),
 440 |   ("cupcake", "🧁"),
 441 |   ("cupid", "💘"),
 442 |   ("curacao", "🇨🇼"),
 443 |   ("curling_stone", "🥌"),
 444 |   ("curly_haired_man", "👨‍🦱"),
 445 |   ("curly_haired_woman", "👩‍🦱"),
 446 |   ("curly_loop", "➰"),
 447 |   ("currency_exchange", "💱"),
 448 |   ("curry", "🍛"),
 449 |   ("cursing_face", "🤬"),
 450 |   ("custard", "🍮"),
 451 |   ("customs", "🛃"),
 452 |   ("cut_of_meat", "🥩"),
 453 |   ("cyclone", "🌀"),
 454 |   ("cyprus", "🇨🇾"),
 455 |   ("czech_republic", "🇨🇿"),
 456 |   ("dagger", "🗡️"),
 457 |   ("dancer", "💃"),
 458 |   ("dancers", "👯"),
 459 |   ("dancing_men", "👯‍♂️"),
 460 |   ("dancing_women", "👯‍♀️"),
 461 |   ("dango", "🍡"),
 462 |   ("dark_sunglasses", "🕶️"),
 463 |   ("dart", "🎯"),
 464 |   ("dash", "💨"),
 465 |   ("date", "📅"),
 466 |   ("de", "🇩🇪"),
 467 |   ("deaf_man", "🧏‍♂️"),
 468 |   ("deaf_person", "🧏"),
 469 |   ("deaf_woman", "🧏‍♀️"),
 470 |   ("deciduous_tree", "🌳"),
 471 |   ("deer", "🦌"),
 472 |   ("denmark", "🇩🇰"),
 473 |   ("department_store", "🏬"),
 474 |   ("derelict_house", "🏚️"),
 475 |   ("desert", "🏜️"),
 476 |   ("desert_island", "🏝️"),
 477 |   ("desktop_computer", "🖥️"),
 478 |   ("detective", "🕵️"),
 479 |   ("diamond_shape_with_a_dot_inside", "💠"),
 480 |   ("diamonds", "♦️"),
 481 |   ("diego_garcia", "🇩🇬"),
 482 |   ("disappointed", "😞"),
 483 |   ("disappointed_relieved", "😥"),
 484 |   ("disguised_face", "🥸"),
 485 |   ("diving_mask", "🤿"),
 486 |   ("diya_lamp", "🪔"),
 487 |   ("dizzy", "💫"),
 488 |   ("dizzy_face", "😵"),
 489 |   ("djibouti", "🇩🇯"),
 490 |   ("dna", "🧬"),
 491 |   ("do_not_litter", "🚯"),
 492 |   ("dodo", "🦤"),
 493 |   ("dog", "🐶"),
 494 |   ("dog2", "🐕"),
 495 |   ("dollar", "💵"),
 496 |   ("dolls", "🎎"),
 497 |   ("dolphin", "🐬"),
 498 |   ("dominica", "🇩🇲"),
 499 |   ("dominican_republic", "🇩🇴"),
 500 |   ("door", "🚪"),
 501 |   ("doughnut", "🍩"),
 502 |   ("dove", "🕊️"),
 503 |   ("dragon", "🐉"),
 504 |   ("dragon_face", "🐲"),
 505 |   ("dress", "👗"),
 506 |   ("dromedary_camel", "🐪"),
 507 |   ("drooling_face", "🤤"),
 508 |   ("drop_of_blood", "🩸"),
 509 |   ("droplet", "💧"),
 510 |   ("drum", "🥁"),
 511 |   ("duck", "🦆"),
 512 |   ("dumpling", "🥟"),
 513 |   ("dvd", "📀"),
 514 |   ("e-mail", "📧"),
 515 |   ("eagle", "🦅"),
 516 |   ("ear", "👂"),
 517 |   ("ear_of_rice", "🌾"),
 518 |   ("ear_with_hearing_aid", "🦻"),
 519 |   ("earth_africa", "🌍"),
 520 |   ("earth_americas", "🌎"),
 521 |   ("earth_asia", "🌏"),
 522 |   ("ecuador", "🇪🇨"),
 523 |   ("egg", "🥚"),
 524 |   ("eggplant", "🍆"),
 525 |   ("egypt", "🇪🇬"),
 526 |   ("eight", "8️⃣"),
 527 |   ("eight_pointed_black_star", "✴️"),
 528 |   ("eight_spoked_asterisk", "✳️"),
 529 |   ("eject_button", "⏏️"),
 530 |   ("el_salvador", "🇸🇻"),
 531 |   ("electric_plug", "🔌"),
 532 |   ("elephant", "🐘"),
 533 |   ("elevator", "🛗"),
 534 |   ("elf", "🧝"),
 535 |   ("elf_man", "🧝‍♂️"),
 536 |   ("elf_woman", "🧝‍♀️"),
 537 |   ("email", "📧"),
 538 |   ("end", "🔚"),
 539 |   ("england", "🏴󠁧󠁢󠁥󠁮󠁧󠁿"),
 540 |   ("envelope", "✉️"),
 541 |   ("envelope_with_arrow", "📩"),
 542 |   ("equatorial_guinea", "🇬🇶"),
 543 |   ("eritrea", "🇪🇷"),
 544 |   ("es", "🇪🇸"),
 545 |   ("estonia", "🇪🇪"),
 546 |   ("ethiopia", "🇪🇹"),
 547 |   ("eu", "🇪🇺"),
 548 |   ("euro", "💶"),
 549 |   ("european_castle", "🏰"),
 550 |   ("european_post_office", "🏤"),
 551 |   ("european_union", "🇪🇺"),
 552 |   ("evergreen_tree", "🌲"),
 553 |   ("exclamation", "❗"),
 554 |   ("exploding_head", "🤯"),
 555 |   ("expressionless", "😑"),
 556 |   ("eye", "👁️"),
 557 |   ("eye_speech_bubble", "👁️‍🗨️"),
 558 |   ("eyeglasses", "👓"),
 559 |   ("eyes", "👀"),
 560 |   ("face_exhaling", "😮‍💨"),
 561 |   ("face_in_clouds", "😶‍🌫️"),
 562 |   ("face_with_head_bandage", "🤕"),
 563 |   ("face_with_spiral_eyes", "😵‍💫"),
 564 |   ("face_with_thermometer", "🤒"),
 565 |   ("facepalm", "🤦"),
 566 |   ("facepunch", "👊"),
 567 |   ("factory", "🏭"),
 568 |   ("factory_worker", "🧑‍🏭"),
 569 |   ("fairy", "🧚"),
 570 |   ("fairy_man", "🧚‍♂️"),
 571 |   ("fairy_woman", "🧚‍♀️"),
 572 |   ("falafel", "🧆"),
 573 |   ("falkland_islands", "🇫🇰"),
 574 |   ("fallen_leaf", "🍂"),
 575 |   ("family", "👪"),
 576 |   ("family_man_boy", "👨‍👦"),
 577 |   ("family_man_boy_boy", "👨‍👦‍👦"),
 578 |   ("family_man_girl", "👨‍👧"),
 579 |   ("family_man_girl_boy", "👨‍👧‍👦"),
 580 |   ("family_man_girl_girl", "👨‍👧‍👧"),
 581 |   ("family_man_man_boy", "👨‍👨‍👦"),
 582 |   ("family_man_man_boy_boy", "👨‍👨‍👦‍👦"),
 583 |   ("family_man_man_girl", "👨‍👨‍👧"),
 584 |   ("family_man_man_girl_boy", "👨‍👨‍👧‍👦"),
 585 |   ("family_man_man_girl_girl", "👨‍👨‍👧‍👧"),
 586 |   ("family_man_woman_boy", "👨‍👩‍👦"),
 587 |   ("family_man_woman_boy_boy", "👨‍👩‍👦‍👦"),
 588 |   ("family_man_woman_girl", "👨‍👩‍👧"),
 589 |   ("family_man_woman_girl_boy", "👨‍👩‍👧‍👦"),
 590 |   ("family_man_woman_girl_girl", "👨‍👩‍👧‍👧"),
 591 |   ("family_woman_boy", "👩‍👦"),
 592 |   ("family_woman_boy_boy", "👩‍👦‍👦"),
 593 |   ("family_woman_girl", "👩‍👧"),
 594 |   ("family_woman_girl_boy", "👩‍👧‍👦"),
 595 |   ("family_woman_girl_girl", "👩‍👧‍👧"),
 596 |   ("family_woman_woman_boy", "👩‍👩‍👦"),
 597 |   ("family_woman_woman_boy_boy", "👩‍👩‍👦‍👦"),
 598 |   ("family_woman_woman_girl", "👩‍👩‍👧"),
 599 |   ("family_woman_woman_girl_boy", "👩‍👩‍👧‍👦"),
 600 |   ("family_woman_woman_girl_girl", "👩‍👩‍👧‍👧"),
 601 |   ("farmer", "🧑‍🌾"),
 602 |   ("faroe_islands", "🇫🇴"),
 603 |   ("fast_forward", "⏩"),
 604 |   ("fax", "📠"),
 605 |   ("fearful", "😨"),
 606 |   ("feather", "🪶"),
 607 |   ("feet", "🐾"),
 608 |   ("female_detective", "🕵️‍♀️"),
 609 |   ("female_sign", "♀️"),
 610 |   ("ferris_wheel", "🎡"),
 611 |   ("ferry", "⛴️"),
 612 |   ("field_hockey", "🏑"),
 613 |   ("fiji", "🇫🇯"),
 614 |   ("file_cabinet", "🗄️"),
 615 |   ("file_folder", "📁"),
 616 |   ("film_projector", "📽️"),
 617 |   ("film_strip", "🎞️"),
 618 |   ("finland", "🇫🇮"),
 619 |   ("fire", "🔥"),
 620 |   ("fire_engine", "🚒"),
 621 |   ("fire_extinguisher", "🧯"),
 622 |   ("firecracker", "🧨"),
 623 |   ("firefighter", "🧑‍🚒"),
 624 |   ("fireworks", "🎆"),
 625 |   ("first_quarter_moon", "🌓"),
 626 |   ("first_quarter_moon_with_face", "🌛"),
 627 |   ("fish", "🐟"),
 628 |   ("fish_cake", "🍥"),
 629 |   ("fishing_pole_and_fish", "🎣"),
 630 |   ("fist", "✊"),
 631 |   ("fist_left", "🤛"),
 632 |   ("fist_oncoming", "👊"),
 633 |   ("fist_raised", "✊"),
 634 |   ("fist_right", "🤜"),
 635 |   ("five", "5️⃣"),
 636 |   ("flags", "🎏"),
 637 |   ("flamingo", "🦩"),
 638 |   ("flashlight", "🔦"),
 639 |   ("flat_shoe", "🥿"),
 640 |   ("flatbread", "🫓"),
 641 |   ("fleur_de_lis", "⚜️"),
 642 |   ("flight_arrival", "🛬"),
 643 |   ("flight_departure", "🛫"),
 644 |   ("flipper", "🐬"),
 645 |   ("floppy_disk", "💾"),
 646 |   ("flower_playing_cards", "🎴"),
 647 |   ("flushed", "😳"),
 648 |   ("fly", "🪰"),
 649 |   ("flying_disc", "🥏"),
 650 |   ("flying_saucer", "🛸"),
 651 |   ("fog", "🌫️"),
 652 |   ("foggy", "🌁"),
 653 |   ("fondue", "🫕"),
 654 |   ("foot", "🦶"),
 655 |   ("football", "🏈"),
 656 |   ("footprints", "👣"),
 657 |   ("fork_and_knife", "🍴"),
 658 |   ("fortune_cookie", "🥠"),
 659 |   ("fountain", "⛲"),
 660 |   ("fountain_pen", "🖋️"),
 661 |   ("four", "4️⃣"),
 662 |   ("four_leaf_clover", "🍀"),
 663 |   ("fox_face", "🦊"),
 664 |   ("fr", "🇫🇷"),
 665 |   ("framed_picture", "🖼️"),
 666 |   ("free", "🆓"),
 667 |   ("french_guiana", "🇬🇫"),
 668 |   ("french_polynesia", "🇵🇫"),
 669 |   ("french_southern_territories", "🇹🇫"),
 670 |   ("fried_egg", "🍳"),
 671 |   ("fried_shrimp", "🍤"),
 672 |   ("fries", "🍟"),
 673 |   ("frog", "🐸"),
 674 |   ("frowning", "😦"),
 675 |   ("frowning_face", "☹️"),
 676 |   ("frowning_man", "🙍‍♂️"),
 677 |   ("frowning_person", "🙍"),
 678 |   ("frowning_woman", "🙍‍♀️"),
 679 |   ("fu", "🖕"),
 680 |   ("fuelpump", "⛽"),
 681 |   ("full_moon", "🌕"),
 682 |   ("full_moon_with_face", "🌝"),
 683 |   ("funeral_urn", "⚱️"),
 684 |   ("gabon", "🇬🇦"),
 685 |   ("gambia", "🇬🇲"),
 686 |   ("game_die", "🎲"),
 687 |   ("garlic", "🧄"),
 688 |   ("gb", "🇬🇧"),
 689 |   ("gear", "⚙️"),
 690 |   ("gem", "💎"),
 691 |   ("gemini", "♊"),
 692 |   ("genie", "🧞"),
 693 |   ("genie_man", "🧞‍♂️"),
 694 |   ("genie_woman", "🧞‍♀️"),
 695 |   ("georgia", "🇬🇪"),
 696 |   ("ghana", "🇬🇭"),
 697 |   ("ghost", "👻"),
 698 |   ("gibraltar", "🇬🇮"),
 699 |   ("gift", "🎁"),
 700 |   ("gift_heart", "💝"),
 701 |   ("giraffe", "🦒"),
 702 |   ("girl", "👧"),
 703 |   ("globe_with_meridians", "🌐"),
 704 |   ("gloves", "🧤"),
 705 |   ("goal_net", "🥅"),
 706 |   ("goat", "🐐"),
 707 |   ("goggles", "🥽"),
 708 |   ("golf", "⛳"),
 709 |   ("golfing", "🏌️"),
 710 |   ("golfing_man", "🏌️‍♂️"),
 711 |   ("golfing_woman", "🏌️‍♀️"),
 712 |   ("gorilla", "🦍"),
 713 |   ("grapes", "🍇"),
 714 |   ("greece", "🇬🇷"),
 715 |   ("green_apple", "🍏"),
 716 |   ("green_book", "📗"),
 717 |   ("green_circle", "🟢"),
 718 |   ("green_heart", "💚"),
 719 |   ("green_salad", "🥗"),
 720 |   ("green_square", "🟩"),
 721 |   ("greenland", "🇬🇱"),
 722 |   ("grenada", "🇬🇩"),
 723 |   ("grey_exclamation", "❕"),
 724 |   ("grey_question", "❔"),
 725 |   ("grimacing", "😬"),
 726 |   ("grin", "😁"),
 727 |   ("grinning", "😀"),
 728 |   ("guadeloupe", "🇬🇵"),
 729 |   ("guam", "🇬🇺"),
 730 |   ("guard", "💂"),
 731 |   ("guardsman", "💂‍♂️"),
 732 |   ("guardswoman", "💂‍♀️"),
 733 |   ("guatemala", "🇬🇹"),
 734 |   ("guernsey", "🇬🇬"),
 735 |   ("guide_dog", "🦮"),
 736 |   ("guinea", "🇬🇳"),
 737 |   ("guinea_bissau", "🇬🇼"),
 738 |   ("guitar", "🎸"),
 739 |   ("gun", "🔫"),
 740 |   ("guyana", "🇬🇾"),
 741 |   ("haircut", "💇"),
 742 |   ("haircut_man", "💇‍♂️"),
 743 |   ("haircut_woman", "💇‍♀️"),
 744 |   ("haiti", "🇭🇹"),
 745 |   ("hamburger", "🍔"),
 746 |   ("hammer", "🔨"),
 747 |   ("hammer_and_pick", "⚒️"),
 748 |   ("hammer_and_wrench", "🛠️"),
 749 |   ("hamster", "🐹"),
 750 |   ("hand", "✋"),
 751 |   ("hand_over_mouth", "🤭"),
 752 |   ("handbag", "👜"),
 753 |   ("handball_person", "🤾"),
 754 |   ("handshake", "🤝"),
 755 |   ("hankey", "💩"),
 756 |   ("hash", "#️⃣"),
 757 |   ("hatched_chick", "🐥"),
 758 |   ("hatching_chick", "🐣"),
 759 |   ("headphones", "🎧"),
 760 |   ("headstone", "🪦"),
 761 |   ("health_worker", "🧑‍⚕️"),
 762 |   ("hear_no_evil", "🙉"),
 763 |   ("heard_mcdonald_islands", "🇭🇲"),
 764 |   ("heart", "❤️"),
 765 |   ("heart_decoration", "💟"),
 766 |   ("heart_eyes", "😍"),
 767 |   ("heart_eyes_cat", "😻"),
 768 |   ("heart_on_fire", "❤️‍🔥"),
 769 |   ("heartbeat", "💓"),
 770 |   ("heartpulse", "💗"),
 771 |   ("hearts", "♥️"),
 772 |   ("heavy_check_mark", "✔️"),
 773 |   ("heavy_division_sign", "➗"),
 774 |   ("heavy_dollar_sign", "💲"),
 775 |   ("heavy_exclamation_mark", "❗"),
 776 |   ("heavy_heart_exclamation", "❣️"),
 777 |   ("heavy_minus_sign", "➖"),
 778 |   ("heavy_multiplication_x", "✖️"),
 779 |   ("heavy_plus_sign", "➕"),
 780 |   ("hedgehog", "🦔"),
 781 |   ("helicopter", "🚁"),
 782 |   ("herb", "🌿"),
 783 |   ("hibiscus", "🌺"),
 784 |   ("high_brightness", "🔆"),
 785 |   ("high_heel", "👠"),
 786 |   ("hiking_boot", "🥾"),
 787 |   ("hindu_temple", "🛕"),
 788 |   ("hippopotamus", "🦛"),
 789 |   ("hocho", "🔪"),
 790 |   ("hole", "🕳️"),
 791 |   ("honduras", "🇭🇳"),
 792 |   ("honey_pot", "🍯"),
 793 |   ("honeybee", "🐝"),
 794 |   ("hong_kong", "🇭🇰"),
 795 |   ("hook", "🪝"),
 796 |   ("horse", "🐴"),
 797 |   ("horse_racing", "🏇"),
 798 |   ("hospital", "🏥"),
 799 |   ("hot_face", "🥵"),
 800 |   ("hot_pepper", "🌶️"),
 801 |   ("hotdog", "🌭"),
 802 |   ("hotel", "🏨"),
 803 |   ("hotsprings", "♨️"),
 804 |   ("hourglass", "⌛"),
 805 |   ("hourglass_flowing_sand", "⏳"),
 806 |   ("house", "🏠"),
 807 |   ("house_with_garden", "🏡"),
 808 |   ("houses", "🏘️"),
 809 |   ("hugs", "🤗"),
 810 |   ("hungary", "🇭🇺"),
 811 |   ("hushed", "😯"),
 812 |   ("hut", "🛖"),
 813 |   ("ice_cream", "🍨"),
 814 |   ("ice_cube", "🧊"),
 815 |   ("ice_hockey", "🏒"),
 816 |   ("ice_skate", "⛸️"),
 817 |   ("icecream", "🍦"),
 818 |   ("iceland", "🇮🇸"),
 819 |   ("id", "🆔"),
 820 |   ("ideograph_advantage", "🉐"),
 821 |   ("imp", "👿"),
 822 |   ("inbox_tray", "📥"),
 823 |   ("incoming_envelope", "📨"),
 824 |   ("india", "🇮🇳"),
 825 |   ("indonesia", "🇮🇩"),
 826 |   ("infinity", "♾️"),
 827 |   ("information_desk_person", "💁"),
 828 |   ("information_source", "ℹ️"),
 829 |   ("innocent", "😇"),
 830 |   ("interrobang", "⁉️"),
 831 |   ("iphone", "📱"),
 832 |   ("iran", "🇮🇷"),
 833 |   ("iraq", "🇮🇶"),
 834 |   ("ireland", "🇮🇪"),
 835 |   ("isle_of_man", "🇮🇲"),
 836 |   ("israel", "🇮🇱"),
 837 |   ("it", "🇮🇹"),
 838 |   ("izakaya_lantern", "🏮"),
 839 |   ("jack_o_lantern", "🎃"),
 840 |   ("jamaica", "🇯🇲"),
 841 |   ("japan", "🗾"),
 842 |   ("japanese_castle", "🏯"),
 843 |   ("japanese_goblin", "👺"),
 844 |   ("japanese_ogre", "👹"),
 845 |   ("jeans", "👖"),
 846 |   ("jersey", "🇯🇪"),
 847 |   ("jigsaw", "🧩"),
 848 |   ("jordan", "🇯🇴"),
 849 |   ("joy", "😂"),
 850 |   ("joy_cat", "😹"),
 851 |   ("joystick", "🕹️"),
 852 |   ("jp", "🇯🇵"),
 853 |   ("judge", "🧑‍⚖️"),
 854 |   ("juggling_person", "🤹"),
 855 |   ("kaaba", "🕋"),
 856 |   ("kangaroo", "🦘"),
 857 |   ("kazakhstan", "🇰🇿"),
 858 |   ("kenya", "🇰🇪"),
 859 |   ("key", "🔑"),
 860 |   ("keyboard", "⌨️"),
 861 |   ("keycap_ten", "🔟"),
 862 |   ("kick_scooter", "🛴"),
 863 |   ("kimono", "👘"),
 864 |   ("kiribati", "🇰🇮"),
 865 |   ("kiss", "💋"),
 866 |   ("kissing", "😗"),
 867 |   ("kissing_cat", "😽"),
 868 |   ("kissing_closed_eyes", "😚"),
 869 |   ("kissing_heart", "😘"),
 870 |   ("kissing_smiling_eyes", "😙"),
 871 |   ("kite", "🪁"),
 872 |   ("kiwi_fruit", "🥝"),
 873 |   ("kneeling_man", "🧎‍♂️"),
 874 |   ("kneeling_person", "🧎"),
 875 |   ("kneeling_woman", "🧎‍♀️"),
 876 |   ("knife", "🔪"),
 877 |   ("knot", "🪢"),
 878 |   ("koala", "🐨"),
 879 |   ("koko", "🈁"),
 880 |   ("kosovo", "🇽🇰"),
 881 |   ("kr", "🇰🇷"),
 882 |   ("kuwait", "🇰🇼"),
 883 |   ("kyrgyzstan", "🇰🇬"),
 884 |   ("lab_coat", "🥼"),
 885 |   ("label", "🏷️"),
 886 |   ("lacrosse", "🥍"),
 887 |   ("ladder", "🪜"),
 888 |   ("lady_beetle", "🐞"),
 889 |   ("lantern", "🏮"),
 890 |   ("laos", "🇱🇦"),
 891 |   ("large_blue_circle", "🔵"),
 892 |   ("large_blue_diamond", "🔷"),
 893 |   ("large_orange_diamond", "🔶"),
 894 |   ("last_quarter_moon", "🌗"),
 895 |   ("last_quarter_moon_with_face", "🌜"),
 896 |   ("latin_cross", "✝️"),
 897 |   ("latvia", "🇱🇻"),
 898 |   ("laughing", "😆"),
 899 |   ("leafy_green", "🥬"),
 900 |   ("leaves", "🍃"),
 901 |   ("lebanon", "🇱🇧"),
 902 |   ("ledger", "📒"),
 903 |   ("left_luggage", "🛅"),
 904 |   ("left_right_arrow", "↔️"),
 905 |   ("left_speech_bubble", "🗨️"),
 906 |   ("leftwards_arrow_with_hook", "↩️"),
 907 |   ("leg", "🦵"),
 908 |   ("lemon", "🍋"),
 909 |   ("leo", "♌"),
 910 |   ("leopard", "🐆"),
 911 |   ("lesotho", "🇱🇸"),
 912 |   ("level_slider", "🎚️"),
 913 |   ("liberia", "🇱🇷"),
 914 |   ("libra", "♎"),
 915 |   ("libya", "🇱🇾"),
 916 |   ("liechtenstein", "🇱🇮"),
 917 |   ("light_rail", "🚈"),
 918 |   ("link", "🔗"),
 919 |   ("lion", "🦁"),
 920 |   ("lips", "👄"),
 921 |   ("lipstick", "💄"),
 922 |   ("lithuania", "🇱🇹"),
 923 |   ("lizard", "🦎"),
 924 |   ("llama", "🦙"),
 925 |   ("lobster", "🦞"),
 926 |   ("lock", "🔒"),
 927 |   ("lock_with_ink_pen", "🔏"),
 928 |   ("lollipop", "🍭"),
 929 |   ("long_drum", "🪘"),
 930 |   ("loop", "➿"),
 931 |   ("lotion_bottle", "🧴"),
 932 |   ("lotus_position", "🧘"),
 933 |   ("lotus_position_man", "🧘‍♂️"),
 934 |   ("lotus_position_woman", "🧘‍♀️"),
 935 |   ("loud_sound", "🔊"),
 936 |   ("loudspeaker", "📢"),
 937 |   ("love_hotel", "🏩"),
 938 |   ("love_letter", "💌"),
 939 |   ("love_you_gesture", "🤟"),
 940 |   ("low_brightness", "🔅"),
 941 |   ("luggage", "🧳"),
 942 |   ("lungs", "🫁"),
 943 |   ("luxembourg", "🇱🇺"),
 944 |   ("lying_face", "🤥"),
 945 |   ("m", "Ⓜ️"),
 946 |   ("macau", "🇲🇴"),
 947 |   ("macedonia", "🇲🇰"),
 948 |   ("madagascar", "🇲🇬"),
 949 |   ("mag", "🔍"),
 950 |   ("mag_right", "🔎"),
 951 |   ("mage", "🧙"),
 952 |   ("mage_man", "🧙‍♂️"),
 953 |   ("mage_woman", "🧙‍♀️"),
 954 |   ("magic_wand", "🪄"),
 955 |   ("magnet", "🧲"),
 956 |   ("mahjong", "🀄"),
 957 |   ("mailbox", "📫"),
 958 |   ("mailbox_closed", "📪"),
 959 |   ("mailbox_with_mail", "📬"),
 960 |   ("mailbox_with_no_mail", "📭"),
 961 |   ("malawi", "🇲🇼"),
 962 |   ("malaysia", "🇲🇾"),
 963 |   ("maldives", "🇲🇻"),
 964 |   ("male_detective", "🕵️‍♂️"),
 965 |   ("male_sign", "♂️"),
 966 |   ("mali", "🇲🇱"),
 967 |   ("malta", "🇲🇹"),
 968 |   ("mammoth", "🦣"),
 969 |   ("man", "👨"),
 970 |   ("man_artist", "👨‍🎨"),
 971 |   ("man_astronaut", "👨‍🚀"),
 972 |   ("man_beard", "🧔‍♂️"),
 973 |   ("man_cartwheeling", "🤸‍♂️"),
 974 |   ("man_cook", "👨‍🍳"),
 975 |   ("man_dancing", "🕺"),
 976 |   ("man_facepalming", "🤦‍♂️"),
 977 |   ("man_factory_worker", "👨‍🏭"),
 978 |   ("man_farmer", "👨‍🌾"),
 979 |   ("man_feeding_baby", "👨‍🍼"),
 980 |   ("man_firefighter", "👨‍🚒"),
 981 |   ("man_health_worker", "👨‍⚕️"),
 982 |   ("man_in_manual_wheelchair", "👨‍🦽"),
 983 |   ("man_in_motorized_wheelchair", "👨‍🦼"),
 984 |   ("man_in_tuxedo", "🤵‍♂️"),
 985 |   ("man_judge", "👨‍⚖️"),
 986 |   ("man_juggling", "🤹‍♂️"),
 987 |   ("man_mechanic", "👨‍🔧"),
 988 |   ("man_office_worker", "👨‍💼"),
 989 |   ("man_pilot", "👨‍✈️"),
 990 |   ("man_playing_handball", "🤾‍♂️"),
 991 |   ("man_playing_water_polo", "🤽‍♂️"),
 992 |   ("man_scientist", "👨‍🔬"),
 993 |   ("man_shrugging", "🤷‍♂️"),
 994 |   ("man_singer", "👨‍🎤"),
 995 |   ("man_student", "👨‍🎓"),
 996 |   ("man_teacher", "👨‍🏫"),
 997 |   ("man_technologist", "👨‍💻"),
 998 |   ("man_with_gua_pi_mao", "👲"),
 999 |   ("man_with_probing_cane", "👨‍🦯"),
1000 |   ("man_with_turban", "👳‍♂️"),
1001 |   ("man_with_veil", "👰‍♂️"),
1002 |   ("mandarin", "🍊"),
1003 |   ("mango", "🥭"),
1004 |   ("mans_shoe", "👞"),
1005 |   ("mantelpiece_clock", "🕰️"),
1006 |   ("manual_wheelchair", "🦽"),
1007 |   ("maple_leaf", "🍁"),
1008 |   ("marshall_islands", "🇲🇭"),
1009 |   ("martial_arts_uniform", "🥋"),
1010 |   ("martinique", "🇲🇶"),
1011 |   ("mask", "😷"),
1012 |   ("massage", "💆"),
1013 |   ("massage_man", "💆‍♂️"),
1014 |   ("massage_woman", "💆‍♀️"),
1015 |   ("mate", "🧉"),
1016 |   ("mauritania", "🇲🇷"),
1017 |   ("mauritius", "🇲🇺"),
1018 |   ("mayotte", "🇾🇹"),
1019 |   ("meat_on_bone", "🍖"),
1020 |   ("mechanic", "🧑‍🔧"),
1021 |   ("mechanical_arm", "🦾"),
1022 |   ("mechanical_leg", "🦿"),
1023 |   ("medal_military", "🎖️"),
1024 |   ("medal_sports", "🏅"),
1025 |   ("medical_symbol", "⚕️"),
1026 |   ("mega", "📣"),
1027 |   ("melon", "🍈"),
1028 |   ("memo", "📝"),
1029 |   ("men_wrestling", "🤼‍♂️"),
1030 |   ("mending_heart", "❤️‍🩹"),
1031 |   ("menorah", "🕎"),
1032 |   ("mens", "🚹"),
1033 |   ("mermaid", "🧜‍♀️"),
1034 |   ("merman", "🧜‍♂️"),
1035 |   ("merperson", "🧜"),
1036 |   ("metal", "🤘"),
1037 |   ("metro", "🚇"),
1038 |   ("mexico", "🇲🇽"),
1039 |   ("microbe", "🦠"),
1040 |   ("micronesia", "🇫🇲"),
1041 |   ("microphone", "🎤"),
1042 |   ("microscope", "🔬"),
1043 |   ("middle_finger", "🖕"),
1044 |   ("military_helmet", "🪖"),
1045 |   ("milk_glass", "🥛"),
1046 |   ("milky_way", "🌌"),
1047 |   ("minibus", "🚐"),
1048 |   ("minidisc", "💽"),
1049 |   ("mirror", "🪞"),
1050 |   ("mobile_phone_off", "📴"),
1051 |   ("moldova", "🇲🇩"),
1052 |   ("monaco", "🇲🇨"),
1053 |   ("money_mouth_face", "🤑"),
1054 |   ("money_with_wings", "💸"),
1055 |   ("moneybag", "💰"),
1056 |   ("mongolia", "🇲🇳"),
1057 |   ("monkey", "🐒"),
1058 |   ("monkey_face", "🐵"),
1059 |   ("monocle_face", "🧐"),
1060 |   ("monorail", "🚝"),
1061 |   ("montenegro", "🇲🇪"),
1062 |   ("montserrat", "🇲🇸"),
1063 |   ("moon", "🌔"),
1064 |   ("moon_cake", "🥮"),
1065 |   ("morocco", "🇲🇦"),
1066 |   ("mortar_board", "🎓"),
1067 |   ("mosque", "🕌"),
1068 |   ("mosquito", "🦟"),
1069 |   ("motor_boat", "🛥️"),
1070 |   ("motor_scooter", "🛵"),
1071 |   ("motorcycle", "🏍️"),
1072 |   ("motorized_wheelchair", "🦼"),
1073 |   ("motorway", "🛣️"),
1074 |   ("mount_fuji", "🗻"),
1075 |   ("mountain", "⛰️"),
1076 |   ("mountain_bicyclist", "🚵"),
1077 |   ("mountain_biking_man", "🚵‍♂️"),
1078 |   ("mountain_biking_woman", "🚵‍♀️"),
1079 |   ("mountain_cableway", "🚠"),
1080 |   ("mountain_railway", "🚞"),
1081 |   ("mountain_snow", "🏔️"),
1082 |   ("mouse", "🐭"),
1083 |   ("mouse2", "🐁"),
1084 |   ("mouse_trap", "🪤"),
1085 |   ("movie_camera", "🎥"),
1086 |   ("moyai", "🗿"),
1087 |   ("mozambique", "🇲🇿"),
1088 |   ("mrs_claus", "🤶"),
1089 |   ("muscle", "💪"),
1090 |   ("mushroom", "🍄"),
1091 |   ("musical_keyboard", "🎹"),
1092 |   ("musical_note", "🎵"),
1093 |   ("musical_score", "🎼"),
1094 |   ("mute", "🔇"),
1095 |   ("mx_claus", "🧑‍🎄"),
1096 |   ("myanmar", "🇲🇲"),
1097 |   ("nail_care", "💅"),
1098 |   ("name_badge", "📛"),
1099 |   ("namibia", "🇳🇦"),
1100 |   ("national_park", "🏞️"),
1101 |   ("nauru", "🇳🇷"),
1102 |   ("nauseated_face", "🤢"),
1103 |   ("nazar_amulet", "🧿"),
1104 |   ("necktie", "👔"),
1105 |   ("negative_squared_cross_mark", "❎"),
1106 |   ("nepal", "🇳🇵"),
1107 |   ("nerd_face", "🤓"),
1108 |   ("nesting_dolls", "🪆"),
1109 |   ("netherlands", "🇳🇱"),
1110 |   ("neutral_face", "😐"),
1111 |   ("new", "🆕"),
1112 |   ("new_caledonia", "🇳🇨"),
1113 |   ("new_moon", "🌑"),
1114 |   ("new_moon_with_face", "🌚"),
1115 |   ("new_zealand", "🇳🇿"),
1116 |   ("newspaper", "📰"),
1117 |   ("newspaper_roll", "🗞️"),
1118 |   ("next_track_button", "⏭️"),
1119 |   ("ng", "🆖"),
1120 |   ("ng_man", "🙅‍♂️"),
1121 |   ("ng_woman", "🙅‍♀️"),
1122 |   ("nicaragua", "🇳🇮"),
1123 |   ("niger", "🇳🇪"),
1124 |   ("nigeria", "🇳🇬"),
1125 |   ("night_with_stars", "🌃"),
1126 |   ("nine", "9️⃣"),
1127 |   ("ninja", "🥷"),
1128 |   ("niue", "🇳🇺"),
1129 |   ("no_bell", "🔕"),
1130 |   ("no_bicycles", "🚳"),
1131 |   ("no_entry", "⛔"),
1132 |   ("no_entry_sign", "🚫"),
1133 |   ("no_good", "🙅"),
1134 |   ("no_good_man", "🙅‍♂️"),
1135 |   ("no_good_woman", "🙅‍♀️"),
1136 |   ("no_mobile_phones", "📵"),
1137 |   ("no_mouth", "😶"),
1138 |   ("no_pedestrians", "🚷"),
1139 |   ("no_smoking", "🚭"),
1140 |   ("non-potable_water", "🚱"),
1141 |   ("norfolk_island", "🇳🇫"),
1142 |   ("north_korea", "🇰🇵"),
1143 |   ("northern_mariana_islands", "🇲🇵"),
1144 |   ("norway", "🇳🇴"),
1145 |   ("nose", "👃"),
1146 |   ("notebook", "📓"),
1147 |   ("notebook_with_decorative_cover", "📔"),
1148 |   ("notes", "🎶"),
1149 |   ("nut_and_bolt", "🔩"),
1150 |   ("o", "⭕"),
1151 |   ("o2", "🅾️"),
1152 |   ("ocean", "🌊"),
1153 |   ("octopus", "🐙"),
1154 |   ("oden", "🍢"),
1155 |   ("office", "🏢"),
1156 |   ("office_worker", "🧑‍💼"),
1157 |   ("oil_drum", "🛢️"),
1158 |   ("ok", "🆗"),
1159 |   ("ok_hand", "👌"),
1160 |   ("ok_man", "🙆‍♂️"),
1161 |   ("ok_person", "🙆"),
1162 |   ("ok_woman", "🙆‍♀️"),
1163 |   ("old_key", "🗝️"),
1164 |   ("older_adult", "🧓"),
1165 |   ("older_man", "👴"),
1166 |   ("older_woman", "👵"),
1167 |   ("olive", "🫒"),
1168 |   ("om", "🕉️"),
1169 |   ("oman", "🇴🇲"),
1170 |   ("on", "🔛"),
1171 |   ("oncoming_automobile", "🚘"),
1172 |   ("oncoming_bus", "🚍"),
1173 |   ("oncoming_police_car", "🚔"),
1174 |   ("oncoming_taxi", "🚖"),
1175 |   ("one", "1️⃣"),
1176 |   ("one_piece_swimsuit", "🩱"),
1177 |   ("onion", "🧅"),
1178 |   ("open_book", "📖"),
1179 |   ("open_file_folder", "📂"),
1180 |   ("open_hands", "👐"),
1181 |   ("open_mouth", "😮"),
1182 |   ("open_umbrella", "☂️"),
1183 |   ("ophiuchus", "⛎"),
1184 |   ("orange", "🍊"),
1185 |   ("orange_book", "📙"),
1186 |   ("orange_circle", "🟠"),
1187 |   ("orange_heart", "🧡"),
1188 |   ("orange_square", "🟧"),
1189 |   ("orangutan", "🦧"),
1190 |   ("orthodox_cross", "☦️"),
1191 |   ("otter", "🦦"),
1192 |   ("outbox_tray", "📤"),
1193 |   ("owl", "🦉"),
1194 |   ("ox", "🐂"),
1195 |   ("oyster", "🦪"),
1196 |   ("package", "📦"),
1197 |   ("page_facing_up", "📄"),
1198 |   ("page_with_curl", "📃"),
1199 |   ("pager", "📟"),
1200 |   ("paintbrush", "🖌️"),
1201 |   ("pakistan", "🇵🇰"),
1202 |   ("palau", "🇵🇼"),
1203 |   ("palestinian_territories", "🇵🇸"),
1204 |   ("palm_tree", "🌴"),
1205 |   ("palms_up_together", "🤲"),
1206 |   ("panama", "🇵🇦"),
1207 |   ("pancakes", "🥞"),
1208 |   ("panda_face", "🐼"),
1209 |   ("paperclip", "📎"),
1210 |   ("paperclips", "🖇️"),
1211 |   ("papua_new_guinea", "🇵🇬"),
1212 |   ("parachute", "🪂"),
1213 |   ("paraguay", "🇵🇾"),
1214 |   ("parasol_on_ground", "⛱️"),
1215 |   ("parking", "🅿️"),
1216 |   ("parrot", "🦜"),
1217 |   ("part_alternation_mark", "〽️"),
1218 |   ("partly_sunny", "⛅"),
1219 |   ("partying_face", "🥳"),
1220 |   ("passenger_ship", "🛳️"),
1221 |   ("passport_control", "🛂"),
1222 |   ("pause_button", "⏸️"),
1223 |   ("paw_prints", "🐾"),
1224 |   ("peace_symbol", "☮️"),
1225 |   ("peach", "🍑"),
1226 |   ("peacock", "🦚"),
1227 |   ("peanuts", "🥜"),
1228 |   ("pear", "🍐"),
1229 |   ("pen", "🖊️"),
1230 |   ("pencil", "📝"),
1231 |   ("pencil2", "✏️"),
1232 |   ("penguin", "🐧"),
1233 |   ("pensive", "😔"),
1234 |   ("people_holding_hands", "🧑‍🤝‍🧑"),
1235 |   ("people_hugging", "🫂"),
1236 |   ("performing_arts", "🎭"),
1237 |   ("persevere", "😣"),
1238 |   ("person_bald", "🧑‍🦲"),
1239 |   ("person_curly_hair", "🧑‍🦱"),
1240 |   ("person_feeding_baby", "🧑‍🍼"),
1241 |   ("person_fencing", "🤺"),
1242 |   ("person_in_manual_wheelchair", "🧑‍🦽"),
1243 |   ("person_in_motorized_wheelchair", "🧑‍🦼"),
1244 |   ("person_in_tuxedo", "🤵"),
1245 |   ("person_red_hair", "🧑‍🦰"),
1246 |   ("person_white_hair", "🧑‍🦳"),
1247 |   ("person_with_probing_cane", "🧑‍🦯"),
1248 |   ("person_with_turban", "👳"),
1249 |   ("person_with_veil", "👰"),
1250 |   ("peru", "🇵🇪"),
1251 |   ("petri_dish", "🧫"),
1252 |   ("philippines", "🇵🇭"),
1253 |   ("phone", "☎️"),
1254 |   ("pick", "⛏️"),
1255 |   ("pickup_truck", "🛻"),
1256 |   ("pie", "🥧"),
1257 |   ("pig", "🐷"),
1258 |   ("pig2", "🐖"),
1259 |   ("pig_nose", "🐽"),
1260 |   ("pill", "💊"),
1261 |   ("pilot", "🧑‍✈️"),
1262 |   ("pinata", "🪅"),
1263 |   ("pinched_fingers", "🤌"),
1264 |   ("pinching_hand", "🤏"),
1265 |   ("pineapple", "🍍"),
1266 |   ("ping_pong", "🏓"),
1267 |   ("pirate_flag", "🏴‍☠️"),
1268 |   ("pisces", "♓"),
1269 |   ("pitcairn_islands", "🇵🇳"),
1270 |   ("pizza", "🍕"),
1271 |   ("placard", "🪧"),
1272 |   ("place_of_worship", "🛐"),
1273 |   ("plate_with_cutlery", "🍽️"),
1274 |   ("play_or_pause_button", "⏯️"),
1275 |   ("pleading_face", "🥺"),
1276 |   ("plunger", "🪠"),
1277 |   ("point_down", "👇"),
1278 |   ("point_left", "👈"),
1279 |   ("point_right", "👉"),
1280 |   ("point_up", "☝️"),
1281 |   ("point_up_2", "👆"),
1282 |   ("poland", "🇵🇱"),
1283 |   ("polar_bear", "🐻‍❄️"),
1284 |   ("police_car", "🚓"),
1285 |   ("police_officer", "👮"),
1286 |   ("policeman", "👮‍♂️"),
1287 |   ("policewoman", "👮‍♀️"),
1288 |   ("poodle", "🐩"),
1289 |   ("poop", "💩"),
1290 |   ("popcorn", "🍿"),
1291 |   ("portugal", "🇵🇹"),
1292 |   ("post_office", "🏣"),
1293 |   ("postal_horn", "📯"),
1294 |   ("postbox", "📮"),
1295 |   ("potable_water", "🚰"),
1296 |   ("potato", "🥔"),
1297 |   ("potted_plant", "🪴"),
1298 |   ("pouch", "👝"),
1299 |   ("poultry_leg", "🍗"),
1300 |   ("pound", "💷"),
1301 |   ("pout", "😡"),
1302 |   ("pouting_cat", "😾"),
1303 |   ("pouting_face", "🙎"),
1304 |   ("pouting_man", "🙎‍♂️"),
1305 |   ("pouting_woman", "🙎‍♀️"),
1306 |   ("pray", "🙏"),
1307 |   ("prayer_beads", "📿"),
1308 |   ("pregnant_woman", "🤰"),
1309 |   ("pretzel", "🥨"),
1310 |   ("previous_track_button", "⏮️"),
1311 |   ("prince", "🤴"),
1312 |   ("princess", "👸"),
1313 |   ("printer", "🖨️"),
1314 |   ("probing_cane", "🦯"),
1315 |   ("puerto_rico", "🇵🇷"),
1316 |   ("punch", "👊"),
1317 |   ("purple_circle", "🟣"),
1318 |   ("purple_heart", "💜"),
1319 |   ("purple_square", "🟪"),
1320 |   ("purse", "👛"),
1321 |   ("pushpin", "📌"),
1322 |   ("put_litter_in_its_place", "🚮"),
1323 |   ("qatar", "🇶🇦"),
1324 |   ("question", "❓"),
1325 |   ("rabbit", "🐰"),
1326 |   ("rabbit2", "🐇"),
1327 |   ("raccoon", "🦝"),
1328 |   ("racehorse", "🐎"),
1329 |   ("racing_car", "🏎️"),
1330 |   ("radio", "📻"),
1331 |   ("radio_button", "🔘"),
1332 |   ("radioactive", "☢️"),
1333 |   ("rage", "😡"),
1334 |   ("railway_car", "🚃"),
1335 |   ("railway_track", "🛤️"),
1336 |   ("rainbow", "🌈"),
1337 |   ("rainbow_flag", "🏳️‍🌈"),
1338 |   ("raised_back_of_hand", "🤚"),
1339 |   ("raised_eyebrow", "🤨"),
1340 |   ("raised_hand", "✋"),
1341 |   ("raised_hand_with_fingers_splayed", "🖐️"),
1342 |   ("raised_hands", "🙌"),
1343 |   ("raising_hand", "🙋"),
1344 |   ("raising_hand_man", "🙋‍♂️"),
1345 |   ("raising_hand_woman", "🙋‍♀️"),
1346 |   ("ram", "🐏"),
1347 |   ("ramen", "🍜"),
1348 |   ("rat", "🐀"),
1349 |   ("razor", "🪒"),
1350 |   ("receipt", "🧾"),
1351 |   ("record_button", "⏺️"),
1352 |   ("recycle", "♻️"),
1353 |   ("red_car", "🚗"),
1354 |   ("red_circle", "🔴"),
1355 |   ("red_envelope", "🧧"),
1356 |   ("red_haired_man", "👨‍🦰"),
1357 |   ("red_haired_woman", "👩‍🦰"),
1358 |   ("red_square", "🟥"),
1359 |   ("registered", "®️"),
1360 |   ("relaxed", "☺️"),
1361 |   ("relieved", "😌"),
1362 |   ("reminder_ribbon", "🎗️"),
1363 |   ("repeat", "🔁"),
1364 |   ("repeat_one", "🔂"),
1365 |   ("rescue_worker_helmet", "⛑️"),
1366 |   ("restroom", "🚻"),
1367 |   ("reunion", "🇷🇪"),
1368 |   ("revolving_hearts", "💞"),
1369 |   ("rewind", "⏪"),
1370 |   ("rhinoceros", "🦏"),
1371 |   ("ribbon", "🎀"),
1372 |   ("rice", "🍚"),
1373 |   ("rice_ball", "🍙"),
1374 |   ("rice_cracker", "🍘"),
1375 |   ("rice_scene", "🎑"),
1376 |   ("right_anger_bubble", "🗯️"),
1377 |   ("ring", "💍"),
1378 |   ("ringed_planet", "🪐"),
1379 |   ("robot", "🤖"),
1380 |   ("rock", "🪨"),
1381 |   ("rocket", "🚀"),
1382 |   ("rofl", "🤣"),
1383 |   ("roll_eyes", "🙄"),
1384 |   ("roll_of_paper", "🧻"),
1385 |   ("roller_coaster", "🎢"),
1386 |   ("roller_skate", "🛼"),
1387 |   ("romania", "🇷🇴"),
1388 |   ("rooster", "🐓"),
1389 |   ("rose", "🌹"),
1390 |   ("rosette", "🏵️"),
1391 |   ("rotating_light", "🚨"),
1392 |   ("round_pushpin", "📍"),
1393 |   ("rowboat", "🚣"),
1394 |   ("rowing_man", "🚣‍♂️"),
1395 |   ("rowing_woman", "🚣‍♀️"),
1396 |   ("ru", "🇷🇺"),
1397 |   ("rugby_football", "🏉"),
1398 |   ("runner", "🏃"),
1399 |   ("running", "🏃"),
1400 |   ("running_man", "🏃‍♂️"),
1401 |   ("running_shirt_with_sash", "🎽"),
1402 |   ("running_woman", "🏃‍♀️"),
1403 |   ("rwanda", "🇷🇼"),
1404 |   ("sa", "🈂️"),
1405 |   ("safety_pin", "🧷"),
1406 |   ("safety_vest", "🦺"),
1407 |   ("sagittarius", "♐"),
1408 |   ("sailboat", "⛵"),
1409 |   ("sake", "🍶"),
1410 |   ("salt", "🧂"),
1411 |   ("samoa", "🇼🇸"),
1412 |   ("san_marino", "🇸🇲"),
1413 |   ("sandal", "👡"),
1414 |   ("sandwich", "🥪"),
1415 |   ("santa", "🎅"),
1416 |   ("sao_tome_principe", "🇸🇹"),
1417 |   ("sari", "🥻"),
1418 |   ("sassy_man", "💁‍♂️"),
1419 |   ("sassy_woman", "💁‍♀️"),
1420 |   ("satellite", "📡"),
1421 |   ("satisfied", "😆"),
1422 |   ("saudi_arabia", "🇸🇦"),
1423 |   ("sauna_man", "🧖‍♂️"),
1424 |   ("sauna_person", "🧖"),
1425 |   ("sauna_woman", "🧖‍♀️"),
1426 |   ("sauropod", "🦕"),
1427 |   ("saxophone", "🎷"),
1428 |   ("scarf", "🧣"),
1429 |   ("school", "🏫"),
1430 |   ("school_satchel", "🎒"),
1431 |   ("scientist", "🧑‍🔬"),
1432 |   ("scissors", "✂️"),
1433 |   ("scorpion", "🦂"),
1434 |   ("scorpius", "♏"),
1435 |   ("scotland", "🏴󠁧󠁢󠁳󠁣󠁴󠁿"),
1436 |   ("scream", "😱"),
1437 |   ("scream_cat", "🙀"),
1438 |   ("screwdriver", "🪛"),
1439 |   ("scroll", "📜"),
1440 |   ("seal", "🦭"),
1441 |   ("seat", "💺"),
1442 |   ("secret", "㊙️"),
1443 |   ("see_no_evil", "🙈"),
1444 |   ("seedling", "🌱"),
1445 |   ("selfie", "🤳"),
1446 |   ("senegal", "🇸🇳"),
1447 |   ("serbia", "🇷🇸"),
1448 |   ("service_dog", "🐕‍🦺"),
1449 |   ("seven", "7️⃣"),
1450 |   ("sewing_needle", "🪡"),
1451 |   ("seychelles", "🇸🇨"),
1452 |   ("shallow_pan_of_food", "🥘"),
1453 |   ("shamrock", "☘️"),
1454 |   ("shark", "🦈"),
1455 |   ("shaved_ice", "🍧"),
1456 |   ("sheep", "🐑"),
1457 |   ("shell", "🐚"),
1458 |   ("shield", "🛡️"),
1459 |   ("shinto_shrine", "⛩️"),
1460 |   ("ship", "🚢"),
1461 |   ("shirt", "👕"),
1462 |   ("shit", "💩"),
1463 |   ("shoe", "👞"),
1464 |   ("shopping", "🛍️"),
1465 |   ("shopping_cart", "🛒"),
1466 |   ("shorts", "🩳"),
1467 |   ("shower", "🚿"),
1468 |   ("shrimp", "🦐"),
1469 |   ("shrug", "🤷"),
1470 |   ("shushing_face", "🤫"),
1471 |   ("sierra_leone", "🇸🇱"),
1472 |   ("signal_strength", "📶"),
1473 |   ("singapore", "🇸🇬"),
1474 |   ("singer", "🧑‍🎤"),
1475 |   ("sint_maarten", "🇸🇽"),
1476 |   ("six", "6️⃣"),
1477 |   ("six_pointed_star", "🔯"),
1478 |   ("skateboard", "🛹"),
1479 |   ("ski", "🎿"),
1480 |   ("skier", "⛷️"),
1481 |   ("skull", "💀"),
1482 |   ("skull_and_crossbones", "☠️"),
1483 |   ("skunk", "🦨"),
1484 |   ("sled", "🛷"),
1485 |   ("sleeping", "😴"),
1486 |   ("sleeping_bed", "🛌"),
1487 |   ("sleepy", "😪"),
1488 |   ("slightly_frowning_face", "🙁"),
1489 |   ("slightly_smiling_face", "🙂"),
1490 |   ("slot_machine", "🎰"),
1491 |   ("sloth", "🦥"),
1492 |   ("slovakia", "🇸🇰"),
1493 |   ("slovenia", "🇸🇮"),
1494 |   ("small_airplane", "🛩️"),
1495 |   ("small_blue_diamond", "🔹"),
1496 |   ("small_orange_diamond", "🔸"),
1497 |   ("small_red_triangle", "🔺"),
1498 |   ("small_red_triangle_down", "🔻"),
1499 |   ("smile", "😄"),
1500 |   ("smile_cat", "😸"),
1501 |   ("smiley", "😃"),
1502 |   ("smiley_cat", "😺"),
1503 |   ("smiling_face_with_tear", "🥲"),
1504 |   ("smiling_face_with_three_hearts", "🥰"),
1505 |   ("smiling_imp", "😈"),
1506 |   ("smirk", "😏"),
1507 |   ("smirk_cat", "😼"),
1508 |   ("smoking", "🚬"),
1509 |   ("snail", "🐌"),
1510 |   ("snake", "🐍"),
1511 |   ("sneezing_face", "🤧"),
1512 |   ("snowboarder", "🏂"),
1513 |   ("snowflake", "❄️"),
1514 |   ("snowman", "⛄"),
1515 |   ("snowman_with_snow", "☃️"),
1516 |   ("soap", "🧼"),
1517 |   ("sob", "😭"),
1518 |   ("soccer", "⚽"),
1519 |   ("socks", "🧦"),
1520 |   ("softball", "🥎"),
1521 |   ("solomon_islands", "🇸🇧"),
1522 |   ("somalia", "🇸🇴"),
1523 |   ("soon", "🔜"),
1524 |   ("sos", "🆘"),
1525 |   ("sound", "🔉"),
1526 |   ("south_africa", "🇿🇦"),
1527 |   ("south_georgia_south_sandwich_islands", "🇬🇸"),
1528 |   ("south_sudan", "🇸🇸"),
1529 |   ("space_invader", "👾"),
1530 |   ("spades", "♠️"),
1531 |   ("spaghetti", "🍝"),
1532 |   ("sparkle", "❇️"),
1533 |   ("sparkler", "🎇"),
1534 |   ("sparkles", "✨"),
1535 |   ("sparkling_heart", "💖"),
1536 |   ("speak_no_evil", "🙊"),
1537 |   ("speaker", "🔈"),
1538 |   ("speaking_head", "🗣️"),
1539 |   ("speech_balloon", "💬"),
1540 |   ("speedboat", "🚤"),
1541 |   ("spider", "🕷️"),
1542 |   ("spider_web", "🕸️"),
1543 |   ("spiral_calendar", "🗓️"),
1544 |   ("spiral_notepad", "🗒️"),
1545 |   ("sponge", "🧽"),
1546 |   ("spoon", "🥄"),
1547 |   ("squid", "🦑"),
1548 |   ("sri_lanka", "🇱🇰"),
1549 |   ("st_barthelemy", "🇧🇱"),
1550 |   ("st_helena", "🇸🇭"),
1551 |   ("st_kitts_nevis", "🇰🇳"),
1552 |   ("st_lucia", "🇱🇨"),
1553 |   ("st_martin", "🇲🇫"),
1554 |   ("st_pierre_miquelon", "🇵🇲"),
1555 |   ("st_vincent_grenadines", "🇻🇨"),
1556 |   ("stadium", "🏟️"),
1557 |   ("standing_man", "🧍‍♂️"),
1558 |   ("standing_person", "🧍"),
1559 |   ("standing_woman", "🧍‍♀️"),
1560 |   ("star", "⭐"),
1561 |   ("star2", "🌟"),
1562 |   ("star_and_crescent", "☪️"),
1563 |   ("star_of_david", "✡️"),
1564 |   ("star_struck", "🤩"),
1565 |   ("stars", "🌠"),
1566 |   ("station", "🚉"),
1567 |   ("statue_of_liberty", "🗽"),
1568 |   ("steam_locomotive", "🚂"),
1569 |   ("stethoscope", "🩺"),
1570 |   ("stew", "🍲"),
1571 |   ("stop_button", "⏹️"),
1572 |   ("stop_sign", "🛑"),
1573 |   ("stopwatch", "⏱️"),
1574 |   ("straight_ruler", "📏"),
1575 |   ("strawberry", "🍓"),
1576 |   ("stuck_out_tongue", "😛"),
1577 |   ("stuck_out_tongue_closed_eyes", "😝"),
1578 |   ("stuck_out_tongue_winking_eye", "😜"),
1579 |   ("student", "🧑‍🎓"),
1580 |   ("studio_microphone", "🎙️"),
1581 |   ("stuffed_flatbread", "🥙"),
1582 |   ("sudan", "🇸🇩"),
1583 |   ("sun_behind_large_cloud", "🌥️"),
1584 |   ("sun_behind_rain_cloud", "🌦️"),
1585 |   ("sun_behind_small_cloud", "🌤️"),
1586 |   ("sun_with_face", "🌞"),
1587 |   ("sunflower", "🌻"),
1588 |   ("sunglasses", "😎"),
1589 |   ("sunny", "☀️"),
1590 |   ("sunrise", "🌅"),
1591 |   ("sunrise_over_mountains", "🌄"),
1592 |   ("superhero", "🦸"),
1593 |   ("superhero_man", "🦸‍♂️"),
1594 |   ("superhero_woman", "🦸‍♀️"),
1595 |   ("supervillain", "🦹"),
1596 |   ("supervillain_man", "🦹‍♂️"),
1597 |   ("supervillain_woman", "🦹‍♀️"),
1598 |   ("surfer", "🏄"),
1599 |   ("surfing_man", "🏄‍♂️"),
1600 |   ("surfing_woman", "🏄‍♀️"),
1601 |   ("suriname", "🇸🇷"),
1602 |   ("sushi", "🍣"),
1603 |   ("suspension_railway", "🚟"),
1604 |   ("svalbard_jan_mayen", "🇸🇯"),
1605 |   ("swan", "🦢"),
1606 |   ("swaziland", "🇸🇿"),
1607 |   ("sweat", "😓"),
1608 |   ("sweat_drops", "💦"),
1609 |   ("sweat_smile", "😅"),
1610 |   ("sweden", "🇸🇪"),
1611 |   ("sweet_potato", "🍠"),
1612 |   ("swim_brief", "🩲"),
1613 |   ("swimmer", "🏊"),
1614 |   ("swimming_man", "🏊‍♂️"),
1615 |   ("swimming_woman", "🏊‍♀️"),
1616 |   ("switzerland", "🇨🇭"),
1617 |   ("symbols", "🔣"),
1618 |   ("synagogue", "🕍"),
1619 |   ("syria", "🇸🇾"),
1620 |   ("syringe", "💉"),
1621 |   ("t-rex", "🦖"),
1622 |   ("taco", "🌮"),
1623 |   ("tada", "🎉"),
1624 |   ("taiwan", "🇹🇼"),
1625 |   ("tajikistan", "🇹🇯"),
1626 |   ("takeout_box", "🥡"),
1627 |   ("tamale", "🫔"),
1628 |   ("tanabata_tree", "🎋"),
1629 |   ("tangerine", "🍊"),
1630 |   ("tanzania", "🇹🇿"),
1631 |   ("taurus", "♉"),
1632 |   ("taxi", "🚕"),
1633 |   ("tea", "🍵"),
1634 |   ("teacher", "🧑‍🏫"),
1635 |   ("teapot", "🫖"),
1636 |   ("technologist", "🧑‍💻"),
1637 |   ("teddy_bear", "🧸"),
1638 |   ("telephone", "☎️"),
1639 |   ("telephone_receiver", "📞"),
1640 |   ("telescope", "🔭"),
1641 |   ("tennis", "🎾"),
1642 |   ("tent", "⛺"),
1643 |   ("test_tube", "🧪"),
1644 |   ("thailand", "🇹🇭"),
1645 |   ("thermometer", "🌡️"),
1646 |   ("thinking", "🤔"),
1647 |   ("thong_sandal", "🩴"),
1648 |   ("thought_balloon", "💭"),
1649 |   ("thread", "🧵"),
1650 |   ("three", "3️⃣"),
1651 |   ("thumbsdown", "👎"),
1652 |   ("thumbsup", "👍"),
1653 |   ("ticket", "🎫"),
1654 |   ("tickets", "🎟️"),
1655 |   ("tiger", "🐯"),
1656 |   ("tiger2", "🐅"),
1657 |   ("timer_clock", "⏲️"),
1658 |   ("timor_leste", "🇹🇱"),
1659 |   ("tipping_hand_man", "💁‍♂️"),
1660 |   ("tipping_hand_person", "💁"),
1661 |   ("tipping_hand_woman", "💁‍♀️"),
1662 |   ("tired_face", "😫"),
1663 |   ("tm", "™️"),
1664 |   ("togo", "🇹🇬"),
1665 |   ("toilet", "🚽"),
1666 |   ("tokelau", "🇹🇰"),
1667 |   ("tokyo_tower", "🗼"),
1668 |   ("tomato", "🍅"),
1669 |   ("tonga", "🇹🇴"),
1670 |   ("tongue", "👅"),
1671 |   ("toolbox", "🧰"),
1672 |   ("tooth", "🦷"),
1673 |   ("toothbrush", "🪥"),
1674 |   ("top", "🔝"),
1675 |   ("tophat", "🎩"),
1676 |   ("tornado", "🌪️"),
1677 |   ("tr", "🇹🇷"),
1678 |   ("trackball", "🖲️"),
1679 |   ("tractor", "🚜"),
1680 |   ("traffic_light", "🚥"),
1681 |   ("train", "🚋"),
1682 |   ("train2", "🚆"),
1683 |   ("tram", "🚊"),
1684 |   ("transgender_flag", "🏳️‍⚧️"),
1685 |   ("transgender_symbol", "⚧️"),
1686 |   ("triangular_flag_on_post", "🚩"),
1687 |   ("triangular_ruler", "📐"),
1688 |   ("trident", "🔱"),
1689 |   ("trinidad_tobago", "🇹🇹"),
1690 |   ("tristan_da_cunha", "🇹🇦"),
1691 |   ("triumph", "😤"),
1692 |   ("trolleybus", "🚎"),
1693 |   ("trophy", "🏆"),
1694 |   ("tropical_drink", "🍹"),
1695 |   ("tropical_fish", "🐠"),
1696 |   ("truck", "🚚"),
1697 |   ("trumpet", "🎺"),
1698 |   ("tshirt", "👕"),
1699 |   ("tulip", "🌷"),
1700 |   ("tumbler_glass", "🥃"),
1701 |   ("tunisia", "🇹🇳"),
1702 |   ("turkey", "🦃"),
1703 |   ("turkmenistan", "🇹🇲"),
1704 |   ("turks_caicos_islands", "🇹🇨"),
1705 |   ("turtle", "🐢"),
1706 |   ("tuvalu", "🇹🇻"),
1707 |   ("tv", "📺"),
1708 |   ("twisted_rightwards_arrows", "🔀"),
1709 |   ("two", "2️⃣"),
1710 |   ("two_hearts", "💕"),
1711 |   ("two_men_holding_hands", "👬"),
1712 |   ("two_women_holding_hands", "👭"),
1713 |   ("u5272", "🈹"),
1714 |   ("u5408", "🈴"),
1715 |   ("u55b6", "🈺"),
1716 |   ("u6307", "🈯"),
1717 |   ("u6708", "🈷️"),
1718 |   ("u6709", "🈶"),
1719 |   ("u6e80", "🈵"),
1720 |   ("u7121", "🈚"),
1721 |   ("u7533", "🈸"),
1722 |   ("u7981", "🈲"),
1723 |   ("u7a7a", "🈳"),
1724 |   ("uganda", "🇺🇬"),
1725 |   ("uk", "🇬🇧"),
1726 |   ("ukraine", "🇺🇦"),
1727 |   ("umbrella", "☔"),
1728 |   ("unamused", "😒"),
1729 |   ("underage", "🔞"),
1730 |   ("unicorn", "🦄"),
1731 |   ("united_arab_emirates", "🇦🇪"),
1732 |   ("united_nations", "🇺🇳"),
1733 |   ("unlock", "🔓"),
1734 |   ("up", "🆙"),
1735 |   ("upside_down_face", "🙃"),
1736 |   ("uruguay", "🇺🇾"),
1737 |   ("us", "🇺🇸"),
1738 |   ("us_outlying_islands", "🇺🇲"),
1739 |   ("us_virgin_islands", "🇻🇮"),
1740 |   ("uzbekistan", "🇺🇿"),
1741 |   ("v", "✌️"),
1742 |   ("vampire", "🧛"),
1743 |   ("vampire_man", "🧛‍♂️"),
1744 |   ("vampire_woman", "🧛‍♀️"),
1745 |   ("vanuatu", "🇻🇺"),
1746 |   ("vatican_city", "🇻🇦"),
1747 |   ("venezuela", "🇻🇪"),
1748 |   ("vertical_traffic_light", "🚦"),
1749 |   ("vhs", "📼"),
1750 |   ("vibration_mode", "📳"),
1751 |   ("video_camera", "📹"),
1752 |   ("video_game", "🎮"),
1753 |   ("vietnam", "🇻🇳"),
1754 |   ("violin", "🎻"),
1755 |   ("virgo", "♍"),
1756 |   ("volcano", "🌋"),
1757 |   ("volleyball", "🏐"),
1758 |   ("vomiting_face", "🤮"),
1759 |   ("vs", "🆚"),
1760 |   ("vulcan_salute", "🖖"),
1761 |   ("waffle", "🧇"),
1762 |   ("wales", "🏴󠁧󠁢󠁷󠁬󠁳󠁿"),
1763 |   ("walking", "🚶"),
1764 |   ("walking_man", "🚶‍♂️"),
1765 |   ("walking_woman", "🚶‍♀️"),
1766 |   ("wallis_futuna", "🇼🇫"),
1767 |   ("waning_crescent_moon", "🌘"),
1768 |   ("waning_gibbous_moon", "🌖"),
1769 |   ("warning", "⚠️"),
1770 |   ("wastebasket", "🗑️"),
1771 |   ("watch", "⌚"),
1772 |   ("water_buffalo", "🐃"),
1773 |   ("water_polo", "🤽"),
1774 |   ("watermelon", "🍉"),
1775 |   ("wave", "👋"),
1776 |   ("wavy_dash", "〰️"),
1777 |   ("waxing_crescent_moon", "🌒"),
1778 |   ("waxing_gibbous_moon", "🌔"),
1779 |   ("wc", "🚾"),
1780 |   ("weary", "😩"),
1781 |   ("wedding", "💒"),
1782 |   ("weight_lifting", "🏋️"),
1783 |   ("weight_lifting_man", "🏋️‍♂️"),
1784 |   ("weight_lifting_woman", "🏋️‍♀️"),
1785 |   ("western_sahara", "🇪🇭"),
1786 |   ("whale", "🐳"),
1787 |   ("whale2", "🐋"),
1788 |   ("wheel_of_dharma", "☸️"),
1789 |   ("wheelchair", "♿"),
1790 |   ("white_check_mark", "✅"),
1791 |   ("white_circle", "⚪"),
1792 |   ("white_flag", "🏳️"),
1793 |   ("white_flower", "💮"),
1794 |   ("white_haired_man", "👨‍🦳"),
1795 |   ("white_haired_woman", "👩‍🦳"),
1796 |   ("white_heart", "🤍"),
1797 |   ("white_large_square", "⬜"),
1798 |   ("white_medium_small_square", "◽"),
1799 |   ("white_medium_square", "◻️"),
1800 |   ("white_small_square", "▫️"),
1801 |   ("white_square_button", "🔳"),
1802 |   ("wilted_flower", "🥀"),
1803 |   ("wind_chime", "🎐"),
1804 |   ("wind_face", "🌬️"),
1805 |   ("window", "🪟"),
1806 |   ("wine_glass", "🍷"),
1807 |   ("wink", "😉"),
1808 |   ("wolf", "🐺"),
1809 |   ("woman", "👩"),
1810 |   ("woman_artist", "👩‍🎨"),
1811 |   ("woman_astronaut", "👩‍🚀"),
1812 |   ("woman_beard", "🧔‍♀️"),
1813 |   ("woman_cartwheeling", "🤸‍♀️"),
1814 |   ("woman_cook", "👩‍🍳"),
1815 |   ("woman_dancing", "💃"),
1816 |   ("woman_facepalming", "🤦‍♀️"),
1817 |   ("woman_factory_worker", "👩‍🏭"),
1818 |   ("woman_farmer", "👩‍🌾"),
1819 |   ("woman_feeding_baby", "👩‍🍼"),
1820 |   ("woman_firefighter", "👩‍🚒"),
1821 |   ("woman_health_worker", "👩‍⚕️"),
1822 |   ("woman_in_manual_wheelchair", "👩‍🦽"),
1823 |   ("woman_in_motorized_wheelchair", "👩‍🦼"),
1824 |   ("woman_in_tuxedo", "🤵‍♀️"),
1825 |   ("woman_judge", "👩‍⚖️"),
1826 |   ("woman_juggling", "🤹‍♀️"),
1827 |   ("woman_mechanic", "👩‍🔧"),
1828 |   ("woman_office_worker", "👩‍💼"),
1829 |   ("woman_pilot", "👩‍✈️"),
1830 |   ("woman_playing_handball", "🤾‍♀️"),
1831 |   ("woman_playing_water_polo", "🤽‍♀️"),
1832 |   ("woman_scientist", "👩‍🔬"),
1833 |   ("woman_shrugging", "🤷‍♀️"),
1834 |   ("woman_singer", "👩‍🎤"),
1835 |   ("woman_student", "👩‍🎓"),
1836 |   ("woman_teacher", "👩‍🏫"),
1837 |   ("woman_technologist", "👩‍💻"),
1838 |   ("woman_with_headscarf", "🧕"),
1839 |   ("woman_with_probing_cane", "👩‍🦯"),
1840 |   ("woman_with_turban", "👳‍♀️"),
1841 |   ("woman_with_veil", "👰‍♀️"),
1842 |   ("womans_clothes", "👚"),
1843 |   ("womans_hat", "👒"),
1844 |   ("women_wrestling", "🤼‍♀️"),
1845 |   ("womens", "🚺"),
1846 |   ("wood", "🪵"),
1847 |   ("woozy_face", "🥴"),
1848 |   ("world_map", "🗺️"),
1849 |   ("worm", "🪱"),
1850 |   ("worried", "😟"),
1851 |   ("wrench", "🔧"),
1852 |   ("wrestling", "🤼"),
1853 |   ("writing_hand", "✍️"),
1854 |   ("x", "❌"),
1855 |   ("yarn", "🧶"),
1856 |   ("yawning_face", "🥱"),
1857 |   ("yellow_circle", "🟡"),
1858 |   ("yellow_heart", "💛"),
1859 |   ("yellow_square", "🟨"),
1860 |   ("yemen", "🇾🇪"),
1861 |   ("yen", "💴"),
1862 |   ("yin_yang", "☯️"),
1863 |   ("yo_yo", "🪀"),
1864 |   ("yum", "😋"),
1865 |   ("zambia", "🇿🇲"),
1866 |   ("zany_face", "🤪"),
1867 |   ("zap", "⚡"),
1868 |   ("zebra", "🦓"),
1869 |   ("zero", "0️⃣"),
1870 |   ("zimbabwe", "🇿🇼"),
1871 |   ("zipper_mouth_face", "🤐"),
1872 |   ("zombie", "🧟"),
1873 |   ("zombie_man", "🧟‍♂️"),
1874 |   ("zombie_woman", "🧟‍♀️"),
1875 |   ("zzz", "💤"),
1876 | ];
1877 | 


--------------------------------------------------------------------------------
/src/html.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::BTreeMap;
  2 | 
  3 | use crate::{
  4 |   ast::{self, Attrs, Tag},
  5 |   tree::get_string_content,
  6 |   Document, HtmlOpts,
  7 | };
  8 | 
  9 | pub(crate) fn convert(opts: &HtmlOpts, doc: &Document) -> String {
 10 |   let refs = &doc.references;
 11 |   let mut ctx = Ctx { opts, refs, res: String::new() };
 12 |   ctx.render_doc(doc);
 13 |   ctx.res
 14 | }
 15 | 
 16 | struct Ctx<'a> {
 17 |   #[allow(unused)]
 18 |   opts: &'a HtmlOpts,
 19 |   refs: &'a BTreeMap<String, ast::ReferenceDefinition>,
 20 |   res: String,
 21 | }
 22 | impl<'a> Ctx<'a> {
 23 |   fn render_doc(&mut self, doc: &Document) {
 24 |     for child in &doc.children {
 25 |       self.render(child)
 26 |     }
 27 |   }
 28 |   fn render(&mut self, tag: &Tag) {
 29 |     match tag {
 30 |       Tag::Heading(_) => todo!(),
 31 |       Tag::Para(para) => {
 32 |         self.render_tag("p", &para.attrs);
 33 |         self.render_children(&para.children);
 34 |         self.out("</p>");
 35 |         self.out("\n")
 36 |       }
 37 |       Tag::Link(link) => {
 38 |         let mut attrs = Attrs::new();
 39 |         let dest = self.resolve_reference(link.destination.as_deref(), link.reference.as_deref());
 40 |         if let Some(dest) = dest {
 41 |           attrs.insert("href".to_string(), dest);
 42 |         }
 43 |         self.render_tag("a", &attrs);
 44 |         self.render_children(&link.children);
 45 |         self.out("</a>");
 46 |       }
 47 |       Tag::Image(image) => {
 48 |         let mut attrs = Attrs::new();
 49 |         let alt_text = get_string_content(&image.children);
 50 |         if !alt_text.is_empty() {
 51 |           attrs.insert("alt".to_string(), alt_text);
 52 |         }
 53 |         let dest = self.resolve_reference(image.destination.as_deref(), image.reference.as_deref());
 54 |         if let Some(dest) = dest {
 55 |           attrs.insert("src".to_string(), dest);
 56 |         }
 57 |         self.render_tag("img", &attrs)
 58 |       }
 59 |       Tag::CodeBlock(code_block) => {
 60 |         self.render_tag("pre", &code_block.attrs);
 61 |         let mut attrs = Attrs::default();
 62 |         if let Some(lang) = &code_block.lang {
 63 |           attrs.insert("class".to_string(), format!("language-{lang}"));
 64 |         }
 65 |         self.render_tag("code", &attrs);
 66 |         self.out_escape_html(&code_block.text);
 67 |         self.out("</code></pre>\n");
 68 |       }
 69 |       Tag::Strong(strong) => {
 70 |         self.render_tag("strong", &strong.attrs);
 71 |         self.render_children(&strong.children);
 72 |         self.out("</strong>");
 73 |       }
 74 |       Tag::Emph(emph) => {
 75 |         self.render_tag("em", &emph.attrs);
 76 |         self.render_children(&emph.children);
 77 |         self.out("</em>");
 78 |       }
 79 |       Tag::DoubleQuoted(double_quoted) => {
 80 |         self.out("&ldquo;");
 81 |         self.render_children(&double_quoted.children);
 82 |         self.out("&rdquo;");
 83 |       }
 84 |       Tag::SoftBreak(_) => self.out("\n"),
 85 |       Tag::Url(url) => {
 86 |         let mut attrs = Attrs::new();
 87 |         attrs.insert("href".to_string(), url.destination.clone());
 88 |         self.render_tag("a", &attrs);
 89 |         self.out_escape_html(&url.destination);
 90 |         self.out("</a>");
 91 |       }
 92 |       Tag::Str(str) => {
 93 |         if str.attrs.is_empty() {
 94 |           self.out_escape_html(&str.text);
 95 |         } else {
 96 |           self.render_tag("span", &str.attrs);
 97 |           self.out_escape_html(&str.text);
 98 |           self.out("</span>")
 99 |         }
100 |       }
101 |       Tag::Emoji(emoji) => {
102 |         if let Some(emoji) = crate::emoji::find_emoji(&emoji.alias) {
103 |           self.out(emoji);
104 |         } else {
105 |           self.out(&format!(":{}:", emoji.alias));
106 |         }
107 |       }
108 |       Tag::Verbatim(verbatim) => {
109 |         self.render_tag("code", &verbatim.attrs);
110 |         self.out_escape_html(&verbatim.text);
111 |         self.out("</code>");
112 |       }
113 |       Tag::Span(span) => {
114 |         self.render_tag("span", &span.attrs);
115 |         self.render_children(&span.children);
116 |         self.out("</span>");
117 |       }
118 |       Tag::Insert(insert) => {
119 |         self.render_tag("ins", &insert.attrs);
120 |         self.render_children(&insert.children);
121 |         self.out("</ins>");
122 |       }
123 |       Tag::Delete(delete) => {
124 |         self.render_tag("del", &delete.attrs);
125 |         self.render_children(&delete.children);
126 |         self.out("</del>");
127 |       }
128 |       Tag::Mark(mark) => {
129 |         self.render_tag("mark", &mark.attrs);
130 |         self.render_children(&mark.children);
131 |         self.out("</mark>");
132 |       }
133 |       Tag::Superscript(superscript) => {
134 |         self.render_tag("sup", &superscript.attrs);
135 |         self.render_children(&superscript.children);
136 |         self.out("</sup>");
137 |       }
138 |       Tag::Subscript(subscript) => {
139 |         self.render_tag("sub", &subscript.attrs);
140 |         self.render_children(&subscript.children);
141 |         self.out("</sub>");
142 |       }
143 |       Tag::EmDash(_) => self.out("&mdash;"),
144 |       Tag::EnDash(_) => self.out("&ndash;"),
145 |     }
146 |   }
147 | 
148 |   fn render_children(&mut self, children: &[Tag]) {
149 |     for child in children {
150 |       self.render(child)
151 |     }
152 |   }
153 | 
154 |   fn render_tag(&mut self, tag_name: &str, attrs: &Attrs) {
155 |     self.out("<");
156 |     self.out(tag_name);
157 |     for (k, v) in attrs {
158 |       self.out(" ");
159 |       self.out(k);
160 |       self.out("=");
161 |       self.out(&format!("{v:?}"));
162 |     }
163 |     self.out(">");
164 |   }
165 | 
166 |   fn resolve_reference(
167 |     &self,
168 |     destination: Option<&str>,
169 |     reference: Option<&str>,
170 |   ) -> Option<String> {
171 |     if let Some(destination) = destination {
172 |       return Some(destination.to_string());
173 |     }
174 |     if let Some(reference) = reference {
175 |       if let Some(reference_definition) = self.refs.get(reference) {
176 |         return Some(reference_definition.destination.clone());
177 |       }
178 |     }
179 |     None
180 |   }
181 | 
182 |   fn out(&mut self, s: &str) {
183 |     self.res.push_str(s)
184 |   }
185 |   fn out_escape_html(&mut self, s: &str) {
186 |     self.res.push_str(s)
187 |   }
188 | }
189 | 


--------------------------------------------------------------------------------
/src/inline.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |   collections::{BTreeMap, HashMap},
  3 |   ops::Range,
  4 | };
  5 | 
  6 | use crate::{
  7 |   annot::{Annot, Atom, Comp},
  8 |   attribute,
  9 |   patterns::{find_at, is_space, PatMatch},
 10 |   Match, ParseOpts,
 11 | };
 12 | 
 13 | #[derive(Default)]
 14 | pub struct Tokenizer {
 15 |   opts: ParseOpts,
 16 |   subject: String,
 17 |   matches: BTreeMap<usize, Match>,
 18 |   openers: HashMap<u8, Vec<Opener>>,
 19 |   verbatim: usize,
 20 |   verbatim_type: Comp,
 21 |   destination: bool,
 22 |   firstpos: usize,
 23 |   lastpos: usize,
 24 |   allow_attributes: bool,
 25 |   attribute_tokenizer: Option<attribute::Tokenizer>,
 26 |   attribute_start: usize,
 27 | }
 28 | 
 29 | #[derive(Debug, Clone)]
 30 | struct Opener {
 31 |   range: Range<usize>,
 32 |   annot: &'static str,
 33 |   sub_range: Range<usize>,
 34 | }
 35 | 
 36 | impl Opener {
 37 |   fn new(range: Range<usize>) -> Opener {
 38 |     Opener { range, annot: "", sub_range: 0..0 }
 39 |   }
 40 | }
 41 | 
 42 | // allow up to 3 captures...
 43 | fn bounded_find(subj: &str, patt: &'static str, startpos: usize, endpos: usize) -> PatMatch {
 44 |   let mut m = find_at(subj, patt, startpos);
 45 |   if m.end > endpos {
 46 |     m = PatMatch::default()
 47 |   }
 48 |   m
 49 | }
 50 | 
 51 | impl Tokenizer {
 52 |   pub fn new(subject: String, opts: ParseOpts) -> Tokenizer {
 53 |     let mut res = Tokenizer::default();
 54 |     res.allow_attributes = true;
 55 |     res.subject = subject;
 56 |     res.opts = opts;
 57 |     res
 58 |   }
 59 | 
 60 |   fn add_match(&mut self, range: Range<usize>, annotation: impl Into<Annot>) {
 61 |     let m = Match::new(range.clone(), annotation);
 62 |     self.matches.insert(range.start, m);
 63 |   }
 64 | 
 65 |   fn add_opener(&mut self, name: u8, opener: Opener) {
 66 |     self.openers.entry(name).or_default().push(opener)
 67 |   }
 68 | 
 69 |   fn clear_openers(&mut self, startpos: usize, endpos: usize) {
 70 |     for v in self.openers.values_mut() {
 71 |       v.retain(|it| !(startpos <= it.range.start && it.range.end <= endpos))
 72 |     }
 73 |   }
 74 | 
 75 |   fn str_matches(&mut self, startpos: usize, endpos: usize) {
 76 |     for i in startpos..endpos {
 77 |       if let Some(m) = self.matches.get_mut(&i) {
 78 |         if m.is_not(Atom::Str) && m.is_not(Atom::Escape) {
 79 |           m.a = Atom::Str.into();
 80 |         }
 81 |       }
 82 |     }
 83 |   }
 84 | 
 85 |   fn between_matched(&mut self, pos: usize, c: u8, annotation: Comp, defaultmatch: Atom) -> usize {
 86 |     self.between_matched_impl(
 87 |       pos,
 88 |       c,
 89 |       annotation,
 90 |       defaultmatch,
 91 |       Option::<fn(&str, usize) -> PatMatch>::None,
 92 |     )
 93 |   }
 94 | 
 95 |   fn between_matched_with_open_test(
 96 |     &mut self,
 97 |     pos: usize,
 98 |     c: u8,
 99 |     annotation: Comp,
100 |     defaultmatch: Atom,
101 |     open_test: impl FnOnce(&str, usize) -> PatMatch,
102 |   ) -> usize {
103 |     self.between_matched_impl(pos, c, annotation, defaultmatch, Some(open_test))
104 |   }
105 | 
106 |   fn between_matched_impl(
107 |     &mut self,
108 |     pos: usize,
109 |     c: u8,
110 |     annotation: Comp,
111 |     mut defaultmatch: Atom,
112 |     opentest: Option<impl FnOnce(&str, usize) -> PatMatch>,
113 |   ) -> usize {
114 |     debug_assert!(self.subject[pos..].as_bytes().starts_with(&[c]));
115 | 
116 |     let mut can_open = find_at(&self.subject, "^%S", pos + 1).is_match;
117 |     let mut can_close = !self.subject[..pos].ends_with(is_space);
118 |     let has_open_marker =
119 |       pos != 0 && self.matches.get(&(pos - 1)).map_or(false, |it| it.is(Atom::OpenMarker));
120 |     let has_close_marker = self.subject.as_bytes()[pos + 1] == b'}';
121 |     let mut startopener = pos;
122 |     let mut endcloser = pos + 1;
123 | 
124 |     if let Some(opentest) = opentest {
125 |       can_open = can_open && opentest(&self.subject, pos).is_match;
126 |     }
127 | 
128 |     // allow explicit open/close markers to override:
129 |     if has_open_marker {
130 |       can_open = true;
131 |       can_close = false;
132 |       startopener = pos - 1;
133 |     }
134 |     if !has_open_marker && has_close_marker {
135 |       can_close = true;
136 |       can_open = false;
137 |       endcloser = pos + 2;
138 |     }
139 | 
140 |     if has_open_marker && defaultmatch.is_right_atom() {
141 |       defaultmatch = defaultmatch.corresponding_left_atom();
142 |     } else if has_close_marker && defaultmatch.is_left_atom() {
143 |       defaultmatch = defaultmatch.corresponding_right_atom();
144 |     }
145 | 
146 |     let openers = self.openers.entry(c).or_default();
147 |     if can_close && openers.len() > 0 {
148 |       // check openers for a match
149 |       let opener = openers.last().unwrap().clone();
150 |       if opener.range.end != pos {
151 |         // exclude empty emph
152 |         self.clear_openers(opener.range.start, pos + 1);
153 |         self.add_match(opener.range.clone(), Annot::Add(annotation));
154 |         self.add_match(pos..endcloser, Annot::Sub(annotation));
155 |         return endcloser;
156 |       }
157 |     }
158 |     // if we get here, we didn't match an opener
159 |     if can_open {
160 |       self.add_opener(c, Opener::new(startopener..pos + 1));
161 |       self.add_match(startopener..pos + 1, defaultmatch);
162 |       pos + 1
163 |     } else {
164 |       self.add_match(startopener..endcloser, defaultmatch);
165 |       endcloser
166 |     }
167 |   }
168 | 
169 |   fn matchers(&mut self, c: u8, pos: usize, endpos: usize) -> Option<usize> {
170 |     match c {
171 |       b'`' => {
172 |         let m = bounded_find(&self.subject, "^`*", pos, endpos);
173 |         if !m.is_match {
174 |           return None;
175 |         }
176 |         // TODO: display/inline math
177 | 
178 |         self.add_match(pos..m.end, Annot::Add(Comp::Verbatim));
179 |         self.verbatim_type = Comp::Verbatim;
180 | 
181 |         self.verbatim = m.end - pos;
182 |         return Some(m.end);
183 |       }
184 |       b'\\' => {
185 |         let m = bounded_find(&self.subject, "^[ \t]*\r?\n", pos + 1, endpos);
186 |         self.add_match(pos..pos + 1, Atom::Escape);
187 | 
188 |         if m.is_match {
189 |           // see f there were preceding spaces
190 |           if let Some((_, mm)) = self.matches.iter().rev().next() {
191 |             let sp = mm.range.start;
192 |             let mut ep = mm.range.end;
193 |             if mm.is(Atom::Str) {
194 |               while self.subject.as_bytes()[ep] == b' ' || self.subject.as_bytes()[ep] == b'\t' {
195 |                 ep = ep - 1
196 |               }
197 |               if sp == ep {
198 |                 self.matches.remove(&sp);
199 |               } else {
200 |                 self.add_match(sp..ep, Atom::Str)
201 |               }
202 |             }
203 |           }
204 |           self.add_match(pos + 1..m.end, Atom::Hardbreak);
205 |           return Some(m.end);
206 |         } else {
207 |           let m = bounded_find(&self.subject, "^[%p ]", pos + 1, endpos);
208 |           if !m.is_match {
209 |             self.add_match(pos..pos + 1, Atom::Str);
210 |             return Some(pos + 1);
211 |           } else {
212 |             self.add_match(pos..pos + 1, Atom::Escape);
213 |             if find_at(&self.subject, "^ ", pos + 1).is_match {
214 |               self.add_match(pos + 1..m.end, Atom::Nbsp)
215 |             } else {
216 |               self.add_match(pos + 1..m.end, Atom::Str)
217 |             }
218 |             return Some(m.end);
219 |           }
220 |         }
221 |       }
222 |       b'<' => {
223 |         let url = bounded_find(&self.subject, "^%<[^<>%s]+%>", pos, endpos);
224 |         if url.is_match {
225 |           let is_url = bounded_find(&self.subject, "^%a+:", pos + 1, url.end).is_match;
226 |           let is_email = bounded_find(&self.subject, "^[^:]+%@", pos + 1, url.end).is_match;
227 |           if is_email {
228 |             self.add_match(url.start..url.start + 1, Comp::Email.add());
229 |             self.add_match(url.start + 1..url.end - 1, Atom::Str);
230 |             self.add_match(url.end - 1..url.end, Comp::Email.sub());
231 |             return Some(url.end);
232 |           } else if is_url {
233 |             self.add_match(url.start..url.start + 1, Comp::Url.add());
234 |             self.add_match(url.start + 1..url.end - 1, Atom::Str);
235 |             self.add_match(url.end - 1..url.end, Comp::Url.sub());
236 |             return Some(url.end);
237 |           }
238 |         }
239 |         return None;
240 |       }
241 |       b'~' => Some(self.between_matched(pos, b'~', Comp::Subscript, Atom::Str)),
242 |       b'^' => Some(self.between_matched(pos, b'^', Comp::Superscript, Atom::Str)),
243 |       b'[' => {
244 |         let m = bounded_find(&self.subject, "^%^([^]]+)%]", pos + 1, endpos);
245 |         if m.is_match {
246 |           self.add_match(pos..m.end, Atom::FootnoteReference);
247 |           return Some(m.end);
248 |         } else {
249 |           self.add_opener(b'[', Opener::new(pos..pos + 1));
250 |           self.add_match(pos..pos + 1, Atom::Str);
251 |           return Some(pos + 1);
252 |         }
253 |       }
254 |       b']' => {
255 |         let openers = self.openers.entry(b'[').or_default();
256 |         if openers.len() > 0 {
257 |           let opener = openers.last_mut().unwrap();
258 |           if opener.annot == "reference_link" {
259 |             let opener = opener.clone();
260 |             // found a reference link
261 |             // add the matches
262 |             let is_image = self.subject[..opener.range.start].ends_with('!')
263 |               && !self.subject[..opener.range.start].ends_with("[]");
264 |             if is_image {
265 |               self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker);
266 |               self.add_match(opener.range.clone(), Comp::Imagetext.add());
267 |               self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub());
268 |             } else {
269 |               self.add_match(opener.range.clone(), Comp::Linktext.add());
270 |               self.add_match(opener.sub_range.clone(), Comp::Linktext.sub());
271 |             }
272 |             self.add_match(opener.sub_range.end - 1..opener.sub_range.end, Comp::Reference.add());
273 |             self.add_match(pos..pos, Comp::Reference.sub());
274 |             // convert all matches to str
275 |             self.str_matches(opener.sub_range.end, pos);
276 |             // remove from openers
277 |             self.clear_openers(opener.range.start, pos);
278 |             return Some(pos + 1);
279 |           } else if bounded_find(&self.subject, "^[%[]", pos + 1, endpos).is_match {
280 |             opener.annot = "reference_link";
281 |             opener.sub_range.start = pos; // intermediate ]
282 |             opener.sub_range.end = pos + 2; // intermediate [
283 |             self.add_match(pos..pos + 2, Atom::Str);
284 |             return Some(pos + 2);
285 |           } else if bounded_find(&self.subject, "^[(]", pos + 1, endpos).is_match {
286 |             opener.annot = "explicit_link";
287 |             opener.sub_range.start = pos; // intermediate ]
288 |             opener.sub_range.end = pos + 2; // intermediate (
289 |             self.openers.remove(&b'('); // clear ( openers
290 |             self.destination = true;
291 |             self.add_match(pos..pos + 2, Atom::Str);
292 |             return Some(pos + 2);
293 |           } else if bounded_find(&self.subject, "^%{", pos + 1, endpos).is_match {
294 |             let opener = opener.clone();
295 |             // assume this is attributes, bracketed span
296 |             self.add_match(opener.range.clone(), Comp::Span.add());
297 |             self.add_match(pos..pos + 1, Comp::Span.sub());
298 |             // remove any openers between [ and ]
299 |             self.clear_openers(opener.range.start, pos);
300 |             return Some(pos + 1);
301 |           }
302 |         }
303 |         return None;
304 |       }
305 |       b'(' => {
306 |         if !self.destination {
307 |           return None;
308 |         }
309 |         self.add_opener(b'(', Opener::new(pos..pos + 1));
310 |         self.add_match(pos..pos + 1, Atom::Str);
311 |         return Some(pos + 1);
312 |       }
313 |       b')' => {
314 |         if !self.destination {
315 |           return None;
316 |         }
317 |         let parens = self.openers.entry(b'(').or_default();
318 |         if parens.len() > 0 {
319 |           // TODO?
320 |           parens.pop();
321 |           self.add_match(pos..pos + 1, Atom::Str);
322 |           return Some(pos + 1);
323 |         } else {
324 |           let openers = &self.openers.entry(b'[').or_default().clone();
325 |           if let Some(opener) = openers.last().cloned() {
326 |             if opener.annot == "explicit_link" {
327 |               let (startdest, enddest) = (opener.sub_range.end - 1, pos);
328 |               // we have inline link
329 |               let is_image = self.subject[..opener.range.start].ends_with('!')
330 |                 && !self.subject[..opener.range.start].ends_with("[]");
331 |               if is_image {
332 |                 self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker);
333 |                 self.add_match(opener.range.clone(), Comp::Imagetext.add());
334 |                 self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub());
335 |               } else {
336 |                 self.add_match(opener.range.clone(), Comp::Linktext.add());
337 |                 self.add_match(opener.sub_range.clone(), Comp::Linktext.sub());
338 |               }
339 |               self.add_match(startdest..startdest + 1, Comp::Destination.add());
340 |               self.add_match(enddest..enddest + 1, Comp::Destination.sub());
341 |               self.destination = false;
342 |               // convert all matches to str
343 |               self.str_matches(opener.sub_range.end + 1, pos);
344 |               // remove from openers
345 |               self.clear_openers(opener.range.start, pos);
346 |               return Some(enddest + 1);
347 |             }
348 |           }
349 |           return None;
350 |         }
351 |       }
352 |       b'_' => Some(self.between_matched(pos, b'_', Comp::Emph, Atom::Str)),
353 |       b'*' => Some(self.between_matched(pos, b'*', Comp::Strong, Atom::Str)),
354 |       b'{' => {
355 |         if self.subject[pos + 1..endpos].starts_with(|c: char| "_*~^+='\"-".contains(c)) {
356 |           self.add_match(pos..pos + 1, Atom::OpenMarker);
357 |           return Some(pos + 1);
358 |         } else if self.allow_attributes {
359 |           self.attribute_tokenizer = Some(attribute::Tokenizer::new(self.subject.clone()));
360 |           self.attribute_start = pos;
361 |           return Some(pos);
362 |         } else {
363 |           // disabling allow_attributes only lasts
364 |           // for one potential attribute start {, and then is re-enabled
365 |           self.allow_attributes = true;
366 |           self.add_match(pos..pos + 1, Atom::Str);
367 |           return Some(pos + 1);
368 |         }
369 |       }
370 |       b':' => {
371 |         let m = bounded_find(&self.subject, "^%:[%w_+-]+%:", pos, endpos);
372 |         if m.is_match {
373 |           self.add_match(m.start..m.end, Atom::Emoji);
374 |           return Some(m.end);
375 |         } else {
376 |           self.add_match(pos..pos + 1, Atom::Str);
377 |           return Some(pos + 1);
378 |         }
379 |       }
380 |       b'+' => Some(self.between_matched_with_open_test(
381 |         pos,
382 |         b'+',
383 |         Comp::Insert,
384 |         Atom::Str,
385 |         |subject, pos| {
386 |           find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
387 |         },
388 |       )),
389 |       b'=' => Some(self.between_matched_with_open_test(
390 |         pos,
391 |         b'=',
392 |         Comp::Mark,
393 |         Atom::Str,
394 |         |subject, pos| {
395 |           find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
396 |         },
397 |       )),
398 |       b'\'' => todo!(),
399 |       b'"' => Some(self.between_matched(pos, b'"', Comp::DoubleQuoted, Atom::LeftDoubleQuote)),
400 |       b'-' => {
401 |         let subject = &self.subject[..];
402 |         if subject.as_bytes().get(pos - 1) == Some(&b'{')
403 |           || subject.as_bytes().get(pos + 1) == Some(&b'}')
404 |         {
405 |           return Some(self.between_matched_with_open_test(
406 |             pos,
407 |             b'-',
408 |             Comp::Delete,
409 |             Atom::Str,
410 |             |subject, pos| {
411 |               find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
412 |             },
413 |           ));
414 |         }
415 | 
416 |         let ep = find_at(subject, "^%-*", pos).end.min(endpos);
417 |         let mut hyphens = ep - pos;
418 |         if subject.as_bytes().get(ep) == Some(&b'}') {
419 |           // last hyphen is close del
420 |           hyphens -= 1;
421 |         }
422 |         if hyphens == 0 {
423 |           self.add_match(pos..pos + 2, Atom::Str);
424 |           return Some(pos + 2);
425 |         }
426 |         let mut pos = pos;
427 |         let all_em = hyphens % 3 == 0;
428 |         let all_en = hyphens % 2 == 0;
429 |         while hyphens > 0 {
430 |           if all_em {
431 |             self.add_match(pos..pos + 3, Atom::EmDash);
432 |             pos += 3;
433 |             hyphens -= 3;
434 |           } else if all_en {
435 |             self.add_match(pos..pos + 2, Atom::EnDash);
436 |             pos += 2;
437 |             hyphens -= 2;
438 |           } else if hyphens >= 3 && (hyphens % 2 != 0 || hyphens > 4) {
439 |             self.add_match(pos..pos + 3, Atom::EmDash);
440 |             pos += 3;
441 |             hyphens -= 3;
442 |           } else if hyphens >= 2 {
443 |             self.add_match(pos..pos + 2, Atom::EnDash);
444 |             pos += 2;
445 |             hyphens -= 2;
446 |           } else {
447 |             self.add_match(pos..pos + 1, Atom::Str);
448 |             pos += 1;
449 |             hyphens -= 1;
450 |           }
451 |         }
452 |         Some(pos)
453 |       }
454 |       b'.' => {
455 |         if bounded_find(&self.subject, "^%.%.", pos + 1, endpos).is_match {
456 |           self.add_match(pos..pos + 3, Atom::Ellipses);
457 |           return Some(pos + 3);
458 |         }
459 |         return None;
460 |       }
461 |       _ => return None,
462 |     }
463 |   }
464 | 
465 |   fn single_char(&mut self, pos: usize) -> usize {
466 |     self.add_match(pos..pos + 1, Atom::Str);
467 |     pos + 1
468 |   }
469 | 
470 |   // Feed a slice to the parser, updating state.
471 |   pub fn feed(&mut self, spos: usize, endpos: usize) {
472 |     let special = "[%]%[\\`{}_*()!<>~^:=+$\r\n'\".-]";
473 |     let subject = self.subject.clone();
474 |     if spos < self.firstpos {
475 |       self.firstpos = spos
476 |     }
477 |     if endpos > self.lastpos {
478 |       self.lastpos = endpos
479 |     }
480 |     let mut pos = spos;
481 |     while pos < endpos {
482 |       if let Some(mut attribute_tokenizer) = self.attribute_tokenizer.take() {
483 |         let sp = pos;
484 |         let m = bounded_find(&self.subject, special, pos, endpos);
485 |         let ep2 = if m.is_match { m.start } else { endpos };
486 |         let (status, ep) = attribute_tokenizer.feed(sp, ep2);
487 |         match status {
488 |           attribute::Status::Done => {
489 |             let attribute_start = self.attribute_start;
490 |             // add attribute matches
491 |             self.add_match(attribute_start..attribute_start + 1, Comp::Attributes.add());
492 |             self.add_match(ep..ep + 1, Comp::Attributes.sub());
493 |             let attr_matches = attribute_tokenizer.get_matches();
494 |             for m in attr_matches {
495 |               self.add_match(m.range, m.a);
496 |             }
497 |             self.attribute_tokenizer = None;
498 |             self.attribute_start = !0;
499 |             pos = ep + 1;
500 |           }
501 |           attribute::Status::Fail => {
502 |             pos = self.attribute_start;
503 |             self.allow_attributes = false;
504 |             self.attribute_tokenizer = None;
505 |             self.attribute_start = !0;
506 |           }
507 |           attribute::Status::Continue => {
508 |             self.attribute_tokenizer = Some(attribute_tokenizer);
509 |             pos = ep
510 |           }
511 |         }
512 |       } else {
513 |         // find next interesting character:
514 |         let newpos = bounded_find(&subject, special, pos, endpos).or(endpos);
515 |         if newpos > pos {
516 |           self.add_match(pos..newpos, Atom::Str);
517 |           pos = newpos;
518 |           if pos > endpos {
519 |             break; // otherwise, fall through:
520 |           }
521 |         }
522 |         // if we get here, then newpos = pos,
523 |         // i.e. we have something interesting at pos
524 |         let c = subject.as_bytes()[pos];
525 |         if c == b'\r' || c == b'\n' {
526 |           if c == b'\r' && bounded_find(&subject, "^[%n]", pos + 1, endpos).is_match {
527 |             self.add_match(pos..pos + 2, Atom::Softbreak);
528 |             pos = pos + 2
529 |           } else {
530 |             self.add_match(pos..pos + 1, Atom::Softbreak);
531 |             pos = pos + 1
532 |           }
533 |         } else if self.verbatim > 0 {
534 |           if c == b'`' {
535 |             let m = bounded_find(&subject, "^`+", pos, endpos);
536 |             if m.is_match && m.end - pos == self.verbatim {
537 |               // TODO: Check for raw attributes
538 |               self.add_match(pos..m.end, self.verbatim_type.sub());
539 |               pos = m.end;
540 |               self.verbatim = 0;
541 |               self.verbatim_type = Comp::default();
542 |             } else {
543 |               let endchar = m.end_or(endpos);
544 |               self.add_match(pos..endchar, Atom::Str);
545 |               pos = endchar
546 |             }
547 |           } else {
548 |             self.add_match(pos..pos + 1, Atom::Str);
549 |             pos = pos + 1
550 |           }
551 |         } else {
552 |           pos = self.matchers(c, pos, endpos).unwrap_or_else(|| self.single_char(pos))
553 |         }
554 |       }
555 |     }
556 |   }
557 | 
558 |   pub(crate) fn get_matches(&mut self) -> Vec<Match> {
559 |     let mut sorted: Vec<Match> = Vec::new();
560 |     let mut m_last = Match::new(0..0, Atom::Ellipses); // TODO
561 |     for i in self.firstpos..=self.lastpos {
562 |       if let Some(m) = self.matches.get(&i) {
563 |         if m.is(Atom::Str) && m_last.is(Atom::Str) && m_last.range.end == m.range.start {
564 |           (*sorted.last_mut().unwrap()).range.end = m.range.end;
565 |           m_last.range.end = m.range.end;
566 |         } else {
567 |           sorted.push(m.clone());
568 |           m_last = m.clone()
569 |         }
570 |       }
571 |     }
572 |     if sorted.len() > 0 {
573 |       if sorted.last().unwrap().is(Atom::Softbreak) {
574 |         // remove final softbreak
575 |         sorted.pop();
576 |       }
577 |       if self.verbatim > 0 {
578 |         // unclosed verbatim
579 |         let e = sorted.last().unwrap().range.end;
580 |         sorted.push(Match::new(e..e, self.verbatim_type.sub()))
581 |       }
582 |     }
583 |     sorted
584 |   }
585 | }
586 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // TODO: re-export everything.
 2 | pub mod ast;
 3 | 
 4 | mod annot;
 5 | mod patterns;
 6 | mod block;
 7 | mod inline;
 8 | mod attribute;
 9 | mod tree;
10 | mod emoji;
11 | mod html;
12 | #[cfg(test)]
13 | mod sourcegen;
14 | 
15 | use std::{collections::BTreeMap, ops::Range};
16 | 
17 | use crate::annot::Annot;
18 | 
19 | #[derive(Debug, Default, Clone)]
20 | pub struct Document {
21 |   pub children: Vec<ast::Tag>,
22 |   pub references: BTreeMap<String, ast::ReferenceDefinition>,
23 |   pub debug: String,
24 | }
25 | 
26 | #[derive(Default, Clone)]
27 | pub struct ParseOpts {
28 |   pub debug_matches: bool,
29 | }
30 | 
31 | #[derive(Default, Clone)]
32 | pub struct HtmlOpts {}
33 | 
34 | impl Document {
35 |   pub fn parse(text: &str) -> Document {
36 |     Document::parse_opts(ParseOpts::default(), text)
37 |   }
38 | 
39 |   pub fn parse_opts(opts: ParseOpts, text: &str) -> Document {
40 |     let mut p = block::Tokenizer::new(text.to_string(), opts);
41 |     p.parse();
42 |     tree::build(p)
43 |   }
44 | 
45 |   pub fn to_html(&self) -> String {
46 |     self.to_html_opts(&HtmlOpts::default())
47 |   }
48 | 
49 |   pub fn to_html_opts(&self, opts: &HtmlOpts) -> String {
50 |     html::convert(opts, self)
51 |   }
52 | 
53 |   pub fn to_json(&self) -> String {
54 |     #[derive(serde::Serialize)]
55 |     struct DocRepr<'a> {
56 |       tag: &'static str,
57 |       children: &'a [ast::Tag],
58 |       references: &'a BTreeMap<String, ast::ReferenceDefinition>,
59 |     }
60 |     serde_json::to_string_pretty(&DocRepr {
61 |       tag: "doc",
62 |       children: self.children.as_slice(),
63 |       references: &self.references,
64 |     })
65 |     .unwrap()
66 |   }
67 | }
68 | 
69 | #[derive(Debug, Clone)]
70 | struct Match {
71 |   range: Range<usize>,
72 |   a: Annot,
73 | }
74 | 
75 | impl Match {
76 |   fn new(range: Range<usize>, a: impl Into<Annot>) -> Match {
77 |     Match { range, a: a.into() }
78 |   }
79 |   fn is(&self, annot: impl Into<Annot>) -> bool {
80 |     self.a == annot.into()
81 |   }
82 |   fn is_not(&self, annot: impl Into<Annot>) -> bool {
83 |     !self.is(annot)
84 |   }
85 | }
86 | 
87 | /// Appends formatted string to a `String`.
88 | macro_rules! _format_to {
89 |     ($buf:expr) => ();
90 |     ($buf:expr, $lit:literal $($arg:tt)*) => {
91 |         { use ::std::fmt::Write as _; let _ = ::std::write!($buf, $lit $($arg)*); }
92 |     };
93 | }
94 | pub(crate) use _format_to as format_to;
95 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | use anyhow::Context;
 4 | use lexopt::{Arg::Long, Arg::Short, Arg::Value};
 5 | 
 6 | fn main() -> anyhow::Result<()> {
 7 |   let mut matches = false;
 8 |   let mut ast = false;
 9 |   let mut files = Vec::new();
10 | 
11 |   let mut parser = lexopt::Parser::from_env();
12 |   while let Some(arg) = parser.next()? {
13 |     match arg {
14 |       Short('m') | Long("matches") => matches = true,
15 |       Short('a') | Long("ast") => ast = true,
16 |       Value(val) => files.push(val),
17 |       _ => Err(arg.unexpected())?,
18 |     }
19 |   }
20 | 
21 |   let mut inputs = Vec::new();
22 |   if files.is_empty() {
23 |     let content = std::io::read_to_string(std::io::stdin()).context("failed to read stdin")?;
24 |     inputs.push(content)
25 |   } else {
26 |     for file in files {
27 |       let path = PathBuf::from(file);
28 |       let content = std::fs::read_to_string(&path)
29 |         .with_context(|| format!("failed to read {}", path.display()))?;
30 |       inputs.push(content)
31 |     }
32 |   }
33 | 
34 |   let opts = djot::ParseOpts { debug_matches: matches };
35 |   for content in inputs {
36 |     let doc = djot::Document::parse_opts(opts.clone(), &content);
37 |     if matches {
38 |       println!("{}", doc.debug)
39 |     } else if ast {
40 |       println!("{}", doc.to_json())
41 |     } else {
42 |       println!("{}", doc.to_html())
43 |     }
44 |   }
45 | 
46 |   Ok(())
47 | }
48 | 


--------------------------------------------------------------------------------
/src/patterns.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::Range;
 2 | 
 3 | #[derive(Debug, Default)]
 4 | pub struct PatMatch {
 5 |   pub is_match: bool,
 6 |   pub start: usize,
 7 |   pub end: usize,
 8 |   pub cap1: Range<usize>,
 9 |   pub cap2: Range<usize>,
10 | }
11 | 
12 | impl PatMatch {
13 |   pub(crate) fn or(&self, endpos: usize) -> usize {
14 |     if self.is_match {
15 |       self.start
16 |     } else {
17 |       endpos
18 |     }
19 |   }
20 | 
21 |   pub(crate) fn end_or(&self, endpos: usize) -> usize {
22 |     if self.is_match {
23 |       self.end
24 |     } else {
25 |       endpos
26 |     }
27 |   }
28 | 
29 |   pub(crate) fn or_else(self, f: impl FnOnce() -> Self) -> Self {
30 |     if self.is_match {
31 |       self
32 |     } else {
33 |       f()
34 |     }
35 |   }
36 | }
37 | 
38 | pub fn find(subject: &str, pat: &'static str) -> PatMatch {
39 |   find_at(subject, pat, 0)
40 | }
41 | 
42 | pub fn find_at(subject: &str, pat: &'static str, start: usize) -> PatMatch {
43 |   let mut pat = lua_patterns::LuaPattern::new(pat);
44 |   let is_match = pat.matches(&subject[start..]);
45 |   let range = pat.range();
46 |   let cap1 = pat.capture(1);
47 |   let cap2 = pat.capture(2);
48 |   PatMatch {
49 |     start: range.start + start,
50 |     end: range.end + start,
51 |     is_match,
52 |     cap1: cap1.start + start..cap1.end + start,
53 |     cap2: cap2.start + start..cap2.end + start,
54 |   }
55 | }
56 | 
57 | pub(crate) fn is_space(c: char) -> bool {
58 |   " \n\t".contains(c)
59 | }
60 | 


--------------------------------------------------------------------------------
/src/sourcegen.rs:
--------------------------------------------------------------------------------
 1 | //! Generates matches and ast structures
 2 | mod annot;
 3 | mod ast;
 4 | 
 5 | use std::path::Path;
 6 | 
 7 | fn camel_case(ident: &str) -> String {
 8 |   ident
 9 |     .split('_')
10 |     .flat_map(|word| {
11 |       word.chars().next().map(|it| it.to_ascii_uppercase()).into_iter().chain(word.chars().skip(1))
12 |     })
13 |     .collect()
14 | }
15 | 
16 | fn ensure_content(path: &str, content: &str) {
17 |   let base = Path::new(env!("CARGO_MANIFEST_DIR"));
18 |   let path = base.join(path);
19 |   let old = std::fs::read_to_string(&path).unwrap_or_default();
20 |   if normalize(&old) == normalize(content) {
21 |     return;
22 |   }
23 |   std::fs::write(&path, content)
24 |     .unwrap_or_else(|err| panic!("can't write {}: {err}", path.display()));
25 | }
26 | 
27 | fn normalize(s: &str) -> String {
28 |   s.split_ascii_whitespace().flat_map(|it| it.split(',')).collect()
29 | }
30 | 


--------------------------------------------------------------------------------
/src/sourcegen/annot.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |   format_to,
  3 |   sourcegen::{camel_case, ensure_content},
  4 | };
  5 | 
  6 | const ANNOTATIONS: &str = "
  7 | verbatim
  8 | email
  9 | url
 10 | subscript
 11 | superscript
 12 | para
 13 | code_block
 14 | imagetext
 15 | linktext
 16 | reference
 17 | destination
 18 | emph
 19 | strong
 20 | span
 21 | double_quoted
 22 | reference_definition
 23 | insert
 24 | delete
 25 | mark
 26 | attributes
 27 | 
 28 | str
 29 | escape
 30 | hardbreak
 31 | nbsp
 32 | blankline
 33 | image_marker
 34 | left_double_quote
 35 | right_double_quote
 36 | ellipses
 37 | softbreak
 38 | footnote_reference
 39 | open_marker
 40 | emoji
 41 | reference_key
 42 | reference_value
 43 | code_language
 44 | em_dash
 45 | en_dash
 46 | id
 47 | key
 48 | value
 49 | class
 50 | ";
 51 | 
 52 | #[test]
 53 | fn generate_annotations() {
 54 |   let (composites, atoms) = ANNOTATIONS.trim().split_once("\n\n").unwrap();
 55 | 
 56 |   let mut buf = "\
 57 | use std::fmt;
 58 | "
 59 |   .to_string();
 60 | 
 61 |   emit_comp(&mut buf, composites);
 62 |   emit_atom(&mut buf, atoms);
 63 |   ensure_content("src/annot/generated.rs", &buf);
 64 | }
 65 | 
 66 | fn emit_comp(buf: &mut String, composites: &str) {
 67 |   format_to!(
 68 |     buf,
 69 |     "\
 70 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 71 | pub(crate) enum Comp {{
 72 | "
 73 |   );
 74 |   for ident in composites.lines() {
 75 |     format_to!(buf, "  {},\n", camel_case(ident))
 76 |   }
 77 |   format_to!(buf, "}}\n");
 78 | 
 79 |   let mut display_arms = String::new();
 80 |   for ident in composites.lines() {
 81 |     format_to!(display_arms, "      Comp::{} => \"{ident}\",\n", camel_case(ident))
 82 |   }
 83 | 
 84 |   format_to!(
 85 |     buf,
 86 |     "
 87 | impl fmt::Display for Comp {{
 88 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{
 89 |     f.write_str(match self {{
 90 |       {display_arms}
 91 |     }})
 92 |   }}
 93 | }}
 94 |     "
 95 |   );
 96 | }
 97 | 
 98 | fn emit_atom(buf: &mut String, atoms: &str) {
 99 |   let mut variants = String::new();
100 |   for ident in atoms.lines() {
101 |     format_to!(variants, "  {},\n", camel_case(ident))
102 |   }
103 | 
104 |   format_to!(
105 |     buf,
106 |     "
107 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
108 | pub(crate) enum Atom {{
109 |   {variants}
110 | }}
111 | "
112 |   );
113 | 
114 |   let mut left_atoms = String::new();
115 |   let mut right_atoms = String::new();
116 |   let mut ltr = String::new();
117 |   let mut rtl = String::new();
118 |   for ident in atoms.lines() {
119 |     if ident.starts_with("left_") {
120 |       format_to!(left_atoms, " | Atom::{}", camel_case(ident));
121 |       let rident = &ident.replace("left", "right");
122 |       format_to!(ltr, "Atom::{} => Atom::{},\n", camel_case(ident), camel_case(rident));
123 |       format_to!(rtl, "Atom::{} => Atom::{},\n", camel_case(rident), camel_case(ident));
124 |     }
125 |     if ident.starts_with("right_") {
126 |       format_to!(right_atoms, " | Atom::{}", camel_case(ident))
127 |     }
128 |   }
129 | 
130 |   format_to!(
131 |     buf,
132 |     "
133 | impl Atom {{
134 |   pub(crate) fn is_left_atom(self) -> bool {{
135 |     matches!(self, {left_atoms})
136 |   }}
137 |   pub(crate) fn is_right_atom(self) -> bool {{
138 |     matches!(self, {right_atoms})
139 |   }}
140 |   pub(crate) fn corresponding_left_atom(self) -> Atom {{
141 |     match self {{
142 |       {rtl}
143 |       _ => self
144 |     }}
145 |   }}
146 |   pub(crate) fn corresponding_right_atom(self) -> Atom {{
147 |     match self {{
148 |       {ltr}
149 |       _ => self
150 |     }}
151 |   }}
152 | }}
153 | "
154 |   );
155 | 
156 |   let mut display_arms = String::new();
157 |   for ident in atoms.lines() {
158 |     format_to!(display_arms, "      Atom::{} => \"{ident}\",\n", camel_case(ident))
159 |   }
160 | 
161 |   format_to!(
162 |     buf,
163 |     "
164 | impl fmt::Display for Atom {{
165 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{
166 |     f.write_str(match self {{
167 |       {display_arms}
168 |     }})
169 |   }}
170 | }}
171 |     "
172 |   );
173 | }
174 | 


--------------------------------------------------------------------------------
/src/sourcegen/ast.rs:
--------------------------------------------------------------------------------
  1 | use crate::{format_to, sourcegen::camel_case};
  2 | 
  3 | use crate::sourcegen::ensure_content;
  4 | 
  5 | const TAGS: &str = "
  6 | heading level: u32
  7 | para
  8 | link destination: Option<String>, reference: Option<String>
  9 | image destination: Option<String>, reference: Option<String>
 10 | code_block lang: Option<String>, text: String
 11 | strong
 12 | emph
 13 | insert
 14 | delete
 15 | mark
 16 | superscript
 17 | subscript
 18 | span
 19 | double_quoted
 20 | url destination: String
 21 | 
 22 | soft_break
 23 | em_dash
 24 | en_dash
 25 | verbatim text: String
 26 | str text: String
 27 | emoji alias: String
 28 | ";
 29 | 
 30 | #[test]
 31 | fn generate_annotations() {
 32 |   let (composites, atoms) = TAGS.trim().split_once("\n\n").unwrap();
 33 | 
 34 |   let mut buf = format!("use super::Attrs;\n");
 35 |   emit_ast_comp(&mut buf, composites);
 36 |   emit_ast_atom(&mut buf, atoms);
 37 |   emit_ast_tag(&mut buf, composites, atoms);
 38 |   ensure_content("src/ast/generated.rs", &buf);
 39 | }
 40 | 
 41 | fn emit_ast_comp(buf: &mut String, composites: &str) {
 42 |   for comp in composites.lines() {
 43 |     let (ident, fields) = comp.split_once(" ").unwrap_or((comp, ""));
 44 |     let fields = if fields.is_empty() {
 45 |       String::new()
 46 |     } else {
 47 |       fields.split(", ").map(|it| format!("pub {it},\n")).collect::<String>()
 48 |     };
 49 | 
 50 |     format_to! {buf, "
 51 | #[derive(Debug, Default, Clone, serde::Serialize)]
 52 | pub struct {} {{
 53 |   #[serde(skip_serializing_if = \"Attrs::is_empty\")]
 54 |   pub attrs: Attrs,
 55 |   pub children: Vec<Tag>,
 56 |   {fields}
 57 | }}
 58 | ", camel_case(ident)}
 59 |   }
 60 | }
 61 | 
 62 | fn emit_ast_atom(buf: &mut String, atoms: &str) {
 63 |   for atom in atoms.lines() {
 64 |     let (ident, fields) = atom.split_once(" ").unwrap_or((atom, ""));
 65 |     let fields = if fields.is_empty() {
 66 |       String::new()
 67 |     } else {
 68 |       fields.split(", ").map(|it| format!("pub {it},\n")).collect::<String>()
 69 |     };
 70 |     format_to! {buf, "
 71 | #[derive(Debug, Default, Clone, serde::Serialize)]
 72 | pub struct {} {{
 73 |   #[serde(skip_serializing_if = \"Attrs::is_empty\")]
 74 |   pub attrs: Attrs,
 75 |   {fields}
 76 | }}
 77 | ", camel_case(ident)}
 78 |   }
 79 | }
 80 | 
 81 | fn emit_ast_tag(buf: &mut String, composites: &str, atoms: &str) {
 82 |   let mut variants = String::new();
 83 |   for comp in composites.lines() {
 84 |     let ident = comp.split_once(" ").map_or(comp, |it| it.0);
 85 |     let camel = camel_case(ident);
 86 |     format_to!(variants, "  {camel}({camel}),\n");
 87 |   }
 88 |   for atom in atoms.lines() {
 89 |     let ident = atom.split_once(" ").map_or(atom, |it| it.0);
 90 |     let camel = camel_case(ident);
 91 |     format_to!(variants, "  {camel}({camel}),\n");
 92 |   }
 93 |   format_to!(
 94 |     buf,
 95 |     "
 96 | #[derive(Debug, Clone, serde::Serialize)]
 97 | #[serde(tag = \"tag\", rename_all = \"snake_case\")]
 98 | pub enum Tag {{ {variants} }}
 99 | "
100 |   )
101 | }
102 | 


--------------------------------------------------------------------------------
/src/tree.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::BTreeMap;
  2 | 
  3 | use crate::{
  4 |   annot::{Annot, Atom, Comp},
  5 |   ast::{
  6 |     Attrs, CodeBlock, Delete, DoubleQuoted, Emoji, Emph, Image, Insert, Link, Mark, Para,
  7 |     ReferenceDefinition, SoftBreak, Span, Str, Strong, Subscript, Superscript, Tag, Url, Verbatim,
  8 |   },
  9 |   block,
 10 |   patterns::find,
 11 |   Document, Match,
 12 | };
 13 | 
 14 | pub(crate) fn build(p: block::Tokenizer) -> Document {
 15 |   let mut ctx = Ctx { subject: p.subject, matches: p.matches, idx: 0, references: BTreeMap::new() };
 16 |   let mut doc = ctx.get_doc();
 17 |   doc.debug = p.debug;
 18 |   doc.references = ctx.references;
 19 |   doc
 20 | }
 21 | 
 22 | struct Ctx {
 23 |   subject: String,
 24 |   matches: Vec<Match>,
 25 |   references: BTreeMap<String, ReferenceDefinition>,
 26 |   idx: usize,
 27 | }
 28 | 
 29 | impl Ctx {
 30 |   fn get_doc(&mut self) -> Document {
 31 |     let mut res = Document::default();
 32 |     while self.idx < self.matches.len() {
 33 |       self.get_tag(&mut res.children)
 34 |     }
 35 |     res
 36 |   }
 37 | 
 38 |   fn get_tag(&mut self, acc: &mut Vec<Tag>) {
 39 |     self.skip_trivia();
 40 |     let m = self.matches[self.idx].clone();
 41 |     self.idx += 1;
 42 |     let res = match m.a {
 43 |       Annot::Add(comp) => match comp {
 44 |         Comp::CodeBlock => Tag::CodeBlock(self.get_code_block()),
 45 |         Comp::Para => Tag::Para(self.get_para()),
 46 |         Comp::Verbatim => Tag::Verbatim(self.get_verbatim()),
 47 |         Comp::Strong => Tag::Strong(self.get_strong()),
 48 |         Comp::Emph => Tag::Emph(self.get_emph()),
 49 |         Comp::Insert => Tag::Insert(self.get_insert()),
 50 |         Comp::Delete => Tag::Delete(self.get_delete()),
 51 |         Comp::Mark => Tag::Mark(self.get_mark()),
 52 |         Comp::Subscript => Tag::Subscript(self.get_subscript()),
 53 |         Comp::Superscript => Tag::Superscript(self.get_superscript()),
 54 |         Comp::DoubleQuoted => Tag::DoubleQuoted(self.get_double_quoted()),
 55 |         Comp::Linktext => Tag::Link(self.get_link()),
 56 |         Comp::Imagetext => Tag::Image(self.get_image()),
 57 |         Comp::Url => Tag::Url(self.get_url()),
 58 |         Comp::Attributes => todo!(),
 59 |         Comp::Span => Tag::Span(self.get_span()),
 60 |         Comp::ReferenceDefinition => {
 61 |           self.get_reference_definition();
 62 |           return;
 63 |         }
 64 |         _ => todo!("{comp:?}"),
 65 |       },
 66 |       Annot::Sub(sub) => unreachable!("-{sub}"),
 67 |       Annot::Atom(atom) => match atom {
 68 |         Atom::Str => {
 69 |           let mut text = self.subject[m.range].to_string();
 70 |           let attrs = self.get_attrs();
 71 |           if !attrs.is_empty() {
 72 |             if let Some(idx) = text.rfind(|it: char| it.is_ascii_whitespace()) {
 73 |               acc.push(Tag::Str(Str { attrs: Attrs::new(), text: text[..idx + 1].to_string() }));
 74 |               text.drain(..idx + 1);
 75 |             }
 76 |           }
 77 |           Tag::Str(Str { attrs, text })
 78 |         }
 79 |         Atom::Emoji => {
 80 |           let mut res = Emoji::default();
 81 |           res.alias = self.subject[m.range.start + 1..m.range.end - 1].to_string();
 82 |           Tag::Emoji(res)
 83 |         }
 84 |         Atom::Softbreak => Tag::SoftBreak(SoftBreak::default()),
 85 |         Atom::Class | Atom::Id => return,
 86 |         _ => todo!("{atom:?}"),
 87 |       },
 88 |     };
 89 |     acc.push(res)
 90 |   }
 91 | 
 92 |   fn get_code_block(&mut self) -> CodeBlock {
 93 |     let mut res = CodeBlock::default();
 94 |     let m = self.matches[self.idx].clone();
 95 |     if m.is(Atom::CodeLanguage) {
 96 |       res.lang = Some(self.subject[m.range].to_string());
 97 |       self.idx += 1;
 98 |     }
 99 |     res.text = self.get_text_until(Comp::CodeBlock);
100 |     res
101 |   }
102 | 
103 |   fn get_para(&mut self) -> Para {
104 |     let mut res = Para::default();
105 |     res.children = self.get_tags_until(Comp::Para);
106 |     res
107 |   }
108 | 
109 |   fn get_verbatim(&mut self) -> Verbatim {
110 |     let mut res = Verbatim::default();
111 |     res.text = self.get_text_until(Comp::Verbatim);
112 |     if find(res.text.as_str(), "^ +`").is_match {
113 |       res.text.remove(0);
114 |     }
115 |     if find(res.text.as_str(), "` +$").is_match {
116 |       res.text.pop();
117 |     }
118 |     res
119 |   }
120 | 
121 |   fn get_strong(&mut self) -> Strong {
122 |     let mut res = Strong::default();
123 |     res.children = self.get_tags_until(Comp::Strong);
124 |     res
125 |   }
126 | 
127 |   fn get_emph(&mut self) -> Emph {
128 |     let mut res = Emph::default();
129 |     res.children = self.get_tags_until(Comp::Emph);
130 |     res
131 |   }
132 | 
133 |   fn get_insert(&mut self) -> Insert {
134 |     let mut res = Insert::default();
135 |     res.children = self.get_tags_until(Comp::Insert);
136 |     res
137 |   }
138 | 
139 |   fn get_delete(&mut self) -> Delete {
140 |     let mut res = Delete::default();
141 |     res.children = self.get_tags_until(Comp::Delete);
142 |     res
143 |   }
144 | 
145 |   fn get_mark(&mut self) -> Mark {
146 |     let mut res = Mark::default();
147 |     res.children = self.get_tags_until(Comp::Mark);
148 |     res
149 |   }
150 | 
151 |   fn get_subscript(&mut self) -> Subscript {
152 |     let mut res = Subscript::default();
153 |     res.children = self.get_tags_until(Comp::Subscript);
154 |     res
155 |   }
156 | 
157 |   fn get_superscript(&mut self) -> Superscript {
158 |     let mut res = Superscript::default();
159 |     res.children = self.get_tags_until(Comp::Superscript);
160 |     res
161 |   }
162 | 
163 |   fn get_double_quoted(&mut self) -> DoubleQuoted {
164 |     let mut res = DoubleQuoted::default();
165 |     res.children = self.get_tags_until(Comp::DoubleQuoted);
166 |     res
167 |   }
168 | 
169 |   fn get_link(&mut self) -> Link {
170 |     let mut res = Link::default();
171 |     res.children = self.get_tags_until(Comp::Linktext);
172 |     match self.get_dest() {
173 |       LinkDest::Dest(dest) => res.destination = Some(dest),
174 |       LinkDest::Ref(r) => res.reference = Some(r),
175 |       LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)),
176 |     }
177 |     res
178 |   }
179 | 
180 |   fn get_image(&mut self) -> Image {
181 |     let mut res = Image::default();
182 |     res.children = self.get_tags_until(Comp::Imagetext);
183 |     match self.get_dest() {
184 |       LinkDest::Dest(dest) => res.destination = Some(dest),
185 |       LinkDest::Ref(r) => res.reference = Some(r),
186 |       LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)),
187 |     }
188 |     res
189 |   }
190 | 
191 |   fn get_dest(&mut self) -> LinkDest {
192 |     let m = self.matches[self.idx].clone();
193 |     self.idx += 1;
194 |     if m.is(Comp::Destination.add()) {
195 |       let dest = self.get_text_until(Comp::Destination);
196 |       LinkDest::Dest(dest.replace('\n', ""))
197 |     } else {
198 |       let r = self.get_text_until(Comp::Reference);
199 |       if r.is_empty() {
200 |         LinkDest::AutoRef
201 |       } else {
202 |         LinkDest::Ref(r.replace('\n', " "))
203 |       }
204 |     }
205 |   }
206 | 
207 |   fn get_url(&mut self) -> Url {
208 |     let mut res = Url::default();
209 |     res.destination = self.get_text_until(Comp::Url);
210 |     res
211 |   }
212 | 
213 |   fn get_span(&mut self) -> Span {
214 |     let mut res = Span::default();
215 |     res.children = self.get_tags_until(Comp::Span);
216 |     res.attrs = self.get_attrs();
217 |     res
218 |   }
219 | 
220 |   fn get_attrs(&mut self) -> Attrs {
221 |     if !self.matches[self.idx].is(Comp::Attributes.add()) {
222 |       return Attrs::new();
223 |     }
224 |     self.idx += 1;
225 |     let mut res = Attrs::new();
226 |     loop {
227 |       let m = self.matches[self.idx].clone();
228 |       self.idx += 1;
229 |       if m.is(Comp::Attributes.sub()) {
230 |         break;
231 |       }
232 |       if m.is(Atom::Class) {
233 |         match res.entry("class".to_string()) {
234 |           indexmap::map::Entry::Occupied(mut it) => {
235 |             it.insert(format!("{} {}", it.get(), &self.subject[m.range.clone()]));
236 |           }
237 |           indexmap::map::Entry::Vacant(it) => {
238 |             it.insert(self.subject[m.range.clone()].to_string());
239 |           }
240 |         }
241 |       } else if m.is(Atom::Id) {
242 |         res.insert("id".to_string(), self.subject[m.range].to_string());
243 |       } else if m.is(Atom::Key) {
244 |         let key = self.subject[m.range].to_string();
245 |         let m = self.matches[self.idx].clone();
246 |         self.idx += 1;
247 |         let value = self.subject[m.range].to_string();
248 |         res.insert(key, value);
249 |       }
250 |     }
251 |     res
252 |   }
253 | 
254 |   fn get_reference_definition(&mut self) {
255 |     let mut res = ReferenceDefinition::default();
256 |     let key = self.matches[self.idx].clone();
257 |     self.idx += 1;
258 |     loop {
259 |       let m = self.matches[self.idx].clone();
260 |       if !m.is(Atom::ReferenceValue) {
261 |         break;
262 |       }
263 |       self.idx += 1;
264 |       res.destination.push_str(&self.subject[m.range]);
265 |     }
266 |     assert!(self.matches[self.idx].is(Comp::ReferenceDefinition.sub()));
267 |     self.idx += 1;
268 |     self.references.insert(self.subject[key.range.start + 1..key.range.end - 1].to_string(), res);
269 |   }
270 | 
271 |   fn get_tags_until(&mut self, comp: Comp) -> Vec<Tag> {
272 |     let mut res = vec![];
273 |     while !self.matches[self.idx].is(comp.sub()) {
274 |       self.get_tag(&mut res)
275 |     }
276 |     self.idx += 1;
277 |     res
278 |   }
279 | 
280 |   fn get_text_until(&mut self, comp: Comp) -> String {
281 |     let mut res = String::new();
282 |     loop {
283 |       let m = self.matches[self.idx].clone();
284 |       self.idx += 1;
285 |       if m.is(comp.sub()) {
286 |         break;
287 |       }
288 |       res.push_str(&self.subject[m.range]);
289 |     }
290 |     res
291 |   }
292 | 
293 |   fn skip_trivia(&mut self) {
294 |     while self.idx < self.matches.len() {
295 |       let m = self.matches[self.idx].clone();
296 |       if !(m.is(Atom::Blankline) || m.is(Atom::ImageMarker) || m.is(Atom::Escape)) {
297 |         break;
298 |       }
299 |       self.idx += 1;
300 |       continue;
301 |     }
302 |   }
303 | }
304 | 
305 | pub(crate) fn get_string_content(tags: &[Tag]) -> String {
306 |   let mut res = String::new();
307 |   for tag in tags {
308 |     match tag {
309 |       Tag::SoftBreak(_) => res.push('\n'),
310 |       Tag::Str(str) => res.push_str(&str.text),
311 |       Tag::Emph(emph) => res.push_str(&get_string_content(&emph.children)),
312 |       _ => (),
313 |     }
314 |   }
315 |   res
316 | }
317 | 
318 | enum LinkDest {
319 |   Dest(String),
320 |   Ref(String),
321 |   AutoRef,
322 | }
323 | 


--------------------------------------------------------------------------------
/tests/data/attributes.test:
--------------------------------------------------------------------------------
  1 | An inline attribute allies to the preceding element, which might
  2 | be complex (span, emphasis, link) or a simple word (defined as a
  3 | sequence of non-ASCII-whitespace characters).
  4 | ```
  5 | foo привет{.ru}
  6 | .
  7 | <p>foo <span class="ru">привет</span></p>
  8 | ```
  9 | 
 10 | ```
 11 | (some text){.attr}
 12 | .
 13 | <p>(some <span class="attr">text)</span></p>
 14 | ```
 15 | 
 16 | ```
 17 | [some text]{.attr}
 18 | .
 19 | <p><span class="attr">some text</span></p>
 20 | ```
 21 | 
 22 | Ensure that emphasis that starts before the attribute can still close,
 23 | even if the attribute contains a potential closer.
 24 | 
 25 | ```
 26 | a *b{#id key="*"}*
 27 | .
 28 | <p>a <strong><span id="id" key="*">b</span></strong></p>
 29 | ```
 30 | 
 31 | ```
 32 | a *b{#id key="*"}o
 33 | .
 34 | <p>a <span id="id" key="*">*b</span>o</p>
 35 | ```
 36 | 
 37 | Don't allow attributes to start when we're parsing a potential
 38 | attribute.
 39 | 
 40 | ```
 41 | hi{key="{#hi"}
 42 | .
 43 | <p>hi{key=&ldquo;{#hi&rdquo;</p>
 44 | ```
 45 | 
 46 | ```
 47 | hi\{key="abc{#hi}"
 48 | .
 49 | <p>hi{key=&ldquo;<span id="hi">abc</span>&rdquo;</p>
 50 | ```
 51 | STOP
 52 | ```
 53 | hi{key="\{#hi"}
 54 | .
 55 | <p><span key="{#hi">hi</span></p>
 56 | ```
 57 | 
 58 | Line break:
 59 | 
 60 | ```
 61 | hi{#id .class
 62 | key="value"}
 63 | .
 64 | <p><span class="class" id="id" key="value">hi</span></p>
 65 | ```
 66 | 
 67 | Here there is nothing for the attribute to attach to:
 68 | 
 69 | ```
 70 | {#id} at beginning
 71 | .
 72 | <p> at beginning</p>
 73 | ```
 74 | 
 75 | ```
 76 | After {#id} space
 77 | {.class}
 78 | .
 79 | <p>After  space
 80 | </p>
 81 | ```
 82 | 
 83 | Block attributes come before the block, on a line by themselves.
 84 | 
 85 | ```
 86 | {#id .class}
 87 | A paragraph
 88 | .
 89 | <p class="class" id="id">A paragraph</p>
 90 | ```
 91 | 
 92 | Use indentation if you need to continue the attributes over a line break.
 93 | 
 94 | ```
 95 | {#id .class
 96 |   style="color:red"}
 97 | A paragraph
 98 | .
 99 | <p class="class" id="id" style="color:red">A paragraph</p>
100 | ```
101 | 
102 | If the attribute block can't be parsed as attributes, it will be
103 | parsed as a regular paragraph:
104 | 
105 | ```
106 | {#id .cla*ss*
107 | .
108 | <p>{#id .cla<strong>ss</strong></p>
109 | ```
110 | 
111 | You can use consecutive attribute blocks.
112 | In case of conflict, later values take precedence over earlier ones,
113 | but classes accumulate:
114 | 
115 | ```
116 | {#id}
117 | {key=val}
118 | {.foo .bar}
119 | {key=val2}
120 | {.baz}
121 | {#id2}
122 | Okay
123 | .
124 | <p class="foo bar baz" id="id2" key="val2">Okay</p>
125 | ```
126 | 
127 | Attributes on different kinds of blocks:
128 | 
129 | ```
130 | {#id}
131 | > Block quote
132 | .
133 | <blockquote id="id">
134 | <p>Block quote</p>
135 | </blockquote>
136 | ```
137 | 
138 | ```
139 | {#id}
140 | # Heading
141 | .
142 | <section id="id">
143 | <h1>Heading</h1>
144 | </section>
145 | ```
146 | 
147 | ```
148 | {.blue}
149 | - - - - -
150 | .
151 | <hr class="blue">
152 | ```
153 | 
154 | ````
155 | {highlight=3}
156 | ``` ruby
157 | x = 3
158 | ```
159 | .
160 | <pre highlight="3"><code class="language-ruby">x = 3
161 | </code></pre>
162 | ````
163 | 
164 | ```
165 | {.special}
166 | 1. one
167 | 2. two
168 | .
169 | <ol class="special">
170 | <li>
171 | one
172 | </li>
173 | <li>
174 | two
175 | </li>
176 | </ol>
177 | ```
178 | 
179 | ```
180 | > {.foo}
181 | > > {.bar}
182 | > > nested
183 | .
184 | <blockquote>
185 | <blockquote class="foo">
186 | <p class="bar">nested</p>
187 | </blockquote>
188 | </blockquote>
189 | ```
190 | 
191 | Comments start at a `%` character
192 | (not in quotes) and end with another `%`.
193 | These can be used to comment up an attribute
194 | list or without any real attributes.
195 | 
196 | ```
197 | foo{#ident % this is a comment % .class}
198 | .
199 | <p><span class="class" id="ident">foo</span></p>
200 | ```
201 | 
202 | In block-level comment, subsequent lines must
203 | be indented, as with attributes:
204 | 
205 | ```
206 | {% This is  a comment before a
207 |   block-level item. %}
208 | Paragraph.
209 | .
210 | <p>Paragraph.</p>
211 | ```
212 | 
213 | Inline attributes can be empty:
214 | 
215 | ```
216 | hi{}
217 | .
218 | <p>hi</p>
219 | ```
220 | 
221 | Block attributes can be empty:
222 | 
223 | ```
224 | {}
225 | hi
226 | .
227 | <p>hi</p>
228 | ```
229 | 


--------------------------------------------------------------------------------
/tests/data/code_blocks.test:
--------------------------------------------------------------------------------
 1 | 
 2 | ```
 3 | ~~~
 4 | code
 5 |   block
 6 | ~~~
 7 | .
 8 | <pre><code>code
 9 |   block
10 | </code></pre>
11 | ```
12 | 
13 | ````
14 | ``` python
15 | x = y + 3
16 | ```
17 | .
18 | <pre><code class="language-python">x = y + 3
19 | </code></pre>
20 | ````
21 | 
22 | ````
23 |   ``` python
24 |   if true:
25 |     x = 3
26 |   ```
27 | .
28 | <pre><code class="language-python">if true:
29 |   x = 3
30 | </code></pre>
31 | ````
32 | 
33 | ````
34 | ``` not a code block ```
35 | .
36 | <p><code> not a code block </code></p>
37 | ````
38 | 
39 | ````
40 | ``` not a code block
41 | .
42 | <p><code> not a code block</code></p>
43 | ````
44 | 
45 | ````
46 | ```
47 | hi
48 | ```
49 | ```
50 | two
51 | ```
52 | .
53 | <pre><code>hi
54 | </code></pre>
55 | <pre><code>two
56 | </code></pre>
57 | ````
58 | 
59 | Empty code block:
60 | 
61 | ````
62 | ```
63 | ```
64 | .
65 | <pre><code></code></pre>
66 | ````
67 | 


--------------------------------------------------------------------------------
/tests/data/emoji.test:
--------------------------------------------------------------------------------
 1 | ```
 2 | :+1: :scream:
 3 | .
 4 | <p>👍 😱</p>
 5 | ```
 6 | 
 7 | ```
 8 | This is a :nonexistent: emoji.
 9 | .
10 | <p>This is a :nonexistent: emoji.</p>
11 | ```
12 | 
13 | ```
14 | :ice:scream:
15 | .
16 | <p>:ice:scream:</p>
17 | ```
18 | 


--------------------------------------------------------------------------------
/tests/data/emphasis.test:
--------------------------------------------------------------------------------
  1 | ```
  2 | *foo bar*
  3 | .
  4 | <p><strong>foo bar</strong></p>
  5 | ```
  6 | 
  7 | ```
  8 | a* foo bar*
  9 | .
 10 | <p>a* foo bar*</p>
 11 | ```
 12 | 
 13 | ```
 14 | *foo bar *
 15 | .
 16 | <p>*foo bar *</p>
 17 | ```
 18 | 
 19 | Unicode spaces don't block emphasis.
 20 | 
 21 | ```
 22 | * a *
 23 | .
 24 | <p><strong> a </strong></p>
 25 | ```
 26 | 
 27 | Intraword:
 28 | 
 29 | ```
 30 | foo*bar*baz
 31 | .
 32 | <p>foo<strong>bar</strong>baz</p>
 33 | ```
 34 | 
 35 | ```
 36 | _foo bar_
 37 | .
 38 | <p><em>foo bar</em></p>
 39 | ```
 40 | 
 41 | ```
 42 | _ foo bar_
 43 | .
 44 | <p>_ foo bar_</p>
 45 | ```
 46 | 
 47 | ```
 48 | _foo bar _
 49 | .
 50 | <p>_foo bar _</p>
 51 | ```
 52 | 
 53 | Unicode spaces don't block emphasis.
 54 | 
 55 | ```
 56 | _ a _
 57 | .
 58 | <p><em> a </em></p>
 59 | ```
 60 | 
 61 | Intraword:
 62 | 
 63 | ```
 64 | foo_bar_baz
 65 | .
 66 | <p>foo<em>bar</em>baz</p>
 67 | ```
 68 | 
 69 | ```
 70 | aa_"bb"_cc
 71 | .
 72 | <p>aa<em>&ldquo;bb&rdquo;</em>cc</p>
 73 | ```
 74 | 
 75 | ```
 76 | *foo_
 77 | .
 78 | <p>*foo_</p>
 79 | ```
 80 | 
 81 | ```
 82 | _foo*
 83 | .
 84 | <p>_foo*</p>
 85 | ```
 86 | 
 87 | A line ending counts as whitespace:
 88 | 
 89 | ```
 90 | _foo bar
 91 | _
 92 | .
 93 | <p>_foo bar
 94 | _</p>
 95 | ```
 96 | 
 97 | So does a tab:
 98 | 
 99 | ```
100 | _	a_
101 | .
102 | <p>_	a_</p>
103 | ```
104 | 
105 | This one is different from commonmark:
106 | 
107 | ```
108 | _(_foo_)_
109 | .
110 | <p><em>(</em>foo<em>)</em></p>
111 | ```
112 | 
113 | But you can force the second `_` to be an opener
114 | using the marker `{`.
115 | 
116 | ```
117 | _({_foo_})_
118 | .
119 | <p><em>(<em>foo</em>)</em></p>
120 | ```
121 | 
122 | ```
123 | _(*foo*)_
124 | .
125 | <p><em>(<strong>foo</strong>)</em></p>
126 | ```
127 | 
128 | Overlapping scopes (first to close wins):
129 | 
130 | ```
131 | _foo *bar_ baz*
132 | .
133 | <p><em>foo *bar</em> baz*</p>
134 | ```
135 | 
136 | Over line break:
137 | 
138 | ```
139 | _foo
140 | bar_
141 | .
142 | <p><em>foo
143 | bar</em></p>
144 | ```
145 | 
146 | Inline content allowed:
147 | 
148 | ```
149 | *foo [link](url) `*`*
150 | .
151 | <p><strong>foo <a href="url">link</a> <code>*</code></strong></p>
152 | ```
153 | 
154 | Can't emph an underscore:
155 | 
156 | ```
157 | ___
158 | .
159 | <p>___</p>
160 | ```
161 | 
162 | Unless you escape it:
163 | 
164 | ```
165 | _\__
166 | .
167 | <p><em>_</em></p>
168 | ```
169 | 
170 | No empty emph:
171 | 
172 | ```
173 | __
174 | .
175 | <p>__</p>
176 | ```
177 | 
178 | ```
179 | _}b_
180 | .
181 | <p>_}b_</p>
182 | ```
183 | 
184 | ```
185 | _\}b_
186 | .
187 | <p><em>}b</em></p>
188 | ```
189 | 
190 | ```
191 | _ab\_c_
192 | .
193 | <p><em>ab_c</em></p>
194 | ```
195 | 
196 | ```
197 | *****a*****
198 | .
199 | <p><strong><strong><strong><strong><strong>a</strong></strong></strong></strong></strong></p>
200 | ```
201 | 
202 | ```
203 | _[bar_](url)
204 | .
205 | <p><em>[bar</em>](url)</p>
206 | ```
207 | 
208 | ```
209 | \_[bar_](url)
210 | .
211 | <p>_<a href="url">bar_</a></p>
212 | ```
213 | 
214 | Code takes precedence:
215 | 
216 | ```
217 | _`a_`b
218 | .
219 | <p>_<code>a_</code>b</p>
220 | ```
221 | 
222 | Autolinks take precedence:
223 | 
224 | ```
225 | _<http://example.com/a_b>
226 | .
227 | <p>_<a href="http://example.com/a_b">http://example.com/a_b</a></p>
228 | ```
229 | 


--------------------------------------------------------------------------------
/tests/data/hello_world.test:
--------------------------------------------------------------------------------
1 | ```
2 | Hello, world!
3 | .
4 | <p>Hello, world!</p>
5 | ```
6 | 


--------------------------------------------------------------------------------
/tests/data/insert_delete_mark.test:
--------------------------------------------------------------------------------
 1 | ```
 2 | This is {-deleted
 3 | _text_-}. The braces are -required-.
 4 | And they must be in the -}right order{-.
 5 | .
 6 | <p>This is <del>deleted
 7 | <em>text</em></del>. The braces are -required-.
 8 | And they must be in the -}right order{-.</p>
 9 | ```
10 | 
11 | ```
12 | {+ Inserted text +}
13 | .
14 | <p><ins> Inserted text </ins></p>
15 | ```
16 | 
17 | Interaction with smart:
18 | 
19 | ```
20 | {--hello--}
21 | .
22 | <p><del>-hello-</del></p>
23 | ```
24 | 
25 | ```
26 | This is {=marked *text*=}.
27 | .
28 | <p>This is <mark>marked <strong>text</strong></mark>.</p>
29 | ```
30 | 


--------------------------------------------------------------------------------
/tests/data/links_and_images.test:
--------------------------------------------------------------------------------
  1 | 
  2 | ```
  3 | [basic _link_][a_b_]
  4 | 
  5 | [a_b_]: url
  6 | .
  7 | <p><a href="url">basic <em>link</em></a></p>
  8 | ```
  9 | 
 10 | ```
 11 | ![basic _image_][a_b_]
 12 | 
 13 | [a_b_]: url
 14 | .
 15 | <p><img alt="basic image" src="url"></p>
 16 | ```
 17 | 
 18 | ```
 19 | [link][]
 20 | 
 21 | [link]: url
 22 | .
 23 | <p><a href="url">link</a></p>
 24 | ```
 25 | 
 26 | ```
 27 | [link][]
 28 | 
 29 | [link]:
 30 |  url
 31 | .
 32 | <p><a href="url">link</a></p>
 33 | ```
 34 | 
 35 | The URL can be split over multiple lines:
 36 | 
 37 | ```
 38 | [link][]
 39 | 
 40 | [link]:
 41 |  url
 42 |   andurl
 43 | .
 44 | <p><a href="urlandurl">link</a></p>
 45 | ```
 46 | 
 47 | ```
 48 | [link](url
 49 | andurl)
 50 | .
 51 | <p><a href="urlandurl">link</a></p>
 52 | ```
 53 | 
 54 | ```
 55 | [link][]
 56 | 
 57 | [link]:
 58 | [link2]: url
 59 | .
 60 | <p><a href="">link</a></p>
 61 | ```
 62 | 
 63 | ```
 64 | [link][]
 65 | [link][link2]
 66 | 
 67 | [link2]:
 68 |   url2
 69 | [link]:
 70 |  url
 71 | .
 72 | <p><a href="url">link</a>
 73 | <a href="url2">link</a></p>
 74 | ```
 75 | 
 76 | ```
 77 | [link][a and
 78 | b]
 79 | 
 80 | [a and b]: url
 81 | .
 82 | <p><a href="url">link</a></p>
 83 | ```
 84 | 
 85 | If the reference isn't found, we get an empty link.
 86 | 
 87 | ```
 88 | [link][a and
 89 | b]
 90 | .
 91 | <p><a>link</a></p>
 92 | ```
 93 | 
 94 | Reference definitions can't have line breaks in the key:
 95 | 
 96 | ```
 97 | [link][a and
 98 | b]
 99 | 
100 | [a and
101 | b]: url
102 | .
103 | <p><a>link</a></p>
104 | <p>[a and
105 | b]: url</p>
106 | ```
107 | 
108 | No case normalization is done on reference definitions:
109 | 
110 | ```
111 | [Link][]
112 | 
113 | [link]: /url
114 | .
115 | <p><a>Link</a></p>
116 | ```
117 | STOP
118 | Attributes on reference definitions get transferred to
119 | the link:
120 | 
121 | ```
122 | {title=foo}
123 | [ref]: /url
124 | 
125 | [ref][]
126 | .
127 | <p><a href="/url" title="foo">ref</a></p>
128 | ```
129 | 
130 | Attributes on the link override those on references:
131 | 
132 | ```
133 | {title=foo}
134 | [ref]: /url
135 | 
136 | [ref][]{title=bar}
137 | .
138 | <p><a href="/url" title="bar">ref</a></p>
139 | ```
140 | 
141 | ```
142 | [link _and_ link][]
143 | 
144 | [link and link]: url
145 | .
146 | <p><a href="url">link <em>and</em> link</a></p>
147 | ```
148 | 
149 | ```
150 | ![basic _image_](url)
151 | .
152 | <p><img alt="basic image" src="url"></p>
153 | ```
154 | 
155 | ```
156 | [![image](img.jpg)](url)
157 | .
158 | <p><a href="url"><img alt="image" src="img.jpg"></a></p>
159 | ```
160 | 
161 | ```
162 | [unclosed](hello *a
163 | b*
164 | .
165 | <p>[unclosed](hello <strong>a
166 | b</strong></p>
167 | ```
168 | 
169 | Note that soft breaks are ignored, so long URLs
170 | can be split over multiple lines:
171 | ```
172 | [closed](hello *a
173 | b*)
174 | .
175 | <p><a href="hello *ab*">closed</a></p>
176 | ```
177 | 
178 | Here the strong takes precedence over the link because it
179 | starts first:
180 | ```
181 | *[closed](hello*)
182 | .
183 | <p><strong>[closed](hello</strong>)</p>
184 | ```
185 | 
186 | Avoid this with a backslash escape:
187 | ```
188 | *[closed](hello\*)
189 | .
190 | <p>*<a href="hello*">closed</a></p>
191 | ```
192 | 
193 | Link in link?
194 | ```
195 | [[foo](bar)](baz)
196 | .
197 | <p><a href="baz"><a href="bar">foo</a></a></p>
198 | ```
199 | 
200 | Link in image?
201 | ```
202 | ![[link](url)](img)
203 | .
204 | <p><img alt="link" src="img"></p>
205 | ```
206 | 
207 | Image in link?
208 | ```
209 | [![image](img)](url)
210 | .
211 | <p><a href="url"><img alt="image" src="img"></a></p>
212 | ```
213 | 
214 | Autolinks:
215 | ```
216 | <http://example.com/foo>
217 | <me@example.com>
218 | .
219 | <p><a href="http://example.com/foo">http://example.com/foo</a>
220 | <a href="mailto:me@example.com">me@example.com</a></p>
221 | ```
222 | 
223 | Openers inside `[..](` or `[..][` or `[..]{` can't match
224 | outside them, even if the construction doesn't turn out to be
225 | a link or span or image.
226 | 
227 | ```
228 | [x_y](x_y)
229 | .
230 | <p><a href="x_y">x_y</a></p>
231 | ```
232 | 
233 | ```
234 | [x_y](x_
235 | .
236 | <p>[x_y](x_</p>
237 | ```
238 | 
239 | ```
240 | [x_y]{.bar_}
241 | .
242 | <p><span class="bar_">x_y</span></p>
243 | ```
244 | 


--------------------------------------------------------------------------------
/tests/data/para.test:
--------------------------------------------------------------------------------
1 | ```
2 | hi
3 | there
4 | .
5 | <p>hi
6 | there</p>
7 | ```
8 | 


--------------------------------------------------------------------------------
/tests/data/regression.test:
--------------------------------------------------------------------------------
 1 | Issue #104:
 2 | 
 3 | ```
 4 | {1--}
 5 | 
 6 | {1-}
 7 | .
 8 | <p>{1--}</p>
 9 | <p>{1-}</p>
10 | ```
11 | 
12 | Issue #106:
13 | 
14 | ```
15 | 
16 | |`|
17 | .
18 | <p>|<code>|</code></p>
19 | ```
20 | 
21 | ``` [matches]
22 | 
23 | |`|x
24 | .
25 | blankline 1-1
26 | +para 2-2
27 | str 2-2
28 | +verbatim 3-3
29 | str 4-5
30 | -verbatim 5-5
31 | -para 6-6
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/data/super_subscript.test:
--------------------------------------------------------------------------------
 1 | ```
 2 | H~2~O
 3 | .
 4 | <p>H<sub>2</sub>O</p>
 5 | ```
 6 | 
 7 | ```
 8 | mc^2^
 9 | .
10 | <p>mc<sup>2</sup></p>
11 | ```
12 | 
13 | ```
14 | test^of superscript ~with subscript~^
15 | .
16 | <p>test<sup>of superscript <sub>with subscript</sub></sup></p>
17 | ```
18 | 
19 | ```
20 | H{~2 ~}O
21 | .
22 | <p>H<sub>2 </sub>O</p>
23 | ```
24 | 


--------------------------------------------------------------------------------
/tests/data/verbatim.test:
--------------------------------------------------------------------------------
 1 | 
 2 | ```
 3 | Some `code`
 4 | .
 5 | <p>Some <code>code</code></p>
 6 | ```
 7 | 
 8 | ```
 9 | Some `code
10 | with a line break`
11 | .
12 | <p>Some <code>code
13 | with a line break</code></p>
14 | ```
15 | 
16 | ```
17 | Special characters: `*hi*`
18 | .
19 | <p>Special characters: <code>*hi*</code></p>
20 | ```
21 | 
22 | ```
23 | *foo`*`
24 | .
25 | <p>*foo<code>*</code></p>
26 | ```
27 | 
28 | ```
29 | `````a`a``a```a````a``````a`````
30 | .
31 | <p><code>a`a``a```a````a``````a</code></p>
32 | ```
33 | 
34 | ```
35 | ` ``a`` `
36 | .
37 | <p><code>``a``</code></p>
38 | ```
39 | 
40 | Implicitly closed by end of paragraph:
41 | 
42 | ```
43 | ` a
44 | c
45 | .
46 | <p><code> a
47 | c</code></p>
48 | ```
49 | 


--------------------------------------------------------------------------------
/tests/spec.rs:
--------------------------------------------------------------------------------
  1 | use std::{fs, path::PathBuf};
  2 | 
  3 | #[allow(unused)]
  4 | fn to_ref_html(source: &str, matches: bool) -> String {
  5 |   let sh = xshell::Shell::new().unwrap();
  6 |   if !sh.path_exists("ref") {
  7 |     xshell::cmd!(sh, "git clone https://github.com/jgm/djot ref").run().unwrap();
  8 |   }
  9 |   sh.change_dir("ref");
 10 |   let matches = if matches { Some("-m") } else { None };
 11 |   let mut html = xshell::cmd!(sh, "lua ./bin/main.lua {matches...}").stdin(source).read().unwrap();
 12 |   if cfg!(windows) {
 13 |     html = html.replace("\r\n", "\n");
 14 |   }
 15 |   html.push('\n');
 16 |   html
 17 | }
 18 | 
 19 | struct TestOpts {
 20 |   debug_ast: bool,
 21 |   ref_matches: bool,
 22 |   parse: djot::ParseOpts,
 23 | }
 24 | 
 25 | #[test]
 26 | fn spec_tests() {
 27 |   let opts =
 28 |     TestOpts { debug_ast: true, ref_matches: true, parse: djot::ParseOpts { debug_matches: true } };
 29 | 
 30 |   let mut last_fail = LastFail::load();
 31 |   let sh = xshell::Shell::new().unwrap();
 32 |   let mut total = 0;
 33 |   for path in sh.read_dir("./tests/data").unwrap() {
 34 |     if path.extension().unwrap_or_default() == "test" {
 35 |       let file_stem = path.file_stem().unwrap_or_default().to_str().unwrap_or_default();
 36 |       let source = fs::read_to_string(&path).unwrap();
 37 |       for (i, test_case) in parse_test(source.as_str()).into_iter().enumerate() {
 38 |         if last_fail.skip(file_stem, i) {
 39 |           continue;
 40 |         }
 41 |         let mut debug = String::new();
 42 |         let doc = djot::Document::parse_opts(opts.parse.clone(), &test_case.djot);
 43 |         debug.push_str(&doc.debug);
 44 |         if opts.debug_ast {
 45 |           debug.push_str(&doc.to_json());
 46 |         }
 47 |         let got = doc.to_html();
 48 |         let want = test_case.html.as_str();
 49 |         let ref_html = to_ref_html(&test_case.djot, false);
 50 |         if opts.ref_matches {
 51 |           debug.push_str(&format!("Ref Matches:\n{}-----", to_ref_html(&test_case.djot, true)));
 52 |         }
 53 |         if want != ref_html.as_str() {
 54 |           panic!(
 55 |             "\nReference mismatch in {}\nRef:\n{ref_html}-----\nWant:\n{want}-----\n",
 56 |             file_stem
 57 |           )
 58 |         }
 59 |         if got.as_str() != want {
 60 |           let mut msg = format!(
 61 |             "\nMismatch in {}\nSource:\n{}-----\nWant:\n{want}-----\nGot:\n{got}-----\n",
 62 |             file_stem, test_case.djot,
 63 |           );
 64 |           if !debug.is_empty() {
 65 |             msg = format!("{msg}Debug:\n{debug}-----\n")
 66 |           }
 67 |           panic!("{msg}")
 68 |         }
 69 |         last_fail.test_ok();
 70 |         total += 1;
 71 |       }
 72 |     }
 73 |   }
 74 |   eprintln!("total tests: {total}");
 75 | }
 76 | 
 77 | #[derive(Debug, Default)]
 78 | struct TestCase {
 79 |   djot: String,
 80 |   html: String,
 81 | }
 82 | 
 83 | #[derive(Debug)]
 84 | enum ParseState {
 85 |   Init,
 86 |   Djot(TestCase, usize),
 87 |   Html(TestCase, usize),
 88 | }
 89 | 
 90 | fn parse_test(source: &str) -> Vec<TestCase> {
 91 |   let mut res = Vec::new();
 92 |   let mut state = ParseState::Init;
 93 |   for line in source.lines() {
 94 |     state = match state {
 95 |       ParseState::Init if line == "STOP" => {
 96 |         break;
 97 |       }
 98 |       ParseState::Init => match parse_fence(line) {
 99 |         Some(fence) => ParseState::Djot(TestCase::default(), fence),
100 |         None => ParseState::Init,
101 |       },
102 |       ParseState::Djot(mut test_case, test_case_fence) => {
103 |         if line == "." {
104 |           ParseState::Html(test_case, test_case_fence)
105 |         } else {
106 |           test_case.djot.push_str(line);
107 |           test_case.djot.push('\n');
108 |           ParseState::Djot(test_case, test_case_fence)
109 |         }
110 |       }
111 |       ParseState::Html(mut test_case, test_case_fence) => match parse_fence(line) {
112 |         Some(fence) if fence == test_case_fence => {
113 |           res.push(test_case);
114 |           ParseState::Init
115 |         }
116 |         _ => {
117 |           test_case.html.push_str(line);
118 |           test_case.html.push('\n');
119 |           ParseState::Html(test_case, test_case_fence)
120 |         }
121 |       },
122 |     };
123 |   }
124 | 
125 |   res
126 | }
127 | 
128 | fn parse_fence(line: &str) -> Option<usize> {
129 |   if line.bytes().all(|it| it == b'`') && line.len() > 0 {
130 |     Some(line.len())
131 |   } else {
132 |     None
133 |   }
134 | }
135 | 
136 | struct LastFail {
137 |   loaded: Option<(String, usize)>,
138 |   current: Option<(String, usize)>,
139 | }
140 | 
141 | impl LastFail {
142 |   fn load() -> LastFail {
143 |     let mut loaded = None;
144 |     if let Ok(text) = fs::read_to_string(fail_file()) {
145 |       let (name, pos) = text.split_once(':').unwrap_or_else(|| panic!("bad fail file {text:?}"));
146 |       let idx = pos.parse::<usize>().unwrap_or_else(|_| panic!("bad fail file {text:?}"));
147 |       eprintln!("loaded fail {name}:{idx}");
148 |       loaded = Some((name.to_string(), idx))
149 |     }
150 |     LastFail { loaded, current: None }
151 |   }
152 |   fn skip(&mut self, name: &str, pos: usize) -> bool {
153 |     self.current = Some((name.to_string(), pos));
154 |     if let Some(loaded) = &self.loaded {
155 |       return !(loaded.0 == name && loaded.1 == pos);
156 |     }
157 |     false
158 |   }
159 |   fn test_ok(&mut self) {
160 |     if let Some((name, pos)) = &self.loaded {
161 |       eprintln!("{}:{} is now ok!", name, pos);
162 |       let _ = fs::remove_file(&fail_file());
163 |       self.loaded = None;
164 |     }
165 |     self.current = None
166 |   }
167 | }
168 | 
169 | impl Drop for LastFail {
170 |   fn drop(&mut self) {
171 |     if let Some((name, pos)) = &self.current {
172 |       eprintln!("saved fail {name}:{pos}");
173 |       let _ = fs::write(fail_file(), format!("{name}:{pos}"));
174 |     }
175 |   }
176 | }
177 | 
178 | fn fail_file() -> PathBuf {
179 |   PathBuf::from(env!("CARGO_TARGET_TMPDIR")).join("fail")
180 | }
181 | 


--------------------------------------------------------------------------------
/tests/tidy.rs:
--------------------------------------------------------------------------------
1 | use xshell::{cmd, Shell};
2 | 
3 | #[test]
4 | fn test_formatting() {
5 |   let sh = Shell::new().unwrap();
6 |   cmd!(sh, "cargo fmt -- --check").run().unwrap()
7 | }
8 | 


--------------------------------------------------------------------------------