├── .github
└── workflows
│ └── ci.yaml
├── .gitignore
├── Cargo.toml
├── README.md
├── bors.toml
├── lua-patterns
├── .gitignore
├── Cargo.toml
├── LICENSE.txt
├── examples
│ ├── errors.rs
│ ├── iter.rs
│ ├── multiple_captures.rs
│ ├── range.rs
│ └── strings.rs
├── readme.md
└── src
│ ├── errors.rs
│ ├── lib.rs
│ └── luapat.rs
├── rustfmt.toml
├── src
├── annot.rs
├── annot
│ └── generated.rs
├── ast.rs
├── ast
│ └── generated.rs
├── attribute.rs
├── block.rs
├── emoji.rs
├── html.rs
├── inline.rs
├── lib.rs
├── main.rs
├── patterns.rs
├── sourcegen.rs
├── sourcegen
│ ├── annot.rs
│ └── ast.rs
└── tree.rs
└── tests
├── data
├── attributes.test
├── code_blocks.test
├── emoji.test
├── emphasis.test
├── hello_world.test
├── insert_delete_mark.test
├── links_and_images.test
├── para.test
├── regression.test
├── super_subscript.test
└── verbatim.test
├── spec.rs
└── tidy.rs
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on:
3 | pull_request:
4 | push:
5 | branches: ["master", "staging", "trying"]
6 |
7 | env:
8 | CARGO_INCREMENTAL: 0
9 | CARGO_NET_RETRY: 10
10 | CI: 1
11 | RUST_BACKTRACE: short
12 | RUSTFLAGS: -D warnings
13 | RUSTUP_MAX_RETRIES: 10
14 |
15 | jobs:
16 | test:
17 | name: Rust
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - uses: Swatinem/rust-cache@6720f05bc48b77f96918929a9019fb2203ff71f8
23 | - run: rustup update --no-self-update stable
24 | - run: sudo apt-get install lua5.3
25 | - run: cargo test
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode
2 | /target
3 | /Cargo.lock
4 | /ref
5 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "djot"
3 | version = "0.1.0"
4 | license = "MIT OR Apache-2.0"
5 | authors = ["Aleksey Kladov "]
6 | edition = "2021"
7 |
8 | [dependencies]
9 | anyhow = "1.0.66"
10 | indexmap = { version = "1.9.1", features = ["serde"] }
11 | lexopt = "0.2.1"
12 | lua-patterns = { path = "lua-patterns" }
13 | serde = { version = "1.0.147", features = ["derive"] }
14 | serde_json = "1.0.87"
15 |
16 | [dev-dependencies]
17 | xshell = "0.2.0"
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # djot-rs
2 |
3 | UPDATE:
4 |
5 | This didn't went particularly far, checkout
6 |
7 |
8 |
9 | instead.
10 |
11 |
12 | An experimental Rust implementation of the [Djot](https://djot.net) light markup
13 | language.
14 |
15 | ## Design Rules
16 |
17 | Djot is in development, this defines _current_ design rules:
18 |
19 | 1. 100% compatibility with the reference Lua implementation, bugs and all. We
20 | don't want to fork a language which barely exist.
21 | 2. Reasonable source compatibility with the reference Lua implementation. We
22 | want to makes it easy to incorporate changes, though we don't necessary want
23 | to bend Rust to be lua.
24 |
25 | Currently this is very incomplete, feel free to submit PR to fill the blank
26 | spaces, just try to be close to the original code.
27 |
28 | There are some tests, run with `cargo test`. We are using the same test suite as
29 | the upstream project (see `.test` files in `tests/data`)
30 |
31 | ## Aspirations
32 |
33 | * "Easy", obvious API -- no streaming parsing, no allocation minimization, just
34 | gives you a full ast
35 | * core + alloc. We don't need OS. Getting rid of the allocator would be nice, but not for this library.
36 | * in general, leave pulldown-djot to someone else (or to the next iteration of this library)
37 | * djot.ts module for convenience
38 | * typescript extensible visitor API for rendering: `./djot.ts intput.adoc --template slides.ts`
39 | `ast.to_html({ code_block: (tag) => { ... }})`.
40 |
41 | ## See Also
42 |
43 | * https://git.sr.ht/~kmaasrud/djr a pulldown-cmark inspired parser
44 |
--------------------------------------------------------------------------------
/bors.toml:
--------------------------------------------------------------------------------
1 | status = [ "Rust" ]
2 | delete_merged_branches = true
3 |
--------------------------------------------------------------------------------
/lua-patterns/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | scratch
4 |
--------------------------------------------------------------------------------
/lua-patterns/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "lua-patterns"
3 | version = "0.3.0"
4 | authors = ["steve donovan "]
5 | description = "Binding to Lua String Patterns"
6 | license = "MIT"
7 | repository = "https://github.com/stevedonovan/lua-patterns"
8 | documentation = "https://docs.rs/lua-patterns"
9 |
10 | keywords = ["string","matching","lua"]
11 |
12 | categories = ["parsing","api-bindings"]
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/lua-patterns/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright © 2017 Steve Donovan
2 |
3 | Copyright © 1994–2017 Lua.org, PUC-Rio.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"),
7 | to deal in the Software without restriction, including without
8 | limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software,
10 | and to permit persons to whom the Software is furnished to do so,
11 | subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included
14 | in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
19 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/lua-patterns/examples/errors.rs:
--------------------------------------------------------------------------------
1 | extern crate lua_patterns;
2 | use lua_patterns::errors::PatternError;
3 |
4 | fn main() {
5 | let bad = [
6 | ("bonzo %","malformed pattern (ends with '%')"),
7 | ("bonzo (dog%(","unfinished capture"),
8 | ("alles [%a%[","malformed pattern (missing ']')"),
9 | ("bonzo (dog (cat)","unfinished capture"),
10 | ("frodo %f[%A","malformed pattern (missing ']')"),
11 | ("frodo (1) (2(3)%2)%1","invalid capture index %2"),
12 | ];
13 |
14 | fn error(s: &str) -> PatternError {
15 | PatternError(s.into())
16 | }
17 |
18 | for p in bad.iter() {
19 | let res = lua_patterns::LuaPattern::new_try(p.0);
20 | if let Err(e) = res {
21 | assert_eq!(e, error(p.1));
22 | } else {
23 | println!("'{}' was fine",p.0);
24 | }
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/lua-patterns/examples/iter.rs:
--------------------------------------------------------------------------------
1 | extern crate lua_patterns as lp;
2 |
3 |
4 |
5 | fn main() {
6 |
7 | //~ let mut m = lp::LuaPattern::new("hello%");
8 | //~ m.matches("hello");
9 | //~ println!("ok");
10 |
11 | let mut m = lp::LuaPattern::new("(%a+)");
12 | let mut iter = m.gmatch("one two three");
13 | assert_eq!(iter.next(), Some("one"));
14 | assert_eq!(iter.next(), Some("two"));
15 | assert_eq!(iter.next(), Some("three"));
16 | assert_eq!(iter.next(), None);
17 |
18 | let mut m = lp::LuaPattern::new("%S+");
19 | let split: Vec<_> = m.gmatch("dog cat leopard wolf").collect();
20 | assert_eq!(split,&["dog","cat","leopard","wolf"]);
21 |
22 | let mut m = lp::LuaPattern::new("%s*(%S+)%s*=%s*(.-);");
23 | let cc = m.captures(" hello= bonzo dog;");
24 | assert_eq!(cc[0], " hello= bonzo dog;");
25 | assert_eq!(cc[1],"hello");
26 | assert_eq!(cc[2],"bonzo dog");
27 |
28 | for cc in m.gmatch_captures("hello=bonzo dog; bye=cat;") {
29 | println!("'{}'='{}'",cc.get(1),cc.get(2));
30 | }
31 |
32 | let mut m = lp::LuaPattern::new("%$(%S+)");
33 | let res = m.gsub_with("hello $dolly you're so $fine",
34 | |cc| cc.get(1).to_uppercase()
35 | );
36 | assert_eq!(res,"hello DOLLY you're so FINE");
37 |
38 | let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);");
39 | let res = m.gsub_with("alpha=bonzo; beta=felix;",
40 | |cc| format!("{}:'{}',", cc.get(1), cc.get(2))
41 | );
42 | assert_eq!(res, "alpha:'bonzo', beta:'felix',");
43 |
44 |
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/lua-patterns/examples/multiple_captures.rs:
--------------------------------------------------------------------------------
1 | extern crate lua_patterns as lp;
2 |
3 | fn main() {
4 | let mut p = lp::LuaPattern::new("%s*(%d+)%s+(%S+)");
5 | if let Some((int,rest)) = p.match_maybe_2(" 233 hello dolly") {
6 | assert_eq!(int,"233");
7 | assert_eq!(rest,"hello");
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/lua-patterns/examples/range.rs:
--------------------------------------------------------------------------------
1 | extern crate lua_patterns;
2 | use lua_patterns::{LuaPattern,LuaPatternBuilder};
3 |
4 | fn main() {
5 | let mut m = LuaPattern::new("(%a+) one");
6 | let text = " hello one two";
7 | assert!(m.matches(text));
8 | assert_eq!(m.capture(1),1..6);
9 | assert_eq!(m.capture(0),1..10);
10 |
11 | let v = m.captures(text);
12 | assert_eq!(v, &["hello one","hello"]);
13 |
14 | let mut v = Vec::new();
15 | assert!(m.capture_into(text,&mut v));
16 | assert_eq!(v, &["hello one","hello"]);
17 |
18 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0];
19 |
20 | let patt = LuaPatternBuilder::new()
21 | .bytes_as_hex("DE24")
22 | .text("+")
23 | .bytes(&[0xBE])
24 | .build();
25 |
26 | let mut m = LuaPattern::from_bytes(&patt);
27 | assert!(m.matches_bytes(bytes));
28 | assert_eq!(&bytes[m.capture(0)], &[0xDE,0x24,0x24,0xBE]);
29 |
30 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
31 | let res = m.gsub("a=2; b=3; c = 4;","'%2':%1 ");
32 | println!("{}",res);
33 |
34 | let mut m = LuaPattern::new("%s+");
35 | let res = m.gsub("hello dolly you're so fine","");
36 | println!("{}",res);
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/lua-patterns/examples/strings.rs:
--------------------------------------------------------------------------------
1 | // Similar to the strings(1) utility
2 | // We print any sequences involving four or more ASCII letters
3 | extern crate lua_patterns;
4 | use lua_patterns::LuaPattern;
5 |
6 | use std::env;
7 | use std::str;
8 | use std::fs::File;
9 | use std::io::prelude::*;
10 |
11 | fn main() {
12 | let file = env::args().skip(1).next().expect("provide a binary file");
13 | let mut f = File::open(&file).expect("can't open file");
14 | let mut buf = Vec::new();
15 | f.read_to_end(&mut buf).expect("can't read file");
16 |
17 | let mut words = LuaPattern::new("%a%a%a%a+");
18 | for w in words.gmatch_bytes(&buf) {
19 | println!("{}",str::from_utf8(w).unwrap());
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/lua-patterns/readme.md:
--------------------------------------------------------------------------------
1 | ## Lua string patterns in Rust
2 |
3 | [Lua string patterns](https://www.lua.org/pil/20.2.html) are a powerful
4 | yet lightweight alternative to full regular expressions. They are not
5 | regexps, since there is no alternation (the `|` operator), but this
6 | is not usually a problem. In fact, full regexps become _too powerful_ and
7 | power can be dangerous or just plain confusing.
8 | This is why OpenBSD's httpd has [Lua patterns](http://man.openbsd.org/patterns.7).
9 | The decision to use `%` as the escape rather than the traditional `\` is refreshing.
10 | In the Rust context, `lua-patterns` is a very lightweight dependency, if you
11 | don't need the full power of the `regex` crate.
12 |
13 | This library reuses the original source from Lua 5.2 - only
14 | 400 lines of battle-tested C. I originally did this for a similar project to bring
15 | [these patterns to C++](https::/github.com/stevedonovan/rx-cpp).
16 |
17 | More information can be found on [the Lua wiki](http://lua-users.org/wiki/PatternsTutorial).
18 | The cool thing is that Lua is a 300KB download, if you want to test patterns out
19 | without going through Rust.
20 |
21 | I've organized the Rust interface much as the original Lua library, 'match',
22 | 'gmatch' and 'gsub', but made these methods of a `LuaPattern` struct. This is
23 | for two main reasons:
24 |
25 | - although string patterns are not compiled, they can be validated upfront
26 | - after a match, the struct contains the results
27 |
28 | ```rust
29 | extern crate lua_patterns;
30 | use lua_patterns::LuaPattern;
31 |
32 | let mut m = LuaPattern::new("one");
33 | let text = "hello one two";
34 | assert!(m.matches(text));
35 | let r = m.range();
36 | assert_eq!(r.start, 6);
37 | assert_eq!(r.end, 9);
38 | ```
39 | This not in itself impressive, since it can be done with the string `find`
40 | method. (`new` will panic if you feed it a bad pattern, so use `new_try` if
41 | you want more control.)
42 |
43 | Once we start using patterns it gets more exciting, especially
44 | with _captures_:
45 |
46 | ```rust
47 | let mut m = LuaPattern::new("(%a+) one");
48 | let text = " hello one two";
49 | assert!(m.matches(text));
50 | assert_eq!(m.capture(0),1..10); // "hello one"
51 | assert_eq!(m.capture(1),1..6); // "hello"
52 | ```
53 | Lua patterns (like regexps) are not anchored by default, so this finds
54 | the first match and works from there. The 0 capture always exists
55 | (the full match) and here the 1 capture just picks up the first word.
56 |
57 | > There is an obvious limitation: "%a" refers specifically to a single byte
58 | > representing a letter according to the C locale. Lua people will often
59 | > look for 'sequence of non-spaces' ("%S+"), etc - that is, identify maybe-UTF-8
60 | > sequences using surronding punctuation or spaces.
61 |
62 | If you want your captures as strings, then there are several options. If there's
63 | just one, then `match_maybe` is useful:
64 |
65 | ```rust
66 | let mut m = LuaPattern::new("OK%s+(%d+)");
67 | let res = m.match_maybe("and that's OK 400 to you");
68 | assert_eq!(res, Some("400"));
69 | ```
70 | You can grab them as a vector (it will be empty if the match fails.)
71 |
72 | ```rust
73 | let mut m = LuaPattern::new("(%a+) one");
74 | let text = " hello one two";
75 | let v = m.captures(text);
76 | assert_eq!(v, &["hello one","hello"]);
77 | ```
78 | This will create a vector. You can avoid excessive allocations with `capture_into`:
79 |
80 | ```rust
81 | let mut v = Vec::new();
82 | if m.capture_into(text,&mut v) {
83 | assert_eq!(v, &["hello one","hello"]);
84 | }
85 | ```
86 | Imagine that this is happening in a loop - the vector is only allocated the first
87 | time it is filled, and thereafter there are no allocations. It's a convenient
88 | method if you are checking text against several patterns, and is actually
89 | more ergonomic than using Lua's `string.match`. (Personally I prefer
90 | to use those marvelous things called "if statements" rather than elaborate
91 | regular expressions.)
92 |
93 | The `gmatch` method creates an interator over all matched strings.
94 |
95 | ```rust
96 | let mut m = lp::LuaPattern::new("%S+");
97 | let split: Vec<_> = m.gmatch("dog cat leopard wolf ").collect();
98 | assert_eq!(split,&["dog","cat","leopard","wolf"]);
99 | ```
100 | A single match is returned; if the pattern has no captures, you get the full match,
101 | otherwise you get the first match. So "(%S+)" would give you the same result.
102 |
103 | A more general version is `gmatch_captures` which creates a _streaming_ iterator
104 | over captures. You have to be a little careful with this one; in particular, you
105 | will get nonsense if you try to `collect` on the return captures: don't try to
106 | keep these values.
107 | It is fine to collect from an expression involving the `get` method however!
108 |
109 | ```rust
110 | let mut m = lua_patterns::LuaPattern::new("(%S)%S+");
111 | let split: Vec<_> = m.gmatch_captures("dog cat leopard wolf")
112 | .map(|cc| cc.get(1)).collect();
113 | assert_eq!(split,&["d","c","l","w"]);
114 | ```
115 |
116 | Text substitution is an old favourite of mine, so here's `gsub_with`:
117 |
118 | ```rust
119 | let mut m = lp::LuaPattern::new("%$(%S+)");
120 | let res = m.gsub_with("hello $dolly you're so $fine",
121 | |cc| cc.get(1).to_uppercase()
122 | );
123 | assert_eq!(res,"hello DOLLY you're so FINE");
124 | ```
125 | The closure is passed a `Closures` object and the captures are accessed
126 | using the `get` method; it returns a `String`.
127 |
128 | The second form of `gsub` is convenient when you have a replacement
129 | string, which may contain closure references. (To add a literal "%" escape
130 | it like so "%%")
131 |
132 | ```rust
133 | let mut m = LuaPattern::new("%s+");
134 | let res = m.gsub("hello dolly you're so fine","");
135 | assert_eq!(res, "hellodollyyou'resofine");
136 |
137 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
138 | let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
139 | assert_eq!(res, "'2':a '3':b '4':c ");
140 | ```
141 | The third form of `string.gsub` in Lua does lookup with a table - that is, a map.
142 | But for maps you really want to handle the 'not found' case in some special way:
143 |
144 | ```rust
145 | let mut map = HashMap::new();
146 | // updating old lines for the 21st Century
147 | map.insert("dolly", "baby");
148 | map.insert("fine", "cool");
149 | map.insert("good-looking", "pretty");
150 |
151 | let mut m = LuaPattern::new("%$%((.-)%)");
152 | let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)",
153 | |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string()
154 | );
155 | assert_eq!(res,"hello baby you're so cool and pretty");
156 | ```
157 |
158 | (The ".-" pattern means 'match as little as possible' - often called 'lazy'
159 | matching.)
160 |
161 | This is equivalent to a replace string "%1:'%2'":
162 |
163 | ```rust
164 | let mut m = lp::LuaPattern::new("(%S+)%s*=%s*([^;]+);");
165 | let res = m.gsub_with("alpha=bonzo; beta=felix;",
166 | |cc| format!("{}:'{}',", cc.get(1), cc.get(2))
167 | );
168 | assert_eq!(res, "alpha:'bonzo', beta:'felix',");
169 | ```
170 | Having a byte-oriented pattern matcher can be useful. For instance, this
171 | is basically the old `strings` utility - we read all of a 'binary' file into
172 | a vector of bytes, and then use `gmatch_bytes` to iterate over all `&[u8]`
173 | matches corresponding to two or more adjacent ASCII letters:
174 |
175 | ```rust
176 | let mut words = LuaPattern::new("%a%a+");
177 | for w in words.gmatch_bytes(&buf) {
178 | println!("{}",std::str::from_utf8(w).unwrap());
179 | }
180 | ```
181 | The pattern itself may be arbitrary bytes - Lua 'string' matching does
182 | not care about embedded nul bytes:
183 |
184 | ```rust
185 | let patt = &[0xDE,0x00,b'+',0xBE];
186 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x0,0x0,0xBE,0x0,0x0];
187 |
188 | let mut m = LuaPattern::from_bytes(patt);
189 | assert!(m.matches_bytes(bytes));
190 | assert_eq!(&bytes[m.capture(0)], &[0xDE,0x00,0x00,0xBE]);
191 | ```
192 | The problem here is that it's not obvious when our 'arbitrary' bytes
193 | include one of the special matching characters like `$` (which is 0x24)
194 | and so on. Hence there is `LuaPatternBuilder`:
195 |
196 | ```rust
197 | let bytes = &[0xFF,0xEE,0x0,0xDE,0x24,0x24,0xBE,0x0,0x0];
198 |
199 | let patt = LuaPatternBuilder::new()
200 | .bytes_as_hex("DE24") // less tedious than a byte slice
201 | .text("+") // unescaped
202 | .bytes(&[0xBE]) // byte slice
203 | .build();
204 |
205 | let mut m = LuaPattern::from_bytes(&patt);
206 | // picks up "DE2424BE"
207 | ```
208 | > Static verification: this version attempts to verify string patterns. If you
209 | > want errors, use `new_try` and `from_bytes_try`, otherwise the constructors panic.
210 | > If a match panics after successful verification, it is a __BUG__ - please
211 | > report the offending pattern.
212 |
213 |
--------------------------------------------------------------------------------
/lua-patterns/src/errors.rs:
--------------------------------------------------------------------------------
1 | use std::fmt;
2 | use std::error::Error;
3 |
4 | /// Error type returned by _try methods
5 | #[derive(Debug,PartialEq)]
6 | pub struct PatternError(pub String);
7 |
8 | impl fmt::Display for PatternError {
9 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
10 | write!(f,"{}",self.0)
11 | }
12 | }
13 |
14 | impl Error for PatternError {
15 | fn description(&self) -> &str {
16 | &self.0
17 | }
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/lua-patterns/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! This is a Rust binding to [Lua string patterns](https://www.lua.org/pil/20.2.html),
2 | //! using the original code from Lua 5.2.
3 | //!
4 | //! Although not regular expressions (they lack alternation) they are a powerful
5 | //! and lightweight way to process text. Please note that they are not
6 | //! UTF-8-aware, and in fact can process arbitrary binary data.
7 | //!
8 | //! `LuaPattern` can be created from a string _or_ a byte slice, and has
9 | //! methods which are similar to the original Lua API. Please see
10 | //! [the README](https://github.com/stevedonovan/lua-patterns/blob/master/readme.md)
11 | //! for more discussion.
12 | //!
13 | //! [LuaPattern](struct.LuaPattern.html) implements the public API.
14 | //!
15 | //! ## Examples
16 | //!
17 | //! ```rust
18 | //! extern crate lua_patterns;
19 | //! let mut m = lua_patterns::LuaPattern::new("one");
20 | //! let text = "hello one two";
21 | //! assert!(m.matches(text));
22 | //! let r = m.range();
23 | //! assert_eq!(r.start, 6);
24 | //! assert_eq!(r.end, 9);
25 | //! ```
26 | //!
27 | //! Collecting captures from a match:
28 | //!
29 | //! ```rust
30 | //! extern crate lua_patterns;
31 | //! let text = " hello one";
32 | //! let mut m = lua_patterns::LuaPattern::new("(%S+) one");
33 | //!
34 | //! // allocates a vector of captures
35 | //! let v = m.captures(text);
36 | //! assert_eq!(v, &["hello one","hello"]);
37 | //! let mut v = Vec::new();
38 | //! // writes captures into preallocated vector
39 | //! if m.capture_into(text,&mut v) {
40 | //! assert_eq!(v, &["hello one","hello"]);
41 | //! }
42 | //! ```
43 |
44 | use std::ops;
45 |
46 | pub mod errors;
47 | use errors::*;
48 | mod luapat;
49 | use luapat::*;
50 |
51 |
52 | /// Represents a Lua string pattern and the results of a match
53 | pub struct LuaPattern<'a> {
54 | patt: &'a [u8],
55 | matches: [LuaMatch; LUA_MAXCAPTURES],
56 | n_match: usize
57 | }
58 |
59 | impl <'a> LuaPattern<'a> {
60 | /// Maybe create a new Lua pattern from a slice of bytes
61 | pub fn from_bytes_try (bytes: &'a [u8]) -> Result,PatternError> {
62 | str_check(bytes)?;
63 | let matches = [LuaMatch{start: 0, end: 0}; LUA_MAXCAPTURES];
64 | Ok(LuaPattern{patt: bytes, matches: matches, n_match: 0})
65 | }
66 |
67 | /// Maybe create a new Lua pattern from a string
68 | pub fn new_try(patt: &'a str) -> Result,PatternError> {
69 | LuaPattern::from_bytes_try(patt.as_bytes())
70 | }
71 |
72 | /// Create a new Lua pattern from a string, panicking if bad
73 | pub fn new(patt: &'a str) -> LuaPattern<'a> {
74 | LuaPattern::new_try(patt).expect("bad pattern")
75 | }
76 |
77 | /// Create a new Lua pattern from a slice of bytes, panicking if bad
78 | pub fn from_bytes (bytes: &'a [u8]) -> LuaPattern<'a> {
79 | LuaPattern::from_bytes_try(bytes).expect("bad pattern")
80 | }
81 |
82 | /// Match a slice of bytes with a pattern
83 | ///
84 | /// ```
85 | /// let patt = &[0xFE,0xEE,b'+',0xED];
86 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
87 | /// let bytes = &[0x00,0x01,0xFE,0xEE,0xEE,0xED,0xEF];
88 | /// assert!(m.matches_bytes(bytes));
89 | /// assert_eq!(&bytes[m.range()], &[0xFE,0xEE,0xEE,0xED]);
90 | /// ```
91 | pub fn matches_bytes(&mut self, s: &[u8]) -> bool {
92 | self.n_match = str_match(s,self.patt,&mut self.matches).expect("Should not fail - report as bug");
93 | self.n_match > 0
94 | }
95 |
96 | /// Match a string with a pattern
97 | ///
98 | /// ```
99 | /// let mut m = lua_patterns::LuaPattern::new("(%a+) one");
100 | /// let text = " hello one two";
101 | /// assert!(m.matches(text));
102 | /// ```
103 | pub fn matches(&mut self, text: &str) -> bool {
104 | self.matches_bytes(text.as_bytes())
105 | }
106 |
107 | /// Match a string, returning first capture if successful
108 | ///
109 | /// ```
110 | /// let mut m = lua_patterns::LuaPattern::new("OK%s+(%d+)");
111 | /// let res = m.match_maybe("and that's OK 400 to you");
112 | /// assert_eq!(res, Some("400"));
113 | /// ```
114 | pub fn match_maybe<'t>(&mut self, text: &'t str) -> Option<&'t str> {
115 | if self.matches(text) {
116 | Some(&text[self.first_capture()])
117 | } else {
118 | None
119 | }
120 | }
121 |
122 | /// Match a string, returning first two explicit captures if successful
123 | ///
124 | /// ```
125 | /// let mut p = lua_patterns::LuaPattern::new("%s*(%d+)%s+(%S+)");
126 | /// let (int,rest) = p.match_maybe_2(" 233 hello dolly").unwrap();
127 | /// assert_eq!(int,"233");
128 | /// assert_eq!(rest,"hello");
129 | /// ```
130 | pub fn match_maybe_2<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str)> {
131 | if self.matches(text) {
132 | let cc = self.match_captures(text);
133 | if cc.num_matches() != 3 { return None; }
134 | Some((cc.get(1),cc.get(2)))
135 | } else {
136 | None
137 | }
138 | }
139 |
140 | /// Match a string, returning first three explicit captures if successful
141 | ///
142 | /// ```
143 | /// let mut p = lua_patterns::LuaPattern::new("(%d+)/(%d+)/(%d+)");
144 | /// let (y,m,d) = p.match_maybe_3("2017/11/10").unwrap();
145 | /// assert_eq!(y,"2017");
146 | /// assert_eq!(m,"11");
147 | /// assert_eq!(d,"10");
148 | /// ```
149 | pub fn match_maybe_3<'t>(&mut self, text: &'t str) -> Option<(&'t str,&'t str,&'t str)> {
150 | if self.matches(text) {
151 | let cc = self.match_captures(text);
152 | if cc.num_matches() != 4 { return None; }
153 | Some((cc.get(1),cc.get(2),cc.get(3)))
154 | } else {
155 | None
156 | }
157 | }
158 |
159 | /// Match and collect all captures as a vector of string slices
160 | ///
161 | /// ```
162 | /// let mut m = lua_patterns::LuaPattern::new("(one).+");
163 | /// assert_eq!(m.captures(" one two"), &["one two","one"]);
164 | /// ```
165 | pub fn captures<'b>(&mut self, text: &'b str) -> Vec<&'b str> {
166 | let mut res = Vec::new();
167 | self.capture_into(text, &mut res);
168 | res
169 | }
170 |
171 | /// A convenient way to access the captures with no allocation
172 | ///
173 | /// ```rust
174 | /// let text = " hello one";
175 | /// let mut m = lua_patterns::LuaPattern::new("(%S+) one");
176 | /// if m.matches(text) {
177 | /// let cc = m.match_captures(text);
178 | /// assert_eq!(cc.get(0), "hello one");
179 | /// assert_eq!(cc.get(1), "hello");
180 | /// }
181 | /// ```
182 | pub fn match_captures<'b,'c>(&'c self, text: &'b str) -> Captures<'a,'b,'c> {
183 | Captures {m: self, text: text}
184 | }
185 |
186 | /// Match and collect all captures into the provided vector.
187 | ///
188 | /// ```rust
189 | /// let text = " hello one";
190 | /// let mut m = lua_patterns::LuaPattern::new("(%S+) one");
191 | /// let mut v = Vec::new();
192 | /// if m.capture_into(text,&mut v) {
193 | /// assert_eq!(v, &["hello one","hello"]);
194 | /// }
195 | /// ```
196 | pub fn capture_into<'b>(&mut self, text: &'b str, vec: &mut Vec<&'b str>) -> bool {
197 | self.matches(text);
198 | vec.clear();
199 | for i in 0..self.n_match {
200 | vec.push(&text[self.capture(i)]);
201 | }
202 | self.n_match > 0
203 | }
204 |
205 | /// The full match (same as `capture(0)`)
206 | pub fn range(&self) -> ops::Range {
207 | self.capture(0)
208 | }
209 |
210 | /// Get the nth capture of the match.
211 | ///
212 | /// ```
213 | /// let mut m = lua_patterns::LuaPattern::new("(%a+) one");
214 | /// let text = " hello one two";
215 | /// assert!(m.matches(text));
216 | /// assert_eq!(m.capture(0),1..10);
217 | /// assert_eq!(m.capture(1),1..6);
218 | /// ```
219 | pub fn capture(&self, i: usize) -> ops::Range {
220 | ops::Range{
221 | start: self.matches[i].start as usize,
222 | end: self.matches[i].end as usize
223 | }
224 | }
225 |
226 | /// Get the 'first' capture of the match
227 | ///
228 | /// If there are no matches, this is the same as `range`,
229 | /// otherwise it's `capture(1)`
230 | pub fn first_capture(&self) -> ops::Range {
231 | let idx = if self.n_match > 1 {1} else {0};
232 | self.capture(idx)
233 | }
234 |
235 | /// An iterator over all matches in a string.
236 | ///
237 | /// The matches are returned as string slices; if there are no
238 | /// captures the full match is used, otherwise the first capture.
239 | /// That is, this example will also work with the pattern "(%S+)".
240 | ///
241 | /// ```
242 | /// let mut m = lua_patterns::LuaPattern::new("%S+");
243 | /// let split: Vec<_> = m.gmatch("dog cat leopard wolf").collect();
244 | /// assert_eq!(split,&["dog","cat","leopard","wolf"]);
245 | /// ```
246 | pub fn gmatch<'b,'c>(&'c mut self, text: &'b str) -> GMatch<'a,'b,'c> {
247 | GMatch{m: self, text: text}
248 | }
249 |
250 | /// An iterator over all captures in a string.
251 | ///
252 | /// The matches are returned as captures; this is a _streaming_
253 | /// iterator, so don't try to collect the captures directly; extract
254 | /// the string slices using `get`.
255 | ///
256 | /// ```
257 | /// let mut m = lua_patterns::LuaPattern::new("(%S)%S+");
258 | /// let split: Vec<_> = m.gmatch_captures("dog cat leopard wolf")
259 | /// .map(|cc| cc.get(1)).collect();
260 | /// assert_eq!(split,&["d","c","l","w"]);
261 | /// ```
262 | pub fn gmatch_captures<'b,'c>(&'c mut self, text: &'b str) -> GMatchCaptures<'a,'b,'c> {
263 | GMatchCaptures{m: self, text: text}
264 | }
265 |
266 | /// An iterator over all matches in a slice of bytes.
267 | ///
268 | /// ```
269 | /// let bytes = &[0xAA,0x01,0x01,0x03,0xBB,0x01,0x01,0x01];
270 | /// let patt = &[0x01,b'+'];
271 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
272 | /// let mut iter = m.gmatch_bytes(bytes);
273 | /// assert_eq!(iter.next().unwrap(), &[0x01,0x01]);
274 | /// assert_eq!(iter.next().unwrap(), &[0x01,0x01,0x01]);
275 | /// assert_eq!(iter.next(), None);
276 | /// ```
277 | pub fn gmatch_bytes<'b>(&'a mut self, bytes: &'b [u8]) -> GMatchBytes<'a,'b> {
278 | GMatchBytes{m: self, bytes: bytes}
279 | }
280 |
281 | /// Globally substitute all matches with a replacement
282 | /// provided by a function of the captures.
283 | ///
284 | /// ```
285 | /// let mut m = lua_patterns::LuaPattern::new("%$(%S+)");
286 | /// let res = m.gsub_with("hello $dolly you're so $fine!",
287 | /// |cc| cc.get(1).to_uppercase()
288 | /// );
289 | /// assert_eq!(res, "hello DOLLY you're so FINE!");
290 | /// ```
291 | pub fn gsub_with (&mut self, text: &str, lookup: F) -> String
292 | where F: Fn(Captures)-> String {
293 | let mut slice = text;
294 | let mut res = String::new();
295 | while self.matches(slice) {
296 | // full range of match
297 | let all = self.range();
298 | // append everything up to match
299 | res.push_str(&slice[0..all.start]);
300 | let captures = Captures{m: self, text: slice};
301 | let repl = lookup(captures);
302 | res.push_str(&repl);
303 | slice = &slice[all.end..];
304 | }
305 | res.push_str(slice);
306 | res
307 | }
308 |
309 | /// Globally substitute all matches with a replacement string
310 | ///
311 | /// This string _may_ have capture references ("%0",..). Use "%%"
312 | /// to represent "%". Plain strings like "" work just fine ;)
313 | ///
314 | /// ```
315 | /// let mut m = lua_patterns::LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
316 | /// let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
317 | /// assert_eq!(res,"'2':a '3':b '4':c ");
318 | /// ```
319 | pub fn gsub (&mut self, text: &str, repl: &str) -> String {
320 | let repl = generate_gsub_patterns(repl);
321 | let mut slice = text;
322 | let mut res = String::new();
323 | while self.matches(slice) {
324 | let all = self.range();
325 | res.push_str(&slice[0..all.start]);
326 | let captures = Captures{m: self, text: slice};
327 | for r in &repl {
328 | match *r {
329 | Subst::Text(ref s) => res.push_str(&s),
330 | Subst::Capture(i) => res.push_str(captures.get(i))
331 | }
332 | }
333 | slice = &slice[all.end..];
334 | }
335 | res.push_str(slice);
336 | res
337 | }
338 |
339 | /// Globally substitute all _byte_ matches with a replacement
340 | /// provided by a function of the captures.
341 | ///
342 | /// ```
343 | /// let bytes = &[0xAA,0x01,0x02,0x03,0xBB];
344 | /// let patt = &[0x01,0x02];
345 | /// let mut m = lua_patterns::LuaPattern::from_bytes(patt);
346 | /// let res = m.gsub_bytes_with(bytes,|cc| vec![0xFF]);
347 | /// assert_eq!(res, &[0xAA,0xFF,0x03,0xBB]);
348 | /// ```
349 | pub fn gsub_bytes_with (&mut self, bytes: &[u8], lookup: F) -> Vec
350 | where F: Fn(ByteCaptures)-> Vec {
351 | let mut slice = bytes;
352 | let mut res = Vec::new();
353 | while self.matches_bytes(slice) {
354 | let all = self.range();
355 | let capture = &slice[0..all.start];
356 | res.extend_from_slice(capture);
357 | let captures = ByteCaptures{m: self, bytes: slice};
358 | let repl = lookup(captures);
359 | res.extend(repl);
360 | slice = &slice[all.end..];
361 | }
362 | res.extend_from_slice(slice);
363 | res
364 | }
365 |
366 | }
367 |
368 | #[derive(Debug)]
369 | pub enum Subst {
370 | Text(String),
371 | Capture(usize)
372 | }
373 |
374 | impl Subst {
375 | fn new_text(text: &str) -> Subst {
376 | Subst::Text(text.to_string())
377 | }
378 | }
379 |
380 | pub fn generate_gsub_patterns(repl: &str) -> Vec {
381 | let mut m = LuaPattern::new("%%([%%%d])");
382 | let mut res = Vec::new();
383 | let mut slice = repl;
384 | while m.matches(slice) {
385 | let all = m.range();
386 | let before = &slice[0..all.start];
387 | if before != "" {
388 | res.push(Subst::new_text(before));
389 | }
390 | let capture = &slice[m.capture(1)];
391 | if capture == "%" { // escaped literal '%'
392 | res.push(Subst::new_text("%"));
393 | } else { // has to be a digit
394 | let index: usize = capture.parse().unwrap();
395 | res.push(Subst::Capture(index));
396 | }
397 | slice = &slice[all.end..];
398 | }
399 | res.push(Subst::new_text(slice));
400 | res
401 | }
402 |
403 | pub struct Substitute {
404 | repl: Vec
405 | }
406 |
407 | impl Substitute {
408 | pub fn new(repl: &str) -> Substitute {
409 | Substitute{
410 | repl: generate_gsub_patterns(repl)
411 | }
412 | }
413 |
414 | pub fn subst(&self, patt: &LuaPattern, text: &str) -> String {
415 | let mut res = String::new();
416 | let captures = patt.match_captures(text);
417 | for r in &self.repl {
418 | match *r {
419 | Subst::Text(ref s) => res.push_str(&s),
420 | Subst::Capture(i) => res.push_str(captures.get(i))
421 | }
422 | }
423 | res
424 | }
425 |
426 | }
427 |
428 |
429 |
430 | /// Low-overhead convenient access to string match captures
431 | // note: there are three borrows going on here.
432 | // The lifetime 'a is for the _pattern_, the lifetime 'b is
433 | // for the _source string_, and 'c is for the reference to LuaPattern
434 | // And the LuaPattern reference cannot live longer than the pattern reference
435 | pub struct Captures<'a,'b,'c> where 'a: 'c {
436 | m: &'c LuaPattern<'a>,
437 | text: &'b str
438 | }
439 |
440 | impl <'a,'b,'c> Captures<'a,'b,'c> {
441 | /// get the capture as a string slice
442 | pub fn get(&self, i: usize) -> &'b str {
443 | &self.text[self.m.capture(i)]
444 | }
445 |
446 | /// number of matches
447 | pub fn num_matches(&self) -> usize {
448 | self.m.n_match
449 | }
450 | }
451 |
452 |
453 | /// Low-overhead convenient access to byte match captures
454 | pub struct ByteCaptures<'a,'b> {
455 | m: &'a LuaPattern<'a>,
456 | bytes: &'b [u8]
457 | }
458 |
459 | impl <'a,'b> ByteCaptures<'a,'b> {
460 | /// get the capture as a byte slice
461 | pub fn get(&self, i: usize) -> &'b [u8] {
462 | &self.bytes[self.m.capture(i)]
463 | }
464 |
465 | /// number of matches
466 | pub fn num_matches(&self) -> usize {
467 | self.m.n_match
468 | }
469 | }
470 |
471 | /// Iterator for all string slices from `gmatch`
472 | // note lifetimes as for Captures above!
473 | pub struct GMatch<'a,'b,'c> where 'a: 'c {
474 | m: &'c mut LuaPattern<'a>,
475 | text: &'b str
476 | }
477 |
478 | impl <'a,'b,'c>Iterator for GMatch<'a,'b,'c> {
479 | type Item = &'b str;
480 |
481 | fn next(&mut self) -> Option {
482 | if ! self.m.matches(self.text) {
483 | None
484 | } else {
485 | let slice = &self.text[self.m.first_capture()];
486 | self.text = &self.text[self.m.range().end..];
487 | Some(slice)
488 | }
489 | }
490 |
491 | }
492 |
493 | /// Unsafe version of Captures, needed for gmatch_captures
494 | // It's unsafe because the lifetime only depends on the original
495 | // text, not the borrowed matches.
496 | pub struct CapturesUnsafe<'b>{
497 | matches: *const LuaMatch,
498 | text: &'b str
499 | }
500 |
501 | impl <'b> CapturesUnsafe<'b> {
502 | /// get the capture as a string slice
503 | pub fn get(&self, i: usize) -> &'b str {
504 | unsafe {
505 | let p = self.matches.offset(i as isize);
506 | let range =
507 | ops::Range{
508 | start: (*p).start as usize,
509 | end: (*p).end as usize
510 | };
511 | &self.text[range]
512 | }
513 | }
514 | }
515 |
516 | /// Streaming iterator for all captures from `gmatch_captures`
517 | // lifetimes as for Captures above!
518 | // 'a is pattern, 'b is text, 'c is ref to LuaPattern
519 | pub struct GMatchCaptures<'a,'b,'c> where 'a: 'c {
520 | m: &'c mut LuaPattern<'a>,
521 | text: &'b str
522 | }
523 |
524 | impl <'a,'b,'c> Iterator for GMatchCaptures<'a,'b,'c> where 'a: 'c {
525 | type Item = CapturesUnsafe<'b>;
526 |
527 | fn next(&mut self) -> Option {
528 | if ! self.m.matches(self.text) {
529 | None
530 | } else {
531 | let split = self.text.split_at(self.m.range().end);
532 | self.text = split.1;
533 | let match_ptr: *const LuaMatch = self.m.matches.as_ptr();
534 | Some(CapturesUnsafe{matches: match_ptr, text: split.0})
535 | }
536 | }
537 |
538 | }
539 |
540 | /// Iterator for all byte slices from `gmatch_bytes`
541 | pub struct GMatchBytes<'a,'b> {
542 | m: &'a mut LuaPattern<'a>,
543 | bytes: &'b [u8]
544 | }
545 |
546 | impl <'a,'b>Iterator for GMatchBytes<'a,'b> {
547 | type Item = &'b [u8];
548 |
549 | fn next(&mut self) -> Option {
550 | if ! self.m.matches_bytes(self.bytes) {
551 | None
552 | } else {
553 | let slice = &self.bytes[self.m.first_capture()];
554 | self.bytes = &self.bytes[self.m.range().end..];
555 | Some(slice)
556 | }
557 | }
558 |
559 | }
560 |
561 | /// Build a byte Lua pattern, optionally escaping 'magic' characters
562 | pub struct LuaPatternBuilder {
563 | bytes: Vec
564 | }
565 |
566 | impl LuaPatternBuilder {
567 | /// Create a new Lua pattern builder
568 | pub fn new() -> LuaPatternBuilder {
569 | LuaPatternBuilder{bytes: Vec::new()}
570 | }
571 |
572 | /// Add unescaped characters from a string
573 | ///
574 | /// ```
575 | /// let patt = lua_patterns::LuaPatternBuilder::new()
576 | /// .text("(boo)")
577 | /// .build();
578 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "(boo)");
579 | /// ```
580 | pub fn text(&mut self, s: &str) -> &mut Self {
581 | self.bytes.extend_from_slice(s.as_bytes());
582 | self
583 | }
584 |
585 | /// Add unescaped characters from lines
586 | ///
587 | /// This looks for first non-whitespace run in each line,
588 | /// useful for spreading patterns out and commmenting them.
589 | /// Works with patterns that use '%s' religiously!
590 | ///
591 | /// ```
592 | /// let patt = lua_patterns::LuaPatternBuilder::new()
593 | /// .text_lines("
594 | /// hello-dolly
595 | /// you-are-fine # comment
596 | /// cool
597 | /// ")
598 | /// .build();
599 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(),
600 | /// "hello-dollyyou-are-finecool");
601 | /// ```
602 | pub fn text_lines(&mut self, lines: &str) -> &mut Self {
603 | let mut text = String::new();
604 | for line in lines.lines() {
605 | if let Some(first) = line.split_whitespace().next() {
606 | text.push_str(first);
607 | }
608 | }
609 | self.text(&text)
610 | }
611 |
612 | /// Add escaped bytes from a slice
613 | ///
614 | /// ```
615 | /// let patt = lua_patterns::LuaPatternBuilder::new()
616 | /// .text("^")
617 | /// .bytes(b"^") // magic character!
618 | /// .build();
619 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^");
620 | /// ```
621 | pub fn bytes(&mut self, b: &[u8]) -> &mut Self {
622 | let mut m = LuaPattern::new("[%-%.%+%[%]%(%)%$%^%%%?%*]");
623 | let bb = m.gsub_bytes_with(b,|cc| {
624 | let mut res = Vec::new();
625 | res.push(b'%');
626 | res.push(cc.get(0)[0]);
627 | res
628 | });
629 | self.bytes.extend(bb);
630 | self
631 | }
632 |
633 | /// Add escaped bytes from hex string
634 | ///
635 | /// This consists of adjacent pairs of hex digits.
636 | ///
637 | /// ```
638 | /// let patt = lua_patterns::LuaPatternBuilder::new()
639 | /// .text("^")
640 | /// .bytes_as_hex("5E") // which is ASCII '^'
641 | /// .build();
642 | /// assert_eq!(std::str::from_utf8(&patt).unwrap(), "^%^");
643 | /// ```
644 | pub fn bytes_as_hex(&mut self, bs: &str) -> &mut Self {
645 | let bb = LuaPatternBuilder::hex_to_bytes(bs);
646 | self.bytes(&bb)
647 | }
648 |
649 | /// Create the pattern
650 | pub fn build(&mut self) -> Vec {
651 | let mut v = Vec::new();
652 | std::mem::swap(&mut self.bytes, &mut v);
653 | v
654 | }
655 |
656 | /// Utility to create a vector of bytes from a hex string
657 | ///
658 | /// ```
659 | /// let bb = lua_patterns::LuaPatternBuilder::hex_to_bytes("AEFE00FE");
660 | /// assert_eq!(bb, &[0xAE,0xFE,0x00,0xFE]);
661 | /// ```
662 | pub fn hex_to_bytes(s: &str) -> Vec {
663 | let mut m = LuaPattern::new("%x%x");
664 | m.gmatch(s).map(|pair| u8::from_str_radix(pair,16).unwrap()).collect()
665 | }
666 |
667 | /// Utility to create a hex string from a slice of bytes
668 | ///
669 | /// ```
670 | /// let hex = lua_patterns::LuaPatternBuilder::bytes_to_hex(&[0xAE,0xFE,0x00,0xFE]);
671 | /// assert_eq!(hex,"AEFE00FE");
672 | ///
673 | /// ```
674 | pub fn bytes_to_hex(s: &[u8]) -> String {
675 | s.iter().map(|b| format!("{:02X}",b)).collect()
676 | }
677 |
678 | }
679 |
680 | #[cfg(test)]
681 | mod tests {
682 | use super::*;
683 |
684 | #[test]
685 | fn captures_and_matching() {
686 | let mut m = LuaPattern::new("(one).+");
687 | assert_eq!(m.captures(" one two"), &["one two","one"]);
688 | let empty: &[&str] = &[];
689 | assert_eq!(m.captures("four"), empty);
690 |
691 | assert_eq!(m.matches("one dog"), true);
692 | assert_eq!(m.matches("dog one "), true);
693 | assert_eq!(m.matches("dog one"), false);
694 |
695 | let text = "one dog";
696 | let mut m = LuaPattern::new("^(%a+)");
697 | assert_eq!(m.matches(text), true);
698 | assert_eq!(&text[m.capture(1)], "one");
699 | assert_eq!(m.matches(" one dog"), false);
700 |
701 | // captures without allocation
702 | m.matches(text);
703 | let captures = m.match_captures(text);
704 | assert_eq!(captures.get(0), "one");
705 | assert_eq!(captures.get(1), "one");
706 |
707 | let mut m = LuaPattern::new("(%S+)%s*=%s*(.+)");
708 |
709 | // captures as Vec
710 | let cc = m.captures(" hello= bonzo dog");
711 | assert_eq!(cc[0], "hello= bonzo dog");
712 | assert_eq!(cc[1], "hello");
713 | assert_eq!(cc[2], "bonzo dog");
714 |
715 | }
716 |
717 | #[test]
718 | fn multiple_captures() {
719 | let mut p = LuaPattern::new("%s*(%d+)%s+(%S+)");
720 | let (int,rest) = p.match_maybe_2(" 233 hello dolly").unwrap();
721 | assert_eq!(int,"233");
722 | assert_eq!(rest,"hello");
723 | }
724 |
725 | #[test]
726 | fn gmatch() {
727 | let mut m = LuaPattern::new("%a+");
728 | let mut iter = m.gmatch("one two three");
729 | assert_eq!(iter.next(), Some("one"));
730 | assert_eq!(iter.next(), Some("two"));
731 | assert_eq!(iter.next(), Some("three"));
732 | assert_eq!(iter.next(), None);
733 |
734 | let mut m = LuaPattern::new("(%a+)");
735 | let mut iter = m.gmatch("one two three");
736 | assert_eq!(iter.next(), Some("one"));
737 | assert_eq!(iter.next(), Some("two"));
738 | assert_eq!(iter.next(), Some("three"));
739 | assert_eq!(iter.next(), None);
740 |
741 | let mut m = LuaPattern::new("(%a+)");
742 | let mut iter = m.gmatch_captures("one two three");
743 | assert_eq!(iter.next().unwrap().get(1), "one");
744 | assert_eq!(iter.next().unwrap().get(1), "two");
745 | assert_eq!(iter.next().unwrap().get(1), "three");
746 | }
747 |
748 | #[test]
749 | fn gsub() {
750 | use std::collections::HashMap;
751 |
752 | let mut m = LuaPattern::new("%$(%S+)");
753 | let res = m.gsub_with("hello $dolly you're so $fine!",
754 | |cc| cc.get(1).to_uppercase()
755 | );
756 | assert_eq!(res, "hello DOLLY you're so FINE!");
757 |
758 | let mut map = HashMap::new();
759 | map.insert("dolly", "baby");
760 | map.insert("fine", "cool");
761 | map.insert("good-looking", "pretty");
762 |
763 | let mut m = LuaPattern::new("%$%((.-)%)");
764 | let res = m.gsub_with("hello $(dolly) you're so $(fine) and $(good-looking)",
765 | |cc| map.get(cc.get(1)).unwrap_or(&"?").to_string()
766 | );
767 | assert_eq!(res, "hello baby you're so cool and pretty");
768 |
769 | let mut m = LuaPattern::new("%s+");
770 | let res = m.gsub("hello dolly you're so fine","");
771 | assert_eq!(res, "hellodollyyou'resofine");
772 |
773 | let mut m = LuaPattern::new("(%S+)%s*=%s*(%S+);%s*");
774 | let res = m.gsub("a=2; b=3; c = 4;", "'%2':%1 ");
775 | assert_eq!(res,"'2':a '3':b '4':c ");
776 | }
777 |
778 | #[test]
779 | fn bad_patterns() {
780 | let bad = [
781 | ("bonzo %","malformed pattern (ends with '%')"),
782 | ("bonzo (dog%(","unfinished capture"),
783 | ("alles [%a%[","malformed pattern (missing ']')"),
784 | ("bonzo (dog (cat)","unfinished capture"),
785 | ("frodo %f[%A","malformed pattern (missing ']')"),
786 | ("frodo (1) (2(3)%2)%1","invalid capture index %2"),
787 | ];
788 | for p in bad.iter() {
789 | let res = LuaPattern::new_try(p.0);
790 | if let Err(e) = res {
791 | assert_eq!(e, PatternError(p.1.into()));
792 | } else {
793 | panic!("false positive");
794 | }
795 | }
796 | }
797 | }
798 |
--------------------------------------------------------------------------------
/lua-patterns/src/luapat.rs:
--------------------------------------------------------------------------------
1 | // translation of Lua 5.2 string pattern code
2 |
3 | use errors::*;
4 | use std::ptr::null;
5 |
6 | pub const LUA_MAXCAPTURES: usize = 32;
7 | /* maximum recursion depth for 'match' */
8 | const MAXCCALLS: usize = 200;
9 |
10 | const L_ESC: u8 = b'%';
11 |
12 | fn add(p: CPtr, count: usize) -> CPtr {
13 | unsafe {p.offset(count as isize)}
14 | }
15 |
16 | fn sub(p: CPtr, count: usize) -> CPtr {
17 | unsafe {p.offset(-(count as isize))}
18 | }
19 |
20 | fn next(p: CPtr) -> CPtr {
21 | add(p, 1)
22 | }
23 |
24 | fn at(p: CPtr) -> u8 {
25 | unsafe { *p }
26 | }
27 |
28 | fn diff(p1: CPtr, p2: CPtr) -> usize {
29 | let d = (p1 as isize).wrapping_sub(p2 as isize);
30 | d as usize
31 | }
32 |
33 | #[derive(Copy,Clone,Debug)]
34 | pub struct LuaMatch {
35 | pub start: usize,
36 | pub end: usize,
37 | }
38 |
39 | #[derive(Copy,Clone)]
40 | enum CapLen {
41 | Len(usize),
42 | Unfinished,
43 | Position,
44 | }
45 |
46 | impl CapLen {
47 | fn is_unfinished(&self) -> bool {
48 | match *self {
49 | CapLen::Unfinished => true,
50 | _ => false
51 | }
52 | }
53 |
54 | fn size(&self) -> Result {
55 | match *self {
56 | CapLen::Len(size) => Ok(size),
57 | _ => error("capture was unfinished or positional")
58 | }
59 | }
60 |
61 | }
62 |
63 | type CPtr = *const u8;
64 |
65 | #[derive(Copy,Clone)]
66 | struct Capture {
67 | init: CPtr,
68 | len: CapLen,
69 | }
70 |
71 | impl Capture {
72 | fn is_unfinished(&self) -> bool {
73 | self.len.is_unfinished()
74 | }
75 | }
76 |
77 | use std::result;
78 |
79 | type Result = result::Result;
80 |
81 | fn error(msg: &str) -> Result {
82 | Err(PatternError(msg.into()))
83 | }
84 |
85 | struct MatchState {
86 | matchdepth: usize, /* control for recursive depth (to avoid stack overflow) */
87 | src_init: CPtr, /* init of source string */
88 | src_end: CPtr, /* end ('\0') of source string */
89 | p_end: CPtr, /* end ('\0') of pattern */
90 | level: usize, /* total number of captures (finished or unfinished) */
91 | capture: [Capture; LUA_MAXCAPTURES],
92 | }
93 |
94 | impl MatchState {
95 | fn new(s: CPtr, se: CPtr, pe: CPtr) -> MatchState {
96 | MatchState {
97 | matchdepth: MAXCCALLS,
98 | src_init: s,
99 | src_end: se,
100 | p_end: pe,
101 | level: 0,
102 | capture: [Capture{init: null(), len: CapLen::Len(0) }; LUA_MAXCAPTURES],
103 | }
104 | }
105 |
106 | fn check_capture(&self, l: usize) -> Result {
107 | let l = l as i8 - b'1' as i8;
108 | if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() {
109 | return error(&format!("invalid capture index %{}", l + 1));
110 | }
111 | Ok(l as usize)
112 | }
113 |
114 | fn capture_to_close(&self) -> Result {
115 | let mut level = (self.level - 1) as isize;
116 | while level >= 0 {
117 | if self.capture[level as usize].is_unfinished() {
118 | return Ok(level as usize);
119 | }
120 | level -= 1;
121 | }
122 | error("invalid pattern capture")
123 | }
124 |
125 | fn classend (&self, p: CPtr) -> Result {
126 | let ch = at(p);
127 | let mut next_p = next(p);
128 | Ok(match ch {
129 | L_ESC => {
130 | if next_p == self.p_end {
131 | return error("malformed pattern (ends with '%')");
132 | }
133 | next(next_p)
134 | },
135 | b'[' => {
136 | if at(next_p) == b'^' {
137 | next_p = next(next_p);
138 | }
139 | while at(next_p) != b']' {
140 | if next_p == self.p_end {
141 | return error("malformed pattern (missing ']')");
142 | }
143 | let ch = at(next_p);
144 | next_p = next(next_p);
145 | if ch == L_ESC && p < self.p_end {
146 | next_p = next(next_p); /* skip escapes (e.g. `%]') */
147 | }
148 | }
149 | next(next_p)
150 | },
151 | _ => next_p
152 | })
153 | }
154 |
155 | }
156 |
157 | fn match_class (ch: u8, class: u8) -> bool {
158 | let res = match class.to_ascii_lowercase() {
159 | b'a' => ch.is_ascii_alphabetic(),
160 | b'c' => ch.is_ascii_control(),
161 | b'd' => ch.is_ascii_digit(),
162 | b'g' => ch.is_ascii_graphic(),
163 | b'l' => ch.is_ascii_lowercase(),
164 | b'p' => ch.is_ascii_punctuation(),
165 | b's' => ch.is_ascii_whitespace(),
166 | b'u' => ch.is_ascii_uppercase(),
167 | b'w' => ch.is_ascii_alphanumeric(),
168 | b'x' => ch.is_ascii_hexdigit(),
169 | lc => return lc == ch,
170 | };
171 | if class.is_ascii_lowercase() { res } else {! res}
172 | }
173 |
174 |
175 | fn matchbracketclass (c: u8, p: CPtr, ec: CPtr) -> bool {
176 | let mut p = p;
177 | // [^ inverts match
178 | let sig = if at(next(p)) == b'^' {
179 | p = next(p);
180 | false
181 | } else {
182 | true
183 | };
184 | p = next(p);
185 | while p < ec {
186 | if at(p) == L_ESC { // e.g %s
187 | p = next(p);
188 | if match_class(c, at(p)) {
189 | return sig;
190 | }
191 | } else
192 | // e.g a-z
193 | if at(next(p)) == b'-' && add(p,2) < ec {
194 | let lastc = at(p);
195 | p = add(p,2);
196 | if lastc <= c && c <= at(p) {
197 | return sig;
198 | }
199 | } else
200 | if at(p) == c {
201 | return sig;
202 | }
203 | p = next(p);
204 | }
205 | return ! sig;
206 | }
207 |
208 | impl MatchState {
209 |
210 | fn singlematch (&self, s: CPtr, p: CPtr, ep: CPtr) -> bool {
211 | if s >= self.src_end {
212 | return false;
213 | }
214 | let c = at(s);
215 | let pc = at(p);
216 | match pc {
217 | b'.' => true, /* matches any char */
218 | L_ESC => match_class(c, at(next(p))),
219 | b'[' => matchbracketclass(c, p, sub(ep,1)),
220 | _ => c == pc
221 | }
222 | }
223 |
224 | fn matchbalance (&self, s: CPtr, p: CPtr) -> Result {
225 | if p >= sub(self.p_end,1) {
226 | return error("malformed pattern (missing arguments to '%b')");
227 | }
228 | if at(s) != at(p) {
229 | return Ok(null());
230 | }
231 | // e.g. %b()
232 | let b = at(p);
233 | let e = at(next(p));
234 | let mut cont = 1;
235 | let mut s = next(s);
236 | while s < self.src_end {
237 | let ch = at(s);
238 | if ch == e {
239 | cont -= 1;
240 | if cont == 0 {
241 | return Ok(next(s));
242 | }
243 | } else
244 | if ch == b {
245 | cont += 1;
246 | }
247 | s = next(s);
248 | }
249 | Ok(null()) /* string ends out of balance */
250 | }
251 |
252 | fn max_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result {
253 | let mut i = 0isize; /* counts maximum expand for item */
254 | while self.singlematch(add(s,i as usize),p,ep) {
255 | i += 1;
256 | }
257 | /* keeps trying to match with the maximum repetitions */
258 | while i >= 0 {
259 | let res = self.patt_match(add(s,i as usize),next(ep))?;
260 | if ! res.is_null() {
261 | return Ok(res);
262 | }
263 | i -= 1; /* else didn't match; reduce 1 repetition to try again */
264 | }
265 | Ok(null())
266 | }
267 |
268 | fn min_expand(&mut self, s: CPtr, p: CPtr, ep: CPtr) -> Result {
269 | let mut s = s;
270 | loop {
271 | let res = self.patt_match(s,next(ep))?;
272 | if ! res.is_null() {
273 | return Ok(res);
274 | } else
275 | if self.singlematch(s, p, ep) {
276 | s = next(s);
277 | } else {
278 | return Ok(null());
279 | }
280 | }
281 | }
282 |
283 | fn start_capture(&mut self, s: CPtr, p: CPtr, what: CapLen) -> Result {
284 | let level = self.level;
285 | if level >= LUA_MAXCAPTURES {
286 | return error("too many captures");
287 | }
288 | self.capture[level].init = s;
289 | self.capture[level].len = what;
290 | self.level = level + 1;
291 | let res = self.patt_match(s, p)?;
292 | if res.is_null() { /* match failed? */
293 | self.level -= 1; /* undo capture */
294 | }
295 | Ok(res)
296 | }
297 |
298 | fn end_capture(&mut self, s: CPtr, p: CPtr) -> Result {
299 | let l = self.capture_to_close()?;
300 | self.capture[l].len = CapLen::Len(diff(s,self.capture[l].init)); /* close capture */
301 | let res = self.patt_match(s, p)?;
302 | if res.is_null() { /* match failed? */
303 | self.capture[l].len = CapLen::Unfinished;
304 | }
305 | Ok(res)
306 | }
307 |
308 | fn match_capture(&mut self, s: CPtr, l: usize) -> Result {
309 | let l = self.check_capture(l)?;
310 | let len = self.capture[l].len.size()?;
311 | if diff(self.src_end, s) >= len {
312 | unsafe {s.copy_to_nonoverlapping(self.capture[l].init as *mut u8, len);}
313 | return Ok(add(s,len));
314 | }
315 | Ok(null())
316 | }
317 |
318 |
319 | fn patt_match(&mut self, s: CPtr, p: CPtr) -> Result {
320 | let mut s = s;
321 | let mut p = p;
322 | self.matchdepth -= 1;
323 | if self.matchdepth == 0 {
324 | return error("pattern too complex");
325 | }
326 |
327 | if p == self.p_end { /* end of pattern? */
328 | self.matchdepth += 1;
329 | return Ok(s);
330 | }
331 | match at(p) {
332 | b'(' => { /* start capture */
333 | if at(next(p)) == b')' { /* position capture? */
334 | s = self.start_capture(s, add(p,2), CapLen::Position)?;
335 | } else {
336 | s = self.start_capture(s, next(p), CapLen::Unfinished)?;
337 | }
338 | },
339 | b')' => { /* end capture */
340 | s = self.end_capture(s, next(p))?;
341 | },
342 | b'$' => {
343 | if next(p) != self.p_end { /* is the `$' the last char in pattern? */
344 | /* no; go to default */
345 | return self.patt_default_match(s, p);
346 | }
347 | s = if s == self.src_end {s} else {null()}; /* check end of string */
348 | }
349 | L_ESC => { /* escaped sequences not in the format class[*+?-]? */
350 | match at(next(p)) {
351 | b'b' => { /* balanced string? */
352 | s = self.matchbalance(s, add(p,2))?;
353 | if ! s.is_null() {
354 | // e.g, after %b()
355 | return self.patt_match(s, add(p,4));
356 | }
357 | },
358 | b'f' => { /* frontier? */
359 | p = add(p,2);
360 | if at(p) != b'[' {
361 | return error("missing '[' after '%f' in pattern");
362 | }
363 | let ep = self.classend(p)?; /* points to what is next */
364 | let previous = if s == self.src_init {b'\0'} else {at(sub(s,1))};
365 | let epl = sub(ep,1);
366 | if ! matchbracketclass(previous,p,epl)
367 | && matchbracketclass(at(s),p,epl) {
368 | return self.patt_match(s, ep);
369 | }
370 | s = null(); /* match failed */
371 | },
372 | b'0'..=b'9' => { /* capture results (%0-%9)? */
373 | s = self.match_capture(s,at(next(p)) as usize)?;
374 | if ! s.is_null() {
375 | return self.patt_match(s, add(p,2));
376 | }
377 | },
378 | _ => return self.patt_default_match(s, p)
379 | }
380 |
381 | },
382 | _ => return self.patt_default_match(s, p)
383 |
384 | }
385 | self.matchdepth += 1;
386 | Ok(s)
387 | }
388 |
389 | fn patt_default_match(&mut self, s: CPtr, p: CPtr) -> Result {
390 | let mut s = s;
391 | /* pattern class plus optional suffix */
392 | let ep = self.classend(p)?; /* points to optional suffix */
393 | let epc = if ep == self.p_end { 0 } else { at(ep) };
394 | /* does not match at least once? */
395 | if ! self.singlematch(s, p, ep) {
396 | if epc == b'*' || epc == b'?' || epc == b'-' { /* accept empty? */
397 | return self.patt_match(s, next(ep));
398 | } else { /* '+' or no suffix */
399 | s = null(); /* fail */
400 | }
401 | } else { /* matched once */
402 | match at(ep) { /* handle optional suffix */
403 | b'?' => {
404 | let res = self.patt_match(next(s),next(ep))?;
405 | if ! res.is_null() {
406 | s = res;
407 | } else {
408 | return self.patt_match(s, next(ep));
409 | }
410 | },
411 | b'+' => { /* 1 or more repetitions */
412 | s = next(s);
413 | s = self.max_expand(s, p, ep)?;
414 | },
415 | b'*' => { /* 0 or more repetitions */
416 | s = self.max_expand(s, p, ep)?;
417 | },
418 | b'-' => { /* 0 or more repetitions (minimum) */
419 | s = self.min_expand(s, p, ep)? ;
420 | },
421 | _ => { /* no suffix */
422 | return self.patt_match(next(s),ep);
423 | }
424 | }
425 | }
426 | self.matchdepth += 1;
427 | Ok(s)
428 | }
429 |
430 | fn push_onecapture(&mut self, i: usize, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result<()> {
431 | if i >= self.level {
432 | if i == 0 { /* ms->level == 0, too */
433 | mm[0].start = 0;
434 | mm[0].end = diff(e,s);
435 | Ok(())
436 | } else {
437 | return error("invalid capture index");
438 | }
439 | } else {
440 | let init = self.capture[i].init;
441 | match self.capture[i].len {
442 | CapLen::Unfinished => error("unfinished capture"),
443 | CapLen::Position => {
444 | mm[i].start = diff(init,next(self.src_init));
445 | mm[i].end = mm[i].start;
446 | Ok(())
447 | },
448 | CapLen::Len(l) => {
449 | mm[i].start = diff(init,self.src_init);
450 | mm[i].end = mm[i].start + l;
451 | Ok(())
452 | }
453 | }
454 | }
455 |
456 | }
457 |
458 | fn push_captures(&mut self, s: CPtr, e: CPtr, mm: &mut [LuaMatch]) -> Result {
459 | let nlevels = if self.level == 0 && ! s.is_null() {1} else {self.level};
460 | for i in 0..nlevels {
461 | self.push_onecapture(i, s, e, mm)?;
462 | }
463 | Ok(nlevels) /* number of strings pushed */
464 | }
465 |
466 | pub fn str_match_check(&mut self, p: CPtr) -> Result<()> {
467 | let mut level_stack = [0; LUA_MAXCAPTURES];
468 | let mut stack_idx = 0;
469 | let mut p = p;
470 | while p < self.p_end {
471 | let ch = at(p);
472 | p = next(p);
473 | match ch {
474 | L_ESC => {
475 | //p = next(p);
476 | let c = at(p);
477 | match c {
478 | b'b' => {
479 | p = next(p);
480 | if p >= self.p_end {
481 | return error("malformed pattern (missing arguments to '%b')");
482 | }
483 | },
484 | b'f' => {
485 | p = next(p);
486 | if at(p) != b'[' {
487 | return error("missing '[' after '%f' in pattern");
488 | }
489 | p = sub(p,1); // so we see [...]
490 | },
491 | b'0' ..= b'9' => {
492 | let l = (c as i8) - (b'1' as i8);
493 | println!("level {}", self.level);
494 | if l < 0 || l as usize >= self.level || self.capture[l as usize].is_unfinished() {
495 | return error(&format!("invalid capture index %{}", l + 1));
496 | }
497 | p = sub(p,1);
498 | },
499 | _ => {}
500 | }
501 | },
502 | b'[' => {
503 | while at(p) != b']' {
504 | if p == self.p_end {
505 | return error("malformed pattern (missing ']')");
506 | }
507 | if at(p) == L_ESC && p < self.p_end {
508 | p = next(p);
509 | }
510 | p = next(p);
511 | }
512 | },
513 | b'(' => {
514 | if at(p) != b')' { // not a position capture
515 | level_stack[stack_idx] = self.level;
516 | stack_idx += 1;
517 | self.capture[self.level].len = CapLen::Unfinished;
518 | self.level += 1;
519 | if self.level >= LUA_MAXCAPTURES {
520 | return error("too many captures");
521 | }
522 | } else {
523 | p = next(p);
524 | }
525 | },
526 | b')' => {
527 | if stack_idx == 0 {
528 | return error("no open capture");
529 | }
530 | stack_idx -= 1;
531 | self.capture[level_stack[stack_idx]].len = CapLen::Position;
532 | },
533 | _ => {}
534 | }
535 | }
536 | if stack_idx > 0 {
537 | return error("unfinished capture");
538 | }
539 | Ok(())
540 | }
541 | }
542 |
543 | pub fn str_match(s: &[u8], p: &[u8], mm: &mut [LuaMatch]) -> Result {
544 | let mut lp = p.len();
545 | let mut p = p.as_ptr();
546 | let ls = s.len();
547 | let s = s.as_ptr();
548 | let mut s1 = s;
549 | let anchor = at(p) == b'^';
550 | if anchor {
551 | p = next(p);
552 | lp -= 1; /* skip anchor character */
553 | }
554 |
555 | let mut ms = MatchState::new(s,add(s,ls),add(p,lp));
556 | loop {
557 | let res = ms.patt_match(s1, p)?;
558 | if ! res.is_null() {
559 | mm[0].start = diff(s1,s); /* start */
560 | mm[0].end = diff(res,s); /* end */
561 | return Ok(ms.push_captures(null(),null(),&mut mm[1..])? + 1);
562 | }
563 | s1 = next(s1);
564 | if ! (s1 < ms.src_end && ! anchor) {
565 | break;
566 | }
567 | }
568 | Ok(0)
569 | }
570 |
571 | pub fn str_check(p: &[u8]) -> Result<()> {
572 | let mut lp = p.len();
573 | let mut p = p.as_ptr();
574 | let anchor = at(p) == b'^';
575 | if anchor {
576 | p = next(p);
577 | lp -= 1; /* skip anchor character */
578 | }
579 | let mut ms = MatchState::new(null(),null(),add(p,lp));
580 | if at(sub(ms.p_end,1)) == b'%' {
581 | return error("malformed pattern (ends with '%')");
582 | }
583 | ms.str_match_check(p)?;
584 | Ok(())
585 | }
586 |
587 | /*
588 | fn check(s: &[u8], p: &[u8]) {
589 | if let Err(e) = str_check(p) {
590 | println!("check error {}",e);
591 | return;
592 | }
593 |
594 | let mut matches = [LuaMatch{start: 0, end: 0}; 10];
595 | match str_match(s, p, &mut matches) {
596 | Ok(n) => {
597 | println!("ok {} matches", n);
598 | for i in 0..n {
599 | println!("match {:?} {:?}",
600 | matches[i],
601 | String::from_utf8(s[matches[i].start .. matches[i].end].to_vec())
602 | );
603 | }
604 | },
605 | Err(e) => {
606 | println!("error: {}", e)
607 | }
608 | }
609 | }
610 |
611 |
612 |
613 | fn main() {
614 | let mut args = std::env::args().skip(1);
615 | let pat = args.next().unwrap();
616 | let s = args.next().unwrap();
617 | check(s.as_bytes(), pat.as_bytes());
618 |
619 | //~ check(b"hello",b"%a");
620 | //~ check(b"0hello",b"%a+");
621 | //~ check(b"hello",b"%l(%a)");
622 | //check(b"hello",b"he(l+)");
623 | //check(b"k {and {so}}",b"k%s+(%b{})");
624 | }
625 | */
626 |
--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | reorder_modules = false
2 | use_small_heuristics = "Max"
3 | tab_spaces = 2
4 |
--------------------------------------------------------------------------------
/src/annot.rs:
--------------------------------------------------------------------------------
1 | mod generated;
2 |
3 | use std::fmt;
4 |
5 | pub(crate) use self::generated::*;
6 |
7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
8 | pub(crate) enum Annot {
9 | Atom(Atom),
10 | Add(Comp),
11 | Sub(Comp),
12 | }
13 |
14 | impl PartialEq for Annot {
15 | fn eq(&self, other: &Atom) -> bool {
16 | match self {
17 | Annot::Atom(it) => it == other,
18 | _ => false,
19 | }
20 | }
21 | }
22 |
23 | impl From for Annot {
24 | fn from(value: Atom) -> Annot {
25 | Annot::Atom(value)
26 | }
27 | }
28 |
29 | impl fmt::Display for Annot {
30 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
31 | match self {
32 | Annot::Atom(it) => write!(f, "{it}"),
33 | Annot::Add(it) => write!(f, "+{it}"),
34 | Annot::Sub(it) => write!(f, "-{it}"),
35 | }
36 | }
37 | }
38 |
39 | impl Comp {
40 | pub(crate) fn add(self) -> Annot {
41 | Annot::Add(self)
42 | }
43 | pub(crate) fn sub(self) -> Annot {
44 | Annot::Sub(self)
45 | }
46 | }
47 |
48 | impl Default for Comp {
49 | fn default() -> Self {
50 | Comp::Para
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/annot/generated.rs:
--------------------------------------------------------------------------------
1 | use std::fmt;
2 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
3 | pub(crate) enum Comp {
4 | Verbatim,
5 | Email,
6 | Url,
7 | Subscript,
8 | Superscript,
9 | Para,
10 | CodeBlock,
11 | Imagetext,
12 | Linktext,
13 | Reference,
14 | Destination,
15 | Emph,
16 | Strong,
17 | Span,
18 | DoubleQuoted,
19 | ReferenceDefinition,
20 | Insert,
21 | Delete,
22 | Mark,
23 | Attributes,
24 | }
25 |
26 | impl fmt::Display for Comp {
27 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28 | f.write_str(match self {
29 | Comp::Verbatim => "verbatim",
30 | Comp::Email => "email",
31 | Comp::Url => "url",
32 | Comp::Subscript => "subscript",
33 | Comp::Superscript => "superscript",
34 | Comp::Para => "para",
35 | Comp::CodeBlock => "code_block",
36 | Comp::Imagetext => "imagetext",
37 | Comp::Linktext => "linktext",
38 | Comp::Reference => "reference",
39 | Comp::Destination => "destination",
40 | Comp::Emph => "emph",
41 | Comp::Strong => "strong",
42 | Comp::Span => "span",
43 | Comp::DoubleQuoted => "double_quoted",
44 | Comp::ReferenceDefinition => "reference_definition",
45 | Comp::Insert => "insert",
46 | Comp::Delete => "delete",
47 | Comp::Mark => "mark",
48 | Comp::Attributes => "attributes",
49 | })
50 | }
51 | }
52 |
53 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
54 | pub(crate) enum Atom {
55 | Str,
56 | Escape,
57 | Hardbreak,
58 | Nbsp,
59 | Blankline,
60 | ImageMarker,
61 | LeftDoubleQuote,
62 | RightDoubleQuote,
63 | Ellipses,
64 | Softbreak,
65 | FootnoteReference,
66 | OpenMarker,
67 | Emoji,
68 | ReferenceKey,
69 | ReferenceValue,
70 | CodeLanguage,
71 | EmDash,
72 | EnDash,
73 | Id,
74 | Key,
75 | Value,
76 | Class,
77 | }
78 |
79 | impl Atom {
80 | pub(crate) fn is_left_atom(self) -> bool {
81 | matches!(self, | Atom::LeftDoubleQuote)
82 | }
83 | pub(crate) fn is_right_atom(self) -> bool {
84 | matches!(self, | Atom::RightDoubleQuote)
85 | }
86 | pub(crate) fn corresponding_left_atom(self) -> Atom {
87 | match self {
88 | Atom::RightDoubleQuote => Atom::LeftDoubleQuote,
89 |
90 | _ => self,
91 | }
92 | }
93 | pub(crate) fn corresponding_right_atom(self) -> Atom {
94 | match self {
95 | Atom::LeftDoubleQuote => Atom::RightDoubleQuote,
96 |
97 | _ => self,
98 | }
99 | }
100 | }
101 |
102 | impl fmt::Display for Atom {
103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 | f.write_str(match self {
105 | Atom::Str => "str",
106 | Atom::Escape => "escape",
107 | Atom::Hardbreak => "hardbreak",
108 | Atom::Nbsp => "nbsp",
109 | Atom::Blankline => "blankline",
110 | Atom::ImageMarker => "image_marker",
111 | Atom::LeftDoubleQuote => "left_double_quote",
112 | Atom::RightDoubleQuote => "right_double_quote",
113 | Atom::Ellipses => "ellipses",
114 | Atom::Softbreak => "softbreak",
115 | Atom::FootnoteReference => "footnote_reference",
116 | Atom::OpenMarker => "open_marker",
117 | Atom::Emoji => "emoji",
118 | Atom::ReferenceKey => "reference_key",
119 | Atom::ReferenceValue => "reference_value",
120 | Atom::CodeLanguage => "code_language",
121 | Atom::EmDash => "em_dash",
122 | Atom::EnDash => "en_dash",
123 | Atom::Id => "id",
124 | Atom::Key => "key",
125 | Atom::Value => "value",
126 | Atom::Class => "class",
127 | })
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/ast.rs:
--------------------------------------------------------------------------------
1 | mod generated;
2 |
3 | use indexmap::IndexMap;
4 |
5 | pub use self::generated::*;
6 |
7 | pub type Attrs = IndexMap;
8 |
9 | #[derive(Debug, Default, Clone, serde::Serialize)]
10 | pub struct ReferenceDefinition {
11 | #[serde(skip_serializing_if = "Attrs::is_empty")]
12 | pub attrs: Attrs,
13 | pub destination: String,
14 | }
15 |
--------------------------------------------------------------------------------
/src/ast/generated.rs:
--------------------------------------------------------------------------------
1 | use super::Attrs;
2 |
3 | #[derive(Debug, Default, Clone, serde::Serialize)]
4 | pub struct Heading {
5 | #[serde(skip_serializing_if = "Attrs::is_empty")]
6 | pub attrs: Attrs,
7 | pub children: Vec,
8 | pub level: u32,
9 | }
10 |
11 | #[derive(Debug, Default, Clone, serde::Serialize)]
12 | pub struct Para {
13 | #[serde(skip_serializing_if = "Attrs::is_empty")]
14 | pub attrs: Attrs,
15 | pub children: Vec,
16 | }
17 |
18 | #[derive(Debug, Default, Clone, serde::Serialize)]
19 | pub struct Link {
20 | #[serde(skip_serializing_if = "Attrs::is_empty")]
21 | pub attrs: Attrs,
22 | pub children: Vec,
23 | pub destination: Option,
24 | pub reference: Option,
25 | }
26 |
27 | #[derive(Debug, Default, Clone, serde::Serialize)]
28 | pub struct Image {
29 | #[serde(skip_serializing_if = "Attrs::is_empty")]
30 | pub attrs: Attrs,
31 | pub children: Vec,
32 | pub destination: Option,
33 | pub reference: Option,
34 | }
35 |
36 | #[derive(Debug, Default, Clone, serde::Serialize)]
37 | pub struct CodeBlock {
38 | #[serde(skip_serializing_if = "Attrs::is_empty")]
39 | pub attrs: Attrs,
40 | pub children: Vec,
41 | pub lang: Option,
42 | pub text: String,
43 | }
44 |
45 | #[derive(Debug, Default, Clone, serde::Serialize)]
46 | pub struct Strong {
47 | #[serde(skip_serializing_if = "Attrs::is_empty")]
48 | pub attrs: Attrs,
49 | pub children: Vec,
50 | }
51 |
52 | #[derive(Debug, Default, Clone, serde::Serialize)]
53 | pub struct Emph {
54 | #[serde(skip_serializing_if = "Attrs::is_empty")]
55 | pub attrs: Attrs,
56 | pub children: Vec,
57 | }
58 |
59 | #[derive(Debug, Default, Clone, serde::Serialize)]
60 | pub struct Insert {
61 | #[serde(skip_serializing_if = "Attrs::is_empty")]
62 | pub attrs: Attrs,
63 | pub children: Vec,
64 | }
65 |
66 | #[derive(Debug, Default, Clone, serde::Serialize)]
67 | pub struct Delete {
68 | #[serde(skip_serializing_if = "Attrs::is_empty")]
69 | pub attrs: Attrs,
70 | pub children: Vec,
71 | }
72 |
73 | #[derive(Debug, Default, Clone, serde::Serialize)]
74 | pub struct Mark {
75 | #[serde(skip_serializing_if = "Attrs::is_empty")]
76 | pub attrs: Attrs,
77 | pub children: Vec,
78 | }
79 |
80 | #[derive(Debug, Default, Clone, serde::Serialize)]
81 | pub struct Superscript {
82 | #[serde(skip_serializing_if = "Attrs::is_empty")]
83 | pub attrs: Attrs,
84 | pub children: Vec,
85 | }
86 |
87 | #[derive(Debug, Default, Clone, serde::Serialize)]
88 | pub struct Subscript {
89 | #[serde(skip_serializing_if = "Attrs::is_empty")]
90 | pub attrs: Attrs,
91 | pub children: Vec,
92 | }
93 |
94 | #[derive(Debug, Default, Clone, serde::Serialize)]
95 | pub struct Span {
96 | #[serde(skip_serializing_if = "Attrs::is_empty")]
97 | pub attrs: Attrs,
98 | pub children: Vec,
99 | }
100 |
101 | #[derive(Debug, Default, Clone, serde::Serialize)]
102 | pub struct DoubleQuoted {
103 | #[serde(skip_serializing_if = "Attrs::is_empty")]
104 | pub attrs: Attrs,
105 | pub children: Vec,
106 | }
107 |
108 | #[derive(Debug, Default, Clone, serde::Serialize)]
109 | pub struct Url {
110 | #[serde(skip_serializing_if = "Attrs::is_empty")]
111 | pub attrs: Attrs,
112 | pub children: Vec,
113 | pub destination: String,
114 | }
115 |
116 | #[derive(Debug, Default, Clone, serde::Serialize)]
117 | pub struct SoftBreak {
118 | #[serde(skip_serializing_if = "Attrs::is_empty")]
119 | pub attrs: Attrs,
120 | }
121 |
122 | #[derive(Debug, Default, Clone, serde::Serialize)]
123 | pub struct EmDash {
124 | #[serde(skip_serializing_if = "Attrs::is_empty")]
125 | pub attrs: Attrs,
126 | }
127 |
128 | #[derive(Debug, Default, Clone, serde::Serialize)]
129 | pub struct EnDash {
130 | #[serde(skip_serializing_if = "Attrs::is_empty")]
131 | pub attrs: Attrs,
132 | }
133 |
134 | #[derive(Debug, Default, Clone, serde::Serialize)]
135 | pub struct Verbatim {
136 | #[serde(skip_serializing_if = "Attrs::is_empty")]
137 | pub attrs: Attrs,
138 | pub text: String,
139 | }
140 |
141 | #[derive(Debug, Default, Clone, serde::Serialize)]
142 | pub struct Str {
143 | #[serde(skip_serializing_if = "Attrs::is_empty")]
144 | pub attrs: Attrs,
145 | pub text: String,
146 | }
147 |
148 | #[derive(Debug, Default, Clone, serde::Serialize)]
149 | pub struct Emoji {
150 | #[serde(skip_serializing_if = "Attrs::is_empty")]
151 | pub attrs: Attrs,
152 | pub alias: String,
153 | }
154 |
155 | #[derive(Debug, Clone, serde::Serialize)]
156 | #[serde(tag = "tag", rename_all = "snake_case")]
157 | pub enum Tag {
158 | Heading(Heading),
159 | Para(Para),
160 | Link(Link),
161 | Image(Image),
162 | CodeBlock(CodeBlock),
163 | Strong(Strong),
164 | Emph(Emph),
165 | Insert(Insert),
166 | Delete(Delete),
167 | Mark(Mark),
168 | Superscript(Superscript),
169 | Subscript(Subscript),
170 | Span(Span),
171 | DoubleQuoted(DoubleQuoted),
172 | Url(Url),
173 | SoftBreak(SoftBreak),
174 | EmDash(EmDash),
175 | EnDash(EnDash),
176 | Verbatim(Verbatim),
177 | Str(Str),
178 | Emoji(Emoji),
179 | }
180 |
--------------------------------------------------------------------------------
/src/attribute.rs:
--------------------------------------------------------------------------------
1 | use std::ops::Range;
2 |
3 | use crate::{
4 | annot::{Annot, Atom},
5 | patterns::find_at,
6 | Match,
7 | };
8 |
9 | #[derive(Default)]
10 | pub(crate) struct Tokenizer {
11 | subject: String,
12 | state: State,
13 | begin: usize,
14 | lastpos: usize,
15 | matches: Vec,
16 | }
17 |
18 | #[derive(Default)]
19 | enum State {
20 | Scanning,
21 | ScanningId,
22 | ScanningClass,
23 | ScanningKey,
24 | ScanningValue,
25 | ScanningBareValue,
26 | ScanningQuotedValue,
27 | ScanningEscaped,
28 | ScanningComment,
29 | Fail,
30 | Done,
31 | #[default]
32 | Start,
33 | }
34 |
35 | pub(crate) enum Status {
36 | Done,
37 | Fail,
38 | Continue,
39 | }
40 |
41 | impl Tokenizer {
42 | pub(crate) fn new(subject: String) -> Tokenizer {
43 | let mut res = Tokenizer::default();
44 | res.subject = subject;
45 | res
46 | }
47 |
48 | fn add_match(&mut self, range: Range, annot: impl Into) {
49 | self.matches.push(Match::new(range, annot))
50 | }
51 |
52 | pub(crate) fn get_matches(&mut self) -> Vec {
53 | std::mem::take(&mut self.matches)
54 | }
55 |
56 | // Feed tokenizer a slice of text from the subject, between
57 | // startpos and endpos inclusive. Return status, position,
58 | // where status is either "done" (position should point to
59 | // final '}'), "fail" (position should point to first character
60 | // that could not be tokenized), or "continue" (position should
61 | // point to last character parsed).
62 | pub(crate) fn feed(&mut self, startpos: usize, endpos: usize) -> (Status, usize) {
63 | let mut pos = startpos;
64 | while pos <= endpos {
65 | self.state = self.step(pos);
66 | match self.state {
67 | State::Done => return (Status::Done, pos),
68 | State::Fail => {
69 | self.lastpos = pos + 1;
70 | return (Status::Fail, pos);
71 | }
72 | _ => {
73 | self.lastpos = pos + 1;
74 | pos = pos + 1
75 | }
76 | }
77 | }
78 | (Status::Continue, pos)
79 | }
80 |
81 | fn step(&mut self, pos: usize) -> State {
82 | match self.state {
83 | State::Start => {
84 | if find_at(&self.subject, "^{", pos).is_match {
85 | State::Scanning
86 | } else {
87 | State::Fail
88 | }
89 | }
90 | State::Fail => State::Fail,
91 | State::Done => State::Done,
92 | State::Scanning => match self.subject.as_bytes()[pos] {
93 | b' ' | b'\t' | b'\n' | b'\r' => State::Scanning,
94 | b'}' => State::Done,
95 | b'#' => {
96 | self.begin = pos;
97 | State::ScanningId
98 | }
99 | b'%' => {
100 | self.begin = pos;
101 | State::ScanningComment
102 | }
103 | b'.' => {
104 | self.begin = pos;
105 | State::ScanningClass
106 | }
107 | _ => {
108 | if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
109 | self.begin = pos;
110 | State::ScanningKey
111 | } else {
112 | State::Fail
113 | }
114 | }
115 | },
116 | State::ScanningComment => {
117 | if self.subject.as_bytes()[pos] == b'%' {
118 | State::Scanning
119 | } else {
120 | State::ScanningComment
121 | }
122 | }
123 | State::ScanningId => self.step_ident(pos, Atom::Id, State::ScanningId),
124 | State::ScanningClass => self.step_ident(pos, Atom::Class, State::ScanningClass),
125 | State::ScanningKey => {
126 | let c = self.subject.as_bytes()[pos];
127 | if c == b'=' {
128 | self.add_match(self.begin..self.lastpos, Atom::Key);
129 | self.begin = !0;
130 | State::ScanningValue
131 | } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
132 | State::ScanningKey
133 | } else {
134 | State::Fail
135 | }
136 | }
137 | State::ScanningValue => {
138 | let c = self.subject.as_bytes()[pos];
139 | if c == b'"' {
140 | self.begin = pos;
141 | State::ScanningQuotedValue
142 | } else if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
143 | self.begin = pos;
144 | State::ScanningBareValue
145 | } else {
146 | State::Fail
147 | }
148 | }
149 | State::ScanningBareValue => {
150 | let c = self.subject.as_bytes()[pos];
151 | if find_at(&self.subject, "^[%a%d_:-]", pos).is_match {
152 | State::ScanningBareValue
153 | } else if c == b'}' {
154 | self.add_match(self.begin..self.lastpos, Atom::Value);
155 | self.begin = !0;
156 | State::Done
157 | } else if find_at(&self.subject, "^%s", pos).is_match {
158 | self.add_match(self.begin..self.lastpos, Atom::Value);
159 | self.begin = !0;
160 | State::Scanning
161 | } else {
162 | State::Fail
163 | }
164 | }
165 | State::ScanningEscaped => State::ScanningQuotedValue,
166 | State::ScanningQuotedValue => {
167 | let c = self.subject.as_bytes()[pos];
168 | match c {
169 | b'"' => {
170 | self.add_match(self.begin + 1..self.lastpos, Atom::Value);
171 | self.begin = !0;
172 | State::Scanning
173 | }
174 | b'\\' => State::ScanningEscaped,
175 | b'{' | b'}' => State::Fail,
176 | b'\n' => {
177 | self.add_match(self.begin + 1..self.lastpos, Atom::Value);
178 | State::ScanningQuotedValue
179 | }
180 | _ => State::ScanningQuotedValue,
181 | }
182 | }
183 | }
184 | }
185 |
186 | fn step_ident(&mut self, pos: usize, atom: Atom, state: State) -> State {
187 | let c = self.subject.as_bytes()[pos];
188 | match c {
189 | b'_' | b'-' | b':' => state,
190 | b'}' => {
191 | if self.lastpos > self.begin + 1 {
192 | self.add_match(self.begin + 1..self.lastpos, atom)
193 | }
194 | self.begin = !0;
195 | State::Done
196 | }
197 | _ => {
198 | if find_at(&self.subject, "^[^%s%p]", pos).is_match {
199 | state
200 | } else if find_at(&self.subject, "^%s", pos).is_match {
201 | if self.lastpos > self.begin {
202 | self.add_match(self.begin + 1..self.lastpos, atom)
203 | }
204 | self.begin = !0;
205 | State::Scanning
206 | } else {
207 | State::Fail
208 | }
209 | }
210 | }
211 | }
212 | }
213 |
--------------------------------------------------------------------------------
/src/block.rs:
--------------------------------------------------------------------------------
1 | use std::ops::Range;
2 |
3 | use crate::{
4 | annot::{Annot, Atom, Comp},
5 | format_to, inline,
6 | patterns::{find, find_at, PatMatch},
7 | Match, ParseOpts,
8 | };
9 |
10 | #[derive(Default)]
11 | pub struct Tokenizer {
12 | pub subject: String,
13 | indent: usize,
14 | startline: usize,
15 | starteol: usize,
16 | endeol: usize,
17 | pub(crate) matches: Vec,
18 | pos: usize,
19 | last_matched_container: usize,
20 | opts: ParseOpts,
21 | finished_line: bool,
22 |
23 | pub(crate) debug: String,
24 | }
25 |
26 | trait Container {
27 | fn content(&self) -> &'static str;
28 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
29 | None
30 | }
31 | fn restore_indent(&self) -> Option {
32 | None
33 | }
34 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool
35 | where
36 | Self: Sized;
37 | fn cont(&mut self, p: &mut Tokenizer) -> bool;
38 | fn close(self: Box, p: &mut Tokenizer);
39 | }
40 |
41 | const CONTAINERS: &[fn(&mut Tokenizer, &mut Vec>) -> bool] =
42 | &[Para::open, CodeBlock::open, ReferenceDefinition::open];
43 |
44 | struct Para {
45 | inline_parser: inline::Tokenizer,
46 | }
47 |
48 | impl Container for Para {
49 | fn content(&self) -> &'static str {
50 | "inline"
51 | }
52 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
53 | Some(&mut self.inline_parser)
54 | }
55 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool
56 | where
57 | Self: Sized,
58 | {
59 | p.add_container(
60 | stack,
61 | Para { inline_parser: inline::Tokenizer::new(p.subject.clone(), p.opts.clone()) },
62 | );
63 | p.add_match(p.pos..p.pos, Comp::Para.add());
64 | true
65 | }
66 |
67 | fn cont(&mut self, p: &mut Tokenizer) -> bool {
68 | p.find("^%S").is_match
69 | }
70 |
71 | fn close(mut self: Box, p: &mut Tokenizer) {
72 | p.matches.extend(self.inline_parser.get_matches());
73 | p.add_match(p.pos - 1..p.pos - 1, Comp::Para.sub())
74 | }
75 | }
76 |
77 | struct CodeBlock {
78 | border: char,
79 | indent: usize,
80 | }
81 |
82 | impl Container for CodeBlock {
83 | fn content(&self) -> &'static str {
84 | "text"
85 | }
86 | fn restore_indent(&self) -> Option {
87 | Some(self.indent)
88 | }
89 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool
90 | where
91 | Self: Sized,
92 | {
93 | let mut border = '`';
94 | let mut m = p.find("^```([ \t]*)([^%s`]*)[ \t]*[\r\n]");
95 | if !m.is_match {
96 | border = '~';
97 | m = p.find("^~~~([ \t]*)([^%s`]*)[ \t]*[\r\n]");
98 | }
99 | if !m.is_match {
100 | return false;
101 | }
102 | p.add_container(stack, CodeBlock { border, indent: p.indent });
103 | let lang = m.cap2;
104 |
105 | p.add_match(p.pos..p.pos + 3, Comp::CodeBlock.add());
106 | if !lang.is_empty() {
107 | p.add_match(lang.start..lang.end, Atom::CodeLanguage)
108 | }
109 |
110 | p.pos = p.pos + 2;
111 | p.finished_line = true;
112 | true
113 | }
114 |
115 | fn cont(&mut self, p: &mut Tokenizer) -> bool {
116 | let m =
117 | if self.border == '`' { p.find("^(```)[ \t]*[\r\n]") } else { p.find("^(~~~)[ \t]*[\r\n]") };
118 | if m.is_match {
119 | p.pos = m.end - 1;
120 | p.finished_line = true;
121 | false
122 | } else {
123 | true
124 | }
125 | }
126 |
127 | fn close(self: Box, p: &mut Tokenizer) {
128 | p.add_match(p.pos - 3..p.pos, Comp::CodeBlock.sub());
129 | }
130 | }
131 |
132 | struct ReferenceDefinition {
133 | indent: usize,
134 | }
135 |
136 | impl Container for ReferenceDefinition {
137 | fn content(&self) -> &'static str {
138 | ""
139 | }
140 |
141 | fn open(p: &mut Tokenizer, stack: &mut Vec>) -> bool
142 | where
143 | Self: Sized,
144 | {
145 | let m = p.find("^[[]([^\r\n]*)%]:[ \t]*(%S*)");
146 | if !m.is_match {
147 | return false;
148 | }
149 | p.add_container(stack, ReferenceDefinition { indent: p.indent });
150 | p.add_match(m.start..m.start, Comp::ReferenceDefinition.add());
151 | p.add_match(m.start..m.start + m.cap1.len() + 2, Atom::ReferenceKey);
152 | if !m.cap2.is_empty() {
153 | p.add_match(m.end - m.cap2.len()..m.end, Atom::ReferenceValue);
154 | }
155 | p.pos = m.end;
156 | true
157 | }
158 |
159 | fn cont(&mut self, p: &mut Tokenizer) -> bool {
160 | if self.indent >= p.indent {
161 | return false;
162 | }
163 | let m = p.find("^(%S+)");
164 | if m.is_match {
165 | p.add_match(m.cap1.start..m.cap1.end, Atom::ReferenceValue);
166 | p.pos = m.end;
167 | }
168 | true
169 | }
170 |
171 | fn close(self: Box, p: &mut Tokenizer) {
172 | p.add_match(p.pos..p.pos, Comp::ReferenceDefinition.sub())
173 | }
174 |
175 | fn inline_parser(&mut self) -> Option<&mut inline::Tokenizer> {
176 | None
177 | }
178 | }
179 |
180 | impl Tokenizer {
181 | pub fn new(mut subject: String, opts: ParseOpts) -> Tokenizer {
182 | if !find(&subject, "[\r\n]$").is_match {
183 | subject.push('\n');
184 | }
185 | let mut res = Tokenizer::default();
186 | res.subject = subject;
187 | res.opts = opts;
188 | res
189 | }
190 |
191 | fn find(&self, pat: &'static str) -> PatMatch {
192 | find_at(&self.subject, pat, self.pos)
193 | }
194 |
195 | fn add_match(&mut self, range: Range, annot: impl Into) {
196 | self.matches.push(Match::new(range, annot))
197 | }
198 |
199 | fn add_container(
200 | &mut self,
201 | stack: &mut Vec>,
202 | container: impl Container + 'static,
203 | ) {
204 | let last_matched = self.last_matched_container;
205 | while stack.len() > last_matched
206 | || (stack.len() > 0 && stack.last().unwrap().content() != "block")
207 | {
208 | stack.pop().unwrap().close(self)
209 | }
210 | stack.push(Box::new(container))
211 | }
212 |
213 | fn skip_space(&mut self) {
214 | let m = find_at(&self.subject, "[^ \t]", self.pos);
215 | if m.is_match {
216 | self.indent = m.start - self.startline;
217 | self.pos = m.start;
218 | }
219 | }
220 |
221 | fn get_eol(&mut self) {
222 | let mut m = find_at(&self.subject, "[\r]?[\n]", self.pos);
223 | if !m.is_match {
224 | (m.start, m.end) = (self.subject.len(), self.subject.len());
225 | }
226 | self.starteol = m.start;
227 | self.endeol = m.end;
228 | }
229 |
230 | pub fn parse(&mut self) {
231 | let mut containers: Vec> = Vec::new();
232 |
233 | let subjectlen = self.subject.len();
234 | while self.pos < subjectlen {
235 | self.indent = 0;
236 | self.startline = self.pos;
237 | self.finished_line = false;
238 | self.get_eol();
239 |
240 | // check open containers for continuation
241 | self.last_matched_container = 0;
242 | for idx in 0..containers.len() {
243 | // skip any indentation
244 | self.skip_space();
245 | if containers[idx].cont(self) {
246 | self.last_matched_container = idx + 1
247 | } else {
248 | break;
249 | }
250 | }
251 |
252 | // if we hit a close fence, we can move to next line
253 | if self.finished_line {
254 | while containers.len() > self.last_matched_container {
255 | containers.pop().unwrap().close(self)
256 | }
257 | }
258 |
259 | if !self.finished_line {
260 | // check for new containers
261 | self.skip_space();
262 | let mut is_blank = self.pos == self.starteol;
263 |
264 | let mut new_starts = false;
265 | let last_match = containers[..self.last_matched_container].first();
266 | let mut check_starts = !is_blank
267 | && !matches!(last_match, Some(c) if c.content() != "block")
268 | && !self.find("^%a+%s").is_match; // optimization
269 |
270 | while check_starts {
271 | check_starts = false;
272 | for i in 1..CONTAINERS.len() {
273 | let open = CONTAINERS[i];
274 | if open(self, &mut containers) {
275 | self.last_matched_container = containers.len();
276 | if self.finished_line {
277 | check_starts = false
278 | } else {
279 | self.skip_space();
280 | new_starts = true;
281 | check_starts = containers.last().unwrap().content() != "text"
282 | }
283 | break;
284 | }
285 | }
286 | }
287 |
288 | if !self.finished_line {
289 | // handle remaining content
290 | self.skip_space();
291 |
292 | is_blank = self.pos == self.starteol;
293 |
294 | let is_lazy = !is_blank
295 | && !new_starts
296 | && self.last_matched_container < containers.len()
297 | && containers.last().unwrap().content() == "inline";
298 |
299 | if !is_lazy && self.last_matched_container < containers.len() {
300 | while containers.len() > self.last_matched_container {
301 | containers.pop().unwrap().close(self);
302 | }
303 | }
304 |
305 | // add para by default if there's text
306 | if !matches!(containers.last(), Some(c) if c.content() != "block") {
307 | if is_blank {
308 | if !new_starts {
309 | // need to track these for tight/loose lists
310 | self.add_match(self.pos..self.endeol, Atom::Blankline);
311 | }
312 | } else {
313 | CONTAINERS[0](self, &mut containers);
314 | }
315 | }
316 |
317 | if let Some(tip) = containers.last_mut() {
318 | if let Some(tip_indent) = tip.restore_indent() {
319 | let mut startpos = self.pos;
320 | if self.indent > tip_indent {
321 | // get back the leading spaces we gobbled
322 | startpos = startpos - (self.indent - tip_indent)
323 | }
324 | self.add_match(startpos..self.endeol, Atom::Str)
325 | } else if let Some(inline_parser) = tip.inline_parser() {
326 | if !is_blank {
327 | inline_parser.feed(self.pos, self.endeol)
328 | }
329 | }
330 | }
331 | }
332 | }
333 |
334 | self.pos = self.endeol;
335 | }
336 | self.finish(containers)
337 | }
338 |
339 | fn finish(&mut self, mut containers: Vec>) {
340 | // close unmatched containers
341 | while let Some(cont) = containers.pop() {
342 | cont.close(self)
343 | }
344 | if self.opts.debug_matches {
345 | for m in &self.matches {
346 | let ms = format!(
347 | "{} {}-{}",
348 | m.a,
349 | m.range.start + 1,
350 | if m.range.is_empty() { m.range.end + 1 } else { m.range.end }
351 | );
352 | format_to!(
353 | self.debug,
354 | "{ms:<20} {:?}\n",
355 | self.subject.get(m.range.clone()).unwrap_or_default()
356 | );
357 | }
358 | }
359 | }
360 | }
361 |
--------------------------------------------------------------------------------
/src/emoji.rs:
--------------------------------------------------------------------------------
1 | pub(crate) fn find_emoji(s: &str) -> Option<&'static str> {
2 | let idx = EMOJI_LIST.binary_search_by_key(&s, |&(k, _)| k).ok()?;
3 | Some(EMOJI_LIST[idx].1)
4 | }
5 |
6 | #[test]
7 | fn emoji_list_is_sorted() {
8 | let mut sorted = EMOJI_LIST.to_vec();
9 | sorted.sort_by_key(|&(k, _)| k);
10 | if EMOJI_LIST != sorted {
11 | let mut buf = String::new();
12 | for (k, v) in sorted {
13 | crate::format_to!(buf, r#"("{k}", "{v}"),"#);
14 | }
15 | std::fs::write("./emoji.sorted", &buf).unwrap();
16 | panic!("unsorted emoji list, sorted version in: ./emoji.sorted")
17 | }
18 | }
19 |
20 | static EMOJI_LIST: &[(&str, &str)] = &[
21 | ("+1", "👍"),
22 | ("-1", "👎"),
23 | ("100", "💯"),
24 | ("1234", "🔢"),
25 | ("1st_place_medal", "🥇"),
26 | ("2nd_place_medal", "🥈"),
27 | ("3rd_place_medal", "🥉"),
28 | ("8ball", "🎱"),
29 | ("a", "🅰️"),
30 | ("ab", "🆎"),
31 | ("abacus", "🧮"),
32 | ("abc", "🔤"),
33 | ("abcd", "🔡"),
34 | ("accept", "🉑"),
35 | ("accordion", "🪗"),
36 | ("adhesive_bandage", "🩹"),
37 | ("adult", "🧑"),
38 | ("aerial_tramway", "🚡"),
39 | ("afghanistan", "🇦🇫"),
40 | ("airplane", "✈️"),
41 | ("aland_islands", "🇦🇽"),
42 | ("alarm_clock", "⏰"),
43 | ("albania", "🇦🇱"),
44 | ("alembic", "⚗️"),
45 | ("algeria", "🇩🇿"),
46 | ("alien", "👽"),
47 | ("ambulance", "🚑"),
48 | ("american_samoa", "🇦🇸"),
49 | ("amphora", "🏺"),
50 | ("anatomical_heart", "🫀"),
51 | ("anchor", "⚓"),
52 | ("andorra", "🇦🇩"),
53 | ("angel", "👼"),
54 | ("anger", "💢"),
55 | ("angola", "🇦🇴"),
56 | ("angry", "😠"),
57 | ("anguilla", "🇦🇮"),
58 | ("anguished", "😧"),
59 | ("ant", "🐜"),
60 | ("antarctica", "🇦🇶"),
61 | ("antigua_barbuda", "🇦🇬"),
62 | ("apple", "🍎"),
63 | ("aquarius", "♒"),
64 | ("argentina", "🇦🇷"),
65 | ("aries", "♈"),
66 | ("armenia", "🇦🇲"),
67 | ("arrow_backward", "◀️"),
68 | ("arrow_double_down", "⏬"),
69 | ("arrow_double_up", "⏫"),
70 | ("arrow_down", "⬇️"),
71 | ("arrow_down_small", "🔽"),
72 | ("arrow_forward", "▶️"),
73 | ("arrow_heading_down", "⤵️"),
74 | ("arrow_heading_up", "⤴️"),
75 | ("arrow_left", "⬅️"),
76 | ("arrow_lower_left", "↙️"),
77 | ("arrow_lower_right", "↘️"),
78 | ("arrow_right", "➡️"),
79 | ("arrow_right_hook", "↪️"),
80 | ("arrow_up", "⬆️"),
81 | ("arrow_up_down", "↕️"),
82 | ("arrow_up_small", "🔼"),
83 | ("arrow_upper_left", "↖️"),
84 | ("arrow_upper_right", "↗️"),
85 | ("arrows_clockwise", "🔃"),
86 | ("arrows_counterclockwise", "🔄"),
87 | ("art", "🎨"),
88 | ("articulated_lorry", "🚛"),
89 | ("artificial_satellite", "🛰️"),
90 | ("artist", "🧑🎨"),
91 | ("aruba", "🇦🇼"),
92 | ("ascension_island", "🇦🇨"),
93 | ("asterisk", "*️⃣"),
94 | ("astonished", "😲"),
95 | ("astronaut", "🧑🚀"),
96 | ("athletic_shoe", "👟"),
97 | ("atm", "🏧"),
98 | ("atom_symbol", "⚛️"),
99 | ("australia", "🇦🇺"),
100 | ("austria", "🇦🇹"),
101 | ("auto_rickshaw", "🛺"),
102 | ("avocado", "🥑"),
103 | ("axe", "🪓"),
104 | ("azerbaijan", "🇦🇿"),
105 | ("b", "🅱️"),
106 | ("baby", "👶"),
107 | ("baby_bottle", "🍼"),
108 | ("baby_chick", "🐤"),
109 | ("baby_symbol", "🚼"),
110 | ("back", "🔙"),
111 | ("bacon", "🥓"),
112 | ("badger", "🦡"),
113 | ("badminton", "🏸"),
114 | ("bagel", "🥯"),
115 | ("baggage_claim", "🛄"),
116 | ("baguette_bread", "🥖"),
117 | ("bahamas", "🇧🇸"),
118 | ("bahrain", "🇧🇭"),
119 | ("balance_scale", "⚖️"),
120 | ("bald_man", "👨🦲"),
121 | ("bald_woman", "👩🦲"),
122 | ("ballet_shoes", "🩰"),
123 | ("balloon", "🎈"),
124 | ("ballot_box", "🗳️"),
125 | ("ballot_box_with_check", "☑️"),
126 | ("bamboo", "🎍"),
127 | ("banana", "🍌"),
128 | ("bangbang", "‼️"),
129 | ("bangladesh", "🇧🇩"),
130 | ("banjo", "🪕"),
131 | ("bank", "🏦"),
132 | ("bar_chart", "📊"),
133 | ("barbados", "🇧🇧"),
134 | ("barber", "💈"),
135 | ("baseball", "⚾"),
136 | ("basket", "🧺"),
137 | ("basketball", "🏀"),
138 | ("basketball_man", "⛹️♂️"),
139 | ("basketball_woman", "⛹️♀️"),
140 | ("bat", "🦇"),
141 | ("bath", "🛀"),
142 | ("bathtub", "🛁"),
143 | ("battery", "🔋"),
144 | ("beach_umbrella", "🏖️"),
145 | ("bear", "🐻"),
146 | ("bearded_person", "🧔"),
147 | ("beaver", "🦫"),
148 | ("bed", "🛏️"),
149 | ("bee", "🐝"),
150 | ("beer", "🍺"),
151 | ("beers", "🍻"),
152 | ("beetle", "🪲"),
153 | ("beginner", "🔰"),
154 | ("belarus", "🇧🇾"),
155 | ("belgium", "🇧🇪"),
156 | ("belize", "🇧🇿"),
157 | ("bell", "🔔"),
158 | ("bell_pepper", "🫑"),
159 | ("bellhop_bell", "🛎️"),
160 | ("benin", "🇧🇯"),
161 | ("bento", "🍱"),
162 | ("bermuda", "🇧🇲"),
163 | ("beverage_box", "🧃"),
164 | ("bhutan", "🇧🇹"),
165 | ("bicyclist", "🚴"),
166 | ("bike", "🚲"),
167 | ("biking_man", "🚴♂️"),
168 | ("biking_woman", "🚴♀️"),
169 | ("bikini", "👙"),
170 | ("billed_cap", "🧢"),
171 | ("biohazard", "☣️"),
172 | ("bird", "🐦"),
173 | ("birthday", "🎂"),
174 | ("bison", "🦬"),
175 | ("black_cat", "🐈⬛"),
176 | ("black_circle", "⚫"),
177 | ("black_flag", "🏴"),
178 | ("black_heart", "🖤"),
179 | ("black_joker", "🃏"),
180 | ("black_large_square", "⬛"),
181 | ("black_medium_small_square", "◾"),
182 | ("black_medium_square", "◼️"),
183 | ("black_nib", "✒️"),
184 | ("black_small_square", "▪️"),
185 | ("black_square_button", "🔲"),
186 | ("blond_haired_man", "👱♂️"),
187 | ("blond_haired_person", "👱"),
188 | ("blond_haired_woman", "👱♀️"),
189 | ("blonde_woman", "👱♀️"),
190 | ("blossom", "🌼"),
191 | ("blowfish", "🐡"),
192 | ("blue_book", "📘"),
193 | ("blue_car", "🚙"),
194 | ("blue_heart", "💙"),
195 | ("blue_square", "🟦"),
196 | ("blueberries", "🫐"),
197 | ("blush", "😊"),
198 | ("boar", "🐗"),
199 | ("boat", "⛵"),
200 | ("bolivia", "🇧🇴"),
201 | ("bomb", "💣"),
202 | ("bone", "🦴"),
203 | ("book", "📖"),
204 | ("bookmark", "🔖"),
205 | ("bookmark_tabs", "📑"),
206 | ("books", "📚"),
207 | ("boom", "💥"),
208 | ("boomerang", "🪃"),
209 | ("boot", "👢"),
210 | ("bosnia_herzegovina", "🇧🇦"),
211 | ("botswana", "🇧🇼"),
212 | ("bouncing_ball_man", "⛹️♂️"),
213 | ("bouncing_ball_person", "⛹️"),
214 | ("bouncing_ball_woman", "⛹️♀️"),
215 | ("bouquet", "💐"),
216 | ("bouvet_island", "🇧🇻"),
217 | ("bow", "🙇"),
218 | ("bow_and_arrow", "🏹"),
219 | ("bowing_man", "🙇♂️"),
220 | ("bowing_woman", "🙇♀️"),
221 | ("bowl_with_spoon", "🥣"),
222 | ("bowling", "🎳"),
223 | ("boxing_glove", "🥊"),
224 | ("boy", "👦"),
225 | ("brain", "🧠"),
226 | ("brazil", "🇧🇷"),
227 | ("bread", "🍞"),
228 | ("breast_feeding", "🤱"),
229 | ("bricks", "🧱"),
230 | ("bride_with_veil", "👰♀️"),
231 | ("bridge_at_night", "🌉"),
232 | ("briefcase", "💼"),
233 | ("british_indian_ocean_territory", "🇮🇴"),
234 | ("british_virgin_islands", "🇻🇬"),
235 | ("broccoli", "🥦"),
236 | ("broken_heart", "💔"),
237 | ("broom", "🧹"),
238 | ("brown_circle", "🟤"),
239 | ("brown_heart", "🤎"),
240 | ("brown_square", "🟫"),
241 | ("brunei", "🇧🇳"),
242 | ("bubble_tea", "🧋"),
243 | ("bucket", "🪣"),
244 | ("bug", "🐛"),
245 | ("building_construction", "🏗️"),
246 | ("bulb", "💡"),
247 | ("bulgaria", "🇧🇬"),
248 | ("bullettrain_front", "🚅"),
249 | ("bullettrain_side", "🚄"),
250 | ("burkina_faso", "🇧🇫"),
251 | ("burrito", "🌯"),
252 | ("burundi", "🇧🇮"),
253 | ("bus", "🚌"),
254 | ("business_suit_levitating", "🕴️"),
255 | ("busstop", "🚏"),
256 | ("bust_in_silhouette", "👤"),
257 | ("busts_in_silhouette", "👥"),
258 | ("butter", "🧈"),
259 | ("butterfly", "🦋"),
260 | ("cactus", "🌵"),
261 | ("cake", "🍰"),
262 | ("calendar", "📆"),
263 | ("call_me_hand", "🤙"),
264 | ("calling", "📲"),
265 | ("cambodia", "🇰🇭"),
266 | ("camel", "🐫"),
267 | ("camera", "📷"),
268 | ("camera_flash", "📸"),
269 | ("cameroon", "🇨🇲"),
270 | ("camping", "🏕️"),
271 | ("canada", "🇨🇦"),
272 | ("canary_islands", "🇮🇨"),
273 | ("cancer", "♋"),
274 | ("candle", "🕯️"),
275 | ("candy", "🍬"),
276 | ("canned_food", "🥫"),
277 | ("canoe", "🛶"),
278 | ("cape_verde", "🇨🇻"),
279 | ("capital_abcd", "🔠"),
280 | ("capricorn", "♑"),
281 | ("car", "🚗"),
282 | ("card_file_box", "🗃️"),
283 | ("card_index", "📇"),
284 | ("card_index_dividers", "🗂️"),
285 | ("caribbean_netherlands", "🇧🇶"),
286 | ("carousel_horse", "🎠"),
287 | ("carpentry_saw", "🪚"),
288 | ("carrot", "🥕"),
289 | ("cartwheeling", "🤸"),
290 | ("cat", "🐱"),
291 | ("cat2", "🐈"),
292 | ("cayman_islands", "🇰🇾"),
293 | ("cd", "💿"),
294 | ("central_african_republic", "🇨🇫"),
295 | ("ceuta_melilla", "🇪🇦"),
296 | ("chad", "🇹🇩"),
297 | ("chains", "⛓️"),
298 | ("chair", "🪑"),
299 | ("champagne", "🍾"),
300 | ("chart", "💹"),
301 | ("chart_with_downwards_trend", "📉"),
302 | ("chart_with_upwards_trend", "📈"),
303 | ("checkered_flag", "🏁"),
304 | ("cheese", "🧀"),
305 | ("cherries", "🍒"),
306 | ("cherry_blossom", "🌸"),
307 | ("chess_pawn", "♟️"),
308 | ("chestnut", "🌰"),
309 | ("chicken", "🐔"),
310 | ("child", "🧒"),
311 | ("children_crossing", "🚸"),
312 | ("chile", "🇨🇱"),
313 | ("chipmunk", "🐿️"),
314 | ("chocolate_bar", "🍫"),
315 | ("chopsticks", "🥢"),
316 | ("christmas_island", "🇨🇽"),
317 | ("christmas_tree", "🎄"),
318 | ("church", "⛪"),
319 | ("cinema", "🎦"),
320 | ("circus_tent", "🎪"),
321 | ("city_sunrise", "🌇"),
322 | ("city_sunset", "🌆"),
323 | ("cityscape", "🏙️"),
324 | ("cl", "🆑"),
325 | ("clamp", "🗜️"),
326 | ("clap", "👏"),
327 | ("clapper", "🎬"),
328 | ("classical_building", "🏛️"),
329 | ("climbing", "🧗"),
330 | ("climbing_man", "🧗♂️"),
331 | ("climbing_woman", "🧗♀️"),
332 | ("clinking_glasses", "🥂"),
333 | ("clipboard", "📋"),
334 | ("clipperton_island", "🇨🇵"),
335 | ("clock1", "🕐"),
336 | ("clock10", "🕙"),
337 | ("clock1030", "🕥"),
338 | ("clock11", "🕚"),
339 | ("clock1130", "🕦"),
340 | ("clock12", "🕛"),
341 | ("clock1230", "🕧"),
342 | ("clock130", "🕜"),
343 | ("clock2", "🕑"),
344 | ("clock230", "🕝"),
345 | ("clock3", "🕒"),
346 | ("clock330", "🕞"),
347 | ("clock4", "🕓"),
348 | ("clock430", "🕟"),
349 | ("clock5", "🕔"),
350 | ("clock530", "🕠"),
351 | ("clock6", "🕕"),
352 | ("clock630", "🕡"),
353 | ("clock7", "🕖"),
354 | ("clock730", "🕢"),
355 | ("clock8", "🕗"),
356 | ("clock830", "🕣"),
357 | ("clock9", "🕘"),
358 | ("clock930", "🕤"),
359 | ("closed_book", "📕"),
360 | ("closed_lock_with_key", "🔐"),
361 | ("closed_umbrella", "🌂"),
362 | ("cloud", "☁️"),
363 | ("cloud_with_lightning", "🌩️"),
364 | ("cloud_with_lightning_and_rain", "⛈️"),
365 | ("cloud_with_rain", "🌧️"),
366 | ("cloud_with_snow", "🌨️"),
367 | ("clown_face", "🤡"),
368 | ("clubs", "♣️"),
369 | ("cn", "🇨🇳"),
370 | ("coat", "🧥"),
371 | ("cockroach", "🪳"),
372 | ("cocktail", "🍸"),
373 | ("coconut", "🥥"),
374 | ("cocos_islands", "🇨🇨"),
375 | ("coffee", "☕"),
376 | ("coffin", "⚰️"),
377 | ("coin", "🪙"),
378 | ("cold_face", "🥶"),
379 | ("cold_sweat", "😰"),
380 | ("collision", "💥"),
381 | ("colombia", "🇨🇴"),
382 | ("comet", "☄️"),
383 | ("comoros", "🇰🇲"),
384 | ("compass", "🧭"),
385 | ("computer", "💻"),
386 | ("computer_mouse", "🖱️"),
387 | ("confetti_ball", "🎊"),
388 | ("confounded", "😖"),
389 | ("confused", "😕"),
390 | ("congo_brazzaville", "🇨🇬"),
391 | ("congo_kinshasa", "🇨🇩"),
392 | ("congratulations", "㊗️"),
393 | ("construction", "🚧"),
394 | ("construction_worker", "👷"),
395 | ("construction_worker_man", "👷♂️"),
396 | ("construction_worker_woman", "👷♀️"),
397 | ("control_knobs", "🎛️"),
398 | ("convenience_store", "🏪"),
399 | ("cook", "🧑🍳"),
400 | ("cook_islands", "🇨🇰"),
401 | ("cookie", "🍪"),
402 | ("cool", "🆒"),
403 | ("cop", "👮"),
404 | ("copyright", "©️"),
405 | ("corn", "🌽"),
406 | ("costa_rica", "🇨🇷"),
407 | ("cote_divoire", "🇨🇮"),
408 | ("couch_and_lamp", "🛋️"),
409 | ("couple", "👫"),
410 | ("couple_with_heart", "💑"),
411 | ("couple_with_heart_man_man", "👨❤️👨"),
412 | ("couple_with_heart_woman_man", "👩❤️👨"),
413 | ("couple_with_heart_woman_woman", "👩❤️👩"),
414 | ("couplekiss", "💏"),
415 | ("couplekiss_man_man", "👨❤️💋👨"),
416 | ("couplekiss_man_woman", "👩❤️💋👨"),
417 | ("couplekiss_woman_woman", "👩❤️💋👩"),
418 | ("cow", "🐮"),
419 | ("cow2", "🐄"),
420 | ("cowboy_hat_face", "🤠"),
421 | ("crab", "🦀"),
422 | ("crayon", "🖍️"),
423 | ("credit_card", "💳"),
424 | ("crescent_moon", "🌙"),
425 | ("cricket", "🦗"),
426 | ("cricket_game", "🏏"),
427 | ("croatia", "🇭🇷"),
428 | ("crocodile", "🐊"),
429 | ("croissant", "🥐"),
430 | ("crossed_fingers", "🤞"),
431 | ("crossed_flags", "🎌"),
432 | ("crossed_swords", "⚔️"),
433 | ("crown", "👑"),
434 | ("cry", "😢"),
435 | ("crying_cat_face", "😿"),
436 | ("crystal_ball", "🔮"),
437 | ("cuba", "🇨🇺"),
438 | ("cucumber", "🥒"),
439 | ("cup_with_straw", "🥤"),
440 | ("cupcake", "🧁"),
441 | ("cupid", "💘"),
442 | ("curacao", "🇨🇼"),
443 | ("curling_stone", "🥌"),
444 | ("curly_haired_man", "👨🦱"),
445 | ("curly_haired_woman", "👩🦱"),
446 | ("curly_loop", "➰"),
447 | ("currency_exchange", "💱"),
448 | ("curry", "🍛"),
449 | ("cursing_face", "🤬"),
450 | ("custard", "🍮"),
451 | ("customs", "🛃"),
452 | ("cut_of_meat", "🥩"),
453 | ("cyclone", "🌀"),
454 | ("cyprus", "🇨🇾"),
455 | ("czech_republic", "🇨🇿"),
456 | ("dagger", "🗡️"),
457 | ("dancer", "💃"),
458 | ("dancers", "👯"),
459 | ("dancing_men", "👯♂️"),
460 | ("dancing_women", "👯♀️"),
461 | ("dango", "🍡"),
462 | ("dark_sunglasses", "🕶️"),
463 | ("dart", "🎯"),
464 | ("dash", "💨"),
465 | ("date", "📅"),
466 | ("de", "🇩🇪"),
467 | ("deaf_man", "🧏♂️"),
468 | ("deaf_person", "🧏"),
469 | ("deaf_woman", "🧏♀️"),
470 | ("deciduous_tree", "🌳"),
471 | ("deer", "🦌"),
472 | ("denmark", "🇩🇰"),
473 | ("department_store", "🏬"),
474 | ("derelict_house", "🏚️"),
475 | ("desert", "🏜️"),
476 | ("desert_island", "🏝️"),
477 | ("desktop_computer", "🖥️"),
478 | ("detective", "🕵️"),
479 | ("diamond_shape_with_a_dot_inside", "💠"),
480 | ("diamonds", "♦️"),
481 | ("diego_garcia", "🇩🇬"),
482 | ("disappointed", "😞"),
483 | ("disappointed_relieved", "😥"),
484 | ("disguised_face", "🥸"),
485 | ("diving_mask", "🤿"),
486 | ("diya_lamp", "🪔"),
487 | ("dizzy", "💫"),
488 | ("dizzy_face", "😵"),
489 | ("djibouti", "🇩🇯"),
490 | ("dna", "🧬"),
491 | ("do_not_litter", "🚯"),
492 | ("dodo", "🦤"),
493 | ("dog", "🐶"),
494 | ("dog2", "🐕"),
495 | ("dollar", "💵"),
496 | ("dolls", "🎎"),
497 | ("dolphin", "🐬"),
498 | ("dominica", "🇩🇲"),
499 | ("dominican_republic", "🇩🇴"),
500 | ("door", "🚪"),
501 | ("doughnut", "🍩"),
502 | ("dove", "🕊️"),
503 | ("dragon", "🐉"),
504 | ("dragon_face", "🐲"),
505 | ("dress", "👗"),
506 | ("dromedary_camel", "🐪"),
507 | ("drooling_face", "🤤"),
508 | ("drop_of_blood", "🩸"),
509 | ("droplet", "💧"),
510 | ("drum", "🥁"),
511 | ("duck", "🦆"),
512 | ("dumpling", "🥟"),
513 | ("dvd", "📀"),
514 | ("e-mail", "📧"),
515 | ("eagle", "🦅"),
516 | ("ear", "👂"),
517 | ("ear_of_rice", "🌾"),
518 | ("ear_with_hearing_aid", "🦻"),
519 | ("earth_africa", "🌍"),
520 | ("earth_americas", "🌎"),
521 | ("earth_asia", "🌏"),
522 | ("ecuador", "🇪🇨"),
523 | ("egg", "🥚"),
524 | ("eggplant", "🍆"),
525 | ("egypt", "🇪🇬"),
526 | ("eight", "8️⃣"),
527 | ("eight_pointed_black_star", "✴️"),
528 | ("eight_spoked_asterisk", "✳️"),
529 | ("eject_button", "⏏️"),
530 | ("el_salvador", "🇸🇻"),
531 | ("electric_plug", "🔌"),
532 | ("elephant", "🐘"),
533 | ("elevator", "🛗"),
534 | ("elf", "🧝"),
535 | ("elf_man", "🧝♂️"),
536 | ("elf_woman", "🧝♀️"),
537 | ("email", "📧"),
538 | ("end", "🔚"),
539 | ("england", "🏴"),
540 | ("envelope", "✉️"),
541 | ("envelope_with_arrow", "📩"),
542 | ("equatorial_guinea", "🇬🇶"),
543 | ("eritrea", "🇪🇷"),
544 | ("es", "🇪🇸"),
545 | ("estonia", "🇪🇪"),
546 | ("ethiopia", "🇪🇹"),
547 | ("eu", "🇪🇺"),
548 | ("euro", "💶"),
549 | ("european_castle", "🏰"),
550 | ("european_post_office", "🏤"),
551 | ("european_union", "🇪🇺"),
552 | ("evergreen_tree", "🌲"),
553 | ("exclamation", "❗"),
554 | ("exploding_head", "🤯"),
555 | ("expressionless", "😑"),
556 | ("eye", "👁️"),
557 | ("eye_speech_bubble", "👁️🗨️"),
558 | ("eyeglasses", "👓"),
559 | ("eyes", "👀"),
560 | ("face_exhaling", "😮💨"),
561 | ("face_in_clouds", "😶🌫️"),
562 | ("face_with_head_bandage", "🤕"),
563 | ("face_with_spiral_eyes", "😵💫"),
564 | ("face_with_thermometer", "🤒"),
565 | ("facepalm", "🤦"),
566 | ("facepunch", "👊"),
567 | ("factory", "🏭"),
568 | ("factory_worker", "🧑🏭"),
569 | ("fairy", "🧚"),
570 | ("fairy_man", "🧚♂️"),
571 | ("fairy_woman", "🧚♀️"),
572 | ("falafel", "🧆"),
573 | ("falkland_islands", "🇫🇰"),
574 | ("fallen_leaf", "🍂"),
575 | ("family", "👪"),
576 | ("family_man_boy", "👨👦"),
577 | ("family_man_boy_boy", "👨👦👦"),
578 | ("family_man_girl", "👨👧"),
579 | ("family_man_girl_boy", "👨👧👦"),
580 | ("family_man_girl_girl", "👨👧👧"),
581 | ("family_man_man_boy", "👨👨👦"),
582 | ("family_man_man_boy_boy", "👨👨👦👦"),
583 | ("family_man_man_girl", "👨👨👧"),
584 | ("family_man_man_girl_boy", "👨👨👧👦"),
585 | ("family_man_man_girl_girl", "👨👨👧👧"),
586 | ("family_man_woman_boy", "👨👩👦"),
587 | ("family_man_woman_boy_boy", "👨👩👦👦"),
588 | ("family_man_woman_girl", "👨👩👧"),
589 | ("family_man_woman_girl_boy", "👨👩👧👦"),
590 | ("family_man_woman_girl_girl", "👨👩👧👧"),
591 | ("family_woman_boy", "👩👦"),
592 | ("family_woman_boy_boy", "👩👦👦"),
593 | ("family_woman_girl", "👩👧"),
594 | ("family_woman_girl_boy", "👩👧👦"),
595 | ("family_woman_girl_girl", "👩👧👧"),
596 | ("family_woman_woman_boy", "👩👩👦"),
597 | ("family_woman_woman_boy_boy", "👩👩👦👦"),
598 | ("family_woman_woman_girl", "👩👩👧"),
599 | ("family_woman_woman_girl_boy", "👩👩👧👦"),
600 | ("family_woman_woman_girl_girl", "👩👩👧👧"),
601 | ("farmer", "🧑🌾"),
602 | ("faroe_islands", "🇫🇴"),
603 | ("fast_forward", "⏩"),
604 | ("fax", "📠"),
605 | ("fearful", "😨"),
606 | ("feather", "🪶"),
607 | ("feet", "🐾"),
608 | ("female_detective", "🕵️♀️"),
609 | ("female_sign", "♀️"),
610 | ("ferris_wheel", "🎡"),
611 | ("ferry", "⛴️"),
612 | ("field_hockey", "🏑"),
613 | ("fiji", "🇫🇯"),
614 | ("file_cabinet", "🗄️"),
615 | ("file_folder", "📁"),
616 | ("film_projector", "📽️"),
617 | ("film_strip", "🎞️"),
618 | ("finland", "🇫🇮"),
619 | ("fire", "🔥"),
620 | ("fire_engine", "🚒"),
621 | ("fire_extinguisher", "🧯"),
622 | ("firecracker", "🧨"),
623 | ("firefighter", "🧑🚒"),
624 | ("fireworks", "🎆"),
625 | ("first_quarter_moon", "🌓"),
626 | ("first_quarter_moon_with_face", "🌛"),
627 | ("fish", "🐟"),
628 | ("fish_cake", "🍥"),
629 | ("fishing_pole_and_fish", "🎣"),
630 | ("fist", "✊"),
631 | ("fist_left", "🤛"),
632 | ("fist_oncoming", "👊"),
633 | ("fist_raised", "✊"),
634 | ("fist_right", "🤜"),
635 | ("five", "5️⃣"),
636 | ("flags", "🎏"),
637 | ("flamingo", "🦩"),
638 | ("flashlight", "🔦"),
639 | ("flat_shoe", "🥿"),
640 | ("flatbread", "🫓"),
641 | ("fleur_de_lis", "⚜️"),
642 | ("flight_arrival", "🛬"),
643 | ("flight_departure", "🛫"),
644 | ("flipper", "🐬"),
645 | ("floppy_disk", "💾"),
646 | ("flower_playing_cards", "🎴"),
647 | ("flushed", "😳"),
648 | ("fly", "🪰"),
649 | ("flying_disc", "🥏"),
650 | ("flying_saucer", "🛸"),
651 | ("fog", "🌫️"),
652 | ("foggy", "🌁"),
653 | ("fondue", "🫕"),
654 | ("foot", "🦶"),
655 | ("football", "🏈"),
656 | ("footprints", "👣"),
657 | ("fork_and_knife", "🍴"),
658 | ("fortune_cookie", "🥠"),
659 | ("fountain", "⛲"),
660 | ("fountain_pen", "🖋️"),
661 | ("four", "4️⃣"),
662 | ("four_leaf_clover", "🍀"),
663 | ("fox_face", "🦊"),
664 | ("fr", "🇫🇷"),
665 | ("framed_picture", "🖼️"),
666 | ("free", "🆓"),
667 | ("french_guiana", "🇬🇫"),
668 | ("french_polynesia", "🇵🇫"),
669 | ("french_southern_territories", "🇹🇫"),
670 | ("fried_egg", "🍳"),
671 | ("fried_shrimp", "🍤"),
672 | ("fries", "🍟"),
673 | ("frog", "🐸"),
674 | ("frowning", "😦"),
675 | ("frowning_face", "☹️"),
676 | ("frowning_man", "🙍♂️"),
677 | ("frowning_person", "🙍"),
678 | ("frowning_woman", "🙍♀️"),
679 | ("fu", "🖕"),
680 | ("fuelpump", "⛽"),
681 | ("full_moon", "🌕"),
682 | ("full_moon_with_face", "🌝"),
683 | ("funeral_urn", "⚱️"),
684 | ("gabon", "🇬🇦"),
685 | ("gambia", "🇬🇲"),
686 | ("game_die", "🎲"),
687 | ("garlic", "🧄"),
688 | ("gb", "🇬🇧"),
689 | ("gear", "⚙️"),
690 | ("gem", "💎"),
691 | ("gemini", "♊"),
692 | ("genie", "🧞"),
693 | ("genie_man", "🧞♂️"),
694 | ("genie_woman", "🧞♀️"),
695 | ("georgia", "🇬🇪"),
696 | ("ghana", "🇬🇭"),
697 | ("ghost", "👻"),
698 | ("gibraltar", "🇬🇮"),
699 | ("gift", "🎁"),
700 | ("gift_heart", "💝"),
701 | ("giraffe", "🦒"),
702 | ("girl", "👧"),
703 | ("globe_with_meridians", "🌐"),
704 | ("gloves", "🧤"),
705 | ("goal_net", "🥅"),
706 | ("goat", "🐐"),
707 | ("goggles", "🥽"),
708 | ("golf", "⛳"),
709 | ("golfing", "🏌️"),
710 | ("golfing_man", "🏌️♂️"),
711 | ("golfing_woman", "🏌️♀️"),
712 | ("gorilla", "🦍"),
713 | ("grapes", "🍇"),
714 | ("greece", "🇬🇷"),
715 | ("green_apple", "🍏"),
716 | ("green_book", "📗"),
717 | ("green_circle", "🟢"),
718 | ("green_heart", "💚"),
719 | ("green_salad", "🥗"),
720 | ("green_square", "🟩"),
721 | ("greenland", "🇬🇱"),
722 | ("grenada", "🇬🇩"),
723 | ("grey_exclamation", "❕"),
724 | ("grey_question", "❔"),
725 | ("grimacing", "😬"),
726 | ("grin", "😁"),
727 | ("grinning", "😀"),
728 | ("guadeloupe", "🇬🇵"),
729 | ("guam", "🇬🇺"),
730 | ("guard", "💂"),
731 | ("guardsman", "💂♂️"),
732 | ("guardswoman", "💂♀️"),
733 | ("guatemala", "🇬🇹"),
734 | ("guernsey", "🇬🇬"),
735 | ("guide_dog", "🦮"),
736 | ("guinea", "🇬🇳"),
737 | ("guinea_bissau", "🇬🇼"),
738 | ("guitar", "🎸"),
739 | ("gun", "🔫"),
740 | ("guyana", "🇬🇾"),
741 | ("haircut", "💇"),
742 | ("haircut_man", "💇♂️"),
743 | ("haircut_woman", "💇♀️"),
744 | ("haiti", "🇭🇹"),
745 | ("hamburger", "🍔"),
746 | ("hammer", "🔨"),
747 | ("hammer_and_pick", "⚒️"),
748 | ("hammer_and_wrench", "🛠️"),
749 | ("hamster", "🐹"),
750 | ("hand", "✋"),
751 | ("hand_over_mouth", "🤭"),
752 | ("handbag", "👜"),
753 | ("handball_person", "🤾"),
754 | ("handshake", "🤝"),
755 | ("hankey", "💩"),
756 | ("hash", "#️⃣"),
757 | ("hatched_chick", "🐥"),
758 | ("hatching_chick", "🐣"),
759 | ("headphones", "🎧"),
760 | ("headstone", "🪦"),
761 | ("health_worker", "🧑⚕️"),
762 | ("hear_no_evil", "🙉"),
763 | ("heard_mcdonald_islands", "🇭🇲"),
764 | ("heart", "❤️"),
765 | ("heart_decoration", "💟"),
766 | ("heart_eyes", "😍"),
767 | ("heart_eyes_cat", "😻"),
768 | ("heart_on_fire", "❤️🔥"),
769 | ("heartbeat", "💓"),
770 | ("heartpulse", "💗"),
771 | ("hearts", "♥️"),
772 | ("heavy_check_mark", "✔️"),
773 | ("heavy_division_sign", "➗"),
774 | ("heavy_dollar_sign", "💲"),
775 | ("heavy_exclamation_mark", "❗"),
776 | ("heavy_heart_exclamation", "❣️"),
777 | ("heavy_minus_sign", "➖"),
778 | ("heavy_multiplication_x", "✖️"),
779 | ("heavy_plus_sign", "➕"),
780 | ("hedgehog", "🦔"),
781 | ("helicopter", "🚁"),
782 | ("herb", "🌿"),
783 | ("hibiscus", "🌺"),
784 | ("high_brightness", "🔆"),
785 | ("high_heel", "👠"),
786 | ("hiking_boot", "🥾"),
787 | ("hindu_temple", "🛕"),
788 | ("hippopotamus", "🦛"),
789 | ("hocho", "🔪"),
790 | ("hole", "🕳️"),
791 | ("honduras", "🇭🇳"),
792 | ("honey_pot", "🍯"),
793 | ("honeybee", "🐝"),
794 | ("hong_kong", "🇭🇰"),
795 | ("hook", "🪝"),
796 | ("horse", "🐴"),
797 | ("horse_racing", "🏇"),
798 | ("hospital", "🏥"),
799 | ("hot_face", "🥵"),
800 | ("hot_pepper", "🌶️"),
801 | ("hotdog", "🌭"),
802 | ("hotel", "🏨"),
803 | ("hotsprings", "♨️"),
804 | ("hourglass", "⌛"),
805 | ("hourglass_flowing_sand", "⏳"),
806 | ("house", "🏠"),
807 | ("house_with_garden", "🏡"),
808 | ("houses", "🏘️"),
809 | ("hugs", "🤗"),
810 | ("hungary", "🇭🇺"),
811 | ("hushed", "😯"),
812 | ("hut", "🛖"),
813 | ("ice_cream", "🍨"),
814 | ("ice_cube", "🧊"),
815 | ("ice_hockey", "🏒"),
816 | ("ice_skate", "⛸️"),
817 | ("icecream", "🍦"),
818 | ("iceland", "🇮🇸"),
819 | ("id", "🆔"),
820 | ("ideograph_advantage", "🉐"),
821 | ("imp", "👿"),
822 | ("inbox_tray", "📥"),
823 | ("incoming_envelope", "📨"),
824 | ("india", "🇮🇳"),
825 | ("indonesia", "🇮🇩"),
826 | ("infinity", "♾️"),
827 | ("information_desk_person", "💁"),
828 | ("information_source", "ℹ️"),
829 | ("innocent", "😇"),
830 | ("interrobang", "⁉️"),
831 | ("iphone", "📱"),
832 | ("iran", "🇮🇷"),
833 | ("iraq", "🇮🇶"),
834 | ("ireland", "🇮🇪"),
835 | ("isle_of_man", "🇮🇲"),
836 | ("israel", "🇮🇱"),
837 | ("it", "🇮🇹"),
838 | ("izakaya_lantern", "🏮"),
839 | ("jack_o_lantern", "🎃"),
840 | ("jamaica", "🇯🇲"),
841 | ("japan", "🗾"),
842 | ("japanese_castle", "🏯"),
843 | ("japanese_goblin", "👺"),
844 | ("japanese_ogre", "👹"),
845 | ("jeans", "👖"),
846 | ("jersey", "🇯🇪"),
847 | ("jigsaw", "🧩"),
848 | ("jordan", "🇯🇴"),
849 | ("joy", "😂"),
850 | ("joy_cat", "😹"),
851 | ("joystick", "🕹️"),
852 | ("jp", "🇯🇵"),
853 | ("judge", "🧑⚖️"),
854 | ("juggling_person", "🤹"),
855 | ("kaaba", "🕋"),
856 | ("kangaroo", "🦘"),
857 | ("kazakhstan", "🇰🇿"),
858 | ("kenya", "🇰🇪"),
859 | ("key", "🔑"),
860 | ("keyboard", "⌨️"),
861 | ("keycap_ten", "🔟"),
862 | ("kick_scooter", "🛴"),
863 | ("kimono", "👘"),
864 | ("kiribati", "🇰🇮"),
865 | ("kiss", "💋"),
866 | ("kissing", "😗"),
867 | ("kissing_cat", "😽"),
868 | ("kissing_closed_eyes", "😚"),
869 | ("kissing_heart", "😘"),
870 | ("kissing_smiling_eyes", "😙"),
871 | ("kite", "🪁"),
872 | ("kiwi_fruit", "🥝"),
873 | ("kneeling_man", "🧎♂️"),
874 | ("kneeling_person", "🧎"),
875 | ("kneeling_woman", "🧎♀️"),
876 | ("knife", "🔪"),
877 | ("knot", "🪢"),
878 | ("koala", "🐨"),
879 | ("koko", "🈁"),
880 | ("kosovo", "🇽🇰"),
881 | ("kr", "🇰🇷"),
882 | ("kuwait", "🇰🇼"),
883 | ("kyrgyzstan", "🇰🇬"),
884 | ("lab_coat", "🥼"),
885 | ("label", "🏷️"),
886 | ("lacrosse", "🥍"),
887 | ("ladder", "🪜"),
888 | ("lady_beetle", "🐞"),
889 | ("lantern", "🏮"),
890 | ("laos", "🇱🇦"),
891 | ("large_blue_circle", "🔵"),
892 | ("large_blue_diamond", "🔷"),
893 | ("large_orange_diamond", "🔶"),
894 | ("last_quarter_moon", "🌗"),
895 | ("last_quarter_moon_with_face", "🌜"),
896 | ("latin_cross", "✝️"),
897 | ("latvia", "🇱🇻"),
898 | ("laughing", "😆"),
899 | ("leafy_green", "🥬"),
900 | ("leaves", "🍃"),
901 | ("lebanon", "🇱🇧"),
902 | ("ledger", "📒"),
903 | ("left_luggage", "🛅"),
904 | ("left_right_arrow", "↔️"),
905 | ("left_speech_bubble", "🗨️"),
906 | ("leftwards_arrow_with_hook", "↩️"),
907 | ("leg", "🦵"),
908 | ("lemon", "🍋"),
909 | ("leo", "♌"),
910 | ("leopard", "🐆"),
911 | ("lesotho", "🇱🇸"),
912 | ("level_slider", "🎚️"),
913 | ("liberia", "🇱🇷"),
914 | ("libra", "♎"),
915 | ("libya", "🇱🇾"),
916 | ("liechtenstein", "🇱🇮"),
917 | ("light_rail", "🚈"),
918 | ("link", "🔗"),
919 | ("lion", "🦁"),
920 | ("lips", "👄"),
921 | ("lipstick", "💄"),
922 | ("lithuania", "🇱🇹"),
923 | ("lizard", "🦎"),
924 | ("llama", "🦙"),
925 | ("lobster", "🦞"),
926 | ("lock", "🔒"),
927 | ("lock_with_ink_pen", "🔏"),
928 | ("lollipop", "🍭"),
929 | ("long_drum", "🪘"),
930 | ("loop", "➿"),
931 | ("lotion_bottle", "🧴"),
932 | ("lotus_position", "🧘"),
933 | ("lotus_position_man", "🧘♂️"),
934 | ("lotus_position_woman", "🧘♀️"),
935 | ("loud_sound", "🔊"),
936 | ("loudspeaker", "📢"),
937 | ("love_hotel", "🏩"),
938 | ("love_letter", "💌"),
939 | ("love_you_gesture", "🤟"),
940 | ("low_brightness", "🔅"),
941 | ("luggage", "🧳"),
942 | ("lungs", "🫁"),
943 | ("luxembourg", "🇱🇺"),
944 | ("lying_face", "🤥"),
945 | ("m", "Ⓜ️"),
946 | ("macau", "🇲🇴"),
947 | ("macedonia", "🇲🇰"),
948 | ("madagascar", "🇲🇬"),
949 | ("mag", "🔍"),
950 | ("mag_right", "🔎"),
951 | ("mage", "🧙"),
952 | ("mage_man", "🧙♂️"),
953 | ("mage_woman", "🧙♀️"),
954 | ("magic_wand", "🪄"),
955 | ("magnet", "🧲"),
956 | ("mahjong", "🀄"),
957 | ("mailbox", "📫"),
958 | ("mailbox_closed", "📪"),
959 | ("mailbox_with_mail", "📬"),
960 | ("mailbox_with_no_mail", "📭"),
961 | ("malawi", "🇲🇼"),
962 | ("malaysia", "🇲🇾"),
963 | ("maldives", "🇲🇻"),
964 | ("male_detective", "🕵️♂️"),
965 | ("male_sign", "♂️"),
966 | ("mali", "🇲🇱"),
967 | ("malta", "🇲🇹"),
968 | ("mammoth", "🦣"),
969 | ("man", "👨"),
970 | ("man_artist", "👨🎨"),
971 | ("man_astronaut", "👨🚀"),
972 | ("man_beard", "🧔♂️"),
973 | ("man_cartwheeling", "🤸♂️"),
974 | ("man_cook", "👨🍳"),
975 | ("man_dancing", "🕺"),
976 | ("man_facepalming", "🤦♂️"),
977 | ("man_factory_worker", "👨🏭"),
978 | ("man_farmer", "👨🌾"),
979 | ("man_feeding_baby", "👨🍼"),
980 | ("man_firefighter", "👨🚒"),
981 | ("man_health_worker", "👨⚕️"),
982 | ("man_in_manual_wheelchair", "👨🦽"),
983 | ("man_in_motorized_wheelchair", "👨🦼"),
984 | ("man_in_tuxedo", "🤵♂️"),
985 | ("man_judge", "👨⚖️"),
986 | ("man_juggling", "🤹♂️"),
987 | ("man_mechanic", "👨🔧"),
988 | ("man_office_worker", "👨💼"),
989 | ("man_pilot", "👨✈️"),
990 | ("man_playing_handball", "🤾♂️"),
991 | ("man_playing_water_polo", "🤽♂️"),
992 | ("man_scientist", "👨🔬"),
993 | ("man_shrugging", "🤷♂️"),
994 | ("man_singer", "👨🎤"),
995 | ("man_student", "👨🎓"),
996 | ("man_teacher", "👨🏫"),
997 | ("man_technologist", "👨💻"),
998 | ("man_with_gua_pi_mao", "👲"),
999 | ("man_with_probing_cane", "👨🦯"),
1000 | ("man_with_turban", "👳♂️"),
1001 | ("man_with_veil", "👰♂️"),
1002 | ("mandarin", "🍊"),
1003 | ("mango", "🥭"),
1004 | ("mans_shoe", "👞"),
1005 | ("mantelpiece_clock", "🕰️"),
1006 | ("manual_wheelchair", "🦽"),
1007 | ("maple_leaf", "🍁"),
1008 | ("marshall_islands", "🇲🇭"),
1009 | ("martial_arts_uniform", "🥋"),
1010 | ("martinique", "🇲🇶"),
1011 | ("mask", "😷"),
1012 | ("massage", "💆"),
1013 | ("massage_man", "💆♂️"),
1014 | ("massage_woman", "💆♀️"),
1015 | ("mate", "🧉"),
1016 | ("mauritania", "🇲🇷"),
1017 | ("mauritius", "🇲🇺"),
1018 | ("mayotte", "🇾🇹"),
1019 | ("meat_on_bone", "🍖"),
1020 | ("mechanic", "🧑🔧"),
1021 | ("mechanical_arm", "🦾"),
1022 | ("mechanical_leg", "🦿"),
1023 | ("medal_military", "🎖️"),
1024 | ("medal_sports", "🏅"),
1025 | ("medical_symbol", "⚕️"),
1026 | ("mega", "📣"),
1027 | ("melon", "🍈"),
1028 | ("memo", "📝"),
1029 | ("men_wrestling", "🤼♂️"),
1030 | ("mending_heart", "❤️🩹"),
1031 | ("menorah", "🕎"),
1032 | ("mens", "🚹"),
1033 | ("mermaid", "🧜♀️"),
1034 | ("merman", "🧜♂️"),
1035 | ("merperson", "🧜"),
1036 | ("metal", "🤘"),
1037 | ("metro", "🚇"),
1038 | ("mexico", "🇲🇽"),
1039 | ("microbe", "🦠"),
1040 | ("micronesia", "🇫🇲"),
1041 | ("microphone", "🎤"),
1042 | ("microscope", "🔬"),
1043 | ("middle_finger", "🖕"),
1044 | ("military_helmet", "🪖"),
1045 | ("milk_glass", "🥛"),
1046 | ("milky_way", "🌌"),
1047 | ("minibus", "🚐"),
1048 | ("minidisc", "💽"),
1049 | ("mirror", "🪞"),
1050 | ("mobile_phone_off", "📴"),
1051 | ("moldova", "🇲🇩"),
1052 | ("monaco", "🇲🇨"),
1053 | ("money_mouth_face", "🤑"),
1054 | ("money_with_wings", "💸"),
1055 | ("moneybag", "💰"),
1056 | ("mongolia", "🇲🇳"),
1057 | ("monkey", "🐒"),
1058 | ("monkey_face", "🐵"),
1059 | ("monocle_face", "🧐"),
1060 | ("monorail", "🚝"),
1061 | ("montenegro", "🇲🇪"),
1062 | ("montserrat", "🇲🇸"),
1063 | ("moon", "🌔"),
1064 | ("moon_cake", "🥮"),
1065 | ("morocco", "🇲🇦"),
1066 | ("mortar_board", "🎓"),
1067 | ("mosque", "🕌"),
1068 | ("mosquito", "🦟"),
1069 | ("motor_boat", "🛥️"),
1070 | ("motor_scooter", "🛵"),
1071 | ("motorcycle", "🏍️"),
1072 | ("motorized_wheelchair", "🦼"),
1073 | ("motorway", "🛣️"),
1074 | ("mount_fuji", "🗻"),
1075 | ("mountain", "⛰️"),
1076 | ("mountain_bicyclist", "🚵"),
1077 | ("mountain_biking_man", "🚵♂️"),
1078 | ("mountain_biking_woman", "🚵♀️"),
1079 | ("mountain_cableway", "🚠"),
1080 | ("mountain_railway", "🚞"),
1081 | ("mountain_snow", "🏔️"),
1082 | ("mouse", "🐭"),
1083 | ("mouse2", "🐁"),
1084 | ("mouse_trap", "🪤"),
1085 | ("movie_camera", "🎥"),
1086 | ("moyai", "🗿"),
1087 | ("mozambique", "🇲🇿"),
1088 | ("mrs_claus", "🤶"),
1089 | ("muscle", "💪"),
1090 | ("mushroom", "🍄"),
1091 | ("musical_keyboard", "🎹"),
1092 | ("musical_note", "🎵"),
1093 | ("musical_score", "🎼"),
1094 | ("mute", "🔇"),
1095 | ("mx_claus", "🧑🎄"),
1096 | ("myanmar", "🇲🇲"),
1097 | ("nail_care", "💅"),
1098 | ("name_badge", "📛"),
1099 | ("namibia", "🇳🇦"),
1100 | ("national_park", "🏞️"),
1101 | ("nauru", "🇳🇷"),
1102 | ("nauseated_face", "🤢"),
1103 | ("nazar_amulet", "🧿"),
1104 | ("necktie", "👔"),
1105 | ("negative_squared_cross_mark", "❎"),
1106 | ("nepal", "🇳🇵"),
1107 | ("nerd_face", "🤓"),
1108 | ("nesting_dolls", "🪆"),
1109 | ("netherlands", "🇳🇱"),
1110 | ("neutral_face", "😐"),
1111 | ("new", "🆕"),
1112 | ("new_caledonia", "🇳🇨"),
1113 | ("new_moon", "🌑"),
1114 | ("new_moon_with_face", "🌚"),
1115 | ("new_zealand", "🇳🇿"),
1116 | ("newspaper", "📰"),
1117 | ("newspaper_roll", "🗞️"),
1118 | ("next_track_button", "⏭️"),
1119 | ("ng", "🆖"),
1120 | ("ng_man", "🙅♂️"),
1121 | ("ng_woman", "🙅♀️"),
1122 | ("nicaragua", "🇳🇮"),
1123 | ("niger", "🇳🇪"),
1124 | ("nigeria", "🇳🇬"),
1125 | ("night_with_stars", "🌃"),
1126 | ("nine", "9️⃣"),
1127 | ("ninja", "🥷"),
1128 | ("niue", "🇳🇺"),
1129 | ("no_bell", "🔕"),
1130 | ("no_bicycles", "🚳"),
1131 | ("no_entry", "⛔"),
1132 | ("no_entry_sign", "🚫"),
1133 | ("no_good", "🙅"),
1134 | ("no_good_man", "🙅♂️"),
1135 | ("no_good_woman", "🙅♀️"),
1136 | ("no_mobile_phones", "📵"),
1137 | ("no_mouth", "😶"),
1138 | ("no_pedestrians", "🚷"),
1139 | ("no_smoking", "🚭"),
1140 | ("non-potable_water", "🚱"),
1141 | ("norfolk_island", "🇳🇫"),
1142 | ("north_korea", "🇰🇵"),
1143 | ("northern_mariana_islands", "🇲🇵"),
1144 | ("norway", "🇳🇴"),
1145 | ("nose", "👃"),
1146 | ("notebook", "📓"),
1147 | ("notebook_with_decorative_cover", "📔"),
1148 | ("notes", "🎶"),
1149 | ("nut_and_bolt", "🔩"),
1150 | ("o", "⭕"),
1151 | ("o2", "🅾️"),
1152 | ("ocean", "🌊"),
1153 | ("octopus", "🐙"),
1154 | ("oden", "🍢"),
1155 | ("office", "🏢"),
1156 | ("office_worker", "🧑💼"),
1157 | ("oil_drum", "🛢️"),
1158 | ("ok", "🆗"),
1159 | ("ok_hand", "👌"),
1160 | ("ok_man", "🙆♂️"),
1161 | ("ok_person", "🙆"),
1162 | ("ok_woman", "🙆♀️"),
1163 | ("old_key", "🗝️"),
1164 | ("older_adult", "🧓"),
1165 | ("older_man", "👴"),
1166 | ("older_woman", "👵"),
1167 | ("olive", "🫒"),
1168 | ("om", "🕉️"),
1169 | ("oman", "🇴🇲"),
1170 | ("on", "🔛"),
1171 | ("oncoming_automobile", "🚘"),
1172 | ("oncoming_bus", "🚍"),
1173 | ("oncoming_police_car", "🚔"),
1174 | ("oncoming_taxi", "🚖"),
1175 | ("one", "1️⃣"),
1176 | ("one_piece_swimsuit", "🩱"),
1177 | ("onion", "🧅"),
1178 | ("open_book", "📖"),
1179 | ("open_file_folder", "📂"),
1180 | ("open_hands", "👐"),
1181 | ("open_mouth", "😮"),
1182 | ("open_umbrella", "☂️"),
1183 | ("ophiuchus", "⛎"),
1184 | ("orange", "🍊"),
1185 | ("orange_book", "📙"),
1186 | ("orange_circle", "🟠"),
1187 | ("orange_heart", "🧡"),
1188 | ("orange_square", "🟧"),
1189 | ("orangutan", "🦧"),
1190 | ("orthodox_cross", "☦️"),
1191 | ("otter", "🦦"),
1192 | ("outbox_tray", "📤"),
1193 | ("owl", "🦉"),
1194 | ("ox", "🐂"),
1195 | ("oyster", "🦪"),
1196 | ("package", "📦"),
1197 | ("page_facing_up", "📄"),
1198 | ("page_with_curl", "📃"),
1199 | ("pager", "📟"),
1200 | ("paintbrush", "🖌️"),
1201 | ("pakistan", "🇵🇰"),
1202 | ("palau", "🇵🇼"),
1203 | ("palestinian_territories", "🇵🇸"),
1204 | ("palm_tree", "🌴"),
1205 | ("palms_up_together", "🤲"),
1206 | ("panama", "🇵🇦"),
1207 | ("pancakes", "🥞"),
1208 | ("panda_face", "🐼"),
1209 | ("paperclip", "📎"),
1210 | ("paperclips", "🖇️"),
1211 | ("papua_new_guinea", "🇵🇬"),
1212 | ("parachute", "🪂"),
1213 | ("paraguay", "🇵🇾"),
1214 | ("parasol_on_ground", "⛱️"),
1215 | ("parking", "🅿️"),
1216 | ("parrot", "🦜"),
1217 | ("part_alternation_mark", "〽️"),
1218 | ("partly_sunny", "⛅"),
1219 | ("partying_face", "🥳"),
1220 | ("passenger_ship", "🛳️"),
1221 | ("passport_control", "🛂"),
1222 | ("pause_button", "⏸️"),
1223 | ("paw_prints", "🐾"),
1224 | ("peace_symbol", "☮️"),
1225 | ("peach", "🍑"),
1226 | ("peacock", "🦚"),
1227 | ("peanuts", "🥜"),
1228 | ("pear", "🍐"),
1229 | ("pen", "🖊️"),
1230 | ("pencil", "📝"),
1231 | ("pencil2", "✏️"),
1232 | ("penguin", "🐧"),
1233 | ("pensive", "😔"),
1234 | ("people_holding_hands", "🧑🤝🧑"),
1235 | ("people_hugging", "🫂"),
1236 | ("performing_arts", "🎭"),
1237 | ("persevere", "😣"),
1238 | ("person_bald", "🧑🦲"),
1239 | ("person_curly_hair", "🧑🦱"),
1240 | ("person_feeding_baby", "🧑🍼"),
1241 | ("person_fencing", "🤺"),
1242 | ("person_in_manual_wheelchair", "🧑🦽"),
1243 | ("person_in_motorized_wheelchair", "🧑🦼"),
1244 | ("person_in_tuxedo", "🤵"),
1245 | ("person_red_hair", "🧑🦰"),
1246 | ("person_white_hair", "🧑🦳"),
1247 | ("person_with_probing_cane", "🧑🦯"),
1248 | ("person_with_turban", "👳"),
1249 | ("person_with_veil", "👰"),
1250 | ("peru", "🇵🇪"),
1251 | ("petri_dish", "🧫"),
1252 | ("philippines", "🇵🇭"),
1253 | ("phone", "☎️"),
1254 | ("pick", "⛏️"),
1255 | ("pickup_truck", "🛻"),
1256 | ("pie", "🥧"),
1257 | ("pig", "🐷"),
1258 | ("pig2", "🐖"),
1259 | ("pig_nose", "🐽"),
1260 | ("pill", "💊"),
1261 | ("pilot", "🧑✈️"),
1262 | ("pinata", "🪅"),
1263 | ("pinched_fingers", "🤌"),
1264 | ("pinching_hand", "🤏"),
1265 | ("pineapple", "🍍"),
1266 | ("ping_pong", "🏓"),
1267 | ("pirate_flag", "🏴☠️"),
1268 | ("pisces", "♓"),
1269 | ("pitcairn_islands", "🇵🇳"),
1270 | ("pizza", "🍕"),
1271 | ("placard", "🪧"),
1272 | ("place_of_worship", "🛐"),
1273 | ("plate_with_cutlery", "🍽️"),
1274 | ("play_or_pause_button", "⏯️"),
1275 | ("pleading_face", "🥺"),
1276 | ("plunger", "🪠"),
1277 | ("point_down", "👇"),
1278 | ("point_left", "👈"),
1279 | ("point_right", "👉"),
1280 | ("point_up", "☝️"),
1281 | ("point_up_2", "👆"),
1282 | ("poland", "🇵🇱"),
1283 | ("polar_bear", "🐻❄️"),
1284 | ("police_car", "🚓"),
1285 | ("police_officer", "👮"),
1286 | ("policeman", "👮♂️"),
1287 | ("policewoman", "👮♀️"),
1288 | ("poodle", "🐩"),
1289 | ("poop", "💩"),
1290 | ("popcorn", "🍿"),
1291 | ("portugal", "🇵🇹"),
1292 | ("post_office", "🏣"),
1293 | ("postal_horn", "📯"),
1294 | ("postbox", "📮"),
1295 | ("potable_water", "🚰"),
1296 | ("potato", "🥔"),
1297 | ("potted_plant", "🪴"),
1298 | ("pouch", "👝"),
1299 | ("poultry_leg", "🍗"),
1300 | ("pound", "💷"),
1301 | ("pout", "😡"),
1302 | ("pouting_cat", "😾"),
1303 | ("pouting_face", "🙎"),
1304 | ("pouting_man", "🙎♂️"),
1305 | ("pouting_woman", "🙎♀️"),
1306 | ("pray", "🙏"),
1307 | ("prayer_beads", "📿"),
1308 | ("pregnant_woman", "🤰"),
1309 | ("pretzel", "🥨"),
1310 | ("previous_track_button", "⏮️"),
1311 | ("prince", "🤴"),
1312 | ("princess", "👸"),
1313 | ("printer", "🖨️"),
1314 | ("probing_cane", "🦯"),
1315 | ("puerto_rico", "🇵🇷"),
1316 | ("punch", "👊"),
1317 | ("purple_circle", "🟣"),
1318 | ("purple_heart", "💜"),
1319 | ("purple_square", "🟪"),
1320 | ("purse", "👛"),
1321 | ("pushpin", "📌"),
1322 | ("put_litter_in_its_place", "🚮"),
1323 | ("qatar", "🇶🇦"),
1324 | ("question", "❓"),
1325 | ("rabbit", "🐰"),
1326 | ("rabbit2", "🐇"),
1327 | ("raccoon", "🦝"),
1328 | ("racehorse", "🐎"),
1329 | ("racing_car", "🏎️"),
1330 | ("radio", "📻"),
1331 | ("radio_button", "🔘"),
1332 | ("radioactive", "☢️"),
1333 | ("rage", "😡"),
1334 | ("railway_car", "🚃"),
1335 | ("railway_track", "🛤️"),
1336 | ("rainbow", "🌈"),
1337 | ("rainbow_flag", "🏳️🌈"),
1338 | ("raised_back_of_hand", "🤚"),
1339 | ("raised_eyebrow", "🤨"),
1340 | ("raised_hand", "✋"),
1341 | ("raised_hand_with_fingers_splayed", "🖐️"),
1342 | ("raised_hands", "🙌"),
1343 | ("raising_hand", "🙋"),
1344 | ("raising_hand_man", "🙋♂️"),
1345 | ("raising_hand_woman", "🙋♀️"),
1346 | ("ram", "🐏"),
1347 | ("ramen", "🍜"),
1348 | ("rat", "🐀"),
1349 | ("razor", "🪒"),
1350 | ("receipt", "🧾"),
1351 | ("record_button", "⏺️"),
1352 | ("recycle", "♻️"),
1353 | ("red_car", "🚗"),
1354 | ("red_circle", "🔴"),
1355 | ("red_envelope", "🧧"),
1356 | ("red_haired_man", "👨🦰"),
1357 | ("red_haired_woman", "👩🦰"),
1358 | ("red_square", "🟥"),
1359 | ("registered", "®️"),
1360 | ("relaxed", "☺️"),
1361 | ("relieved", "😌"),
1362 | ("reminder_ribbon", "🎗️"),
1363 | ("repeat", "🔁"),
1364 | ("repeat_one", "🔂"),
1365 | ("rescue_worker_helmet", "⛑️"),
1366 | ("restroom", "🚻"),
1367 | ("reunion", "🇷🇪"),
1368 | ("revolving_hearts", "💞"),
1369 | ("rewind", "⏪"),
1370 | ("rhinoceros", "🦏"),
1371 | ("ribbon", "🎀"),
1372 | ("rice", "🍚"),
1373 | ("rice_ball", "🍙"),
1374 | ("rice_cracker", "🍘"),
1375 | ("rice_scene", "🎑"),
1376 | ("right_anger_bubble", "🗯️"),
1377 | ("ring", "💍"),
1378 | ("ringed_planet", "🪐"),
1379 | ("robot", "🤖"),
1380 | ("rock", "🪨"),
1381 | ("rocket", "🚀"),
1382 | ("rofl", "🤣"),
1383 | ("roll_eyes", "🙄"),
1384 | ("roll_of_paper", "🧻"),
1385 | ("roller_coaster", "🎢"),
1386 | ("roller_skate", "🛼"),
1387 | ("romania", "🇷🇴"),
1388 | ("rooster", "🐓"),
1389 | ("rose", "🌹"),
1390 | ("rosette", "🏵️"),
1391 | ("rotating_light", "🚨"),
1392 | ("round_pushpin", "📍"),
1393 | ("rowboat", "🚣"),
1394 | ("rowing_man", "🚣♂️"),
1395 | ("rowing_woman", "🚣♀️"),
1396 | ("ru", "🇷🇺"),
1397 | ("rugby_football", "🏉"),
1398 | ("runner", "🏃"),
1399 | ("running", "🏃"),
1400 | ("running_man", "🏃♂️"),
1401 | ("running_shirt_with_sash", "🎽"),
1402 | ("running_woman", "🏃♀️"),
1403 | ("rwanda", "🇷🇼"),
1404 | ("sa", "🈂️"),
1405 | ("safety_pin", "🧷"),
1406 | ("safety_vest", "🦺"),
1407 | ("sagittarius", "♐"),
1408 | ("sailboat", "⛵"),
1409 | ("sake", "🍶"),
1410 | ("salt", "🧂"),
1411 | ("samoa", "🇼🇸"),
1412 | ("san_marino", "🇸🇲"),
1413 | ("sandal", "👡"),
1414 | ("sandwich", "🥪"),
1415 | ("santa", "🎅"),
1416 | ("sao_tome_principe", "🇸🇹"),
1417 | ("sari", "🥻"),
1418 | ("sassy_man", "💁♂️"),
1419 | ("sassy_woman", "💁♀️"),
1420 | ("satellite", "📡"),
1421 | ("satisfied", "😆"),
1422 | ("saudi_arabia", "🇸🇦"),
1423 | ("sauna_man", "🧖♂️"),
1424 | ("sauna_person", "🧖"),
1425 | ("sauna_woman", "🧖♀️"),
1426 | ("sauropod", "🦕"),
1427 | ("saxophone", "🎷"),
1428 | ("scarf", "🧣"),
1429 | ("school", "🏫"),
1430 | ("school_satchel", "🎒"),
1431 | ("scientist", "🧑🔬"),
1432 | ("scissors", "✂️"),
1433 | ("scorpion", "🦂"),
1434 | ("scorpius", "♏"),
1435 | ("scotland", "🏴"),
1436 | ("scream", "😱"),
1437 | ("scream_cat", "🙀"),
1438 | ("screwdriver", "🪛"),
1439 | ("scroll", "📜"),
1440 | ("seal", "🦭"),
1441 | ("seat", "💺"),
1442 | ("secret", "㊙️"),
1443 | ("see_no_evil", "🙈"),
1444 | ("seedling", "🌱"),
1445 | ("selfie", "🤳"),
1446 | ("senegal", "🇸🇳"),
1447 | ("serbia", "🇷🇸"),
1448 | ("service_dog", "🐕🦺"),
1449 | ("seven", "7️⃣"),
1450 | ("sewing_needle", "🪡"),
1451 | ("seychelles", "🇸🇨"),
1452 | ("shallow_pan_of_food", "🥘"),
1453 | ("shamrock", "☘️"),
1454 | ("shark", "🦈"),
1455 | ("shaved_ice", "🍧"),
1456 | ("sheep", "🐑"),
1457 | ("shell", "🐚"),
1458 | ("shield", "🛡️"),
1459 | ("shinto_shrine", "⛩️"),
1460 | ("ship", "🚢"),
1461 | ("shirt", "👕"),
1462 | ("shit", "💩"),
1463 | ("shoe", "👞"),
1464 | ("shopping", "🛍️"),
1465 | ("shopping_cart", "🛒"),
1466 | ("shorts", "🩳"),
1467 | ("shower", "🚿"),
1468 | ("shrimp", "🦐"),
1469 | ("shrug", "🤷"),
1470 | ("shushing_face", "🤫"),
1471 | ("sierra_leone", "🇸🇱"),
1472 | ("signal_strength", "📶"),
1473 | ("singapore", "🇸🇬"),
1474 | ("singer", "🧑🎤"),
1475 | ("sint_maarten", "🇸🇽"),
1476 | ("six", "6️⃣"),
1477 | ("six_pointed_star", "🔯"),
1478 | ("skateboard", "🛹"),
1479 | ("ski", "🎿"),
1480 | ("skier", "⛷️"),
1481 | ("skull", "💀"),
1482 | ("skull_and_crossbones", "☠️"),
1483 | ("skunk", "🦨"),
1484 | ("sled", "🛷"),
1485 | ("sleeping", "😴"),
1486 | ("sleeping_bed", "🛌"),
1487 | ("sleepy", "😪"),
1488 | ("slightly_frowning_face", "🙁"),
1489 | ("slightly_smiling_face", "🙂"),
1490 | ("slot_machine", "🎰"),
1491 | ("sloth", "🦥"),
1492 | ("slovakia", "🇸🇰"),
1493 | ("slovenia", "🇸🇮"),
1494 | ("small_airplane", "🛩️"),
1495 | ("small_blue_diamond", "🔹"),
1496 | ("small_orange_diamond", "🔸"),
1497 | ("small_red_triangle", "🔺"),
1498 | ("small_red_triangle_down", "🔻"),
1499 | ("smile", "😄"),
1500 | ("smile_cat", "😸"),
1501 | ("smiley", "😃"),
1502 | ("smiley_cat", "😺"),
1503 | ("smiling_face_with_tear", "🥲"),
1504 | ("smiling_face_with_three_hearts", "🥰"),
1505 | ("smiling_imp", "😈"),
1506 | ("smirk", "😏"),
1507 | ("smirk_cat", "😼"),
1508 | ("smoking", "🚬"),
1509 | ("snail", "🐌"),
1510 | ("snake", "🐍"),
1511 | ("sneezing_face", "🤧"),
1512 | ("snowboarder", "🏂"),
1513 | ("snowflake", "❄️"),
1514 | ("snowman", "⛄"),
1515 | ("snowman_with_snow", "☃️"),
1516 | ("soap", "🧼"),
1517 | ("sob", "😭"),
1518 | ("soccer", "⚽"),
1519 | ("socks", "🧦"),
1520 | ("softball", "🥎"),
1521 | ("solomon_islands", "🇸🇧"),
1522 | ("somalia", "🇸🇴"),
1523 | ("soon", "🔜"),
1524 | ("sos", "🆘"),
1525 | ("sound", "🔉"),
1526 | ("south_africa", "🇿🇦"),
1527 | ("south_georgia_south_sandwich_islands", "🇬🇸"),
1528 | ("south_sudan", "🇸🇸"),
1529 | ("space_invader", "👾"),
1530 | ("spades", "♠️"),
1531 | ("spaghetti", "🍝"),
1532 | ("sparkle", "❇️"),
1533 | ("sparkler", "🎇"),
1534 | ("sparkles", "✨"),
1535 | ("sparkling_heart", "💖"),
1536 | ("speak_no_evil", "🙊"),
1537 | ("speaker", "🔈"),
1538 | ("speaking_head", "🗣️"),
1539 | ("speech_balloon", "💬"),
1540 | ("speedboat", "🚤"),
1541 | ("spider", "🕷️"),
1542 | ("spider_web", "🕸️"),
1543 | ("spiral_calendar", "🗓️"),
1544 | ("spiral_notepad", "🗒️"),
1545 | ("sponge", "🧽"),
1546 | ("spoon", "🥄"),
1547 | ("squid", "🦑"),
1548 | ("sri_lanka", "🇱🇰"),
1549 | ("st_barthelemy", "🇧🇱"),
1550 | ("st_helena", "🇸🇭"),
1551 | ("st_kitts_nevis", "🇰🇳"),
1552 | ("st_lucia", "🇱🇨"),
1553 | ("st_martin", "🇲🇫"),
1554 | ("st_pierre_miquelon", "🇵🇲"),
1555 | ("st_vincent_grenadines", "🇻🇨"),
1556 | ("stadium", "🏟️"),
1557 | ("standing_man", "🧍♂️"),
1558 | ("standing_person", "🧍"),
1559 | ("standing_woman", "🧍♀️"),
1560 | ("star", "⭐"),
1561 | ("star2", "🌟"),
1562 | ("star_and_crescent", "☪️"),
1563 | ("star_of_david", "✡️"),
1564 | ("star_struck", "🤩"),
1565 | ("stars", "🌠"),
1566 | ("station", "🚉"),
1567 | ("statue_of_liberty", "🗽"),
1568 | ("steam_locomotive", "🚂"),
1569 | ("stethoscope", "🩺"),
1570 | ("stew", "🍲"),
1571 | ("stop_button", "⏹️"),
1572 | ("stop_sign", "🛑"),
1573 | ("stopwatch", "⏱️"),
1574 | ("straight_ruler", "📏"),
1575 | ("strawberry", "🍓"),
1576 | ("stuck_out_tongue", "😛"),
1577 | ("stuck_out_tongue_closed_eyes", "😝"),
1578 | ("stuck_out_tongue_winking_eye", "😜"),
1579 | ("student", "🧑🎓"),
1580 | ("studio_microphone", "🎙️"),
1581 | ("stuffed_flatbread", "🥙"),
1582 | ("sudan", "🇸🇩"),
1583 | ("sun_behind_large_cloud", "🌥️"),
1584 | ("sun_behind_rain_cloud", "🌦️"),
1585 | ("sun_behind_small_cloud", "🌤️"),
1586 | ("sun_with_face", "🌞"),
1587 | ("sunflower", "🌻"),
1588 | ("sunglasses", "😎"),
1589 | ("sunny", "☀️"),
1590 | ("sunrise", "🌅"),
1591 | ("sunrise_over_mountains", "🌄"),
1592 | ("superhero", "🦸"),
1593 | ("superhero_man", "🦸♂️"),
1594 | ("superhero_woman", "🦸♀️"),
1595 | ("supervillain", "🦹"),
1596 | ("supervillain_man", "🦹♂️"),
1597 | ("supervillain_woman", "🦹♀️"),
1598 | ("surfer", "🏄"),
1599 | ("surfing_man", "🏄♂️"),
1600 | ("surfing_woman", "🏄♀️"),
1601 | ("suriname", "🇸🇷"),
1602 | ("sushi", "🍣"),
1603 | ("suspension_railway", "🚟"),
1604 | ("svalbard_jan_mayen", "🇸🇯"),
1605 | ("swan", "🦢"),
1606 | ("swaziland", "🇸🇿"),
1607 | ("sweat", "😓"),
1608 | ("sweat_drops", "💦"),
1609 | ("sweat_smile", "😅"),
1610 | ("sweden", "🇸🇪"),
1611 | ("sweet_potato", "🍠"),
1612 | ("swim_brief", "🩲"),
1613 | ("swimmer", "🏊"),
1614 | ("swimming_man", "🏊♂️"),
1615 | ("swimming_woman", "🏊♀️"),
1616 | ("switzerland", "🇨🇭"),
1617 | ("symbols", "🔣"),
1618 | ("synagogue", "🕍"),
1619 | ("syria", "🇸🇾"),
1620 | ("syringe", "💉"),
1621 | ("t-rex", "🦖"),
1622 | ("taco", "🌮"),
1623 | ("tada", "🎉"),
1624 | ("taiwan", "🇹🇼"),
1625 | ("tajikistan", "🇹🇯"),
1626 | ("takeout_box", "🥡"),
1627 | ("tamale", "🫔"),
1628 | ("tanabata_tree", "🎋"),
1629 | ("tangerine", "🍊"),
1630 | ("tanzania", "🇹🇿"),
1631 | ("taurus", "♉"),
1632 | ("taxi", "🚕"),
1633 | ("tea", "🍵"),
1634 | ("teacher", "🧑🏫"),
1635 | ("teapot", "🫖"),
1636 | ("technologist", "🧑💻"),
1637 | ("teddy_bear", "🧸"),
1638 | ("telephone", "☎️"),
1639 | ("telephone_receiver", "📞"),
1640 | ("telescope", "🔭"),
1641 | ("tennis", "🎾"),
1642 | ("tent", "⛺"),
1643 | ("test_tube", "🧪"),
1644 | ("thailand", "🇹🇭"),
1645 | ("thermometer", "🌡️"),
1646 | ("thinking", "🤔"),
1647 | ("thong_sandal", "🩴"),
1648 | ("thought_balloon", "💭"),
1649 | ("thread", "🧵"),
1650 | ("three", "3️⃣"),
1651 | ("thumbsdown", "👎"),
1652 | ("thumbsup", "👍"),
1653 | ("ticket", "🎫"),
1654 | ("tickets", "🎟️"),
1655 | ("tiger", "🐯"),
1656 | ("tiger2", "🐅"),
1657 | ("timer_clock", "⏲️"),
1658 | ("timor_leste", "🇹🇱"),
1659 | ("tipping_hand_man", "💁♂️"),
1660 | ("tipping_hand_person", "💁"),
1661 | ("tipping_hand_woman", "💁♀️"),
1662 | ("tired_face", "😫"),
1663 | ("tm", "™️"),
1664 | ("togo", "🇹🇬"),
1665 | ("toilet", "🚽"),
1666 | ("tokelau", "🇹🇰"),
1667 | ("tokyo_tower", "🗼"),
1668 | ("tomato", "🍅"),
1669 | ("tonga", "🇹🇴"),
1670 | ("tongue", "👅"),
1671 | ("toolbox", "🧰"),
1672 | ("tooth", "🦷"),
1673 | ("toothbrush", "🪥"),
1674 | ("top", "🔝"),
1675 | ("tophat", "🎩"),
1676 | ("tornado", "🌪️"),
1677 | ("tr", "🇹🇷"),
1678 | ("trackball", "🖲️"),
1679 | ("tractor", "🚜"),
1680 | ("traffic_light", "🚥"),
1681 | ("train", "🚋"),
1682 | ("train2", "🚆"),
1683 | ("tram", "🚊"),
1684 | ("transgender_flag", "🏳️⚧️"),
1685 | ("transgender_symbol", "⚧️"),
1686 | ("triangular_flag_on_post", "🚩"),
1687 | ("triangular_ruler", "📐"),
1688 | ("trident", "🔱"),
1689 | ("trinidad_tobago", "🇹🇹"),
1690 | ("tristan_da_cunha", "🇹🇦"),
1691 | ("triumph", "😤"),
1692 | ("trolleybus", "🚎"),
1693 | ("trophy", "🏆"),
1694 | ("tropical_drink", "🍹"),
1695 | ("tropical_fish", "🐠"),
1696 | ("truck", "🚚"),
1697 | ("trumpet", "🎺"),
1698 | ("tshirt", "👕"),
1699 | ("tulip", "🌷"),
1700 | ("tumbler_glass", "🥃"),
1701 | ("tunisia", "🇹🇳"),
1702 | ("turkey", "🦃"),
1703 | ("turkmenistan", "🇹🇲"),
1704 | ("turks_caicos_islands", "🇹🇨"),
1705 | ("turtle", "🐢"),
1706 | ("tuvalu", "🇹🇻"),
1707 | ("tv", "📺"),
1708 | ("twisted_rightwards_arrows", "🔀"),
1709 | ("two", "2️⃣"),
1710 | ("two_hearts", "💕"),
1711 | ("two_men_holding_hands", "👬"),
1712 | ("two_women_holding_hands", "👭"),
1713 | ("u5272", "🈹"),
1714 | ("u5408", "🈴"),
1715 | ("u55b6", "🈺"),
1716 | ("u6307", "🈯"),
1717 | ("u6708", "🈷️"),
1718 | ("u6709", "🈶"),
1719 | ("u6e80", "🈵"),
1720 | ("u7121", "🈚"),
1721 | ("u7533", "🈸"),
1722 | ("u7981", "🈲"),
1723 | ("u7a7a", "🈳"),
1724 | ("uganda", "🇺🇬"),
1725 | ("uk", "🇬🇧"),
1726 | ("ukraine", "🇺🇦"),
1727 | ("umbrella", "☔"),
1728 | ("unamused", "😒"),
1729 | ("underage", "🔞"),
1730 | ("unicorn", "🦄"),
1731 | ("united_arab_emirates", "🇦🇪"),
1732 | ("united_nations", "🇺🇳"),
1733 | ("unlock", "🔓"),
1734 | ("up", "🆙"),
1735 | ("upside_down_face", "🙃"),
1736 | ("uruguay", "🇺🇾"),
1737 | ("us", "🇺🇸"),
1738 | ("us_outlying_islands", "🇺🇲"),
1739 | ("us_virgin_islands", "🇻🇮"),
1740 | ("uzbekistan", "🇺🇿"),
1741 | ("v", "✌️"),
1742 | ("vampire", "🧛"),
1743 | ("vampire_man", "🧛♂️"),
1744 | ("vampire_woman", "🧛♀️"),
1745 | ("vanuatu", "🇻🇺"),
1746 | ("vatican_city", "🇻🇦"),
1747 | ("venezuela", "🇻🇪"),
1748 | ("vertical_traffic_light", "🚦"),
1749 | ("vhs", "📼"),
1750 | ("vibration_mode", "📳"),
1751 | ("video_camera", "📹"),
1752 | ("video_game", "🎮"),
1753 | ("vietnam", "🇻🇳"),
1754 | ("violin", "🎻"),
1755 | ("virgo", "♍"),
1756 | ("volcano", "🌋"),
1757 | ("volleyball", "🏐"),
1758 | ("vomiting_face", "🤮"),
1759 | ("vs", "🆚"),
1760 | ("vulcan_salute", "🖖"),
1761 | ("waffle", "🧇"),
1762 | ("wales", "🏴"),
1763 | ("walking", "🚶"),
1764 | ("walking_man", "🚶♂️"),
1765 | ("walking_woman", "🚶♀️"),
1766 | ("wallis_futuna", "🇼🇫"),
1767 | ("waning_crescent_moon", "🌘"),
1768 | ("waning_gibbous_moon", "🌖"),
1769 | ("warning", "⚠️"),
1770 | ("wastebasket", "🗑️"),
1771 | ("watch", "⌚"),
1772 | ("water_buffalo", "🐃"),
1773 | ("water_polo", "🤽"),
1774 | ("watermelon", "🍉"),
1775 | ("wave", "👋"),
1776 | ("wavy_dash", "〰️"),
1777 | ("waxing_crescent_moon", "🌒"),
1778 | ("waxing_gibbous_moon", "🌔"),
1779 | ("wc", "🚾"),
1780 | ("weary", "😩"),
1781 | ("wedding", "💒"),
1782 | ("weight_lifting", "🏋️"),
1783 | ("weight_lifting_man", "🏋️♂️"),
1784 | ("weight_lifting_woman", "🏋️♀️"),
1785 | ("western_sahara", "🇪🇭"),
1786 | ("whale", "🐳"),
1787 | ("whale2", "🐋"),
1788 | ("wheel_of_dharma", "☸️"),
1789 | ("wheelchair", "♿"),
1790 | ("white_check_mark", "✅"),
1791 | ("white_circle", "⚪"),
1792 | ("white_flag", "🏳️"),
1793 | ("white_flower", "💮"),
1794 | ("white_haired_man", "👨🦳"),
1795 | ("white_haired_woman", "👩🦳"),
1796 | ("white_heart", "🤍"),
1797 | ("white_large_square", "⬜"),
1798 | ("white_medium_small_square", "◽"),
1799 | ("white_medium_square", "◻️"),
1800 | ("white_small_square", "▫️"),
1801 | ("white_square_button", "🔳"),
1802 | ("wilted_flower", "🥀"),
1803 | ("wind_chime", "🎐"),
1804 | ("wind_face", "🌬️"),
1805 | ("window", "🪟"),
1806 | ("wine_glass", "🍷"),
1807 | ("wink", "😉"),
1808 | ("wolf", "🐺"),
1809 | ("woman", "👩"),
1810 | ("woman_artist", "👩🎨"),
1811 | ("woman_astronaut", "👩🚀"),
1812 | ("woman_beard", "🧔♀️"),
1813 | ("woman_cartwheeling", "🤸♀️"),
1814 | ("woman_cook", "👩🍳"),
1815 | ("woman_dancing", "💃"),
1816 | ("woman_facepalming", "🤦♀️"),
1817 | ("woman_factory_worker", "👩🏭"),
1818 | ("woman_farmer", "👩🌾"),
1819 | ("woman_feeding_baby", "👩🍼"),
1820 | ("woman_firefighter", "👩🚒"),
1821 | ("woman_health_worker", "👩⚕️"),
1822 | ("woman_in_manual_wheelchair", "👩🦽"),
1823 | ("woman_in_motorized_wheelchair", "👩🦼"),
1824 | ("woman_in_tuxedo", "🤵♀️"),
1825 | ("woman_judge", "👩⚖️"),
1826 | ("woman_juggling", "🤹♀️"),
1827 | ("woman_mechanic", "👩🔧"),
1828 | ("woman_office_worker", "👩💼"),
1829 | ("woman_pilot", "👩✈️"),
1830 | ("woman_playing_handball", "🤾♀️"),
1831 | ("woman_playing_water_polo", "🤽♀️"),
1832 | ("woman_scientist", "👩🔬"),
1833 | ("woman_shrugging", "🤷♀️"),
1834 | ("woman_singer", "👩🎤"),
1835 | ("woman_student", "👩🎓"),
1836 | ("woman_teacher", "👩🏫"),
1837 | ("woman_technologist", "👩💻"),
1838 | ("woman_with_headscarf", "🧕"),
1839 | ("woman_with_probing_cane", "👩🦯"),
1840 | ("woman_with_turban", "👳♀️"),
1841 | ("woman_with_veil", "👰♀️"),
1842 | ("womans_clothes", "👚"),
1843 | ("womans_hat", "👒"),
1844 | ("women_wrestling", "🤼♀️"),
1845 | ("womens", "🚺"),
1846 | ("wood", "🪵"),
1847 | ("woozy_face", "🥴"),
1848 | ("world_map", "🗺️"),
1849 | ("worm", "🪱"),
1850 | ("worried", "😟"),
1851 | ("wrench", "🔧"),
1852 | ("wrestling", "🤼"),
1853 | ("writing_hand", "✍️"),
1854 | ("x", "❌"),
1855 | ("yarn", "🧶"),
1856 | ("yawning_face", "🥱"),
1857 | ("yellow_circle", "🟡"),
1858 | ("yellow_heart", "💛"),
1859 | ("yellow_square", "🟨"),
1860 | ("yemen", "🇾🇪"),
1861 | ("yen", "💴"),
1862 | ("yin_yang", "☯️"),
1863 | ("yo_yo", "🪀"),
1864 | ("yum", "😋"),
1865 | ("zambia", "🇿🇲"),
1866 | ("zany_face", "🤪"),
1867 | ("zap", "⚡"),
1868 | ("zebra", "🦓"),
1869 | ("zero", "0️⃣"),
1870 | ("zimbabwe", "🇿🇼"),
1871 | ("zipper_mouth_face", "🤐"),
1872 | ("zombie", "🧟"),
1873 | ("zombie_man", "🧟♂️"),
1874 | ("zombie_woman", "🧟♀️"),
1875 | ("zzz", "💤"),
1876 | ];
1877 |
--------------------------------------------------------------------------------
/src/html.rs:
--------------------------------------------------------------------------------
1 | use std::collections::BTreeMap;
2 |
3 | use crate::{
4 | ast::{self, Attrs, Tag},
5 | tree::get_string_content,
6 | Document, HtmlOpts,
7 | };
8 |
9 | pub(crate) fn convert(opts: &HtmlOpts, doc: &Document) -> String {
10 | let refs = &doc.references;
11 | let mut ctx = Ctx { opts, refs, res: String::new() };
12 | ctx.render_doc(doc);
13 | ctx.res
14 | }
15 |
16 | struct Ctx<'a> {
17 | #[allow(unused)]
18 | opts: &'a HtmlOpts,
19 | refs: &'a BTreeMap,
20 | res: String,
21 | }
22 | impl<'a> Ctx<'a> {
23 | fn render_doc(&mut self, doc: &Document) {
24 | for child in &doc.children {
25 | self.render(child)
26 | }
27 | }
28 | fn render(&mut self, tag: &Tag) {
29 | match tag {
30 | Tag::Heading(_) => todo!(),
31 | Tag::Para(para) => {
32 | self.render_tag("p", ¶.attrs);
33 | self.render_children(¶.children);
34 | self.out("
");
35 | self.out("\n")
36 | }
37 | Tag::Link(link) => {
38 | let mut attrs = Attrs::new();
39 | let dest = self.resolve_reference(link.destination.as_deref(), link.reference.as_deref());
40 | if let Some(dest) = dest {
41 | attrs.insert("href".to_string(), dest);
42 | }
43 | self.render_tag("a", &attrs);
44 | self.render_children(&link.children);
45 | self.out("");
46 | }
47 | Tag::Image(image) => {
48 | let mut attrs = Attrs::new();
49 | let alt_text = get_string_content(&image.children);
50 | if !alt_text.is_empty() {
51 | attrs.insert("alt".to_string(), alt_text);
52 | }
53 | let dest = self.resolve_reference(image.destination.as_deref(), image.reference.as_deref());
54 | if let Some(dest) = dest {
55 | attrs.insert("src".to_string(), dest);
56 | }
57 | self.render_tag("img", &attrs)
58 | }
59 | Tag::CodeBlock(code_block) => {
60 | self.render_tag("pre", &code_block.attrs);
61 | let mut attrs = Attrs::default();
62 | if let Some(lang) = &code_block.lang {
63 | attrs.insert("class".to_string(), format!("language-{lang}"));
64 | }
65 | self.render_tag("code", &attrs);
66 | self.out_escape_html(&code_block.text);
67 | self.out("\n");
68 | }
69 | Tag::Strong(strong) => {
70 | self.render_tag("strong", &strong.attrs);
71 | self.render_children(&strong.children);
72 | self.out("");
73 | }
74 | Tag::Emph(emph) => {
75 | self.render_tag("em", &emph.attrs);
76 | self.render_children(&emph.children);
77 | self.out("");
78 | }
79 | Tag::DoubleQuoted(double_quoted) => {
80 | self.out("“");
81 | self.render_children(&double_quoted.children);
82 | self.out("”");
83 | }
84 | Tag::SoftBreak(_) => self.out("\n"),
85 | Tag::Url(url) => {
86 | let mut attrs = Attrs::new();
87 | attrs.insert("href".to_string(), url.destination.clone());
88 | self.render_tag("a", &attrs);
89 | self.out_escape_html(&url.destination);
90 | self.out("");
91 | }
92 | Tag::Str(str) => {
93 | if str.attrs.is_empty() {
94 | self.out_escape_html(&str.text);
95 | } else {
96 | self.render_tag("span", &str.attrs);
97 | self.out_escape_html(&str.text);
98 | self.out("")
99 | }
100 | }
101 | Tag::Emoji(emoji) => {
102 | if let Some(emoji) = crate::emoji::find_emoji(&emoji.alias) {
103 | self.out(emoji);
104 | } else {
105 | self.out(&format!(":{}:", emoji.alias));
106 | }
107 | }
108 | Tag::Verbatim(verbatim) => {
109 | self.render_tag("code", &verbatim.attrs);
110 | self.out_escape_html(&verbatim.text);
111 | self.out("");
112 | }
113 | Tag::Span(span) => {
114 | self.render_tag("span", &span.attrs);
115 | self.render_children(&span.children);
116 | self.out("");
117 | }
118 | Tag::Insert(insert) => {
119 | self.render_tag("ins", &insert.attrs);
120 | self.render_children(&insert.children);
121 | self.out("");
122 | }
123 | Tag::Delete(delete) => {
124 | self.render_tag("del", &delete.attrs);
125 | self.render_children(&delete.children);
126 | self.out("");
127 | }
128 | Tag::Mark(mark) => {
129 | self.render_tag("mark", &mark.attrs);
130 | self.render_children(&mark.children);
131 | self.out("");
132 | }
133 | Tag::Superscript(superscript) => {
134 | self.render_tag("sup", &superscript.attrs);
135 | self.render_children(&superscript.children);
136 | self.out("");
137 | }
138 | Tag::Subscript(subscript) => {
139 | self.render_tag("sub", &subscript.attrs);
140 | self.render_children(&subscript.children);
141 | self.out("");
142 | }
143 | Tag::EmDash(_) => self.out("—"),
144 | Tag::EnDash(_) => self.out("–"),
145 | }
146 | }
147 |
148 | fn render_children(&mut self, children: &[Tag]) {
149 | for child in children {
150 | self.render(child)
151 | }
152 | }
153 |
154 | fn render_tag(&mut self, tag_name: &str, attrs: &Attrs) {
155 | self.out("<");
156 | self.out(tag_name);
157 | for (k, v) in attrs {
158 | self.out(" ");
159 | self.out(k);
160 | self.out("=");
161 | self.out(&format!("{v:?}"));
162 | }
163 | self.out(">");
164 | }
165 |
166 | fn resolve_reference(
167 | &self,
168 | destination: Option<&str>,
169 | reference: Option<&str>,
170 | ) -> Option {
171 | if let Some(destination) = destination {
172 | return Some(destination.to_string());
173 | }
174 | if let Some(reference) = reference {
175 | if let Some(reference_definition) = self.refs.get(reference) {
176 | return Some(reference_definition.destination.clone());
177 | }
178 | }
179 | None
180 | }
181 |
182 | fn out(&mut self, s: &str) {
183 | self.res.push_str(s)
184 | }
185 | fn out_escape_html(&mut self, s: &str) {
186 | self.res.push_str(s)
187 | }
188 | }
189 |
--------------------------------------------------------------------------------
/src/inline.rs:
--------------------------------------------------------------------------------
1 | use std::{
2 | collections::{BTreeMap, HashMap},
3 | ops::Range,
4 | };
5 |
6 | use crate::{
7 | annot::{Annot, Atom, Comp},
8 | attribute,
9 | patterns::{find_at, is_space, PatMatch},
10 | Match, ParseOpts,
11 | };
12 |
13 | #[derive(Default)]
14 | pub struct Tokenizer {
15 | opts: ParseOpts,
16 | subject: String,
17 | matches: BTreeMap,
18 | openers: HashMap>,
19 | verbatim: usize,
20 | verbatim_type: Comp,
21 | destination: bool,
22 | firstpos: usize,
23 | lastpos: usize,
24 | allow_attributes: bool,
25 | attribute_tokenizer: Option,
26 | attribute_start: usize,
27 | }
28 |
29 | #[derive(Debug, Clone)]
30 | struct Opener {
31 | range: Range,
32 | annot: &'static str,
33 | sub_range: Range,
34 | }
35 |
36 | impl Opener {
37 | fn new(range: Range) -> Opener {
38 | Opener { range, annot: "", sub_range: 0..0 }
39 | }
40 | }
41 |
42 | // allow up to 3 captures...
43 | fn bounded_find(subj: &str, patt: &'static str, startpos: usize, endpos: usize) -> PatMatch {
44 | let mut m = find_at(subj, patt, startpos);
45 | if m.end > endpos {
46 | m = PatMatch::default()
47 | }
48 | m
49 | }
50 |
51 | impl Tokenizer {
52 | pub fn new(subject: String, opts: ParseOpts) -> Tokenizer {
53 | let mut res = Tokenizer::default();
54 | res.allow_attributes = true;
55 | res.subject = subject;
56 | res.opts = opts;
57 | res
58 | }
59 |
60 | fn add_match(&mut self, range: Range, annotation: impl Into) {
61 | let m = Match::new(range.clone(), annotation);
62 | self.matches.insert(range.start, m);
63 | }
64 |
65 | fn add_opener(&mut self, name: u8, opener: Opener) {
66 | self.openers.entry(name).or_default().push(opener)
67 | }
68 |
69 | fn clear_openers(&mut self, startpos: usize, endpos: usize) {
70 | for v in self.openers.values_mut() {
71 | v.retain(|it| !(startpos <= it.range.start && it.range.end <= endpos))
72 | }
73 | }
74 |
75 | fn str_matches(&mut self, startpos: usize, endpos: usize) {
76 | for i in startpos..endpos {
77 | if let Some(m) = self.matches.get_mut(&i) {
78 | if m.is_not(Atom::Str) && m.is_not(Atom::Escape) {
79 | m.a = Atom::Str.into();
80 | }
81 | }
82 | }
83 | }
84 |
85 | fn between_matched(&mut self, pos: usize, c: u8, annotation: Comp, defaultmatch: Atom) -> usize {
86 | self.between_matched_impl(
87 | pos,
88 | c,
89 | annotation,
90 | defaultmatch,
91 | Option:: PatMatch>::None,
92 | )
93 | }
94 |
95 | fn between_matched_with_open_test(
96 | &mut self,
97 | pos: usize,
98 | c: u8,
99 | annotation: Comp,
100 | defaultmatch: Atom,
101 | open_test: impl FnOnce(&str, usize) -> PatMatch,
102 | ) -> usize {
103 | self.between_matched_impl(pos, c, annotation, defaultmatch, Some(open_test))
104 | }
105 |
106 | fn between_matched_impl(
107 | &mut self,
108 | pos: usize,
109 | c: u8,
110 | annotation: Comp,
111 | mut defaultmatch: Atom,
112 | opentest: Option PatMatch>,
113 | ) -> usize {
114 | debug_assert!(self.subject[pos..].as_bytes().starts_with(&[c]));
115 |
116 | let mut can_open = find_at(&self.subject, "^%S", pos + 1).is_match;
117 | let mut can_close = !self.subject[..pos].ends_with(is_space);
118 | let has_open_marker =
119 | pos != 0 && self.matches.get(&(pos - 1)).map_or(false, |it| it.is(Atom::OpenMarker));
120 | let has_close_marker = self.subject.as_bytes()[pos + 1] == b'}';
121 | let mut startopener = pos;
122 | let mut endcloser = pos + 1;
123 |
124 | if let Some(opentest) = opentest {
125 | can_open = can_open && opentest(&self.subject, pos).is_match;
126 | }
127 |
128 | // allow explicit open/close markers to override:
129 | if has_open_marker {
130 | can_open = true;
131 | can_close = false;
132 | startopener = pos - 1;
133 | }
134 | if !has_open_marker && has_close_marker {
135 | can_close = true;
136 | can_open = false;
137 | endcloser = pos + 2;
138 | }
139 |
140 | if has_open_marker && defaultmatch.is_right_atom() {
141 | defaultmatch = defaultmatch.corresponding_left_atom();
142 | } else if has_close_marker && defaultmatch.is_left_atom() {
143 | defaultmatch = defaultmatch.corresponding_right_atom();
144 | }
145 |
146 | let openers = self.openers.entry(c).or_default();
147 | if can_close && openers.len() > 0 {
148 | // check openers for a match
149 | let opener = openers.last().unwrap().clone();
150 | if opener.range.end != pos {
151 | // exclude empty emph
152 | self.clear_openers(opener.range.start, pos + 1);
153 | self.add_match(opener.range.clone(), Annot::Add(annotation));
154 | self.add_match(pos..endcloser, Annot::Sub(annotation));
155 | return endcloser;
156 | }
157 | }
158 | // if we get here, we didn't match an opener
159 | if can_open {
160 | self.add_opener(c, Opener::new(startopener..pos + 1));
161 | self.add_match(startopener..pos + 1, defaultmatch);
162 | pos + 1
163 | } else {
164 | self.add_match(startopener..endcloser, defaultmatch);
165 | endcloser
166 | }
167 | }
168 |
169 | fn matchers(&mut self, c: u8, pos: usize, endpos: usize) -> Option {
170 | match c {
171 | b'`' => {
172 | let m = bounded_find(&self.subject, "^`*", pos, endpos);
173 | if !m.is_match {
174 | return None;
175 | }
176 | // TODO: display/inline math
177 |
178 | self.add_match(pos..m.end, Annot::Add(Comp::Verbatim));
179 | self.verbatim_type = Comp::Verbatim;
180 |
181 | self.verbatim = m.end - pos;
182 | return Some(m.end);
183 | }
184 | b'\\' => {
185 | let m = bounded_find(&self.subject, "^[ \t]*\r?\n", pos + 1, endpos);
186 | self.add_match(pos..pos + 1, Atom::Escape);
187 |
188 | if m.is_match {
189 | // see f there were preceding spaces
190 | if let Some((_, mm)) = self.matches.iter().rev().next() {
191 | let sp = mm.range.start;
192 | let mut ep = mm.range.end;
193 | if mm.is(Atom::Str) {
194 | while self.subject.as_bytes()[ep] == b' ' || self.subject.as_bytes()[ep] == b'\t' {
195 | ep = ep - 1
196 | }
197 | if sp == ep {
198 | self.matches.remove(&sp);
199 | } else {
200 | self.add_match(sp..ep, Atom::Str)
201 | }
202 | }
203 | }
204 | self.add_match(pos + 1..m.end, Atom::Hardbreak);
205 | return Some(m.end);
206 | } else {
207 | let m = bounded_find(&self.subject, "^[%p ]", pos + 1, endpos);
208 | if !m.is_match {
209 | self.add_match(pos..pos + 1, Atom::Str);
210 | return Some(pos + 1);
211 | } else {
212 | self.add_match(pos..pos + 1, Atom::Escape);
213 | if find_at(&self.subject, "^ ", pos + 1).is_match {
214 | self.add_match(pos + 1..m.end, Atom::Nbsp)
215 | } else {
216 | self.add_match(pos + 1..m.end, Atom::Str)
217 | }
218 | return Some(m.end);
219 | }
220 | }
221 | }
222 | b'<' => {
223 | let url = bounded_find(&self.subject, "^%<[^<>%s]+%>", pos, endpos);
224 | if url.is_match {
225 | let is_url = bounded_find(&self.subject, "^%a+:", pos + 1, url.end).is_match;
226 | let is_email = bounded_find(&self.subject, "^[^:]+%@", pos + 1, url.end).is_match;
227 | if is_email {
228 | self.add_match(url.start..url.start + 1, Comp::Email.add());
229 | self.add_match(url.start + 1..url.end - 1, Atom::Str);
230 | self.add_match(url.end - 1..url.end, Comp::Email.sub());
231 | return Some(url.end);
232 | } else if is_url {
233 | self.add_match(url.start..url.start + 1, Comp::Url.add());
234 | self.add_match(url.start + 1..url.end - 1, Atom::Str);
235 | self.add_match(url.end - 1..url.end, Comp::Url.sub());
236 | return Some(url.end);
237 | }
238 | }
239 | return None;
240 | }
241 | b'~' => Some(self.between_matched(pos, b'~', Comp::Subscript, Atom::Str)),
242 | b'^' => Some(self.between_matched(pos, b'^', Comp::Superscript, Atom::Str)),
243 | b'[' => {
244 | let m = bounded_find(&self.subject, "^%^([^]]+)%]", pos + 1, endpos);
245 | if m.is_match {
246 | self.add_match(pos..m.end, Atom::FootnoteReference);
247 | return Some(m.end);
248 | } else {
249 | self.add_opener(b'[', Opener::new(pos..pos + 1));
250 | self.add_match(pos..pos + 1, Atom::Str);
251 | return Some(pos + 1);
252 | }
253 | }
254 | b']' => {
255 | let openers = self.openers.entry(b'[').or_default();
256 | if openers.len() > 0 {
257 | let opener = openers.last_mut().unwrap();
258 | if opener.annot == "reference_link" {
259 | let opener = opener.clone();
260 | // found a reference link
261 | // add the matches
262 | let is_image = self.subject[..opener.range.start].ends_with('!')
263 | && !self.subject[..opener.range.start].ends_with("[]");
264 | if is_image {
265 | self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker);
266 | self.add_match(opener.range.clone(), Comp::Imagetext.add());
267 | self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub());
268 | } else {
269 | self.add_match(opener.range.clone(), Comp::Linktext.add());
270 | self.add_match(opener.sub_range.clone(), Comp::Linktext.sub());
271 | }
272 | self.add_match(opener.sub_range.end - 1..opener.sub_range.end, Comp::Reference.add());
273 | self.add_match(pos..pos, Comp::Reference.sub());
274 | // convert all matches to str
275 | self.str_matches(opener.sub_range.end, pos);
276 | // remove from openers
277 | self.clear_openers(opener.range.start, pos);
278 | return Some(pos + 1);
279 | } else if bounded_find(&self.subject, "^[%[]", pos + 1, endpos).is_match {
280 | opener.annot = "reference_link";
281 | opener.sub_range.start = pos; // intermediate ]
282 | opener.sub_range.end = pos + 2; // intermediate [
283 | self.add_match(pos..pos + 2, Atom::Str);
284 | return Some(pos + 2);
285 | } else if bounded_find(&self.subject, "^[(]", pos + 1, endpos).is_match {
286 | opener.annot = "explicit_link";
287 | opener.sub_range.start = pos; // intermediate ]
288 | opener.sub_range.end = pos + 2; // intermediate (
289 | self.openers.remove(&b'('); // clear ( openers
290 | self.destination = true;
291 | self.add_match(pos..pos + 2, Atom::Str);
292 | return Some(pos + 2);
293 | } else if bounded_find(&self.subject, "^%{", pos + 1, endpos).is_match {
294 | let opener = opener.clone();
295 | // assume this is attributes, bracketed span
296 | self.add_match(opener.range.clone(), Comp::Span.add());
297 | self.add_match(pos..pos + 1, Comp::Span.sub());
298 | // remove any openers between [ and ]
299 | self.clear_openers(opener.range.start, pos);
300 | return Some(pos + 1);
301 | }
302 | }
303 | return None;
304 | }
305 | b'(' => {
306 | if !self.destination {
307 | return None;
308 | }
309 | self.add_opener(b'(', Opener::new(pos..pos + 1));
310 | self.add_match(pos..pos + 1, Atom::Str);
311 | return Some(pos + 1);
312 | }
313 | b')' => {
314 | if !self.destination {
315 | return None;
316 | }
317 | let parens = self.openers.entry(b'(').or_default();
318 | if parens.len() > 0 {
319 | // TODO?
320 | parens.pop();
321 | self.add_match(pos..pos + 1, Atom::Str);
322 | return Some(pos + 1);
323 | } else {
324 | let openers = &self.openers.entry(b'[').or_default().clone();
325 | if let Some(opener) = openers.last().cloned() {
326 | if opener.annot == "explicit_link" {
327 | let (startdest, enddest) = (opener.sub_range.end - 1, pos);
328 | // we have inline link
329 | let is_image = self.subject[..opener.range.start].ends_with('!')
330 | && !self.subject[..opener.range.start].ends_with("[]");
331 | if is_image {
332 | self.add_match(opener.range.start - 1..opener.range.start, Atom::ImageMarker);
333 | self.add_match(opener.range.clone(), Comp::Imagetext.add());
334 | self.add_match(opener.sub_range.clone(), Comp::Imagetext.sub());
335 | } else {
336 | self.add_match(opener.range.clone(), Comp::Linktext.add());
337 | self.add_match(opener.sub_range.clone(), Comp::Linktext.sub());
338 | }
339 | self.add_match(startdest..startdest + 1, Comp::Destination.add());
340 | self.add_match(enddest..enddest + 1, Comp::Destination.sub());
341 | self.destination = false;
342 | // convert all matches to str
343 | self.str_matches(opener.sub_range.end + 1, pos);
344 | // remove from openers
345 | self.clear_openers(opener.range.start, pos);
346 | return Some(enddest + 1);
347 | }
348 | }
349 | return None;
350 | }
351 | }
352 | b'_' => Some(self.between_matched(pos, b'_', Comp::Emph, Atom::Str)),
353 | b'*' => Some(self.between_matched(pos, b'*', Comp::Strong, Atom::Str)),
354 | b'{' => {
355 | if self.subject[pos + 1..endpos].starts_with(|c: char| "_*~^+='\"-".contains(c)) {
356 | self.add_match(pos..pos + 1, Atom::OpenMarker);
357 | return Some(pos + 1);
358 | } else if self.allow_attributes {
359 | self.attribute_tokenizer = Some(attribute::Tokenizer::new(self.subject.clone()));
360 | self.attribute_start = pos;
361 | return Some(pos);
362 | } else {
363 | // disabling allow_attributes only lasts
364 | // for one potential attribute start {, and then is re-enabled
365 | self.allow_attributes = true;
366 | self.add_match(pos..pos + 1, Atom::Str);
367 | return Some(pos + 1);
368 | }
369 | }
370 | b':' => {
371 | let m = bounded_find(&self.subject, "^%:[%w_+-]+%:", pos, endpos);
372 | if m.is_match {
373 | self.add_match(m.start..m.end, Atom::Emoji);
374 | return Some(m.end);
375 | } else {
376 | self.add_match(pos..pos + 1, Atom::Str);
377 | return Some(pos + 1);
378 | }
379 | }
380 | b'+' => Some(self.between_matched_with_open_test(
381 | pos,
382 | b'+',
383 | Comp::Insert,
384 | Atom::Str,
385 | |subject, pos| {
386 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
387 | },
388 | )),
389 | b'=' => Some(self.between_matched_with_open_test(
390 | pos,
391 | b'=',
392 | Comp::Mark,
393 | Atom::Str,
394 | |subject, pos| {
395 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
396 | },
397 | )),
398 | b'\'' => todo!(),
399 | b'"' => Some(self.between_matched(pos, b'"', Comp::DoubleQuoted, Atom::LeftDoubleQuote)),
400 | b'-' => {
401 | let subject = &self.subject[..];
402 | if subject.as_bytes().get(pos - 1) == Some(&b'{')
403 | || subject.as_bytes().get(pos + 1) == Some(&b'}')
404 | {
405 | return Some(self.between_matched_with_open_test(
406 | pos,
407 | b'-',
408 | Comp::Delete,
409 | Atom::Str,
410 | |subject, pos| {
411 | find_at(subject, "^%{", pos - 1).or_else(|| find_at(subject, "^%}", pos + 1))
412 | },
413 | ));
414 | }
415 |
416 | let ep = find_at(subject, "^%-*", pos).end.min(endpos);
417 | let mut hyphens = ep - pos;
418 | if subject.as_bytes().get(ep) == Some(&b'}') {
419 | // last hyphen is close del
420 | hyphens -= 1;
421 | }
422 | if hyphens == 0 {
423 | self.add_match(pos..pos + 2, Atom::Str);
424 | return Some(pos + 2);
425 | }
426 | let mut pos = pos;
427 | let all_em = hyphens % 3 == 0;
428 | let all_en = hyphens % 2 == 0;
429 | while hyphens > 0 {
430 | if all_em {
431 | self.add_match(pos..pos + 3, Atom::EmDash);
432 | pos += 3;
433 | hyphens -= 3;
434 | } else if all_en {
435 | self.add_match(pos..pos + 2, Atom::EnDash);
436 | pos += 2;
437 | hyphens -= 2;
438 | } else if hyphens >= 3 && (hyphens % 2 != 0 || hyphens > 4) {
439 | self.add_match(pos..pos + 3, Atom::EmDash);
440 | pos += 3;
441 | hyphens -= 3;
442 | } else if hyphens >= 2 {
443 | self.add_match(pos..pos + 2, Atom::EnDash);
444 | pos += 2;
445 | hyphens -= 2;
446 | } else {
447 | self.add_match(pos..pos + 1, Atom::Str);
448 | pos += 1;
449 | hyphens -= 1;
450 | }
451 | }
452 | Some(pos)
453 | }
454 | b'.' => {
455 | if bounded_find(&self.subject, "^%.%.", pos + 1, endpos).is_match {
456 | self.add_match(pos..pos + 3, Atom::Ellipses);
457 | return Some(pos + 3);
458 | }
459 | return None;
460 | }
461 | _ => return None,
462 | }
463 | }
464 |
465 | fn single_char(&mut self, pos: usize) -> usize {
466 | self.add_match(pos..pos + 1, Atom::Str);
467 | pos + 1
468 | }
469 |
470 | // Feed a slice to the parser, updating state.
471 | pub fn feed(&mut self, spos: usize, endpos: usize) {
472 | let special = "[%]%[\\`{}_*()!<>~^:=+$\r\n'\".-]";
473 | let subject = self.subject.clone();
474 | if spos < self.firstpos {
475 | self.firstpos = spos
476 | }
477 | if endpos > self.lastpos {
478 | self.lastpos = endpos
479 | }
480 | let mut pos = spos;
481 | while pos < endpos {
482 | if let Some(mut attribute_tokenizer) = self.attribute_tokenizer.take() {
483 | let sp = pos;
484 | let m = bounded_find(&self.subject, special, pos, endpos);
485 | let ep2 = if m.is_match { m.start } else { endpos };
486 | let (status, ep) = attribute_tokenizer.feed(sp, ep2);
487 | match status {
488 | attribute::Status::Done => {
489 | let attribute_start = self.attribute_start;
490 | // add attribute matches
491 | self.add_match(attribute_start..attribute_start + 1, Comp::Attributes.add());
492 | self.add_match(ep..ep + 1, Comp::Attributes.sub());
493 | let attr_matches = attribute_tokenizer.get_matches();
494 | for m in attr_matches {
495 | self.add_match(m.range, m.a);
496 | }
497 | self.attribute_tokenizer = None;
498 | self.attribute_start = !0;
499 | pos = ep + 1;
500 | }
501 | attribute::Status::Fail => {
502 | pos = self.attribute_start;
503 | self.allow_attributes = false;
504 | self.attribute_tokenizer = None;
505 | self.attribute_start = !0;
506 | }
507 | attribute::Status::Continue => {
508 | self.attribute_tokenizer = Some(attribute_tokenizer);
509 | pos = ep
510 | }
511 | }
512 | } else {
513 | // find next interesting character:
514 | let newpos = bounded_find(&subject, special, pos, endpos).or(endpos);
515 | if newpos > pos {
516 | self.add_match(pos..newpos, Atom::Str);
517 | pos = newpos;
518 | if pos > endpos {
519 | break; // otherwise, fall through:
520 | }
521 | }
522 | // if we get here, then newpos = pos,
523 | // i.e. we have something interesting at pos
524 | let c = subject.as_bytes()[pos];
525 | if c == b'\r' || c == b'\n' {
526 | if c == b'\r' && bounded_find(&subject, "^[%n]", pos + 1, endpos).is_match {
527 | self.add_match(pos..pos + 2, Atom::Softbreak);
528 | pos = pos + 2
529 | } else {
530 | self.add_match(pos..pos + 1, Atom::Softbreak);
531 | pos = pos + 1
532 | }
533 | } else if self.verbatim > 0 {
534 | if c == b'`' {
535 | let m = bounded_find(&subject, "^`+", pos, endpos);
536 | if m.is_match && m.end - pos == self.verbatim {
537 | // TODO: Check for raw attributes
538 | self.add_match(pos..m.end, self.verbatim_type.sub());
539 | pos = m.end;
540 | self.verbatim = 0;
541 | self.verbatim_type = Comp::default();
542 | } else {
543 | let endchar = m.end_or(endpos);
544 | self.add_match(pos..endchar, Atom::Str);
545 | pos = endchar
546 | }
547 | } else {
548 | self.add_match(pos..pos + 1, Atom::Str);
549 | pos = pos + 1
550 | }
551 | } else {
552 | pos = self.matchers(c, pos, endpos).unwrap_or_else(|| self.single_char(pos))
553 | }
554 | }
555 | }
556 | }
557 |
558 | pub(crate) fn get_matches(&mut self) -> Vec {
559 | let mut sorted: Vec = Vec::new();
560 | let mut m_last = Match::new(0..0, Atom::Ellipses); // TODO
561 | for i in self.firstpos..=self.lastpos {
562 | if let Some(m) = self.matches.get(&i) {
563 | if m.is(Atom::Str) && m_last.is(Atom::Str) && m_last.range.end == m.range.start {
564 | (*sorted.last_mut().unwrap()).range.end = m.range.end;
565 | m_last.range.end = m.range.end;
566 | } else {
567 | sorted.push(m.clone());
568 | m_last = m.clone()
569 | }
570 | }
571 | }
572 | if sorted.len() > 0 {
573 | if sorted.last().unwrap().is(Atom::Softbreak) {
574 | // remove final softbreak
575 | sorted.pop();
576 | }
577 | if self.verbatim > 0 {
578 | // unclosed verbatim
579 | let e = sorted.last().unwrap().range.end;
580 | sorted.push(Match::new(e..e, self.verbatim_type.sub()))
581 | }
582 | }
583 | sorted
584 | }
585 | }
586 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | // TODO: re-export everything.
2 | pub mod ast;
3 |
4 | mod annot;
5 | mod patterns;
6 | mod block;
7 | mod inline;
8 | mod attribute;
9 | mod tree;
10 | mod emoji;
11 | mod html;
12 | #[cfg(test)]
13 | mod sourcegen;
14 |
15 | use std::{collections::BTreeMap, ops::Range};
16 |
17 | use crate::annot::Annot;
18 |
19 | #[derive(Debug, Default, Clone)]
20 | pub struct Document {
21 | pub children: Vec,
22 | pub references: BTreeMap,
23 | pub debug: String,
24 | }
25 |
26 | #[derive(Default, Clone)]
27 | pub struct ParseOpts {
28 | pub debug_matches: bool,
29 | }
30 |
31 | #[derive(Default, Clone)]
32 | pub struct HtmlOpts {}
33 |
34 | impl Document {
35 | pub fn parse(text: &str) -> Document {
36 | Document::parse_opts(ParseOpts::default(), text)
37 | }
38 |
39 | pub fn parse_opts(opts: ParseOpts, text: &str) -> Document {
40 | let mut p = block::Tokenizer::new(text.to_string(), opts);
41 | p.parse();
42 | tree::build(p)
43 | }
44 |
45 | pub fn to_html(&self) -> String {
46 | self.to_html_opts(&HtmlOpts::default())
47 | }
48 |
49 | pub fn to_html_opts(&self, opts: &HtmlOpts) -> String {
50 | html::convert(opts, self)
51 | }
52 |
53 | pub fn to_json(&self) -> String {
54 | #[derive(serde::Serialize)]
55 | struct DocRepr<'a> {
56 | tag: &'static str,
57 | children: &'a [ast::Tag],
58 | references: &'a BTreeMap,
59 | }
60 | serde_json::to_string_pretty(&DocRepr {
61 | tag: "doc",
62 | children: self.children.as_slice(),
63 | references: &self.references,
64 | })
65 | .unwrap()
66 | }
67 | }
68 |
69 | #[derive(Debug, Clone)]
70 | struct Match {
71 | range: Range,
72 | a: Annot,
73 | }
74 |
75 | impl Match {
76 | fn new(range: Range, a: impl Into) -> Match {
77 | Match { range, a: a.into() }
78 | }
79 | fn is(&self, annot: impl Into) -> bool {
80 | self.a == annot.into()
81 | }
82 | fn is_not(&self, annot: impl Into) -> bool {
83 | !self.is(annot)
84 | }
85 | }
86 |
87 | /// Appends formatted string to a `String`.
88 | macro_rules! _format_to {
89 | ($buf:expr) => ();
90 | ($buf:expr, $lit:literal $($arg:tt)*) => {
91 | { use ::std::fmt::Write as _; let _ = ::std::write!($buf, $lit $($arg)*); }
92 | };
93 | }
94 | pub(crate) use _format_to as format_to;
95 |
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | use std::path::PathBuf;
2 |
3 | use anyhow::Context;
4 | use lexopt::{Arg::Long, Arg::Short, Arg::Value};
5 |
6 | fn main() -> anyhow::Result<()> {
7 | let mut matches = false;
8 | let mut ast = false;
9 | let mut files = Vec::new();
10 |
11 | let mut parser = lexopt::Parser::from_env();
12 | while let Some(arg) = parser.next()? {
13 | match arg {
14 | Short('m') | Long("matches") => matches = true,
15 | Short('a') | Long("ast") => ast = true,
16 | Value(val) => files.push(val),
17 | _ => Err(arg.unexpected())?,
18 | }
19 | }
20 |
21 | let mut inputs = Vec::new();
22 | if files.is_empty() {
23 | let content = std::io::read_to_string(std::io::stdin()).context("failed to read stdin")?;
24 | inputs.push(content)
25 | } else {
26 | for file in files {
27 | let path = PathBuf::from(file);
28 | let content = std::fs::read_to_string(&path)
29 | .with_context(|| format!("failed to read {}", path.display()))?;
30 | inputs.push(content)
31 | }
32 | }
33 |
34 | let opts = djot::ParseOpts { debug_matches: matches };
35 | for content in inputs {
36 | let doc = djot::Document::parse_opts(opts.clone(), &content);
37 | if matches {
38 | println!("{}", doc.debug)
39 | } else if ast {
40 | println!("{}", doc.to_json())
41 | } else {
42 | println!("{}", doc.to_html())
43 | }
44 | }
45 |
46 | Ok(())
47 | }
48 |
--------------------------------------------------------------------------------
/src/patterns.rs:
--------------------------------------------------------------------------------
1 | use std::ops::Range;
2 |
3 | #[derive(Debug, Default)]
4 | pub struct PatMatch {
5 | pub is_match: bool,
6 | pub start: usize,
7 | pub end: usize,
8 | pub cap1: Range,
9 | pub cap2: Range,
10 | }
11 |
12 | impl PatMatch {
13 | pub(crate) fn or(&self, endpos: usize) -> usize {
14 | if self.is_match {
15 | self.start
16 | } else {
17 | endpos
18 | }
19 | }
20 |
21 | pub(crate) fn end_or(&self, endpos: usize) -> usize {
22 | if self.is_match {
23 | self.end
24 | } else {
25 | endpos
26 | }
27 | }
28 |
29 | pub(crate) fn or_else(self, f: impl FnOnce() -> Self) -> Self {
30 | if self.is_match {
31 | self
32 | } else {
33 | f()
34 | }
35 | }
36 | }
37 |
38 | pub fn find(subject: &str, pat: &'static str) -> PatMatch {
39 | find_at(subject, pat, 0)
40 | }
41 |
42 | pub fn find_at(subject: &str, pat: &'static str, start: usize) -> PatMatch {
43 | let mut pat = lua_patterns::LuaPattern::new(pat);
44 | let is_match = pat.matches(&subject[start..]);
45 | let range = pat.range();
46 | let cap1 = pat.capture(1);
47 | let cap2 = pat.capture(2);
48 | PatMatch {
49 | start: range.start + start,
50 | end: range.end + start,
51 | is_match,
52 | cap1: cap1.start + start..cap1.end + start,
53 | cap2: cap2.start + start..cap2.end + start,
54 | }
55 | }
56 |
57 | pub(crate) fn is_space(c: char) -> bool {
58 | " \n\t".contains(c)
59 | }
60 |
--------------------------------------------------------------------------------
/src/sourcegen.rs:
--------------------------------------------------------------------------------
1 | //! Generates matches and ast structures
2 | mod annot;
3 | mod ast;
4 |
5 | use std::path::Path;
6 |
7 | fn camel_case(ident: &str) -> String {
8 | ident
9 | .split('_')
10 | .flat_map(|word| {
11 | word.chars().next().map(|it| it.to_ascii_uppercase()).into_iter().chain(word.chars().skip(1))
12 | })
13 | .collect()
14 | }
15 |
16 | fn ensure_content(path: &str, content: &str) {
17 | let base = Path::new(env!("CARGO_MANIFEST_DIR"));
18 | let path = base.join(path);
19 | let old = std::fs::read_to_string(&path).unwrap_or_default();
20 | if normalize(&old) == normalize(content) {
21 | return;
22 | }
23 | std::fs::write(&path, content)
24 | .unwrap_or_else(|err| panic!("can't write {}: {err}", path.display()));
25 | }
26 |
27 | fn normalize(s: &str) -> String {
28 | s.split_ascii_whitespace().flat_map(|it| it.split(',')).collect()
29 | }
30 |
--------------------------------------------------------------------------------
/src/sourcegen/annot.rs:
--------------------------------------------------------------------------------
1 | use crate::{
2 | format_to,
3 | sourcegen::{camel_case, ensure_content},
4 | };
5 |
6 | const ANNOTATIONS: &str = "
7 | verbatim
8 | email
9 | url
10 | subscript
11 | superscript
12 | para
13 | code_block
14 | imagetext
15 | linktext
16 | reference
17 | destination
18 | emph
19 | strong
20 | span
21 | double_quoted
22 | reference_definition
23 | insert
24 | delete
25 | mark
26 | attributes
27 |
28 | str
29 | escape
30 | hardbreak
31 | nbsp
32 | blankline
33 | image_marker
34 | left_double_quote
35 | right_double_quote
36 | ellipses
37 | softbreak
38 | footnote_reference
39 | open_marker
40 | emoji
41 | reference_key
42 | reference_value
43 | code_language
44 | em_dash
45 | en_dash
46 | id
47 | key
48 | value
49 | class
50 | ";
51 |
52 | #[test]
53 | fn generate_annotations() {
54 | let (composites, atoms) = ANNOTATIONS.trim().split_once("\n\n").unwrap();
55 |
56 | let mut buf = "\
57 | use std::fmt;
58 | "
59 | .to_string();
60 |
61 | emit_comp(&mut buf, composites);
62 | emit_atom(&mut buf, atoms);
63 | ensure_content("src/annot/generated.rs", &buf);
64 | }
65 |
66 | fn emit_comp(buf: &mut String, composites: &str) {
67 | format_to!(
68 | buf,
69 | "\
70 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
71 | pub(crate) enum Comp {{
72 | "
73 | );
74 | for ident in composites.lines() {
75 | format_to!(buf, " {},\n", camel_case(ident))
76 | }
77 | format_to!(buf, "}}\n");
78 |
79 | let mut display_arms = String::new();
80 | for ident in composites.lines() {
81 | format_to!(display_arms, " Comp::{} => \"{ident}\",\n", camel_case(ident))
82 | }
83 |
84 | format_to!(
85 | buf,
86 | "
87 | impl fmt::Display for Comp {{
88 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{
89 | f.write_str(match self {{
90 | {display_arms}
91 | }})
92 | }}
93 | }}
94 | "
95 | );
96 | }
97 |
98 | fn emit_atom(buf: &mut String, atoms: &str) {
99 | let mut variants = String::new();
100 | for ident in atoms.lines() {
101 | format_to!(variants, " {},\n", camel_case(ident))
102 | }
103 |
104 | format_to!(
105 | buf,
106 | "
107 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
108 | pub(crate) enum Atom {{
109 | {variants}
110 | }}
111 | "
112 | );
113 |
114 | let mut left_atoms = String::new();
115 | let mut right_atoms = String::new();
116 | let mut ltr = String::new();
117 | let mut rtl = String::new();
118 | for ident in atoms.lines() {
119 | if ident.starts_with("left_") {
120 | format_to!(left_atoms, " | Atom::{}", camel_case(ident));
121 | let rident = &ident.replace("left", "right");
122 | format_to!(ltr, "Atom::{} => Atom::{},\n", camel_case(ident), camel_case(rident));
123 | format_to!(rtl, "Atom::{} => Atom::{},\n", camel_case(rident), camel_case(ident));
124 | }
125 | if ident.starts_with("right_") {
126 | format_to!(right_atoms, " | Atom::{}", camel_case(ident))
127 | }
128 | }
129 |
130 | format_to!(
131 | buf,
132 | "
133 | impl Atom {{
134 | pub(crate) fn is_left_atom(self) -> bool {{
135 | matches!(self, {left_atoms})
136 | }}
137 | pub(crate) fn is_right_atom(self) -> bool {{
138 | matches!(self, {right_atoms})
139 | }}
140 | pub(crate) fn corresponding_left_atom(self) -> Atom {{
141 | match self {{
142 | {rtl}
143 | _ => self
144 | }}
145 | }}
146 | pub(crate) fn corresponding_right_atom(self) -> Atom {{
147 | match self {{
148 | {ltr}
149 | _ => self
150 | }}
151 | }}
152 | }}
153 | "
154 | );
155 |
156 | let mut display_arms = String::new();
157 | for ident in atoms.lines() {
158 | format_to!(display_arms, " Atom::{} => \"{ident}\",\n", camel_case(ident))
159 | }
160 |
161 | format_to!(
162 | buf,
163 | "
164 | impl fmt::Display for Atom {{
165 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{
166 | f.write_str(match self {{
167 | {display_arms}
168 | }})
169 | }}
170 | }}
171 | "
172 | );
173 | }
174 |
--------------------------------------------------------------------------------
/src/sourcegen/ast.rs:
--------------------------------------------------------------------------------
1 | use crate::{format_to, sourcegen::camel_case};
2 |
3 | use crate::sourcegen::ensure_content;
4 |
5 | const TAGS: &str = "
6 | heading level: u32
7 | para
8 | link destination: Option, reference: Option
9 | image destination: Option, reference: Option
10 | code_block lang: Option, text: String
11 | strong
12 | emph
13 | insert
14 | delete
15 | mark
16 | superscript
17 | subscript
18 | span
19 | double_quoted
20 | url destination: String
21 |
22 | soft_break
23 | em_dash
24 | en_dash
25 | verbatim text: String
26 | str text: String
27 | emoji alias: String
28 | ";
29 |
30 | #[test]
31 | fn generate_annotations() {
32 | let (composites, atoms) = TAGS.trim().split_once("\n\n").unwrap();
33 |
34 | let mut buf = format!("use super::Attrs;\n");
35 | emit_ast_comp(&mut buf, composites);
36 | emit_ast_atom(&mut buf, atoms);
37 | emit_ast_tag(&mut buf, composites, atoms);
38 | ensure_content("src/ast/generated.rs", &buf);
39 | }
40 |
41 | fn emit_ast_comp(buf: &mut String, composites: &str) {
42 | for comp in composites.lines() {
43 | let (ident, fields) = comp.split_once(" ").unwrap_or((comp, ""));
44 | let fields = if fields.is_empty() {
45 | String::new()
46 | } else {
47 | fields.split(", ").map(|it| format!("pub {it},\n")).collect::()
48 | };
49 |
50 | format_to! {buf, "
51 | #[derive(Debug, Default, Clone, serde::Serialize)]
52 | pub struct {} {{
53 | #[serde(skip_serializing_if = \"Attrs::is_empty\")]
54 | pub attrs: Attrs,
55 | pub children: Vec,
56 | {fields}
57 | }}
58 | ", camel_case(ident)}
59 | }
60 | }
61 |
62 | fn emit_ast_atom(buf: &mut String, atoms: &str) {
63 | for atom in atoms.lines() {
64 | let (ident, fields) = atom.split_once(" ").unwrap_or((atom, ""));
65 | let fields = if fields.is_empty() {
66 | String::new()
67 | } else {
68 | fields.split(", ").map(|it| format!("pub {it},\n")).collect::()
69 | };
70 | format_to! {buf, "
71 | #[derive(Debug, Default, Clone, serde::Serialize)]
72 | pub struct {} {{
73 | #[serde(skip_serializing_if = \"Attrs::is_empty\")]
74 | pub attrs: Attrs,
75 | {fields}
76 | }}
77 | ", camel_case(ident)}
78 | }
79 | }
80 |
81 | fn emit_ast_tag(buf: &mut String, composites: &str, atoms: &str) {
82 | let mut variants = String::new();
83 | for comp in composites.lines() {
84 | let ident = comp.split_once(" ").map_or(comp, |it| it.0);
85 | let camel = camel_case(ident);
86 | format_to!(variants, " {camel}({camel}),\n");
87 | }
88 | for atom in atoms.lines() {
89 | let ident = atom.split_once(" ").map_or(atom, |it| it.0);
90 | let camel = camel_case(ident);
91 | format_to!(variants, " {camel}({camel}),\n");
92 | }
93 | format_to!(
94 | buf,
95 | "
96 | #[derive(Debug, Clone, serde::Serialize)]
97 | #[serde(tag = \"tag\", rename_all = \"snake_case\")]
98 | pub enum Tag {{ {variants} }}
99 | "
100 | )
101 | }
102 |
--------------------------------------------------------------------------------
/src/tree.rs:
--------------------------------------------------------------------------------
1 | use std::collections::BTreeMap;
2 |
3 | use crate::{
4 | annot::{Annot, Atom, Comp},
5 | ast::{
6 | Attrs, CodeBlock, Delete, DoubleQuoted, Emoji, Emph, Image, Insert, Link, Mark, Para,
7 | ReferenceDefinition, SoftBreak, Span, Str, Strong, Subscript, Superscript, Tag, Url, Verbatim,
8 | },
9 | block,
10 | patterns::find,
11 | Document, Match,
12 | };
13 |
14 | pub(crate) fn build(p: block::Tokenizer) -> Document {
15 | let mut ctx = Ctx { subject: p.subject, matches: p.matches, idx: 0, references: BTreeMap::new() };
16 | let mut doc = ctx.get_doc();
17 | doc.debug = p.debug;
18 | doc.references = ctx.references;
19 | doc
20 | }
21 |
22 | struct Ctx {
23 | subject: String,
24 | matches: Vec,
25 | references: BTreeMap,
26 | idx: usize,
27 | }
28 |
29 | impl Ctx {
30 | fn get_doc(&mut self) -> Document {
31 | let mut res = Document::default();
32 | while self.idx < self.matches.len() {
33 | self.get_tag(&mut res.children)
34 | }
35 | res
36 | }
37 |
38 | fn get_tag(&mut self, acc: &mut Vec) {
39 | self.skip_trivia();
40 | let m = self.matches[self.idx].clone();
41 | self.idx += 1;
42 | let res = match m.a {
43 | Annot::Add(comp) => match comp {
44 | Comp::CodeBlock => Tag::CodeBlock(self.get_code_block()),
45 | Comp::Para => Tag::Para(self.get_para()),
46 | Comp::Verbatim => Tag::Verbatim(self.get_verbatim()),
47 | Comp::Strong => Tag::Strong(self.get_strong()),
48 | Comp::Emph => Tag::Emph(self.get_emph()),
49 | Comp::Insert => Tag::Insert(self.get_insert()),
50 | Comp::Delete => Tag::Delete(self.get_delete()),
51 | Comp::Mark => Tag::Mark(self.get_mark()),
52 | Comp::Subscript => Tag::Subscript(self.get_subscript()),
53 | Comp::Superscript => Tag::Superscript(self.get_superscript()),
54 | Comp::DoubleQuoted => Tag::DoubleQuoted(self.get_double_quoted()),
55 | Comp::Linktext => Tag::Link(self.get_link()),
56 | Comp::Imagetext => Tag::Image(self.get_image()),
57 | Comp::Url => Tag::Url(self.get_url()),
58 | Comp::Attributes => todo!(),
59 | Comp::Span => Tag::Span(self.get_span()),
60 | Comp::ReferenceDefinition => {
61 | self.get_reference_definition();
62 | return;
63 | }
64 | _ => todo!("{comp:?}"),
65 | },
66 | Annot::Sub(sub) => unreachable!("-{sub}"),
67 | Annot::Atom(atom) => match atom {
68 | Atom::Str => {
69 | let mut text = self.subject[m.range].to_string();
70 | let attrs = self.get_attrs();
71 | if !attrs.is_empty() {
72 | if let Some(idx) = text.rfind(|it: char| it.is_ascii_whitespace()) {
73 | acc.push(Tag::Str(Str { attrs: Attrs::new(), text: text[..idx + 1].to_string() }));
74 | text.drain(..idx + 1);
75 | }
76 | }
77 | Tag::Str(Str { attrs, text })
78 | }
79 | Atom::Emoji => {
80 | let mut res = Emoji::default();
81 | res.alias = self.subject[m.range.start + 1..m.range.end - 1].to_string();
82 | Tag::Emoji(res)
83 | }
84 | Atom::Softbreak => Tag::SoftBreak(SoftBreak::default()),
85 | Atom::Class | Atom::Id => return,
86 | _ => todo!("{atom:?}"),
87 | },
88 | };
89 | acc.push(res)
90 | }
91 |
92 | fn get_code_block(&mut self) -> CodeBlock {
93 | let mut res = CodeBlock::default();
94 | let m = self.matches[self.idx].clone();
95 | if m.is(Atom::CodeLanguage) {
96 | res.lang = Some(self.subject[m.range].to_string());
97 | self.idx += 1;
98 | }
99 | res.text = self.get_text_until(Comp::CodeBlock);
100 | res
101 | }
102 |
103 | fn get_para(&mut self) -> Para {
104 | let mut res = Para::default();
105 | res.children = self.get_tags_until(Comp::Para);
106 | res
107 | }
108 |
109 | fn get_verbatim(&mut self) -> Verbatim {
110 | let mut res = Verbatim::default();
111 | res.text = self.get_text_until(Comp::Verbatim);
112 | if find(res.text.as_str(), "^ +`").is_match {
113 | res.text.remove(0);
114 | }
115 | if find(res.text.as_str(), "` +$").is_match {
116 | res.text.pop();
117 | }
118 | res
119 | }
120 |
121 | fn get_strong(&mut self) -> Strong {
122 | let mut res = Strong::default();
123 | res.children = self.get_tags_until(Comp::Strong);
124 | res
125 | }
126 |
127 | fn get_emph(&mut self) -> Emph {
128 | let mut res = Emph::default();
129 | res.children = self.get_tags_until(Comp::Emph);
130 | res
131 | }
132 |
133 | fn get_insert(&mut self) -> Insert {
134 | let mut res = Insert::default();
135 | res.children = self.get_tags_until(Comp::Insert);
136 | res
137 | }
138 |
139 | fn get_delete(&mut self) -> Delete {
140 | let mut res = Delete::default();
141 | res.children = self.get_tags_until(Comp::Delete);
142 | res
143 | }
144 |
145 | fn get_mark(&mut self) -> Mark {
146 | let mut res = Mark::default();
147 | res.children = self.get_tags_until(Comp::Mark);
148 | res
149 | }
150 |
151 | fn get_subscript(&mut self) -> Subscript {
152 | let mut res = Subscript::default();
153 | res.children = self.get_tags_until(Comp::Subscript);
154 | res
155 | }
156 |
157 | fn get_superscript(&mut self) -> Superscript {
158 | let mut res = Superscript::default();
159 | res.children = self.get_tags_until(Comp::Superscript);
160 | res
161 | }
162 |
163 | fn get_double_quoted(&mut self) -> DoubleQuoted {
164 | let mut res = DoubleQuoted::default();
165 | res.children = self.get_tags_until(Comp::DoubleQuoted);
166 | res
167 | }
168 |
169 | fn get_link(&mut self) -> Link {
170 | let mut res = Link::default();
171 | res.children = self.get_tags_until(Comp::Linktext);
172 | match self.get_dest() {
173 | LinkDest::Dest(dest) => res.destination = Some(dest),
174 | LinkDest::Ref(r) => res.reference = Some(r),
175 | LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)),
176 | }
177 | res
178 | }
179 |
180 | fn get_image(&mut self) -> Image {
181 | let mut res = Image::default();
182 | res.children = self.get_tags_until(Comp::Imagetext);
183 | match self.get_dest() {
184 | LinkDest::Dest(dest) => res.destination = Some(dest),
185 | LinkDest::Ref(r) => res.reference = Some(r),
186 | LinkDest::AutoRef => res.reference = Some(get_string_content(&res.children)),
187 | }
188 | res
189 | }
190 |
191 | fn get_dest(&mut self) -> LinkDest {
192 | let m = self.matches[self.idx].clone();
193 | self.idx += 1;
194 | if m.is(Comp::Destination.add()) {
195 | let dest = self.get_text_until(Comp::Destination);
196 | LinkDest::Dest(dest.replace('\n', ""))
197 | } else {
198 | let r = self.get_text_until(Comp::Reference);
199 | if r.is_empty() {
200 | LinkDest::AutoRef
201 | } else {
202 | LinkDest::Ref(r.replace('\n', " "))
203 | }
204 | }
205 | }
206 |
207 | fn get_url(&mut self) -> Url {
208 | let mut res = Url::default();
209 | res.destination = self.get_text_until(Comp::Url);
210 | res
211 | }
212 |
213 | fn get_span(&mut self) -> Span {
214 | let mut res = Span::default();
215 | res.children = self.get_tags_until(Comp::Span);
216 | res.attrs = self.get_attrs();
217 | res
218 | }
219 |
220 | fn get_attrs(&mut self) -> Attrs {
221 | if !self.matches[self.idx].is(Comp::Attributes.add()) {
222 | return Attrs::new();
223 | }
224 | self.idx += 1;
225 | let mut res = Attrs::new();
226 | loop {
227 | let m = self.matches[self.idx].clone();
228 | self.idx += 1;
229 | if m.is(Comp::Attributes.sub()) {
230 | break;
231 | }
232 | if m.is(Atom::Class) {
233 | match res.entry("class".to_string()) {
234 | indexmap::map::Entry::Occupied(mut it) => {
235 | it.insert(format!("{} {}", it.get(), &self.subject[m.range.clone()]));
236 | }
237 | indexmap::map::Entry::Vacant(it) => {
238 | it.insert(self.subject[m.range.clone()].to_string());
239 | }
240 | }
241 | } else if m.is(Atom::Id) {
242 | res.insert("id".to_string(), self.subject[m.range].to_string());
243 | } else if m.is(Atom::Key) {
244 | let key = self.subject[m.range].to_string();
245 | let m = self.matches[self.idx].clone();
246 | self.idx += 1;
247 | let value = self.subject[m.range].to_string();
248 | res.insert(key, value);
249 | }
250 | }
251 | res
252 | }
253 |
254 | fn get_reference_definition(&mut self) {
255 | let mut res = ReferenceDefinition::default();
256 | let key = self.matches[self.idx].clone();
257 | self.idx += 1;
258 | loop {
259 | let m = self.matches[self.idx].clone();
260 | if !m.is(Atom::ReferenceValue) {
261 | break;
262 | }
263 | self.idx += 1;
264 | res.destination.push_str(&self.subject[m.range]);
265 | }
266 | assert!(self.matches[self.idx].is(Comp::ReferenceDefinition.sub()));
267 | self.idx += 1;
268 | self.references.insert(self.subject[key.range.start + 1..key.range.end - 1].to_string(), res);
269 | }
270 |
271 | fn get_tags_until(&mut self, comp: Comp) -> Vec {
272 | let mut res = vec![];
273 | while !self.matches[self.idx].is(comp.sub()) {
274 | self.get_tag(&mut res)
275 | }
276 | self.idx += 1;
277 | res
278 | }
279 |
280 | fn get_text_until(&mut self, comp: Comp) -> String {
281 | let mut res = String::new();
282 | loop {
283 | let m = self.matches[self.idx].clone();
284 | self.idx += 1;
285 | if m.is(comp.sub()) {
286 | break;
287 | }
288 | res.push_str(&self.subject[m.range]);
289 | }
290 | res
291 | }
292 |
293 | fn skip_trivia(&mut self) {
294 | while self.idx < self.matches.len() {
295 | let m = self.matches[self.idx].clone();
296 | if !(m.is(Atom::Blankline) || m.is(Atom::ImageMarker) || m.is(Atom::Escape)) {
297 | break;
298 | }
299 | self.idx += 1;
300 | continue;
301 | }
302 | }
303 | }
304 |
305 | pub(crate) fn get_string_content(tags: &[Tag]) -> String {
306 | let mut res = String::new();
307 | for tag in tags {
308 | match tag {
309 | Tag::SoftBreak(_) => res.push('\n'),
310 | Tag::Str(str) => res.push_str(&str.text),
311 | Tag::Emph(emph) => res.push_str(&get_string_content(&emph.children)),
312 | _ => (),
313 | }
314 | }
315 | res
316 | }
317 |
318 | enum LinkDest {
319 | Dest(String),
320 | Ref(String),
321 | AutoRef,
322 | }
323 |
--------------------------------------------------------------------------------
/tests/data/attributes.test:
--------------------------------------------------------------------------------
1 | An inline attribute allies to the preceding element, which might
2 | be complex (span, emphasis, link) or a simple word (defined as a
3 | sequence of non-ASCII-whitespace characters).
4 | ```
5 | foo привет{.ru}
6 | .
7 | foo привет
8 | ```
9 |
10 | ```
11 | (some text){.attr}
12 | .
13 | (some text)
14 | ```
15 |
16 | ```
17 | [some text]{.attr}
18 | .
19 | some text
20 | ```
21 |
22 | Ensure that emphasis that starts before the attribute can still close,
23 | even if the attribute contains a potential closer.
24 |
25 | ```
26 | a *b{#id key="*"}*
27 | .
28 | a b
29 | ```
30 |
31 | ```
32 | a *b{#id key="*"}o
33 | .
34 | a *bo
35 | ```
36 |
37 | Don't allow attributes to start when we're parsing a potential
38 | attribute.
39 |
40 | ```
41 | hi{key="{#hi"}
42 | .
43 | hi{key=“{#hi”
44 | ```
45 |
46 | ```
47 | hi\{key="abc{#hi}"
48 | .
49 | hi{key=“abc”
50 | ```
51 | STOP
52 | ```
53 | hi{key="\{#hi"}
54 | .
55 | hi
56 | ```
57 |
58 | Line break:
59 |
60 | ```
61 | hi{#id .class
62 | key="value"}
63 | .
64 | hi
65 | ```
66 |
67 | Here there is nothing for the attribute to attach to:
68 |
69 | ```
70 | {#id} at beginning
71 | .
72 | at beginning
73 | ```
74 |
75 | ```
76 | After {#id} space
77 | {.class}
78 | .
79 | After space
80 |
81 | ```
82 |
83 | Block attributes come before the block, on a line by themselves.
84 |
85 | ```
86 | {#id .class}
87 | A paragraph
88 | .
89 | A paragraph
90 | ```
91 |
92 | Use indentation if you need to continue the attributes over a line break.
93 |
94 | ```
95 | {#id .class
96 | style="color:red"}
97 | A paragraph
98 | .
99 | A paragraph
100 | ```
101 |
102 | If the attribute block can't be parsed as attributes, it will be
103 | parsed as a regular paragraph:
104 |
105 | ```
106 | {#id .cla*ss*
107 | .
108 | {#id .class
109 | ```
110 |
111 | You can use consecutive attribute blocks.
112 | In case of conflict, later values take precedence over earlier ones,
113 | but classes accumulate:
114 |
115 | ```
116 | {#id}
117 | {key=val}
118 | {.foo .bar}
119 | {key=val2}
120 | {.baz}
121 | {#id2}
122 | Okay
123 | .
124 | Okay
125 | ```
126 |
127 | Attributes on different kinds of blocks:
128 |
129 | ```
130 | {#id}
131 | > Block quote
132 | .
133 |
134 | Block quote
135 |
136 | ```
137 |
138 | ```
139 | {#id}
140 | # Heading
141 | .
142 |
145 | ```
146 |
147 | ```
148 | {.blue}
149 | - - - - -
150 | .
151 |
152 | ```
153 |
154 | ````
155 | {highlight=3}
156 | ``` ruby
157 | x = 3
158 | ```
159 | .
160 | x = 3
161 |
162 | ````
163 |
164 | ```
165 | {.special}
166 | 1. one
167 | 2. two
168 | .
169 |
170 | -
171 | one
172 |
173 | -
174 | two
175 |
176 |
177 | ```
178 |
179 | ```
180 | > {.foo}
181 | > > {.bar}
182 | > > nested
183 | .
184 |
185 |
186 | nested
187 |
188 |
189 | ```
190 |
191 | Comments start at a `%` character
192 | (not in quotes) and end with another `%`.
193 | These can be used to comment up an attribute
194 | list or without any real attributes.
195 |
196 | ```
197 | foo{#ident % this is a comment % .class}
198 | .
199 | foo
200 | ```
201 |
202 | In block-level comment, subsequent lines must
203 | be indented, as with attributes:
204 |
205 | ```
206 | {% This is a comment before a
207 | block-level item. %}
208 | Paragraph.
209 | .
210 | Paragraph.
211 | ```
212 |
213 | Inline attributes can be empty:
214 |
215 | ```
216 | hi{}
217 | .
218 | hi
219 | ```
220 |
221 | Block attributes can be empty:
222 |
223 | ```
224 | {}
225 | hi
226 | .
227 | hi
228 | ```
229 |
--------------------------------------------------------------------------------
/tests/data/code_blocks.test:
--------------------------------------------------------------------------------
1 |
2 | ```
3 | ~~~
4 | code
5 | block
6 | ~~~
7 | .
8 | code
9 | block
10 |
11 | ```
12 |
13 | ````
14 | ``` python
15 | x = y + 3
16 | ```
17 | .
18 | x = y + 3
19 |
20 | ````
21 |
22 | ````
23 | ``` python
24 | if true:
25 | x = 3
26 | ```
27 | .
28 | if true:
29 | x = 3
30 |
31 | ````
32 |
33 | ````
34 | ``` not a code block ```
35 | .
36 | not a code block
37 | ````
38 |
39 | ````
40 | ``` not a code block
41 | .
42 | not a code block
43 | ````
44 |
45 | ````
46 | ```
47 | hi
48 | ```
49 | ```
50 | two
51 | ```
52 | .
53 | hi
54 |
55 | two
56 |
57 | ````
58 |
59 | Empty code block:
60 |
61 | ````
62 | ```
63 | ```
64 | .
65 |
66 | ````
67 |
--------------------------------------------------------------------------------
/tests/data/emoji.test:
--------------------------------------------------------------------------------
1 | ```
2 | :+1: :scream:
3 | .
4 | 👍 😱
5 | ```
6 |
7 | ```
8 | This is a :nonexistent: emoji.
9 | .
10 | This is a :nonexistent: emoji.
11 | ```
12 |
13 | ```
14 | :ice:scream:
15 | .
16 | :ice:scream:
17 | ```
18 |
--------------------------------------------------------------------------------
/tests/data/emphasis.test:
--------------------------------------------------------------------------------
1 | ```
2 | *foo bar*
3 | .
4 | foo bar
5 | ```
6 |
7 | ```
8 | a* foo bar*
9 | .
10 | a* foo bar*
11 | ```
12 |
13 | ```
14 | *foo bar *
15 | .
16 | *foo bar *
17 | ```
18 |
19 | Unicode spaces don't block emphasis.
20 |
21 | ```
22 | * a *
23 | .
24 | a
25 | ```
26 |
27 | Intraword:
28 |
29 | ```
30 | foo*bar*baz
31 | .
32 | foobarbaz
33 | ```
34 |
35 | ```
36 | _foo bar_
37 | .
38 | foo bar
39 | ```
40 |
41 | ```
42 | _ foo bar_
43 | .
44 | _ foo bar_
45 | ```
46 |
47 | ```
48 | _foo bar _
49 | .
50 | _foo bar _
51 | ```
52 |
53 | Unicode spaces don't block emphasis.
54 |
55 | ```
56 | _ a _
57 | .
58 | a
59 | ```
60 |
61 | Intraword:
62 |
63 | ```
64 | foo_bar_baz
65 | .
66 | foobarbaz
67 | ```
68 |
69 | ```
70 | aa_"bb"_cc
71 | .
72 | aa“bb”cc
73 | ```
74 |
75 | ```
76 | *foo_
77 | .
78 | *foo_
79 | ```
80 |
81 | ```
82 | _foo*
83 | .
84 | _foo*
85 | ```
86 |
87 | A line ending counts as whitespace:
88 |
89 | ```
90 | _foo bar
91 | _
92 | .
93 | _foo bar
94 | _
95 | ```
96 |
97 | So does a tab:
98 |
99 | ```
100 | _ a_
101 | .
102 | _ a_
103 | ```
104 |
105 | This one is different from commonmark:
106 |
107 | ```
108 | _(_foo_)_
109 | .
110 | (foo)
111 | ```
112 |
113 | But you can force the second `_` to be an opener
114 | using the marker `{`.
115 |
116 | ```
117 | _({_foo_})_
118 | .
119 | (foo)
120 | ```
121 |
122 | ```
123 | _(*foo*)_
124 | .
125 | (foo)
126 | ```
127 |
128 | Overlapping scopes (first to close wins):
129 |
130 | ```
131 | _foo *bar_ baz*
132 | .
133 | foo *bar baz*
134 | ```
135 |
136 | Over line break:
137 |
138 | ```
139 | _foo
140 | bar_
141 | .
142 | foo
143 | bar
144 | ```
145 |
146 | Inline content allowed:
147 |
148 | ```
149 | *foo [link](url) `*`*
150 | .
151 | foo link *
152 | ```
153 |
154 | Can't emph an underscore:
155 |
156 | ```
157 | ___
158 | .
159 | ___
160 | ```
161 |
162 | Unless you escape it:
163 |
164 | ```
165 | _\__
166 | .
167 | _
168 | ```
169 |
170 | No empty emph:
171 |
172 | ```
173 | __
174 | .
175 | __
176 | ```
177 |
178 | ```
179 | _}b_
180 | .
181 | _}b_
182 | ```
183 |
184 | ```
185 | _\}b_
186 | .
187 | }b
188 | ```
189 |
190 | ```
191 | _ab\_c_
192 | .
193 | ab_c
194 | ```
195 |
196 | ```
197 | *****a*****
198 | .
199 | a
200 | ```
201 |
202 | ```
203 | _[bar_](url)
204 | .
205 | [bar](url)
206 | ```
207 |
208 | ```
209 | \_[bar_](url)
210 | .
211 | _bar_
212 | ```
213 |
214 | Code takes precedence:
215 |
216 | ```
217 | _`a_`b
218 | .
219 | _a_
b
220 | ```
221 |
222 | Autolinks take precedence:
223 |
224 | ```
225 | _
226 | .
227 | _http://example.com/a_b
228 | ```
229 |
--------------------------------------------------------------------------------
/tests/data/hello_world.test:
--------------------------------------------------------------------------------
1 | ```
2 | Hello, world!
3 | .
4 | Hello, world!
5 | ```
6 |
--------------------------------------------------------------------------------
/tests/data/insert_delete_mark.test:
--------------------------------------------------------------------------------
1 | ```
2 | This is {-deleted
3 | _text_-}. The braces are -required-.
4 | And they must be in the -}right order{-.
5 | .
6 | This is deleted
7 | text. The braces are -required-.
8 | And they must be in the -}right order{-.
9 | ```
10 |
11 | ```
12 | {+ Inserted text +}
13 | .
14 | Inserted text
15 | ```
16 |
17 | Interaction with smart:
18 |
19 | ```
20 | {--hello--}
21 | .
22 | -hello-
23 | ```
24 |
25 | ```
26 | This is {=marked *text*=}.
27 | .
28 | This is marked text.
29 | ```
30 |
--------------------------------------------------------------------------------
/tests/data/links_and_images.test:
--------------------------------------------------------------------------------
1 |
2 | ```
3 | [basic _link_][a_b_]
4 |
5 | [a_b_]: url
6 | .
7 | basic link
8 | ```
9 |
10 | ```
11 | ![basic _image_][a_b_]
12 |
13 | [a_b_]: url
14 | .
15 | 
16 | ```
17 |
18 | ```
19 | [link][]
20 |
21 | [link]: url
22 | .
23 | link
24 | ```
25 |
26 | ```
27 | [link][]
28 |
29 | [link]:
30 | url
31 | .
32 | link
33 | ```
34 |
35 | The URL can be split over multiple lines:
36 |
37 | ```
38 | [link][]
39 |
40 | [link]:
41 | url
42 | andurl
43 | .
44 | link
45 | ```
46 |
47 | ```
48 | [link](url
49 | andurl)
50 | .
51 | link
52 | ```
53 |
54 | ```
55 | [link][]
56 |
57 | [link]:
58 | [link2]: url
59 | .
60 | link
61 | ```
62 |
63 | ```
64 | [link][]
65 | [link][link2]
66 |
67 | [link2]:
68 | url2
69 | [link]:
70 | url
71 | .
72 | link
73 | link
74 | ```
75 |
76 | ```
77 | [link][a and
78 | b]
79 |
80 | [a and b]: url
81 | .
82 | link
83 | ```
84 |
85 | If the reference isn't found, we get an empty link.
86 |
87 | ```
88 | [link][a and
89 | b]
90 | .
91 | link
92 | ```
93 |
94 | Reference definitions can't have line breaks in the key:
95 |
96 | ```
97 | [link][a and
98 | b]
99 |
100 | [a and
101 | b]: url
102 | .
103 | link
104 | [a and
105 | b]: url
106 | ```
107 |
108 | No case normalization is done on reference definitions:
109 |
110 | ```
111 | [Link][]
112 |
113 | [link]: /url
114 | .
115 | Link
116 | ```
117 | STOP
118 | Attributes on reference definitions get transferred to
119 | the link:
120 |
121 | ```
122 | {title=foo}
123 | [ref]: /url
124 |
125 | [ref][]
126 | .
127 | ref
128 | ```
129 |
130 | Attributes on the link override those on references:
131 |
132 | ```
133 | {title=foo}
134 | [ref]: /url
135 |
136 | [ref][]{title=bar}
137 | .
138 | ref
139 | ```
140 |
141 | ```
142 | [link _and_ link][]
143 |
144 | [link and link]: url
145 | .
146 | link and link
147 | ```
148 |
149 | ```
150 | 
151 | .
152 | 
153 | ```
154 |
155 | ```
156 | [](url)
157 | .
158 | 
159 | ```
160 |
161 | ```
162 | [unclosed](hello *a
163 | b*
164 | .
165 | [unclosed](hello a
166 | b
167 | ```
168 |
169 | Note that soft breaks are ignored, so long URLs
170 | can be split over multiple lines:
171 | ```
172 | [closed](hello *a
173 | b*)
174 | .
175 | closed
176 | ```
177 |
178 | Here the strong takes precedence over the link because it
179 | starts first:
180 | ```
181 | *[closed](hello*)
182 | .
183 | [closed](hello)
184 | ```
185 |
186 | Avoid this with a backslash escape:
187 | ```
188 | *[closed](hello\*)
189 | .
190 | *closed
191 | ```
192 |
193 | Link in link?
194 | ```
195 | [[foo](bar)](baz)
196 | .
197 | foo
198 | ```
199 |
200 | Link in image?
201 | ```
202 | ](img)
203 | .
204 | 
205 | ```
206 |
207 | Image in link?
208 | ```
209 | [](url)
210 | .
211 | 
212 | ```
213 |
214 | Autolinks:
215 | ```
216 |
217 |
218 | .
219 | http://example.com/foo
220 | me@example.com
221 | ```
222 |
223 | Openers inside `[..](` or `[..][` or `[..]{` can't match
224 | outside them, even if the construction doesn't turn out to be
225 | a link or span or image.
226 |
227 | ```
228 | [x_y](x_y)
229 | .
230 | x_y
231 | ```
232 |
233 | ```
234 | [x_y](x_
235 | .
236 | [x_y](x_
237 | ```
238 |
239 | ```
240 | [x_y]{.bar_}
241 | .
242 | x_y
243 | ```
244 |
--------------------------------------------------------------------------------
/tests/data/para.test:
--------------------------------------------------------------------------------
1 | ```
2 | hi
3 | there
4 | .
5 | hi
6 | there
7 | ```
8 |
--------------------------------------------------------------------------------
/tests/data/regression.test:
--------------------------------------------------------------------------------
1 | Issue #104:
2 |
3 | ```
4 | {1--}
5 |
6 | {1-}
7 | .
8 | {1--}
9 | {1-}
10 | ```
11 |
12 | Issue #106:
13 |
14 | ```
15 |
16 | |`|
17 | .
18 | ||
19 | ```
20 |
21 | ``` [matches]
22 |
23 | |`|x
24 | .
25 | blankline 1-1
26 | +para 2-2
27 | str 2-2
28 | +verbatim 3-3
29 | str 4-5
30 | -verbatim 5-5
31 | -para 6-6
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/tests/data/super_subscript.test:
--------------------------------------------------------------------------------
1 | ```
2 | H~2~O
3 | .
4 | H2O
5 | ```
6 |
7 | ```
8 | mc^2^
9 | .
10 | mc2
11 | ```
12 |
13 | ```
14 | test^of superscript ~with subscript~^
15 | .
16 | testof superscript with subscript
17 | ```
18 |
19 | ```
20 | H{~2 ~}O
21 | .
22 | H2 O
23 | ```
24 |
--------------------------------------------------------------------------------
/tests/data/verbatim.test:
--------------------------------------------------------------------------------
1 |
2 | ```
3 | Some `code`
4 | .
5 | Some code
6 | ```
7 |
8 | ```
9 | Some `code
10 | with a line break`
11 | .
12 | Some code
13 | with a line break
14 | ```
15 |
16 | ```
17 | Special characters: `*hi*`
18 | .
19 | Special characters: *hi*
20 | ```
21 |
22 | ```
23 | *foo`*`
24 | .
25 | *foo*
26 | ```
27 |
28 | ```
29 | `````a`a``a```a````a``````a`````
30 | .
31 | a`a``a```a````a``````a
32 | ```
33 |
34 | ```
35 | ` ``a`` `
36 | .
37 | ``a``
38 | ```
39 |
40 | Implicitly closed by end of paragraph:
41 |
42 | ```
43 | ` a
44 | c
45 | .
46 | a
47 | c
48 | ```
49 |
--------------------------------------------------------------------------------
/tests/spec.rs:
--------------------------------------------------------------------------------
1 | use std::{fs, path::PathBuf};
2 |
3 | #[allow(unused)]
4 | fn to_ref_html(source: &str, matches: bool) -> String {
5 | let sh = xshell::Shell::new().unwrap();
6 | if !sh.path_exists("ref") {
7 | xshell::cmd!(sh, "git clone https://github.com/jgm/djot ref").run().unwrap();
8 | }
9 | sh.change_dir("ref");
10 | let matches = if matches { Some("-m") } else { None };
11 | let mut html = xshell::cmd!(sh, "lua ./bin/main.lua {matches...}").stdin(source).read().unwrap();
12 | if cfg!(windows) {
13 | html = html.replace("\r\n", "\n");
14 | }
15 | html.push('\n');
16 | html
17 | }
18 |
19 | struct TestOpts {
20 | debug_ast: bool,
21 | ref_matches: bool,
22 | parse: djot::ParseOpts,
23 | }
24 |
25 | #[test]
26 | fn spec_tests() {
27 | let opts =
28 | TestOpts { debug_ast: true, ref_matches: true, parse: djot::ParseOpts { debug_matches: true } };
29 |
30 | let mut last_fail = LastFail::load();
31 | let sh = xshell::Shell::new().unwrap();
32 | let mut total = 0;
33 | for path in sh.read_dir("./tests/data").unwrap() {
34 | if path.extension().unwrap_or_default() == "test" {
35 | let file_stem = path.file_stem().unwrap_or_default().to_str().unwrap_or_default();
36 | let source = fs::read_to_string(&path).unwrap();
37 | for (i, test_case) in parse_test(source.as_str()).into_iter().enumerate() {
38 | if last_fail.skip(file_stem, i) {
39 | continue;
40 | }
41 | let mut debug = String::new();
42 | let doc = djot::Document::parse_opts(opts.parse.clone(), &test_case.djot);
43 | debug.push_str(&doc.debug);
44 | if opts.debug_ast {
45 | debug.push_str(&doc.to_json());
46 | }
47 | let got = doc.to_html();
48 | let want = test_case.html.as_str();
49 | let ref_html = to_ref_html(&test_case.djot, false);
50 | if opts.ref_matches {
51 | debug.push_str(&format!("Ref Matches:\n{}-----", to_ref_html(&test_case.djot, true)));
52 | }
53 | if want != ref_html.as_str() {
54 | panic!(
55 | "\nReference mismatch in {}\nRef:\n{ref_html}-----\nWant:\n{want}-----\n",
56 | file_stem
57 | )
58 | }
59 | if got.as_str() != want {
60 | let mut msg = format!(
61 | "\nMismatch in {}\nSource:\n{}-----\nWant:\n{want}-----\nGot:\n{got}-----\n",
62 | file_stem, test_case.djot,
63 | );
64 | if !debug.is_empty() {
65 | msg = format!("{msg}Debug:\n{debug}-----\n")
66 | }
67 | panic!("{msg}")
68 | }
69 | last_fail.test_ok();
70 | total += 1;
71 | }
72 | }
73 | }
74 | eprintln!("total tests: {total}");
75 | }
76 |
77 | #[derive(Debug, Default)]
78 | struct TestCase {
79 | djot: String,
80 | html: String,
81 | }
82 |
83 | #[derive(Debug)]
84 | enum ParseState {
85 | Init,
86 | Djot(TestCase, usize),
87 | Html(TestCase, usize),
88 | }
89 |
90 | fn parse_test(source: &str) -> Vec {
91 | let mut res = Vec::new();
92 | let mut state = ParseState::Init;
93 | for line in source.lines() {
94 | state = match state {
95 | ParseState::Init if line == "STOP" => {
96 | break;
97 | }
98 | ParseState::Init => match parse_fence(line) {
99 | Some(fence) => ParseState::Djot(TestCase::default(), fence),
100 | None => ParseState::Init,
101 | },
102 | ParseState::Djot(mut test_case, test_case_fence) => {
103 | if line == "." {
104 | ParseState::Html(test_case, test_case_fence)
105 | } else {
106 | test_case.djot.push_str(line);
107 | test_case.djot.push('\n');
108 | ParseState::Djot(test_case, test_case_fence)
109 | }
110 | }
111 | ParseState::Html(mut test_case, test_case_fence) => match parse_fence(line) {
112 | Some(fence) if fence == test_case_fence => {
113 | res.push(test_case);
114 | ParseState::Init
115 | }
116 | _ => {
117 | test_case.html.push_str(line);
118 | test_case.html.push('\n');
119 | ParseState::Html(test_case, test_case_fence)
120 | }
121 | },
122 | };
123 | }
124 |
125 | res
126 | }
127 |
128 | fn parse_fence(line: &str) -> Option {
129 | if line.bytes().all(|it| it == b'`') && line.len() > 0 {
130 | Some(line.len())
131 | } else {
132 | None
133 | }
134 | }
135 |
136 | struct LastFail {
137 | loaded: Option<(String, usize)>,
138 | current: Option<(String, usize)>,
139 | }
140 |
141 | impl LastFail {
142 | fn load() -> LastFail {
143 | let mut loaded = None;
144 | if let Ok(text) = fs::read_to_string(fail_file()) {
145 | let (name, pos) = text.split_once(':').unwrap_or_else(|| panic!("bad fail file {text:?}"));
146 | let idx = pos.parse::().unwrap_or_else(|_| panic!("bad fail file {text:?}"));
147 | eprintln!("loaded fail {name}:{idx}");
148 | loaded = Some((name.to_string(), idx))
149 | }
150 | LastFail { loaded, current: None }
151 | }
152 | fn skip(&mut self, name: &str, pos: usize) -> bool {
153 | self.current = Some((name.to_string(), pos));
154 | if let Some(loaded) = &self.loaded {
155 | return !(loaded.0 == name && loaded.1 == pos);
156 | }
157 | false
158 | }
159 | fn test_ok(&mut self) {
160 | if let Some((name, pos)) = &self.loaded {
161 | eprintln!("{}:{} is now ok!", name, pos);
162 | let _ = fs::remove_file(&fail_file());
163 | self.loaded = None;
164 | }
165 | self.current = None
166 | }
167 | }
168 |
169 | impl Drop for LastFail {
170 | fn drop(&mut self) {
171 | if let Some((name, pos)) = &self.current {
172 | eprintln!("saved fail {name}:{pos}");
173 | let _ = fs::write(fail_file(), format!("{name}:{pos}"));
174 | }
175 | }
176 | }
177 |
178 | fn fail_file() -> PathBuf {
179 | PathBuf::from(env!("CARGO_TARGET_TMPDIR")).join("fail")
180 | }
181 |
--------------------------------------------------------------------------------
/tests/tidy.rs:
--------------------------------------------------------------------------------
1 | use xshell::{cmd, Shell};
2 |
3 | #[test]
4 | fn test_formatting() {
5 | let sh = Shell::new().unwrap();
6 | cmd!(sh, "cargo fmt -- --check").run().unwrap()
7 | }
8 |
--------------------------------------------------------------------------------