├── .gitattributes
├── .github
    └── workflows
    │   └── semgrep.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── bench-fixtures
    ├── es-spec.html
    └── html-spec.html
├── c
    ├── .gitignore
    ├── Makefile
    ├── actions.rl
    ├── field-names.h
    ├── parse_errors.rl
    ├── parser-feedback.c
    ├── parser-feedback.h
    ├── serializer.c
    ├── serializer.h
    ├── tag-types.h
    ├── tokenizer-states.rl
    ├── tokenizer.h
    └── tokenizer.rl
├── cfsetup.yaml
├── convert-test-log.py
├── error-with-feedback-tests
    └── trailing-solidus.test
├── images
    ├── language-specific-actions.png
    ├── perf-comparison.png
    ├── ragel-visualization.png
    ├── syntax-description.png
    └── syntax-files.png
├── package.json
├── rust
    ├── .editorconfig
    ├── .gitignore
    ├── Cargo.toml
    ├── benches
    │   └── bench.rs
    ├── examples
    │   └── trace.rs
    ├── lazyhtml-sys
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── src
    │   │   └── lib.rs
    │   └── wrapper.h
    ├── src
    │   ├── feedback.rs
    │   ├── lib.rs
    │   ├── serializer.rs
    │   └── tokenizer.rs
    └── tests
    │   ├── decoder.rs
    │   ├── feedback_tokens
    │       ├── mod.rs
    │       ├── noop_tree_sink.rs
    │       └── token_sink_proxy.rs
    │   ├── html5lib.rs
    │   ├── parse_errors.rs
    │   ├── test.rs
    │   ├── token.rs
    │   └── unescape.rs
├── simplify-graph.js
└── syntax
    ├── _helpers.rl
    ├── _navigation.rl
    ├── cdata.rl
    ├── comment.rl
    ├── data.rl
    ├── doctype.rl
    ├── endtag.rl
    ├── index.rl
    ├── plaintext.rl
    ├── rawtext.rl
    ├── rcdata.rl
    ├── scriptdata.rl
    └── starttag.rl


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Exclude the HTML files from GitHub's language statistics
2 | # https://github.com/github/linguist#using-gitattributes
3 | bench-fixtures/* linguist-vendored
4 | 


--------------------------------------------------------------------------------
/.github/workflows/semgrep.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | on:
 3 |   pull_request: {}
 4 |   workflow_dispatch: {}
 5 |   push: 
 6 |     branches:
 7 |       - main
 8 |       - master
 9 |   schedule:
10 |     - cron: '0 0 * * *'
11 | name: Semgrep config
12 | jobs:
13 |   semgrep:
14 |     name: semgrep/ci
15 |     runs-on: ubuntu-20.04
16 |     env:
17 |       SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }}
18 |       SEMGREP_URL: https://cloudflare.semgrep.dev
19 |       SEMGREP_APP_URL: https://cloudflare.semgrep.dev
20 |       SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version
21 |     container:
22 |       image: returntocorp/semgrep
23 |     steps:
24 |       - uses: actions/checkout@v3
25 |       - run: semgrep ci
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.dot
2 | *.png
3 | *.ri
4 | node_modules
5 | /rust/tests.log
6 | /rust/failures.log
7 | /rust/tests.xml
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "html5lib-tests"]
2 | 	path = html5lib-tests
3 | 	url = https://github.com/html5lib/html5lib-tests.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2019, Cloudflare, Inc.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 | list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation and/or
12 | other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 | may be used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | RAGEL = ragel
 2 | RAGELFLAGS =
 3 | 
 4 | RL_FILES := $(wildcard syntax/*.rl)
 5 | 
 6 | .PHONY: c-tokenizer
 7 | c-tokenizer:
 8 | 	make -C c
 9 | 
10 | .PHONY: test
11 | test:
12 | 	cd rust && cargo test
13 | 
14 | .PHONY: bench
15 | bench:
16 | 	cd rust && cargo bench
17 | 
18 | %.dot: c/tokenizer.rl $(RL_FILES)
19 | 	$(RAGEL) $(RAGELFLAGS) -Vp -M $(notdir $(basename $@)) $< > $@
20 | 	node simplify-graph.js $@
21 | 
22 | %.png: %.dot
23 | 	dot -Tpng $< -o $@
24 | 	open $@
25 | 
26 | .PHONY: clean
27 | clean:
28 | 	rm -rf *.dot *.png
29 | 	make -C c clean
30 | 	cd rust; cargo clean
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LazyHTML (lhtml)
  2 | 
  3 | LazyHTML is an HTML5-compliant parser and serializer than enables building transformation pipeline in a pluggable manner.
  4 | 
  5 | ## Testing
  6 | 
  7 | ```
  8 | make test
  9 | ```
 10 | 
 11 | ## Benchmark
 12 | 
 13 | ```
 14 | make bench
 15 | ```
 16 | 
 17 | ## How do we use it?
 18 | 
 19 | First of all, you need to create a buffer of a desired size:
 20 | 
 21 | ```c
 22 | char buffer[1048576];
 23 | ```
 24 | 
 25 | Then, you want to create a parsing state and initialize it with desired options:
 26 | 
 27 | ```c
 28 | lhtml_options_t options = {
 29 |   .initial_state = LHTML_STATE_DATA,
 30 |   .allow_cdata = false,
 31 |   .last_start_tag_name = { .length = 0 },
 32 |   .buffer = buffer,
 33 |   .buffer_size = sizeof(buffer)
 34 | };
 35 | 
 36 | lhtml_state_t state;
 37 | 
 38 | lhtml_init(state, options);
 39 | ```
 40 | 
 41 | At this point, you can inject own handler(s) for transformation:
 42 | 
 43 | ```
 44 | lhtml_token_handler_t handler;
 45 | lhtml_add_handler(&state, &handler, handle_token);
 46 | ```
 47 | 
 48 | Finally, feed it chunk by chunk:
 49 | 
 50 | ```c
 51 | lhtml_string_t chunk = { .data = "...", .length = 3 };
 52 | lhtml_feed(&state, &chunk);
 53 | ```
 54 | 
 55 | And finalize by sending NULL chunk (noting that no further data will be available):
 56 | 
 57 | ```c
 58 | lhtml_feed(&state, NULL);
 59 | ```
 60 | 
 61 | ## Nice, but what do we put into the custom handlers / plugins?
 62 | 
 63 | Each plugin can have own state. To simplify the API, we take advantage of the fact that in C, structure always points to its first element, so if your transformation needs its own state, the convention is to put lhtml_token_handler_t handler; as the first item of your structure, and dereference the extra pointer in a callback to your state. If transformation doesn't need its own state, lhtml_token_handler_t can be used directly as shown below. This item is needed so that lhtml could chain various handlers into a single pipeline (if you're familiar with Nginx module system, this should look familiar to you, although with some modifications).
 64 | 
 65 | So, for example, function that only transforms href propertly on links, can look like following:
 66 | 
 67 | ```c
 68 | // define static string to be used for replacements
 69 | static const lhtml_string_t REPLACEMENT = {
 70 |   .data = "[REPLACED]",
 71 |   .length = sizeof("[REPLACED]") - 1
 72 | };
 73 | 
 74 | static void token_handler(lhtml_token_t *token, void *extra /* this can be your state */) {
 75 |   if (token->type == LHTML_TOKEN_START_TAG) { // we're interested only in start tags
 76 |     const lhtml_token_starttag_t *tag = &token->start_tag;
 77 |     if (tag->type == LHTML_TAG_A) { // check whether tag is of type <a>
 78 |       const size_t n_attrs = tag->attributes.count;
 79 |       const lhtml_attribute_t *attrs = tag->attributes.items;
 80 |       for (size_t i = 0; i < n_attrs; i++) { // iterate over attributes
 81 |         const lhtml_attribute_t *attr = &attrs[i];
 82 |         if (lhtml_name_equals(attr->name, "href")) { // match the attribute name
 83 |           attr->value = REPLACEMENT; // set the attribute value
 84 |         }
 85 |       }
 86 |     }
 87 |   }
 88 |   lhtml_emit(token, extra); // pass transformed token(s) to next handler(s)
 89 | }
 90 | ```
 91 | 
 92 | In your main code, use this handler:
 93 | 
 94 | ```c
 95 | lhtml_token_handler_t handler;
 96 | lhtml_add_handler(&state, &handler, token_handler);
 97 | ```
 98 | 
 99 | That's it!
100 | 
101 | ## What does it do?
102 | 
103 | lhtml is a lexer which is also written in Ragel, but in a more modular fashion and with support for HTML5.
104 | 
105 | * Various parts of the HTML syntax spec live in separate Ragel files (syntax/comment.rl, syntax.starttag.rl, ...) and are connected in syntax/index.rl
106 | 
107 | ![files](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/syntax-files.png)
108 | 
109 | * Syntax descriptions are separated from actions.
110 | 
111 | ![descriptions](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/syntax-description.png)
112 | 
113 | One benefit that this brings is enforced named actions in such codestyle. This makes it easy to visualize, debug and fix specific machines using built-in Ragel's visualization.
114 | Sample output from make AttributeName.png below:
115 | 
116 | ![visualization](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/ragel-visualization.png)
117 | 
118 | This was proved/used during development, as the parser was prototyped in JavaScript for the sake of simplicity and then ported to C with only API / string handling changes within couple of days.
119 | 
120 | * lhtml operates on a byte level. HTML spec defines precise set of encodings that are allowed, and one interesting bit from the spec is:
121 | 
122 | > Since support for encodings that are not defined in the WHATWG Encoding standard is prohibited, [UTF-16 encodings](https://html.spec.whatwg.org/multipage/infrastructure.html#utf-16-encoding) are the only encodings that this specification needs to treat as not being [ASCII-compatible encodings](https://html.spec.whatwg.org/multipage/infrastructure.html#ascii-compatible-encoding).
123 | 
124 | That means that as long as we care only about ASCII-compatible subset (and we do for all the known tags and attributes potentially used in transformations) and the content is not in UTF-16, we can lex HTML on a byte level without expensive streaming decoding in front of it and encoding back after transformation. This is pretty much what we did in the previous parsers, so we can't transform UTF-16 at the moment, but should we decide that we want it in the future, it can be implemented as a special-cased transform in the front of the lexer (it's pretty rare on the Web though, so it's unlikely we will want it as potential issues overweight benefits).
125 | 
126 | * lhtml operates in a streaming fashion. When it gets a new chunk, it combines it with a previous leftover in the preallocated buffer and parses the newly formed string. The leftover is formed from the part of the previous token that was not finished.
127 | 
128 | * Character tokens (pure text) is not saved between buffers as they are the most popular content, and usually we don't care about them for transformation. That means only short tokens such as start tags, end tags, comments and doctype will be buffered.
129 | 
130 | * This leftover + chunk concatenation is the only place where copy occurs. This significantly simplifies handling of the strings across the code (as otherwise we would end up with a rope instead of flat in-memory chunk), and has low overhead (uses only one memmove on small leftover and one memcpy on the new chunk). Parsing itself is zero-copy, and returns tokens with {data, length} string structures which point to this buffer, making them lightweight on memory and easy to work with (and they're compatible with ngx_str_t out of the box).
131 | 
132 | * All the memory is statically allocated for entire context (document). On one hand, this means that if transformation wants to preserve some tokens, it needs to copy their data manually into own state, but on another hand brings significant performance wins as we don't need to allocate/free memory over and over for various buffers and tokens, and instead reuse same one. Also, this allows to avoid any restrictions on how that memory is allocated (whether it's malloc/free, Nginx pool or even a stack - anything works as long as it's live during the parsing).
133 | 
134 | * Tag names are hashed by transforming each letter to range of 1..26 with shifting step of 5 bits. This wouldn't cover custom tags, but gives a fast inlinable linear function that covers all the standard tags we care about, and for the other rare cases we can use lhtml_name_equals which compares the actual names in a case-insensitive manner.
135 | 
136 | * Each token & attribute, in addition to lexed strings, provides a string for the entire token / attribute which can be used if no modifications happened. This both allows to preserve formatting and bring even better performance by avoiding custom per-token serialization in favor of passing this raw strings as-is to the output on any tokens that we don't care about (don't modify).
137 | 
138 | 
139 | ## So is it correct and fast?
140 | 
141 | It's HTML5 compliant, was tested against the official test suites, and several contributions were sent to the specification itself for clarification / simplification of the spec language.
142 | 
143 | Unlike existing parsers, it didn't bail out on any of the 2,382,625 documents from HTTP Archive, although 0.2% of documents exceeded expected bufferization limits as they were in fact JavaScript or RSS or other types of content incorrectly served with Content-Type: text/html, and since anything is valid HTML5, parser tried to parse e.g. a<b; x=3; y=4 as incomplete tag with attributes. This is very rare (and goes to even lower amount of 0.03% when two error-prone advertisement networks are excluded from those results), but still needs to be accounted for and is a valid case for bailing out.
144 | 
145 | As for the benchmarks, I used an example which transforms HTML spec itself (7.9 MB HTML file) by replacing every `<a href>` (only that property only in those tags) to a static value. It was compared against few existing and popular HTML parser (only tokenization mode was used for the fair comparison, so that they don't need to build AST and so on), and timings in milliseconds for 100 iterations are the following (lazy mode means that we're using raw strings whenever possible, the other one serializes each token just for the comparison):
146 | 
147 | Parser | Example #1: 3.6 MB	| Example #2: 7.9 MB | Speed #1 (MB/s) | Speed #2 (MB/s)
148 | --- | --- | --- | --- | ---
149 | Gumbo (Google) |	265.05 |	542.93 |	13.62 |	14.62
150 | html5ever (Mozilla) |	289.75 |	444.32 |	12.46 |	17.87
151 | libhubbbub (Netsurf) | 113.57 |	232.33 |	31.80	| 34.17
152 | lhtml (CloudFlare) |45.32 |	71.55 |	79.69 |	110.97
153 | lhtml (lazy mode) (CloudFlare) |	26.40 |	49.57 |	136.78 |	160.18
154 | 
155 | ![comparison](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/perf-comparison.png)
156 | 
157 | ## Are there any quirks?
158 | 
159 | these parts were carefully extracted from the spec in the way that doesn't break compatibility, but instead allows to move out unnecessary yet expensive operations into a separate optional module in the pipeline.
160 | 
161 | More specifically, as per specification, you have various text transformations in different contexts, such as:
162 | 
163 | * normalizing CR / CRLF to LF
164 | * named / numeric XML-like entities
165 | * replacing U+0000 (NUL) character with U+FFFD (replacement character) in certain contexts where it's considered unsafe
166 | * normalizing uppercase tag names and attributes to lowercase in non-XML contexts
167 | 
168 | Those are important for correct display in browsers, but as we don't render content, perform very limited text processing, and care only about standard (ASCII-subset) tag names and attributes, we can get away with ignoring those and implementing in a separate plugin if needed. This doesn't change correctness as long as you do e.g. case-insensitive comparisons (which we already do in a very cheap way - case-insensitive hashing).
169 | 
170 | Otherwise, we would need to apply charset detection and text decoding (as entity matches or U+FFFD have different representations in various encodings) in front of the parser which would make it significantly slower for little to no benefits.
171 | 
172 | ## License
173 | 
174 | BSD licensed. See the [LICENSE](LICENSE) file for details.


--------------------------------------------------------------------------------
/c/.gitignore:
--------------------------------------------------------------------------------
1 | /*.dSYM
2 | /node_modules
3 | /out
4 | 


--------------------------------------------------------------------------------
/c/Makefile:
--------------------------------------------------------------------------------
 1 | RAGEL = ragel
 2 | RAGELFLAGS = -G2
 3 | CFLAGS = -g -O3
 4 | override CFLAGS += -std=c99 -Wall -Wextra -Wcast-qual -Wwrite-strings -Wshadow -Winline -Wdisabled-optimization -Wuninitialized -Wcast-align -Wno-missing-field-initializers -Werror
 5 | OUT = out
 6 | TARGET := $(shell $(CC) -dumpmachine)
 7 | OUT_TARGET := $(OUT)/$(TARGET)
 8 | RAGEL_SOURCES := actions.rl $(wildcard ../syntax/*.rl)
 9 | SOURCES := $(wildcard *.c)
10 | 
11 | ## Phony tasks
12 | 
13 | .PHONY: all
14 | all: lib
15 | 
16 | .PHONY: lib
17 | lib: $(OUT_TARGET)/liblhtml.a
18 | 
19 | .PHONY: clean-obj
20 | clean-obj:
21 | 	rm -rf $(OUT_TARGET)
22 | 
23 | .PHONY: clean
24 | clean:
25 | 	rm -rf $(OUT)
26 | 
27 | ## Intermediate dependencies
28 | 
29 | $(OUT) $(OUT_TARGET):
30 | 	mkdir -p $@
31 | 
32 | $(OUT)/tokenizer.c: tokenizer.rl $(RAGEL_SOURCES) | $(OUT)
33 | 	$(RAGEL) $(RAGELFLAGS) $< -o $@
34 | 
35 | $(OUT)/tokenizer-states.h: tokenizer-states.rl $(RAGEL_SOURCES) | $(OUT)
36 | 	$(RAGEL) $(RAGELFLAGS) $< -o $@
37 | 
38 | $(OUT)/%.d: %.c | $(OUT)
39 | 	$(CC) $(CFLAGS) -MM $< -MT "\$$(OUT)/$(@F)" -MT "\$$(OUT_TARGET)/$(@F:.d=.o)" -MP -MF $@
40 | 
41 | $(OUT)/tokenizer.d: $(OUT)/tokenizer.c
42 | 	$(CC) $(CFLAGS) -xc -iquote. -MM $< -MT "\$$(OUT)/$(@F)" -MT "\$$(OUT_TARGET)/$(@F:.d=.o)" -MP -MF $@
43 | 
44 | ifneq (, $(filter all lib $(OUT)/%.d $(OUT_TARGET)/%, $(MAKECMDGOALS)))
45 | -include $(patsubst %.c, $(OUT)/%.d, $(SOURCES))
46 | -include $(OUT)/tokenizer.d
47 | endif
48 | 
49 | ## Object files
50 | 
51 | $(OUT_TARGET)/tokenizer.o: $(OUT)/tokenizer-states.h | $(OUT_TARGET)
52 | $(OUT_TARGET)/tokenizer.o:
53 | 	$(CC) $(CFLAGS) -c $< -include $(OUT)/tokenizer-states.h -iquote. -Wno-parentheses-equality -o $@
54 | 
55 | $(OUT_TARGET)/parser-feedback.o: $(OUT)/tokenizer-states.h | $(OUT_TARGET)
56 | $(OUT_TARGET)/parser-feedback.o:
57 | 	$(CC) $(CFLAGS) -c $< -include $(OUT)/tokenizer-states.h -o $@
58 | 
59 | $(OUT_TARGET)/serializer.o: | $(OUT_TARGET)
60 | 	$(CC) $(CFLAGS) -c $< -o $@
61 | 
62 | ## Final library and binaries
63 | 
64 | $(OUT_TARGET)/liblhtml.a: $(OUT_TARGET)/tokenizer.o $(OUT_TARGET)/parser-feedback.o $(OUT_TARGET)/serializer.o
65 | 	$(AR) rcs $@ $?
66 | 


--------------------------------------------------------------------------------
/c/actions.rl:
--------------------------------------------------------------------------------
  1 | %%{
  2 |     machine html;
  3 | 
  4 |     action SaveQuote {
  5 |         state->quote = fc;
  6 |     }
  7 | 
  8 |     action IsMatchingQuote { fc == state->quote }
  9 | 
 10 |     action StartAppropriateEndTag {
 11 |         state->special_end_tag_type = 0;
 12 |     }
 13 | 
 14 |     action FeedAppropriateEndTag { tag_type_append_char(&state->special_end_tag_type, fc) }
 15 | 
 16 |     action IsAppropriateEndTagFed { state->special_end_tag_type == state->last_start_tag_type }
 17 | 
 18 |     action SetAppropriateEndTagName {
 19 |         lhtml_token_endtag_t *end_tag = GET_TOKEN(END_TAG);
 20 |         end_tag->name = range_string(state->slice_start + 2, p);
 21 |         end_tag->type = state->special_end_tag_type;
 22 |     }
 23 | 
 24 |     action StartSlice {
 25 |         state->slice_start = p;
 26 |     }
 27 | 
 28 |     action MarkPosition {
 29 |         state->mark = p;
 30 |     }
 31 | 
 32 |     action UnmarkPosition {
 33 |         state->mark = NULL;
 34 |     }
 35 | 
 36 |     action AdvanceMarkedPosition {
 37 |         state->mark++;
 38 |     }
 39 | 
 40 |     action EmitToken {
 41 |         emit_token(state, p + (p != eof));
 42 |     }
 43 | 
 44 |     action CreateCharacter {
 45 |         token->type = LHTML_TOKEN_CHARACTER;
 46 |         state->unsafe_null = false;
 47 |         state->entities = false;
 48 |     }
 49 | 
 50 |     action UnsafeNull {
 51 |         state->unsafe_null = true;
 52 |     }
 53 | 
 54 |     action AllowEntities {
 55 |         state->entities = true;
 56 |     }
 57 | 
 58 |     action CreateCDataStart {
 59 |         token->type = LHTML_TOKEN_CDATA_START;
 60 |     }
 61 | 
 62 |     action CreateCDataEnd {
 63 |         token->type = LHTML_TOKEN_CDATA_END;
 64 |     }
 65 | 
 66 |     action CreateUnparsed {
 67 |         token->type = LHTML_TOKEN_UNPARSED;
 68 |     }
 69 | 
 70 |     action EmitSlice {
 71 |         emit_slice(state, p);
 72 |     }
 73 | 
 74 |     action CreateStartTagToken {
 75 |         CREATE_TOKEN(START_TAG, {
 76 |             .attributes = (lhtml_attributes_t) {
 77 |                 .buffer = state->attr_buffer
 78 |             }
 79 |         });
 80 |     }
 81 | 
 82 |     action SetStartTagName {
 83 |         lhtml_token_starttag_t *start_tag = GET_TOKEN(START_TAG);
 84 |         start_tag->name = range_string(state->slice_start, p);
 85 |         start_tag->type = lhtml_get_tag_type(start_tag->name);
 86 |     }
 87 | 
 88 |     action SetEndTagName {
 89 |         lhtml_token_endtag_t *end_tag = GET_TOKEN(END_TAG);
 90 |         end_tag->name = range_string(state->slice_start, p);
 91 |         end_tag->type = lhtml_get_tag_type(end_tag->name);
 92 |     }
 93 | 
 94 |     action SetLastStartTagName {
 95 |         state->last_start_tag_type = GET_TOKEN(START_TAG)->type;
 96 |     }
 97 | 
 98 |     action SetSelfClosingFlag {
 99 |         GET_TOKEN(START_TAG)->self_closing = true;
100 |     }
101 | 
102 |     action EndComment {
103 |         CREATE_TOKEN(COMMENT, {
104 |             .value = range_string(state->slice_start, state->mark)
105 |         });
106 |     }
107 | 
108 |     action CreateEndTagToken {
109 |         CREATE_TOKEN(END_TAG, {});
110 |     }
111 | 
112 |     action CanCreateAttribute { can_create_attr(&GET_TOKEN(START_TAG)->attributes) }
113 | 
114 |     action SetAttributeValue {
115 |         if (state->current_attr_is_unique) {
116 |             lhtml_attributes_t *attributes = &GET_TOKEN(START_TAG)->attributes;
117 |             lhtml_attribute_t *attr = &attributes->data[attributes->length - 1];
118 |             attr->value = range_string(state->slice_start, p);
119 |             attr->raw.value.length = (size_t) (p + (*p == '"' || *p == '\'') - attr->name.data);
120 |         }
121 |     }
122 | 
123 |     action AppendAttribute {
124 |         lhtml_attributes_t *attributes = &GET_TOKEN(START_TAG)->attributes;
125 |         lhtml_string_t name = range_string(state->slice_start, p);
126 | 
127 |         state->current_attr_is_unique = lhtml_find_attr(attributes, name) == NULL;
128 | 
129 |         if (state->current_attr_is_unique ) {
130 |             attributes->data[attributes->length++] = (lhtml_attribute_t) {
131 |                 .name = name,
132 |                 .raw = (lhtml_opt_string_t) {
133 |                     .has_value = true,
134 |                     .value = name
135 |                 }
136 |             };
137 |         } else {
138 |             parse_error(state, LHTML_ERR_DUPLICATE_ATTRIBUTE);
139 |         }
140 |     }
141 | 
142 |     action IsCDataAllowed { state->allow_cdata }
143 | 
144 |     action CreateDocType {
145 |         CREATE_TOKEN(DOCTYPE, {});
146 |     }
147 | 
148 |     action SetDocTypeName {
149 |         GET_TOKEN(DOCTYPE)->name = opt_range_string(state->slice_start, p);
150 |     }
151 | 
152 |     action SetForceQuirksFlag {
153 |         GET_TOKEN(DOCTYPE)->force_quirks = true;
154 |     }
155 | 
156 |     action SetDocTypePublicIdentifier {
157 |         GET_TOKEN(DOCTYPE)->public_id = opt_range_string(state->slice_start, p);
158 |     }
159 | 
160 |     action SetDocTypeSystemIdentifier {
161 |         GET_TOKEN(DOCTYPE)->system_id = opt_range_string(state->slice_start, p);
162 |     }
163 | }%%
164 | 


--------------------------------------------------------------------------------
/c/field-names.h:
--------------------------------------------------------------------------------
 1 | #ifndef LHTML_FIELD_NAMES_H
 2 | #define LHTML_FIELD_NAMES_H
 3 | 
 4 | #define LHTML_FIELD_NAME_COMMENT comment
 5 | #define LHTML_FIELD_NAME_START_TAG start_tag
 6 | #define LHTML_FIELD_NAME_END_TAG end_tag
 7 | #define LHTML_FIELD_NAME_DOCTYPE doctype
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/c/parse_errors.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     action Err_AbruptClosingOfEmptyComment { parse_error(state, LHTML_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT); }
 5 |     action Err_AbruptDoctypePublicIdentifier { parse_error(state, LHTML_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER); }
 6 |     action Err_AbruptDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER); }
 7 |     action Err_CDataInHtmlContent { parse_error(state, LHTML_ERR_CDATA_IN_HTML_CONTENT); }
 8 |     action Err_EndTagWithAttributes { parse_error(state, LHTML_ERR_END_TAG_WITH_ATTRIBUTES); }
 9 |     action Err_EndTagWithTrailingSolidus { parse_error(state, LHTML_ERR_END_TAG_WITH_TRAILING_SOLIDUS); }
10 |     action Err_EofBeforeTagName { parse_error(state, LHTML_ERR_EOF_BEFORE_TAG_NAME); }
11 |     action Err_EofInCData { parse_error(state, LHTML_ERR_EOF_IN_CDATA); }
12 |     action Err_EofInComment { parse_error(state, LHTML_ERR_EOF_IN_COMMENT); }
13 |     action Err_EofInDoctype { parse_error(state, LHTML_ERR_EOF_IN_DOCTYPE); }
14 |     action Err_EofInScriptHtmlCommentLikeText { parse_error(state, LHTML_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT); }
15 |     action Err_EofInTag { parse_error(state, LHTML_ERR_EOF_IN_TAG); }
16 |     action Err_IncorrectlyClosedComment { parse_error(state, LHTML_ERR_INCORRECTLY_CLOSED_COMMENT); }
17 |     action Err_IncorrectlyOpenedComment { parse_error(state, LHTML_ERR_INCORRECTLY_OPENED_COMMENT); }
18 |     action Err_InvalidCharacterSequenceAfterDoctypeName { parse_error(state, LHTML_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME); }
19 |     action Err_InvalidFirstCharacterOfTagName { parse_error(state, LHTML_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME); }
20 |     action Err_MissingAttributeValue { parse_error(state, LHTML_ERR_MISSING_ATTRIBUTE_VALUE); }
21 |     action Err_MissingDoctypeName { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_NAME); }
22 |     action Err_MissingDoctypePublicIdentifier { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER); }
23 |     action Err_MissingDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER); }
24 |     action Err_MissingEndTagName { parse_error(state, LHTML_ERR_MISSING_END_TAG_NAME); }
25 |     action Err_MissingQuoteBeforeDoctypePublicIdentifier { parse_error(state, LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER); }
26 |     action Err_MissingQuoteBeforeDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); }
27 |     action Err_MissingSpaceAfterDoctypePublicKeyword { parse_error(state, LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD); }
28 |     action Err_MissingSpaceAfterDoctypeSystemKeyword { parse_error(state, LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD); }
29 |     action Err_MissingWhitespaceBeforeDoctypeName { parse_error (state, LHTML_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME); }
30 |     action Err_MissingWhitespaceBetweenAttributes { parse_error(state, LHTML_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES); }
31 |     action Err_MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers { parse_error(state, LHTML_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS); }
32 |     action Err_NestedComment { parse_error(state, LHTML_ERR_NESTED_COMMENT); }
33 |     action Err_UnexpectedCharacterAfterDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER); }
34 |     action Err_UnexpectedCharacterInAttributeName { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME); }
35 |     action Err_UnexpectedCharacterInUnquotedAttributeValue { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE); }
36 |     action Err_UnexpectedEqualsSignBeforeAttributeName { parse_error(state, LHTML_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME); }
37 |     action Err_UnexpectedQuestionMarkInsteadOfTagName { parse_error(state, LHTML_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME); }
38 |     action Err_UnexpectedSolidusInTag { parse_error(state, LHTML_ERR_UNEXPECTED_SOLIDUS_IN_TAG); }
39 | }%%


--------------------------------------------------------------------------------
/c/parser-feedback.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <assert.h>
  3 | #include "parser-feedback.h"
  4 | // #include "$OUT/tokenizer-states.h" - included with command option to respect env var
  5 | 
  6 | lhtml_ns_t lhtml_get_current_ns(const lhtml_feedback_t *state) {
  7 |     return state->ns_stack.data[state->ns_stack.length - 1];
  8 | }
  9 | 
 10 | static bool is_foreign_ns(lhtml_ns_t ns) {
 11 |     return ns != LHTML_NS_HTML;
 12 | }
 13 | 
 14 | __attribute__((warn_unused_result))
 15 | static bool enter_ns(lhtml_feedback_t *state, lhtml_ns_t ns) {
 16 |     if (state->ns_stack.length >= state->ns_stack.capacity) {
 17 |         return false;
 18 |     }
 19 |     state->ns_stack.data[state->ns_stack.length++] = ns;
 20 |     state->tokenizer->allow_cdata = is_foreign_ns(ns);
 21 |     return true;
 22 | }
 23 | 
 24 | static void leave_ns(lhtml_feedback_t *state) {
 25 |     assert(state->ns_stack.length > 1);
 26 |     state->ns_stack.length--;
 27 |     state->tokenizer->allow_cdata = is_foreign_ns(lhtml_get_current_ns(state));
 28 | }
 29 | 
 30 | static void ensure_tokenizer_mode(lhtml_tokenizer_t *tokenizer, lhtml_tag_type_t tag_type) {
 31 |     int new_state;
 32 | 
 33 |     switch (tag_type) {
 34 |         case LHTML_TAG_TEXTAREA:
 35 |         case LHTML_TAG_TITLE:
 36 |             new_state = html_en_RCData;
 37 |             break;
 38 | 
 39 |         case LHTML_TAG_PLAINTEXT:
 40 |             new_state = html_en_PlainText;
 41 |             break;
 42 | 
 43 |         case LHTML_TAG_SCRIPT:
 44 |             new_state = html_en_ScriptData;
 45 |             break;
 46 | 
 47 |         case LHTML_TAG_STYLE:
 48 |         case LHTML_TAG_IFRAME:
 49 |         case LHTML_TAG_XMP:
 50 |         case LHTML_TAG_NOEMBED:
 51 |         case LHTML_TAG_NOFRAMES:
 52 |         case LHTML_TAG_NOSCRIPT:
 53 |             new_state = html_en_RawText;
 54 |             break;
 55 | 
 56 |         default:
 57 |             return;
 58 |     }
 59 | 
 60 |     tokenizer->cs = new_state;
 61 | }
 62 | 
 63 | static bool can_be_self_closing(lhtml_tag_type_t tag_type) {
 64 |     switch (tag_type) {
 65 |         case LHTML_TAG_BASE:
 66 |         case LHTML_TAG_BASEFONT:
 67 |         case LHTML_TAG_BGSOUND:
 68 |         case LHTML_TAG_LINK:
 69 |         case LHTML_TAG_META:
 70 |         case LHTML_TAG_AREA:
 71 |         case LHTML_TAG_BR:
 72 |         case LHTML_TAG_EMBED:
 73 |         case LHTML_TAG_IMG:
 74 |         case LHTML_TAG_KEYGEN:
 75 |         case LHTML_TAG_WBR:
 76 |         case LHTML_TAG_INPUT:
 77 |         case LHTML_TAG_PARAM:
 78 |         case LHTML_TAG_SOURCE:
 79 |         case LHTML_TAG_TRACK:
 80 |         case LHTML_TAG_HR:
 81 |         case LHTML_TAG_MATH:
 82 |         case LHTML_TAG_SVG:
 83 |         case LHTML_TAG_COL:
 84 |         case LHTML_TAG_FRAME:
 85 |             return true;
 86 |         default:
 87 |             return false;
 88 |     }
 89 | }
 90 | 
 91 | static bool foreign_causes_exit(const lhtml_token_starttag_t *start_tag) {
 92 |     switch (start_tag->type) {
 93 |         case LHTML_TAG_B:
 94 |         case LHTML_TAG_BIG:
 95 |         case LHTML_TAG_BLOCKQUOTE:
 96 |         case LHTML_TAG_BODY:
 97 |         case LHTML_TAG_BR:
 98 |         case LHTML_TAG_CENTER:
 99 |         case LHTML_TAG_CODE:
100 |         case LHTML_TAG_DD:
101 |         case LHTML_TAG_DIV:
102 |         case LHTML_TAG_DL:
103 |         case LHTML_TAG_DT:
104 |         case LHTML_TAG_EM:
105 |         case LHTML_TAG_EMBED:
106 |         /*case LHTML_TAG_H1:
107 |         case LHTML_TAG_H2:
108 |         case LHTML_TAG_H3:
109 |         case LHTML_TAG_H4:
110 |         case LHTML_TAG_H5:
111 |         case LHTML_TAG_H6:*/
112 |         case LHTML_TAG_HEAD:
113 |         case LHTML_TAG_HR:
114 |         case LHTML_TAG_I:
115 |         case LHTML_TAG_IMG:
116 |         case LHTML_TAG_LI:
117 |         case LHTML_TAG_LISTING:
118 |         case LHTML_TAG_MENU:
119 |         case LHTML_TAG_META:
120 |         case LHTML_TAG_NOBR:
121 |         case LHTML_TAG_OL:
122 |         case LHTML_TAG_P:
123 |         case LHTML_TAG_PRE:
124 |         case LHTML_TAG_RUBY:
125 |         case LHTML_TAG_S:
126 |         case LHTML_TAG_SMALL:
127 |         case LHTML_TAG_SPAN:
128 |         case LHTML_TAG_STRONG:
129 |         case LHTML_TAG_STRIKE:
130 |         case LHTML_TAG_SUB:
131 |         case LHTML_TAG_SUP:
132 |         case LHTML_TAG_TABLE:
133 |         case LHTML_TAG_TT:
134 |         case LHTML_TAG_U:
135 |         case LHTML_TAG_UL:
136 |         case LHTML_TAG_VAR:
137 |             return true;
138 |         case LHTML_TAG_FONT: {
139 |             const lhtml_attributes_t *attrs = &start_tag->attributes;
140 |             for (size_t i = 0; i < attrs->length; i++) {
141 |                 const lhtml_string_t name = attrs->data[i].name;
142 |                 if (LHTML_STR_NOCASE_EQUALS(name, "color") || LHTML_STR_NOCASE_EQUALS(name, "size") || LHTML_STR_NOCASE_EQUALS(name, "face")) {
143 |                     return true;
144 |                 }
145 |             }
146 |             return false;
147 |         }
148 |         default: {
149 |             const lhtml_string_t name = start_tag->name;
150 |             return name.length == 2 && ((name.data[0] | 0x20) == 'h') && (name.data[1] >= '1' && name.data[1] <= '6');
151 |         }
152 |     }
153 | }
154 | 
155 | static bool foreign_is_integration_point(lhtml_ns_t ns, lhtml_tag_type_t type, const lhtml_string_t name, const lhtml_attributes_t *attrs) {
156 |     switch (ns) {
157 |         case LHTML_NS_MATHML:
158 |             switch (type) {
159 |                 case LHTML_TAG_MI:
160 |                 case LHTML_TAG_MO:
161 |                 case LHTML_TAG_MN:
162 |                 case LHTML_TAG_MS:
163 |                 case LHTML_TAG_MTEXT:
164 |                     return true;
165 | 
166 |                 default: {
167 |                     if (attrs && LHTML_STR_NOCASE_EQUALS(name, "annotation-xml")) {
168 |                         for (size_t i = 0; i < attrs->length; i++) {
169 |                             const lhtml_attribute_t *attr = &attrs->data[i];
170 |                             if (LHTML_STR_NOCASE_EQUALS(attr->name, "encoding") && (LHTML_STR_NOCASE_EQUALS(attr->value, "text/html") || LHTML_STR_NOCASE_EQUALS(attr->value, "application/xhtml+xml"))) {
171 |                                 return true;
172 |                             }
173 |                         }
174 |                     }
175 |                     return false;
176 |                 }
177 |             }
178 | 
179 |         case LHTML_NS_SVG:
180 |             return type == LHTML_TAG_DESC || type == LHTML_TAG_TITLE || type == LHTML_TAG_FOREIGNOBJECT;
181 | 
182 |         case LHTML_NS_HTML:
183 |             return false;
184 | 
185 |         default:
186 |             assert(false);
187 |     }
188 | }
189 | 
190 | __attribute__((warn_unused_result))
191 | static bool handle_start_tag_token(lhtml_feedback_t *state, lhtml_token_starttag_t *tag, bool *delayed_enter_html) {
192 |     lhtml_tag_type_t type = tag->type;
193 | 
194 |     if (type == LHTML_TAG_SVG || type == LHTML_TAG_MATH) {
195 |         return enter_ns(state, (lhtml_ns_t) type);
196 |     }
197 | 
198 |     lhtml_ns_t ns = lhtml_get_current_ns(state);
199 | 
200 |     if (is_foreign_ns(ns)) {
201 |         if (foreign_causes_exit(tag)) {
202 |             leave_ns(state);
203 |         } else {
204 |             *delayed_enter_html = !tag->self_closing && foreign_is_integration_point(ns, type, tag->name, &tag->attributes);
205 |         }
206 |     } else {
207 |         if (type == LHTML_TAG_IMAGE) {
208 |             tag->type = LHTML_TAG_IMG;
209 |             tag->name = LHTML_STRING("img");
210 |         }
211 | 
212 |         ensure_tokenizer_mode(state->tokenizer, type);
213 |     }
214 | 
215 |     return true;
216 | }
217 | 
218 | static void handle_end_tag_token(lhtml_feedback_t *state, const lhtml_token_endtag_t *tag) {
219 |     lhtml_tag_type_t type = tag->type;
220 | 
221 |     lhtml_ns_t ns = lhtml_get_current_ns(state);
222 | 
223 |     if (is_foreign_ns(ns)) {
224 |         if (type == (lhtml_tag_type_t) ns) {
225 |             leave_ns(state);
226 |         }
227 |     } else if (state->ns_stack.length >= 2) {
228 |         lhtml_ns_t prev_ns = state->ns_stack.data[state->ns_stack.length - 2];
229 | 
230 |         if (foreign_is_integration_point(prev_ns, type, tag->name, NULL)) {
231 |             leave_ns(state);
232 |         }
233 |     }
234 | }
235 | 
236 | static void handle_token(lhtml_token_t *token, lhtml_feedback_t *state) {
237 |     if (token->type == LHTML_TOKEN_START_TAG) {
238 |         bool delayed_enter_html = false;
239 |         if (!handle_start_tag_token(state, &token->start_tag, &delayed_enter_html)) {
240 |             token->type = LHTML_TOKEN_ERROR;
241 |             state->tokenizer->cs = html_error;
242 |         }
243 | 
244 |         lhtml_ns_t ns = lhtml_get_current_ns(state);
245 | 
246 |         if (!is_foreign_ns(ns) && !can_be_self_closing(token->start_tag.type)) {
247 |             token->parse_errors |= 1ULL << LHTML_ERR_NON_VOID_HTML_START_TAG_WITH_TRAILING_SOLIDUS;
248 |         }
249 | 
250 |         lhtml_emit(token, state);
251 | 
252 |         if (delayed_enter_html) {
253 |             if (!enter_ns(state, LHTML_NS_HTML)) {
254 |                 state->tokenizer->cs = html_error;
255 |             }
256 |         }
257 |     } else {
258 |         lhtml_emit(token, state);
259 |         if (token->type == LHTML_TOKEN_END_TAG) {
260 |             handle_end_tag_token(state, &token->end_tag);
261 |         }
262 |     }
263 | }
264 | 
265 | void lhtml_feedback_inject(lhtml_tokenizer_t *tokenizer, lhtml_feedback_t *state) {
266 |     state->tokenizer = tokenizer;
267 |     assert(enter_ns(state, LHTML_NS_HTML));
268 |     LHTML_ADD_HANDLER(tokenizer, state, handle_token);
269 | }
270 | 


--------------------------------------------------------------------------------
/c/parser-feedback.h:
--------------------------------------------------------------------------------
 1 | #ifndef LHTML_FEEDBACK_H
 2 | #define LHTML_FEEDBACK_H
 3 | 
 4 | #include "tokenizer.h"
 5 | 
 6 | typedef enum {
 7 |     LHTML_NS_HTML = LHTML_TAG_HTML,
 8 |     LHTML_NS_MATHML = LHTML_TAG_MATH,
 9 |     LHTML_NS_SVG = LHTML_TAG_SVG
10 | } lhtml_ns_t;
11 | 
12 | typedef LHTML_BUFFER_T(lhtml_ns_t) lhtml_ns_buffer_t;
13 | typedef LHTML_LIST_T(lhtml_ns_buffer_t) lhtml_ns_stack_t;
14 | 
15 | typedef struct {
16 |     lhtml_token_handler_t handler; // needs to be the first one
17 | 
18 |     lhtml_tokenizer_t *tokenizer;
19 |     lhtml_ns_stack_t ns_stack;
20 | } lhtml_feedback_t;
21 | 
22 | __attribute__((nonnull))
23 | void lhtml_feedback_inject(lhtml_tokenizer_t *tokenizer, lhtml_feedback_t *state);
24 | 
25 | __attribute__((nonnull, pure, warn_unused_result))
26 | lhtml_ns_t lhtml_get_current_ns(const lhtml_feedback_t *state);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/c/serializer.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <assert.h>
  3 | #include <string.h>
  4 | #include "serializer.h"
  5 | 
  6 | typedef struct {
  7 |     lhtml_string_t str;
  8 |     const char separator;
  9 |     bool done;
 10 | } split_iterator_t;
 11 | 
 12 | static lhtml_string_t split_iterator_next(split_iterator_t *iter) {
 13 |     lhtml_string_t str = iter->str;
 14 |     const char *ptr = memchr(str.data, iter->separator, str.length);
 15 |     if (ptr == NULL) {
 16 |         iter->done = true;
 17 |         return str;
 18 |     }
 19 |     const char *next = ptr + 1;
 20 |     const char *end = str.data + str.length;
 21 |     iter->str = (lhtml_string_t) {
 22 |         .data = next,
 23 |         .length = end - next
 24 |     };
 25 |     return (lhtml_string_t) {
 26 |         .data = str.data,
 27 |         .length = ptr - str.data
 28 |     };
 29 | }
 30 | 
 31 | static void serialize(lhtml_token_t *token, lhtml_serializer_t *extra) {
 32 |     lhtml_string_callback_t write = extra->writer;
 33 | 
 34 |     if (token->raw.has_value) {
 35 |         write(token->raw.value, extra);
 36 |         return;
 37 |     }
 38 | 
 39 |     switch (token->type) {
 40 |         case LHTML_TOKEN_CDATA_START: {
 41 |             write(LHTML_STRING("<![CDATA["), extra);
 42 |             break;
 43 |         }
 44 | 
 45 |         case LHTML_TOKEN_CDATA_END: {
 46 |             write(LHTML_STRING("]]>"), extra);
 47 |             break;
 48 |         }
 49 | 
 50 |         case LHTML_TOKEN_DOCTYPE: {
 51 |             write(LHTML_STRING("<!DOCTYPE"), extra);
 52 |             if (token->doctype.name.has_value) {
 53 |                 // with name: `<!DOCTYPE html`
 54 |                 write(LHTML_STRING(" "), extra);
 55 |                 write(token->doctype.name.value, extra); // non-empty; shouldn't contain spaces or `>`
 56 |                 if (token->doctype.public_id.has_value) {
 57 |                     // with public id: `<!DOCTYPE PUBLIC "public-id"`
 58 |                     write(LHTML_STRING(" PUBLIC \""), extra);
 59 |                     write(token->doctype.public_id.value, extra); // shouldn't contain `"` or `>`
 60 |                     if (!(token->doctype.force_quirks && !token->doctype.system_id.has_value)) {
 61 |                         write(LHTML_STRING("\""), extra);
 62 |                     }
 63 |                 } else if (token->doctype.system_id.has_value) {
 64 |                     write(LHTML_STRING(" SYSTEM"), extra);
 65 |                 } else if (token->doctype.force_quirks) {
 66 |                     write(LHTML_STRING(" _"), extra);
 67 |                 }
 68 |                 if (token->doctype.system_id.has_value) {
 69 |                     write(LHTML_STRING(" \""), extra);
 70 |                     write(token->doctype.system_id.value, extra);
 71 |                     if (!token->doctype.force_quirks) {
 72 |                         write(LHTML_STRING("\""), extra);
 73 |                     }
 74 |                 }
 75 |             }
 76 |             write(LHTML_STRING(">"), extra);
 77 |             break;
 78 |         }
 79 | 
 80 |         case LHTML_TOKEN_COMMENT: {
 81 |             write(LHTML_STRING("<!--"), extra);
 82 |             write(token->comment.value, extra); // shouldn't contain `-->`
 83 |             write(LHTML_STRING("-->"), extra);
 84 |             break;
 85 |         }
 86 | 
 87 |         case LHTML_TOKEN_START_TAG: {
 88 |             write(LHTML_STRING("<"), extra);
 89 |             write(token->start_tag.name, extra); // non-empty, starts with ASCII letter
 90 |             lhtml_attributes_t *attrs = &token->start_tag.attributes;
 91 |             for (size_t i = 0; i < attrs->length; i++) {
 92 |                 lhtml_attribute_t *attr = &attrs->data[i];
 93 |                 write(LHTML_STRING(" "), extra);
 94 |                 if (attr->raw.has_value) {
 95 |                     write(attr->raw.value, extra);
 96 |                 } else {
 97 |                     write(attr->name, extra);
 98 |                     write(LHTML_STRING("=\""), extra);
 99 |                     split_iterator_t iter = {
100 |                         .str = attr->value,
101 |                         .separator = '"'
102 |                     };
103 |                     for(;;) {
104 |                         // escape double-quotes in attribute values by splitting
105 |                         // the string and emitting &quot; between chunks
106 |                         lhtml_string_t chunk = split_iterator_next(&iter);
107 |                         write(chunk, extra);
108 |                         if (iter.done) {
109 |                             // last chunk, no quote afterwards
110 |                             break;
111 |                         }
112 |                         write(LHTML_STRING("&quot;"), extra);
113 |                     }
114 |                     write(LHTML_STRING("\""), extra);
115 |                 }
116 |             }
117 |             if (token->start_tag.self_closing) {
118 |                 write(LHTML_STRING(" /"), extra);
119 |             }
120 |             write(LHTML_STRING(">"), extra);
121 |             break;
122 |         }
123 | 
124 |         case LHTML_TOKEN_END_TAG: {
125 |             write(LHTML_STRING("</"), extra);
126 |             write(token->end_tag.name, extra);
127 |             write(LHTML_STRING(">"), extra);
128 |             break;
129 |         }
130 | 
131 |         case LHTML_TOKEN_CHARACTER:
132 |         case LHTML_TOKEN_UNPARSED:
133 |         case LHTML_TOKEN_ERROR:
134 |         case LHTML_TOKEN_EOF: {
135 |             // These tokens must have a raw value
136 |             assert(false);
137 |         }
138 |     }
139 | }
140 | 
141 | void lhtml_serializer_inject(lhtml_tokenizer_t *tokenizer, lhtml_serializer_t *state) {
142 |     LHTML_ADD_HANDLER(tokenizer, state, serialize);
143 | }
144 | 


--------------------------------------------------------------------------------
/c/serializer.h:
--------------------------------------------------------------------------------
 1 | #ifndef LHTML_SERIALIZER_H
 2 | #define LHTML_SERIALIZER_H
 3 | 
 4 | #include "tokenizer.h"
 5 | 
 6 | typedef struct lhtml_serializer_state_s lhtml_serializer_t;
 7 | 
 8 | typedef void (*lhtml_string_callback_t)(lhtml_string_t string, lhtml_serializer_t *extra);
 9 | 
10 | struct lhtml_serializer_state_s {
11 |     lhtml_token_handler_t handler; // needs to be the first one
12 |     lhtml_string_callback_t writer;
13 | };
14 | 
15 | __attribute__((nonnull))
16 | void lhtml_serializer_inject(lhtml_tokenizer_t *tokenizer, lhtml_serializer_t *state);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/c/tag-types.h:
--------------------------------------------------------------------------------
  1 | #ifndef LHTML_TAG_TYPES_H
  2 | #define LHTML_TAG_TYPES_H
  3 | 
  4 | typedef enum {
  5 |     // Regular elements
  6 |     LHTML_TAG_A = 1,
  7 |     LHTML_TAG_ABBR = 34898,
  8 |     LHTML_TAG_ADDRESS = 1212749427,
  9 |     LHTML_TAG_AREA = 51361,
 10 |     LHTML_TAG_ARTICLE = 1698991493,
 11 |     LHTML_TAG_ASIDE = 1680517,
 12 |     LHTML_TAG_AUDIO = 1741103,
 13 |     LHTML_TAG_B = 2,
 14 |     LHTML_TAG_BASE = 67173,
 15 |     LHTML_TAG_BDI = 2185,
 16 |     LHTML_TAG_BDO = 2191,
 17 |     LHTML_TAG_BLOCKQUOTE = 84081888640645,
 18 |     LHTML_TAG_BODY = 81049,
 19 |     LHTML_TAG_BR = 82,
 20 |     LHTML_TAG_BUTTON = 89805294,
 21 |     LHTML_TAG_CANVAS = 102193203,
 22 |     LHTML_TAG_CAPTION = 3272222190,
 23 |     LHTML_TAG_CITE = 108165,
 24 |     LHTML_TAG_CODE = 113797,
 25 |     LHTML_TAG_COL = 3564,
 26 |     LHTML_TAG_COLGROUP = 119595941552,
 27 |     LHTML_TAG_DATA = 132737,
 28 |     LHTML_TAG_DATALIST = 139185235572,
 29 |     LHTML_TAG_DD = 132,
 30 |     LHTML_TAG_DEL = 4268,
 31 |     LHTML_TAG_DETAILS = 4483753363,
 32 |     LHTML_TAG_DFN = 4302,
 33 |     LHTML_TAG_DIALOG = 143700455,
 34 |     LHTML_TAG_DIV = 4406,
 35 |     LHTML_TAG_DL = 140,
 36 |     LHTML_TAG_DT = 148,
 37 |     LHTML_TAG_EM = 173,
 38 |     LHTML_TAG_EMBED = 5671076,
 39 |     LHTML_TAG_FIELDSET = 216002612404,
 40 |     LHTML_TAG_FIGCAPTION = 221245627573742,
 41 |     LHTML_TAG_FIGURE = 211015237,
 42 |     LHTML_TAG_FOOTER = 217567410,
 43 |     LHTML_TAG_FORM = 212557,
 44 |     LHTML_TAG_HEAD = 267300,
 45 |     LHTML_TAG_HEADER = 273715378,
 46 |     LHTML_TAG_HGROUP = 276381360,
 47 |     LHTML_TAG_HR = 274,
 48 |     LHTML_TAG_HTML = 283052,
 49 |     LHTML_TAG_I = 9,
 50 |     LHTML_TAG_IFRAME = 308872613,
 51 |     LHTML_TAG_IMG = 9639,
 52 |     LHTML_TAG_INPUT = 9913012,
 53 |     LHTML_TAG_INS = 9683,
 54 |     LHTML_TAG_KBD = 11332,
 55 |     LHTML_TAG_KEYGEN = 375168174,
 56 |     LHTML_TAG_LABEL = 12617900,
 57 |     LHTML_TAG_LEGEND = 408131012,
 58 |     LHTML_TAG_LI = 393,
 59 |     LHTML_TAG_LINK = 402891,
 60 |     LHTML_TAG_MAIN = 427310,
 61 |     LHTML_TAG_MAP = 13360,
 62 |     LHTML_TAG_MARK = 427595,
 63 |     LHTML_TAG_MATH = 427656,
 64 |     LHTML_TAG_MENU = 431573,
 65 |     LHTML_TAG_MENUITEM = 452537405613,
 66 |     LHTML_TAG_META = 431745,
 67 |     LHTML_TAG_METER = 13815986,
 68 |     LHTML_TAG_NAV = 14390,
 69 |     LHTML_TAG_NOSCRIPT = 497783744020,
 70 |     LHTML_TAG_OBJECT = 505746548,
 71 |     LHTML_TAG_OL = 492,
 72 |     LHTML_TAG_OPTGROUP = 533254979248,
 73 |     LHTML_TAG_OPTION = 520758766,
 74 |     LHTML_TAG_OUTPUT = 526009012,
 75 |     LHTML_TAG_P = 16,
 76 |     LHTML_TAG_PARAM = 16828461,
 77 |     LHTML_TAG_PICTURE = 17485682245,
 78 |     LHTML_TAG_PRE = 16965,
 79 |     LHTML_TAG_PROGRESS = 569594418803,
 80 |     LHTML_TAG_Q = 17,
 81 |     LHTML_TAG_RP = 592,
 82 |     LHTML_TAG_RT = 596,
 83 |     LHTML_TAG_RUBY = 611417,
 84 |     LHTML_TAG_S = 19,
 85 |     LHTML_TAG_SAMP = 624048,
 86 |     LHTML_TAG_SCRIPT = 641279508,
 87 |     LHTML_TAG_SECTION = 20572677614,
 88 |     LHTML_TAG_SELECT = 643175540,
 89 |     LHTML_TAG_SLOT = 635380,
 90 |     LHTML_TAG_SMALL = 20350348,
 91 |     LHTML_TAG_SOURCE = 653969509,
 92 |     LHTML_TAG_SPAN = 639022,
 93 |     LHTML_TAG_STRONG = 659111367,
 94 |     LHTML_TAG_STYLE = 20604293,
 95 |     LHTML_TAG_SUB = 20130,
 96 |     LHTML_TAG_SUMMARY = 21119796825,
 97 |     LHTML_TAG_SUP = 20144,
 98 |     LHTML_TAG_SVG = 20167,
 99 |     LHTML_TAG_TABLE = 21006725,
100 |     LHTML_TAG_TBODY = 21052569,
101 |     LHTML_TAG_TD = 644,
102 |     LHTML_TAG_TEMPLATE = 693016856197,
103 |     LHTML_TAG_TEXTAREA = 693389805729,
104 |     LHTML_TAG_TFOOT = 21183988,
105 |     LHTML_TAG_TH = 648,
106 |     LHTML_TAG_THEAD = 21238820,
107 |     LHTML_TAG_TIME = 664997,
108 |     LHTML_TAG_TITLE = 21287301,
109 |     LHTML_TAG_TR = 658,
110 |     LHTML_TAG_TRACK = 21562475,
111 |     LHTML_TAG_U = 21,
112 |     LHTML_TAG_UL = 684,
113 |     LHTML_TAG_VAR = 22578,
114 |     LHTML_TAG_VIDEO = 23367855,
115 |     LHTML_TAG_WBR = 23634,
116 | 
117 |     // Obsolete elements
118 |     LHTML_TAG_APPLET = 50868404,
119 |     LHTML_TAG_ACRONYM = 1193786157,
120 |     LHTML_TAG_BGSOUND = 2402801092,
121 |     LHTML_TAG_DIR = 4402,
122 |     LHTML_TAG_FRAME = 6882725,
123 |     LHTML_TAG_FRAMESET = 225533152436,
124 |     LHTML_TAG_NOFRAMES = 497362711731,
125 |     LHTML_TAG_ISINDEX = 10311110840,
126 |     LHTML_TAG_LISTING = 13207479751,
127 |     LHTML_TAG_NEXTID = 475812132,
128 |     LHTML_TAG_NOEMBED = 15541373092,
129 |     LHTML_TAG_PLAINTEXT = 18005893977876,
130 |     LHTML_TAG_RB = 578,
131 |     LHTML_TAG_RTC = 19075,
132 |     LHTML_TAG_STRIKE = 659105125,
133 |     LHTML_TAG_XMP = 25008,
134 |     LHTML_TAG_BASEFONT = 70436208084,
135 |     LHTML_TAG_BIG = 2343,
136 |     LHTML_TAG_BLINK = 2500043,
137 |     LHTML_TAG_CENTER = 106385586,
138 |     LHTML_TAG_FONT = 212436,
139 |     LHTML_TAG_MARQUEE = 14011651237,
140 |     LHTML_TAG_MULTICOL = 469649100268,
141 |     LHTML_TAG_NOBR = 474194,
142 |     LHTML_TAG_SPACER = 654347442,
143 |     LHTML_TAG_TT = 660,
144 |     LHTML_TAG_IMAGE = 9864421,
145 | 
146 |     // MathML text integration points
147 |     LHTML_TAG_MI = 425,
148 |     LHTML_TAG_MO = 431,
149 |     LHTML_TAG_MN = 430,
150 |     LHTML_TAG_MS = 435,
151 |     LHTML_TAG_MTEXT = 14292756,
152 | 
153 |     // SVG HTML integration points
154 |     LHTML_TAG_DESC = 136803,
155 |     // LHTML_TAG_TITLE // already exists,
156 |     LHTML_TAG_FOREIGNOBJECT = 7478413254770103412,
157 | } lhtml_tag_type_t;
158 | 
159 | #endif


--------------------------------------------------------------------------------
/c/tokenizer-states.rl:
--------------------------------------------------------------------------------
 1 | #ifndef LHTML_TOKENIZER_STATES_H
 2 | #define LHTML_TOKENIZER_STATES_H
 3 | 
 4 | %%{
 5 |     machine html;
 6 | 
 7 |     include 'actions.rl';
 8 |     include 'parse_errors.rl';
 9 |     include '../syntax/index.rl';
10 | }%%
11 | 
12 | #pragma GCC diagnostic push
13 | #pragma GCC diagnostic ignored "-Wunused-variable"
14 | %%write data nofinal;
15 | #pragma GCC diagnostic pop
16 | 
17 | #endif


--------------------------------------------------------------------------------
/c/tokenizer.h:
--------------------------------------------------------------------------------
  1 | #ifndef LHTML_TOKENIZER_H
  2 | #define LHTML_TOKENIZER_H
  3 | 
  4 | #include <stddef.h>
  5 | #include <stdbool.h>
  6 | #include <inttypes.h>
  7 | #include "tag-types.h"
  8 | 
  9 | // gcc :(
 10 | #ifdef __clang__
 11 | #define LHTML_IMMUTABLE const
 12 | #else
 13 | #define LHTML_IMMUTABLE
 14 | #endif
 15 | 
 16 | #define LHTML_BUFFER_T(ITEM_T) struct {\
 17 |     ITEM_T *LHTML_IMMUTABLE data;\
 18 |     LHTML_IMMUTABLE size_t capacity;\
 19 | }
 20 | 
 21 | #define LHTML_LIST_T(BUFFER_T) struct {\
 22 |     union {\
 23 |         BUFFER_T buffer;\
 24 |         LHTML_IMMUTABLE LHTML_BUFFER_T(__typeof__(((BUFFER_T *)0)->data[0]));\
 25 |     };\
 26 |     size_t length;\
 27 | }
 28 | 
 29 | typedef struct {
 30 |     const char *data;
 31 |     size_t length;
 32 | } lhtml_string_t;
 33 | 
 34 | typedef LHTML_BUFFER_T(char) lhtml_char_buffer_t;
 35 | 
 36 | typedef struct {
 37 |     bool has_value;
 38 |     lhtml_string_t value;
 39 | } lhtml_opt_string_t;
 40 | 
 41 | typedef enum {
 42 |     LHTML_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
 43 |     LHTML_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
 44 |     LHTML_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
 45 |     LHTML_ERR_CDATA_IN_HTML_CONTENT,
 46 |     LHTML_ERR_END_TAG_WITH_ATTRIBUTES,
 47 |     LHTML_ERR_DUPLICATE_ATTRIBUTE,
 48 |     LHTML_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
 49 |     LHTML_ERR_EOF_BEFORE_TAG_NAME,
 50 |     LHTML_ERR_EOF_IN_CDATA,
 51 |     LHTML_ERR_EOF_IN_COMMENT,
 52 |     LHTML_ERR_EOF_IN_DOCTYPE,
 53 |     LHTML_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
 54 |     LHTML_ERR_EOF_IN_TAG,
 55 |     LHTML_ERR_INCORRECTLY_CLOSED_COMMENT,
 56 |     LHTML_ERR_INCORRECTLY_OPENED_COMMENT,
 57 |     LHTML_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
 58 |     LHTML_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
 59 |     LHTML_ERR_MISSING_ATTRIBUTE_VALUE,
 60 |     LHTML_ERR_MISSING_DOCTYPE_NAME,
 61 |     LHTML_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
 62 |     LHTML_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
 63 |     LHTML_ERR_MISSING_END_TAG_NAME,
 64 |     LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
 65 |     LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
 66 |     LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
 67 |     LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
 68 |     LHTML_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
 69 |     LHTML_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
 70 |     LHTML_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
 71 |     LHTML_ERR_NESTED_COMMENT,
 72 |     LHTML_ERR_NON_VOID_HTML_START_TAG_WITH_TRAILING_SOLIDUS,
 73 |     LHTML_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
 74 |     LHTML_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
 75 |     LHTML_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
 76 |     LHTML_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
 77 |     LHTML_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
 78 |     LHTML_ERR_UNEXPECTED_SOLIDUS_IN_TAG
 79 | } lhtml_parse_error_t;
 80 | 
 81 | typedef enum {
 82 |     LHTML_TOKEN_ERROR,
 83 |     LHTML_TOKEN_UNPARSED,
 84 |     LHTML_TOKEN_CHARACTER,
 85 |     LHTML_TOKEN_COMMENT,
 86 |     LHTML_TOKEN_START_TAG,
 87 |     LHTML_TOKEN_END_TAG,
 88 |     LHTML_TOKEN_DOCTYPE,
 89 |     LHTML_TOKEN_EOF,
 90 |     LHTML_TOKEN_CDATA_START,
 91 |     LHTML_TOKEN_CDATA_END
 92 | } lhtml_token_type_t;
 93 | 
 94 | typedef struct {
 95 |     lhtml_string_t value;
 96 | } lhtml_token_comment_t;
 97 | 
 98 | typedef struct {
 99 |     lhtml_string_t name;
100 |     lhtml_string_t value;
101 | 
102 |     lhtml_opt_string_t raw;
103 | } lhtml_attribute_t;
104 | 
105 | typedef LHTML_BUFFER_T(lhtml_attribute_t) lhtml_attr_buffer_t;
106 | typedef LHTML_LIST_T(lhtml_attr_buffer_t) lhtml_attributes_t;
107 | 
108 | typedef struct {
109 |     lhtml_string_t name;
110 |     lhtml_tag_type_t type;
111 |     lhtml_attributes_t attributes;
112 |     bool self_closing;
113 | } lhtml_token_starttag_t;
114 | 
115 | typedef struct {
116 |     lhtml_string_t name;
117 |     lhtml_tag_type_t type;
118 | } lhtml_token_endtag_t;
119 | 
120 | typedef struct {
121 |     lhtml_opt_string_t name;
122 |     lhtml_opt_string_t public_id;
123 |     lhtml_opt_string_t system_id;
124 |     bool force_quirks;
125 | } lhtml_token_doctype_t;
126 | 
127 | typedef struct {
128 |     lhtml_token_type_t type;
129 |     union {
130 |         lhtml_token_comment_t comment;
131 |         lhtml_token_starttag_t start_tag;
132 |         lhtml_token_endtag_t end_tag;
133 |         lhtml_token_doctype_t doctype;
134 |     };
135 |     lhtml_opt_string_t raw;
136 |     uint64_t parse_errors;
137 | } lhtml_token_t;
138 | 
139 | #define LHTML_TOKEN_CALLBACK_T(NAME, T) void (*NAME)(lhtml_token_t *token, T *extra)
140 | 
141 | typedef __attribute__((nonnull(1))) LHTML_TOKEN_CALLBACK_T(lhtml_token_callback_t, void);
142 | 
143 | typedef struct lhtml_token_handler_s lhtml_token_handler_t;
144 | 
145 | struct lhtml_token_handler_s {
146 |     lhtml_token_callback_t callback;
147 |     lhtml_token_handler_t *next;
148 | };
149 | 
150 | /// <div rustbindgen nocopy></div>
151 | typedef struct {
152 |     lhtml_token_handler_t base_handler; // needs to be the first one
153 | 
154 |     bool allow_cdata;
155 |     bool unsafe_null;
156 |     bool entities;
157 |     char quote;
158 |     int cs;
159 |     lhtml_tag_type_t last_start_tag_type;
160 |     lhtml_char_buffer_t buffer;
161 |     lhtml_attr_buffer_t attr_buffer;
162 | 
163 |     uint64_t special_end_tag_type;
164 |     lhtml_token_t token;
165 |     const char *slice_start;
166 |     const char *mark;
167 |     char *buffer_pos;
168 |     bool current_attr_is_unique;
169 | } lhtml_tokenizer_t;
170 | 
171 | __attribute__((nonnull))
172 | void lhtml_init(lhtml_tokenizer_t *state);
173 | 
174 | __attribute__((nonnull))
175 | void lhtml_append_handlers(lhtml_token_handler_t *dest, lhtml_token_handler_t *src);
176 | 
177 | __attribute__((nonnull))
178 | void lhtml_emit(lhtml_token_t *token, void *extra);
179 | 
180 | __attribute__((warn_unused_result, nonnull(1)))
181 | bool lhtml_feed(lhtml_tokenizer_t *state, const lhtml_string_t *chunk);
182 | 
183 | __attribute__((pure, warn_unused_result))
184 | bool lhtml_str_nocase_equals(const lhtml_string_t actual, const lhtml_string_t expected);
185 | 
186 | __attribute__((pure, warn_unused_result))
187 | lhtml_tag_type_t lhtml_get_tag_type(const lhtml_string_t name);
188 | 
189 | __attribute__((nonnull, pure, warn_unused_result))
190 | lhtml_attribute_t *lhtml_find_attr(lhtml_attributes_t *attrs, const lhtml_string_t name);
191 | 
192 | __attribute__((nonnull, warn_unused_result))
193 | lhtml_attribute_t *lhtml_create_attr(lhtml_attributes_t *attrs);
194 | 
195 | #define LHTML_STRING(str) ((lhtml_string_t) { .data = str, .length = sizeof(str) - 1 })
196 | 
197 | #define LHTML_STR_EQUALS(actual, expected) ({\
198 |     lhtml_string_t _actual = (actual);\
199 |     lhtml_string_t _expected = LHTML_STRING(expected);\
200 |     _actual.length == _expected.length && memcmp(_actual.data, _expected.data, _expected.length) == 0;\
201 | })
202 | 
203 | #define LHTML_STR_NOCASE_EQUALS(actual, expected) lhtml_str_nocase_equals(actual, LHTML_STRING(expected))
204 | 
205 | #define LHTML_FIND_ATTR(attrs, name) lhtml_find_attr(attrs, LHTML_STRING(name))
206 | 
207 | #define LHTML_INIT_HANDLER(state, cb) {\
208 |     _Static_assert(offsetof(__typeof__(*(state)), handler) == 0, ".handler is the first item in the state");\
209 |     LHTML_TOKEN_CALLBACK_T(_cb, __typeof__(*(state))) = (cb);\
210 |     (state)->handler = (lhtml_token_handler_t) { .callback = (lhtml_token_callback_t) _cb };\
211 | }
212 | 
213 | #define LHTML_ADD_HANDLER(tokenizer, state, cb) {\
214 |     __typeof__((state)) _state = (state);\
215 |     LHTML_INIT_HANDLER(_state, (cb));\
216 |     lhtml_append_handlers(&(tokenizer)->base_handler, &_state->handler);\
217 | }
218 | 
219 | #endif
220 | 


--------------------------------------------------------------------------------
/c/tokenizer.rl:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <string.h>
  3 | #include "tokenizer.h"
  4 | #include "field-names.h"
  5 | // #include "$OUT/tokenizer-states.h" - included with command option to respect env var
  6 | 
  7 | %%{
  8 |     machine html;
  9 | 
 10 |     include 'actions.rl';
 11 |     include 'parse_errors.rl';
 12 |     include '../syntax/index.rl';
 13 | 
 14 |     access state->;
 15 | }%%
 16 | 
 17 | #define GET_TOKEN(TYPE) (assert(token->type == LHTML_TOKEN_##TYPE), &token->LHTML_FIELD_NAME_##TYPE)
 18 | #define TO_LOWER(c) (c | ((unsigned char) (c - 'A') < 26) << 5) // tolower that vectorizes
 19 | 
 20 | #define CREATE_TOKEN(TYPE, VALUE) {\
 21 |     token->type = LHTML_TOKEN_##TYPE;\
 22 |     token->LHTML_FIELD_NAME_##TYPE = (__typeof__(token->LHTML_FIELD_NAME_##TYPE)) VALUE;\
 23 | }
 24 | 
 25 | #define HELPER(...) __attribute__((always_inline, __VA_ARGS__)) inline static
 26 | 
 27 | HELPER(nonnull)
 28 | lhtml_string_t range_string(const char *begin, const char *end) {
 29 |     assert(end >= begin);
 30 |     return (lhtml_string_t) {
 31 |         .data = begin,
 32 |         .length = (size_t) (end - begin)
 33 |     };
 34 | }
 35 | 
 36 | HELPER(nonnull)
 37 | lhtml_opt_string_t opt_range_string(const char *begin, const char *end) {
 38 |     return (lhtml_opt_string_t) {
 39 |         .has_value = true,
 40 |         .value = range_string(begin, end)
 41 |     };
 42 | }
 43 | 
 44 | HELPER(const, warn_unused_result)
 45 | uint64_t tag_type_append_char(uint64_t *code, char c) {
 46 |     // protect against overflow
 47 |     if (*code >> (64 - 5)) {
 48 |         return *code = 0;
 49 |     }
 50 | 
 51 |     if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
 52 |         return *code = (*code << 5) | (c & 31);
 53 |     } else {
 54 |         return *code = 0;
 55 |     }
 56 | }
 57 | 
 58 | __attribute__((always_inline))
 59 | inline lhtml_tag_type_t lhtml_get_tag_type(const lhtml_string_t name) {
 60 |     uint64_t code = 0;
 61 | 
 62 |     for (size_t i = 0; i < name.length; i++) {
 63 |         if (!tag_type_append_char(&code, name.data[i])) {
 64 |             break;
 65 |         }
 66 |     }
 67 | 
 68 |     return code;
 69 | }
 70 | 
 71 | HELPER(nonnull)
 72 | void emit_token(lhtml_tokenizer_t *state, const char *end) {
 73 |     lhtml_token_t *token = &state->token;
 74 |     token->raw.value.length = (size_t) (end - token->raw.value.data);
 75 |     if (token->raw.value.length) {
 76 |         token->raw.has_value = true;
 77 |         lhtml_emit(token, &state->base_handler);
 78 |         token->parse_errors = 0;
 79 |     }
 80 |     token->type = LHTML_TOKEN_ERROR;
 81 |     token->raw.value.data = end;
 82 |     token->raw.value.length = 0;
 83 | }
 84 | 
 85 | HELPER(nonnull)
 86 | bool already_errored(lhtml_tokenizer_t *state, lhtml_string_t unprocessed) {
 87 |     if (unprocessed.length > 0) {
 88 |         lhtml_token_t *token = &state->token;
 89 |         token->type = LHTML_TOKEN_ERROR;
 90 |         token->raw.value = unprocessed;
 91 |         token->raw.has_value = true;
 92 |         lhtml_emit(token, &state->base_handler);
 93 |     }
 94 |     return false;
 95 | }
 96 | 
 97 | HELPER(nonnull)
 98 | bool emit_error(lhtml_tokenizer_t *state, lhtml_string_t unprocessed) {
 99 |     state->token.type = LHTML_TOKEN_ERROR;
100 |     emit_token(state, state->buffer_pos);
101 |     return already_errored(state, unprocessed);
102 | }
103 | 
104 | HELPER(nonnull)
105 | void emit_slice(lhtml_tokenizer_t *state, const char *p) {
106 |     assert(state->token.type == LHTML_TOKEN_CHARACTER);
107 |     assert(state->slice_start == state->token.raw.value.data);
108 |     const char *slice_end = state->mark != NULL ? state->mark : p;
109 |     emit_token(state, slice_end);
110 | }
111 | 
112 | HELPER(nonnull)
113 | void emit_eof(lhtml_tokenizer_t *state) {
114 |     lhtml_token_t *token = &state->token;
115 |     token->type = LHTML_TOKEN_EOF;
116 |     token->raw.has_value = true;
117 |     lhtml_emit(token, &state->base_handler);
118 | }
119 | 
120 | HELPER(nonnull)
121 | void parse_error(lhtml_tokenizer_t *state, lhtml_parse_error_t err) {
122 |     state->token.parse_errors |= 1ULL << err;
123 | }
124 | 
125 | inline bool lhtml_has_parse_error(lhtml_token_t *token, lhtml_parse_error_t err) {
126 |     return token->parse_errors & (1ULL << err);
127 | }
128 | 
129 | void lhtml_emit(lhtml_token_t *token, void *extra) {
130 |     lhtml_token_handler_t *handler = ((lhtml_token_handler_t *) extra)->next;
131 |     if (handler != NULL) {
132 |         handler->callback(token, handler);
133 |     }
134 | }
135 | 
136 | inline bool lhtml_str_nocase_equals(const lhtml_string_t actual, const lhtml_string_t expected) {
137 |     size_t length = expected.length;
138 | 
139 |     if (actual.length != length) {
140 |         return false;
141 |     }
142 | 
143 |     for (size_t i = 0; i < length; i++) {
144 |         if (TO_LOWER(actual.data[i]) != TO_LOWER(expected.data[i])) {
145 |             return false;
146 |         }
147 |     }
148 | 
149 |     return true;
150 | }
151 | 
152 | lhtml_attribute_t *lhtml_find_attr(lhtml_attributes_t *attrs, const lhtml_string_t name) {
153 |     size_t count = attrs->length;
154 |     lhtml_attribute_t *items = attrs->data;
155 |     for (size_t i = 0; i < count; i++) {
156 |         lhtml_attribute_t *attr = &items[i];
157 |         if (lhtml_str_nocase_equals(attr->name, name)) {
158 |             return attr;
159 |         }
160 |     }
161 |     return NULL;
162 | }
163 | 
164 | HELPER(nonnull)
165 | bool can_create_attr(lhtml_attributes_t *attrs) {
166 |     return attrs->length < attrs->capacity;
167 | }
168 | 
169 | inline lhtml_attribute_t *lhtml_create_attr(lhtml_attributes_t *attrs) {
170 |     return can_create_attr(attrs) ? &attrs->data[attrs->length++] : NULL;
171 | }
172 | 
173 | void lhtml_init(lhtml_tokenizer_t *state) {
174 |     %%write init nocs;
175 | 
176 |     if (state->cs == 0) {
177 |         state->cs = html_en_Data;
178 |     }
179 | 
180 |     state->buffer_pos = state->buffer.data;
181 | }
182 | 
183 | void lhtml_append_handlers(lhtml_token_handler_t *dest, lhtml_token_handler_t *src) {
184 |     while (dest->next != NULL) {
185 |         dest = dest->next;
186 |     }
187 |     dest->next = src;
188 | }
189 | 
190 | bool lhtml_feed(lhtml_tokenizer_t *state, const lhtml_string_t *chunk) {
191 |     lhtml_token_t *const token = &state->token;
192 | 
193 |     if (token->type == LHTML_TOKEN_EOF) {
194 |         // if already saw an EOF, ignore any further input
195 |         return false;
196 |     }
197 | 
198 |     if (state->cs == html_error) {
199 |         if (chunk != NULL) {
200 |             return already_errored(state, *chunk);
201 |         } else {
202 |             token->raw.value.length = 0;
203 |             emit_eof(state);
204 |             return false;
205 |         }
206 |     }
207 | 
208 |     lhtml_string_t unprocessed = chunk != NULL ? *chunk : LHTML_STRING("");
209 | 
210 |     do {
211 |         token->raw.value.data = state->buffer.data;
212 | 
213 |         size_t available_space = (size_t) (state->buffer.data + state->buffer.capacity - state->buffer_pos);
214 | 
215 |         if (unprocessed.length <= available_space) {
216 |             available_space = unprocessed.length;
217 |         } else if (available_space == 0) {
218 |             state->cs = html_error;
219 |             return emit_error(state, unprocessed);
220 |         }
221 | 
222 |         const char *p = state->buffer_pos;
223 | 
224 |         if (available_space > 0) {
225 |             memcpy(state->buffer_pos, unprocessed.data, available_space);
226 |             state->buffer_pos += available_space;
227 |             unprocessed.data += available_space;
228 |             unprocessed.length -= available_space;
229 |         }
230 | 
231 |         const char *const pe = state->buffer_pos;
232 |         const char *const eof = chunk == NULL ? pe : NULL;
233 | 
234 | #pragma GCC diagnostic push
235 | #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
236 |         %%write exec;
237 | #pragma GCC diagnostic pop
238 | 
239 |         if (state->cs == html_error) {
240 |             return emit_error(state, unprocessed);
241 |         }
242 | 
243 |         if (chunk == NULL) {
244 |             token->raw.value.length = (size_t) (pe - token->raw.value.data);
245 |             emit_eof(state);
246 |             return true;
247 |         }
248 | 
249 |         if (token->type == LHTML_TOKEN_CHARACTER) {
250 |             emit_slice(state, pe);
251 |             token->type = LHTML_TOKEN_CHARACTER;
252 |             state->slice_start = token->raw.value.data;
253 |         }
254 | 
255 |         size_t shift = (size_t) (token->raw.value.data - state->buffer.data);
256 | 
257 |         if (shift != 0) {
258 |             switch (token->type) {
259 |                 case LHTML_TOKEN_COMMENT: {
260 |                     token->comment.value.data -= shift;
261 |                     break;
262 |                 }
263 | 
264 |                 case LHTML_TOKEN_DOCTYPE: {
265 |                     token->doctype.name.value.data -= shift;
266 |                     token->doctype.public_id.value.data -= shift;
267 |                     token->doctype.system_id.value.data -= shift;
268 |                     break;
269 |                 }
270 | 
271 |                 case LHTML_TOKEN_END_TAG: {
272 |                     token->end_tag.name.data -= shift;
273 |                     break;
274 |                 }
275 | 
276 |                 case LHTML_TOKEN_START_TAG: {
277 |                     token->start_tag.name.data -= shift;
278 |                     lhtml_attributes_t *attrs = &token->start_tag.attributes;
279 |                     for (size_t i = 0; i < attrs->length; i++) {
280 |                         lhtml_attribute_t *attr = &attrs->data[i];
281 |                         attr->name.data -= shift;
282 |                         attr->value.data -= shift;
283 |                         attr->raw.value.data -= shift;
284 |                     }
285 |                     break;
286 |                 }
287 | 
288 |                 default: {
289 |                     break;
290 |                 }
291 |             }
292 | 
293 |             memmove(state->buffer.data, token->raw.value.data, (size_t) (state->buffer_pos - token->raw.value.data));
294 |             state->buffer_pos -= shift;
295 |             state->slice_start -= shift;
296 | 
297 |             if (state->mark != NULL) {
298 |                 state->mark -= shift;
299 |             }
300 |         }
301 |     } while (unprocessed.length > 0);
302 | 
303 |     return true;
304 | }
305 | 


--------------------------------------------------------------------------------
/cfsetup.yaml:
--------------------------------------------------------------------------------
 1 | everything: &everything
 2 |     build:
 3 |         builddeps:
 4 |             - build-essential
 5 |             - ragel
 6 |         post-cache:
 7 |             - make -C c lib
 8 |     test:
 9 |         builddeps:
10 |             - ragel
11 |             - rust
12 |             - clang
13 |         post-cache:
14 |             - cd rust
15 |             - cargo test
16 |     bamboo-test:
17 |         builddeps:
18 |             - ragel
19 |             - rust
20 |             - clang
21 |             - python
22 |         post-cache:
23 |             - cd rust
24 |             - cargo test --no-run # print compilation failures if any
25 |             - RUST_TEST_THREADS=1 cargo test -q --test test -- --logfile tests.log 2>failures.log; ../convert-test-log.py
26 | squeeze: *everything
27 | jessie: *everything
28 | stretch: *everything
29 | 


--------------------------------------------------------------------------------
/convert-test-log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import xml.etree.cElementTree as ET
 3 | import re
 4 | 
 5 | failure_re = re.compile(r"""thread '(.*?)' panicked at '(.*?)', tests/test\.rs:\d+:\d+
 6 | (?:note: Run with `RUST_BACKTRACE=1` for a backtrace.
 7 | )?""", re.DOTALL)
 8 | 
 9 | tests = open('tests.log', 'r').read().rstrip().split('\n')
10 | failures = open('failures.log', 'r').read()
11 | failures_pos = 0
12 | 
13 | root = ET.Element('testsuite', name='html5lib-tests', tests=str(len(tests)))
14 | 
15 | for test in tests:
16 |     (status, name) = test.split(' ', 1)
17 |     case = ET.SubElement(root, 'testcase', name=name)
18 |     if status == 'failed':
19 |         match = failure_re.match(failures, failures_pos)
20 |         assert match is not None, "Could not parse %r" % failures[failures_pos:].split('\n', 1)[0]
21 |         failure_name, details = match.groups()
22 |         assert name == failure_name, "Could not find failure message for %s" % name
23 |         ET.SubElement(case, 'failure').text = details
24 |         failures_pos = match.end()
25 |     elif status == 'ignored':
26 |         ET.SubElement(case, 'skipped')
27 |     else:
28 |         assert status == 'ok', 'Unknown test status: %s' % status
29 | 
30 | ET.ElementTree(root).write('tests.xml')
31 | 


--------------------------------------------------------------------------------
/error-with-feedback-tests/trailing-solidus.test:
--------------------------------------------------------------------------------
 1 | {
 2 |   "tests": [
 3 |     {
 4 |       "description": "Non-void HTML element with trailing solidus",
 5 |       "input": "<div/>",
 6 |       "output": [["StartTag", "div", {}, true]],
 7 |       "errors": [
 8 |         {
 9 |           "code": "non-void-html-element-start-tag-with-trailing-solidus",
10 |           "line": 1,
11 |           "col": 6
12 |         }
13 |       ]
14 |     },
15 |     {
16 |       "description":
17 |         "Non-void HTML element with trailing solidus in foreign content",
18 |       "input": "<svg><div/></svg>",
19 |       "output": [
20 |         ["StartTag", "svg", {}],
21 |         ["StartTag", "div", {}, true],
22 |         ["EndTag", "svg"]
23 |       ],
24 |       "errors": [
25 |         {
26 |           "code": "non-void-html-element-start-tag-with-trailing-solidus",
27 |           "line": 1,
28 |           "col": 11
29 |         }
30 |       ]
31 |     }
32 |   ]
33 | }
34 | 


--------------------------------------------------------------------------------
/images/language-specific-actions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/language-specific-actions.png


--------------------------------------------------------------------------------
/images/perf-comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/perf-comparison.png


--------------------------------------------------------------------------------
/images/ragel-visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/ragel-visualization.png


--------------------------------------------------------------------------------
/images/syntax-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/syntax-description.png


--------------------------------------------------------------------------------
/images/syntax-files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/syntax-files.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "lazyhtml-scripts",
 3 |   "private": true,
 4 |   "version": "1.0.0",
 5 |   "description": "Helper scripts for lazyhtml",
 6 |   "author": "Ingvar Stepanyan <me@rreverser.com> (https://reverser.com/)",
 7 |   "dependencies": {
 8 |     "graphlib-dot": "^0.6.2"
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/rust/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 


--------------------------------------------------------------------------------
/rust/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 


--------------------------------------------------------------------------------
/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Ingvar Stepanyan <me@rreverser.com>"]
 3 | name = "lazyhtml"
 4 | version = "0.1.0"
 5 | publish = false
 6 | autotests = false
 7 | 
 8 | [[bench]]
 9 | harness = false
10 | name = "bench"
11 | 
12 | [[test]]
13 | harness = false
14 | name = "test"
15 | 
16 | [dependencies.lazyhtml-sys]
17 | path = "lazyhtml-sys"
18 | version = "0.1.0"
19 | 
20 | [dev-dependencies]
21 | getopts = "0.2.15"
22 | glob = "0.3.0"
23 | html5ever = "0.23.0"
24 | serde = { version = "1.0.19", features = ["derive"] }
25 | serde_json = "1.0.5"
26 | rustc-test = "0.3.0"
27 | 
28 | [workspace]
29 | 


--------------------------------------------------------------------------------
/rust/benches/bench.rs:
--------------------------------------------------------------------------------
  1 | extern crate glob;
  2 | extern crate html5ever;
  3 | extern crate lazyhtml;
  4 | extern crate rustc_test as test;
  5 | 
  6 | use lazyhtml::*;
  7 | use test::black_box;
  8 | use std::ptr::null_mut;
  9 | use test::Bencher;
 10 | use std::os::raw::c_void;
 11 | use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer,
 12 |                            TokenizerOpts, TokenizerResult};
 13 | use html5ever::tendril::StrTendril;
 14 | use test::{test_main, ShouldPanic, TDynBenchFn, TestDesc, TestDescAndFn, TestFn, TestName};
 15 | use std::fs::File;
 16 | use std::io::Read;
 17 | 
 18 | unsafe extern "C" fn handle_token(token: *mut lhtml_token_t, _state: *mut c_void) {
 19 |     black_box(*token);
 20 | }
 21 | 
 22 | const CHUNK_SIZE: usize = 1024;
 23 | 
 24 | fn string_chunks(mut s: &str) -> Vec<String> {
 25 |     let mut result = Vec::with_capacity((s.len() / CHUNK_SIZE) + 1);
 26 | 
 27 |     while !s.is_empty() {
 28 |         let mut offset = CHUNK_SIZE;
 29 | 
 30 |         if offset < s.len() {
 31 |             while !s.is_char_boundary(offset) {
 32 |                 offset += 1;
 33 |             }
 34 |         } else {
 35 |             offset = s.len();
 36 |         }
 37 | 
 38 |         let (before, after) = s.split_at(offset);
 39 | 
 40 |         result.push(before.to_owned());
 41 | 
 42 |         s = after;
 43 |     }
 44 | 
 45 |     result
 46 | }
 47 | 
 48 | fn bench_lhtml_tokenizer(chunks: &[String]) {
 49 |     let mut bench_handler = lhtml_token_handler_t {
 50 |         callback: Some(handle_token),
 51 |         next: null_mut(),
 52 |     };
 53 | 
 54 |     let mut tokenizer = lazyhtml::Tokenizer::new(100 << 10, 256);
 55 | 
 56 |     bench_handler.inject_into(&mut tokenizer);
 57 | 
 58 |     for chunk in chunks {
 59 |         tokenizer.feed(chunk).expect("Could not feed input chunk");
 60 |     }
 61 | 
 62 |     tokenizer.end().expect("Could not finalize input");
 63 | }
 64 | 
 65 | struct Sink;
 66 | 
 67 | impl TokenSink for Sink {
 68 |     type Handle = ();
 69 | 
 70 |     fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
 71 |         black_box(token);
 72 |         TokenSinkResult::Continue
 73 |     }
 74 | }
 75 | 
 76 | fn bench_html5ever_tokenizer(chunks: &[String]) {
 77 |     let mut tokenizer = Tokenizer::new(Sink, TokenizerOpts::default());
 78 | 
 79 |     let mut queue = BufferQueue::new();
 80 | 
 81 |     for chunk in chunks {
 82 |         queue.push_back(StrTendril::from_slice(chunk));
 83 | 
 84 |         while let TokenizerResult::Script(_) = tokenizer.feed(&mut queue) {
 85 |             // ignore script markers
 86 |         }
 87 |     }
 88 | 
 89 |     tokenizer.end();
 90 | }
 91 | 
 92 | struct Bench {
 93 |     func: fn(&[String]),
 94 |     chunks: Vec<String>,
 95 | }
 96 | 
 97 | impl TDynBenchFn for Bench {
 98 |     fn run(&self, b: &mut Bencher) {
 99 |         b.iter(|| {
100 |             (self.func)(&self.chunks);
101 |         });
102 |     }
103 | }
104 | 
105 | fn main() {
106 |     let args: Vec<_> = ::std::env::args().collect();
107 | 
108 |     let fixtures: Vec<_> = glob::glob("../bench-fixtures/*.html")
109 |         .unwrap()
110 |         .map(|path| path.unwrap())
111 |         .collect();
112 | 
113 |     let funcs: [(&str, fn(&[String])); 2] = [
114 |         ("bench_lhtml_tokenizer", bench_lhtml_tokenizer),
115 |         ("bench_html5ever_tokenizer", bench_html5ever_tokenizer),
116 |     ];
117 | 
118 |     let mut tests = Vec::with_capacity(fixtures.len() * funcs.len());
119 | 
120 |     for path in fixtures {
121 |         let mut input = String::new();
122 |         File::open(&path)
123 |             .unwrap()
124 |             .read_to_string(&mut input)
125 |             .unwrap();
126 | 
127 |         let input_name = path.file_name().unwrap().to_str().unwrap();
128 | 
129 |         let chunks = string_chunks(&input);
130 | 
131 |         for &(func_name, func) in &funcs {
132 |             tests.push(TestDescAndFn {
133 |                 desc: TestDesc {
134 |                     name: TestName::DynTestName(format!("{} x {}", func_name, input_name)),
135 |                     ignore: false,
136 |                     should_panic: ShouldPanic::No,
137 |                     allow_fail: false,
138 |                 },
139 |                 testfn: TestFn::DynBenchFn(Box::new(Bench {
140 |                     func,
141 |                     chunks: chunks.clone(),
142 |                 })),
143 |             });
144 |         }
145 |     }
146 | 
147 |     test_main(&args, tests);
148 | }
149 | 


--------------------------------------------------------------------------------
/rust/examples/trace.rs:
--------------------------------------------------------------------------------
  1 | extern crate getopts;
  2 | extern crate lazyhtml;
  3 | 
  4 | use std::ptr::null_mut;
  5 | use lazyhtml::*;
  6 | use std::os::raw::c_void;
  7 | use getopts::Options;
  8 | use std::env::args;
  9 | 
 10 | struct HandlerState {
 11 |     handler: lhtml_token_handler_t,
 12 | }
 13 | 
 14 | impl HandlerState {
 15 |     pub fn new() -> Self {
 16 |         HandlerState {
 17 |             handler: lhtml_token_handler_t {
 18 |                 callback: Some(Self::callback),
 19 |                 next: null_mut(),
 20 |             },
 21 |         }
 22 |     }
 23 | 
 24 |     unsafe extern "C" fn callback(token: *mut lhtml_token_t, extra: *mut c_void) {
 25 |         println!("{:#?}", *token);
 26 |         lhtml_emit(token, extra);
 27 |     }
 28 | }
 29 | 
 30 | fn main() {
 31 |     let mut opts = Options::new();
 32 | 
 33 |     opts.optflag("f", "feedback", "Enable parser feedback");
 34 |     opts.optopt(
 35 |         "s",
 36 |         "state",
 37 |         "Initial state",
 38 |         "-s (Data|PlainText|RCData|RawText|ScriptData|CDataSection)",
 39 |     );
 40 |     opts.optflag("h", "help", "Show this help");
 41 | 
 42 |     let matches = match opts.parse(args().skip(1)) {
 43 |         Ok(matches) => if matches.free.is_empty() {
 44 |             eprintln!("Missing HTML input");
 45 |             None
 46 |         } else if matches.opt_present("h") {
 47 |             None
 48 |         } else {
 49 |             Some(matches)
 50 |         },
 51 |         Err(e) => {
 52 |             eprintln!("{}", e);
 53 |             None
 54 |         }
 55 |     };
 56 | 
 57 |     let matches = match matches {
 58 |         Some(m) => m,
 59 |         None => {
 60 |             eprintln!("{}", opts.usage("Usage: trace [options] INPUT"));
 61 |             return;
 62 |         }
 63 |     };
 64 | 
 65 |     let initial_state = match matches.opt_str("s").as_ref().map(|s| s.as_str()) {
 66 |         None | Some("Data") => html_en_Data,
 67 |         Some("PlainText") => html_en_PlainText,
 68 |         Some("RCData") => html_en_RCData,
 69 |         Some("RawText") => html_en_RawText,
 70 |         Some("ScriptData") => html_en_ScriptData,
 71 |         Some("CDataSection") => html_en_CDataSection,
 72 |         _ => {
 73 |             eprintln!("Unknown state, defaulting to Data");
 74 |             html_en_Data
 75 |         }
 76 |     };
 77 | 
 78 |     let with_feedback = matches.opt_present("f");
 79 | 
 80 |     let input = matches.free.first().unwrap();
 81 | 
 82 |     let mut test_state = HandlerState::new();
 83 | 
 84 |     let mut feedback;
 85 | 
 86 |     let mut tokenizer = Tokenizer::new(2048, 256);
 87 | 
 88 |     unsafe {
 89 |         tokenizer.set_cs(initial_state);
 90 |     }
 91 | 
 92 |     if with_feedback {
 93 |         feedback = Feedback::new(64);
 94 |         feedback.inject_into(&mut tokenizer);
 95 |     }
 96 | 
 97 |     test_state.handler.inject_into(&mut tokenizer);
 98 | 
 99 |     tokenizer.feed(input).expect("Could not feed input");
100 |     tokenizer.end().expect("Could not finalize input");
101 | }
102 | 


--------------------------------------------------------------------------------
/rust/lazyhtml-sys/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | authors = ["Ingvar Stepanyan <me@rreverser.com>"]
3 | name = "lazyhtml-sys"
4 | version = "0.1.0"
5 | 
6 | [build-dependencies]
7 | bindgen = "0.31.3"
8 | glob = "0.2.11"
9 | 


--------------------------------------------------------------------------------
/rust/lazyhtml-sys/build.rs:
--------------------------------------------------------------------------------
 1 | extern crate bindgen;
 2 | extern crate glob;
 3 | 
 4 | use std::env;
 5 | use std::path::PathBuf;
 6 | use std::process::Command;
 7 | use glob::glob;
 8 | 
 9 | const IMPLICIT_DEPS: &[&str] = &[
10 |     "../../c/tokenizer-states.rl",
11 |     "../../c/actions.rl",
12 |     "../../c/field-names.h",
13 |     "../../c/tag-types.h",
14 |     "../../c/tokenizer.*",
15 |     "../../c/parser-feedback.*",
16 |     "../../c/serializer.*",
17 |     "../../syntax/*.rl",
18 | ];
19 | 
20 | fn main() {
21 |     let out_dir = env::var("OUT_DIR").unwrap();
22 |     let out_path = PathBuf::from(&out_dir);
23 | 
24 |     assert!(
25 |         Command::new("make")
26 |             .current_dir("../../c")
27 |             .arg("lib")
28 |             .arg(format!("OUT_TARGET={}", out_dir))
29 |             .arg("CFLAGS=-fPIC")
30 |             .status()
31 |             .unwrap()
32 |             .success(),
33 |         "building LazyHTML failed"
34 |     );
35 | 
36 |     bindgen::builder()
37 |         .clang_arg("-U__clang__")
38 |         .header("wrapper.h")
39 |         .rust_target(bindgen::RustTarget::Stable_1_19)
40 |         .prepend_enum_name(false)
41 |         .whitelist_function("lhtml_.*")
42 |         .whitelist_type("lhtml_.*")
43 |         .whitelist_var("LHTML_.*|html_en_.*")
44 |         .constified_enum_module("lhtml_tag_type_t")
45 |         .rustified_enum("lhtml_token_type_t|lhtml_ns_t")
46 |         .derive_debug(false)
47 |         .generate()
48 |         .expect("Unable to generate bindings")
49 |         .write_to_file(out_path.join("bindings.rs"))
50 |         .expect("Unable to write bindings");
51 | 
52 |     println!("cargo:rustc-link-search=native={}", &out_dir);
53 |     println!("cargo:rustc-link-lib=static=lhtml");
54 | 
55 |     for dep in IMPLICIT_DEPS {
56 |         for entry in glob(dep).unwrap() {
57 |             println!("cargo:rerun-if-changed={}", entry.unwrap().display());
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/rust/lazyhtml-sys/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![allow(non_upper_case_globals)]
  2 | #![allow(non_camel_case_types)]
  3 | #![allow(non_snake_case)]
  4 | #![allow(unused)]
  5 | 
  6 | use std::fmt::{self, Debug, Formatter};
  7 | use std::slice;
  8 | use std::ops::{Deref, DerefMut};
  9 | 
 10 | include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 11 | 
 12 | impl Deref for lhtml_string_t {
 13 |     type Target = [u8];
 14 | 
 15 |     fn deref(&self) -> &[u8] {
 16 |         if self.data.is_null() {
 17 |             &[]
 18 |         } else {
 19 |             unsafe { slice::from_raw_parts(self.data as _, self.length) }
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | impl Deref for lhtml_attributes_t {
 25 |     type Target = [lhtml_attribute_t];
 26 | 
 27 |     fn deref(&self) -> &[lhtml_attribute_t] {
 28 |         let data = unsafe { self.__bindgen_anon_1.buffer.data };
 29 | 
 30 |         if data.is_null() {
 31 |             &[]
 32 |         } else {
 33 |             unsafe { slice::from_raw_parts(data, self.length) }
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | impl DerefMut for lhtml_attributes_t {
 39 |     fn deref_mut(&mut self) -> &mut [lhtml_attribute_t] {
 40 |         let data = unsafe { self.__bindgen_anon_1.buffer.data };
 41 | 
 42 |         if data.is_null() {
 43 |             &mut []
 44 |         } else {
 45 |             unsafe { slice::from_raw_parts_mut(data, self.length) }
 46 |         }
 47 |     }
 48 | }
 49 | 
 50 | impl Debug for lhtml_string_t {
 51 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 52 |         let s = unsafe { ::std::str::from_utf8_unchecked(self) };
 53 |         s.fmt(f)
 54 |     }
 55 | }
 56 | 
 57 | impl Debug for lhtml_opt_string_t {
 58 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 59 |         if self.has_value {
 60 |             self.value.fmt(f)
 61 |         } else {
 62 |             f.write_str("(none)")
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | impl Debug for lhtml_token_comment_t {
 68 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 69 |         f.debug_struct("lhtml_token_comment_t")
 70 |             .field("value", &self.value)
 71 |             .finish()
 72 |     }
 73 | }
 74 | 
 75 | impl Debug for lhtml_attribute_t {
 76 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 77 |         write!(f, "{:?}: {:?} (raw: {:?})", self.name, self.value, self.raw)
 78 |     }
 79 | }
 80 | 
 81 | impl Debug for lhtml_attributes_t {
 82 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 83 |         f.debug_list().entries(self.iter()).finish()
 84 |     }
 85 | }
 86 | 
 87 | impl Debug for lhtml_token_starttag_t {
 88 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 89 |         f.debug_struct("lhtml_token_starttag_t")
 90 |             .field("name", &self.name)
 91 |             .field("attributes", &self.attributes)
 92 |             .field("self_closing", &self.self_closing)
 93 |             .finish()
 94 |     }
 95 | }
 96 | 
 97 | impl Debug for lhtml_token_endtag_t {
 98 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
 99 |         f.debug_struct("lhtml_token_endtag_t")
100 |             .field("name", &self.name)
101 |             .finish()
102 |     }
103 | }
104 | 
105 | impl Debug for lhtml_token_doctype_t {
106 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
107 |         f.debug_struct("lhtml_token_doctype_t")
108 |             .field("name", &self.name)
109 |             .field("public_id", &self.public_id)
110 |             .field("system_id", &self.system_id)
111 |             .field("force_quirks", &self.force_quirks)
112 |             .finish()
113 |     }
114 | }
115 | 
116 | impl Debug for lhtml_token_t {
117 |     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
118 |         use lhtml_token_type_t::*;
119 | 
120 |         let mut f = f.debug_struct("lhtml_token_t");
121 | 
122 |         f.field("type", &self.type_);
123 | 
124 |         unsafe {
125 |             match self.type_ {
126 |                 LHTML_TOKEN_COMMENT => {
127 |                     f.field("comment", &self.__bindgen_anon_1.comment);
128 |                 }
129 |                 LHTML_TOKEN_START_TAG => {
130 |                     f.field("start_tag", &self.__bindgen_anon_1.start_tag);
131 |                 }
132 |                 LHTML_TOKEN_END_TAG => {
133 |                     f.field("end_tag", &self.__bindgen_anon_1.end_tag);
134 |                 }
135 |                 LHTML_TOKEN_DOCTYPE => {
136 |                     f.field("doctype", &self.__bindgen_anon_1.doctype);
137 |                 }
138 |                 _ => {}
139 |             }
140 |         }
141 | 
142 |         f.field("raw", &self.raw);
143 | 
144 |         f.finish()
145 |     }
146 | }
147 | 


--------------------------------------------------------------------------------
/rust/lazyhtml-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "../../c/tokenizer.h"
2 | #include "../../c/parser-feedback.h"
3 | #include "../../c/serializer.h"
4 | #include "../../c/out/tokenizer-states.h"
5 | 


--------------------------------------------------------------------------------
/rust/src/feedback.rs:
--------------------------------------------------------------------------------
 1 | pub use lazyhtml_sys::*;
 2 | use std::mem::zeroed;
 3 | use tokenizer::*;
 4 | 
 5 | pub struct Feedback(lhtml_feedback_t);
 6 | 
 7 | impl Feedback {
 8 |     pub fn new(ns_capacity: usize) -> Self {
 9 |         Feedback(lhtml_feedback_t {
10 |             ns_stack: lhtml_ns_stack_t {
11 |                 __bindgen_anon_1: lhtml_ns_stack_t__bindgen_ty_1 {
12 |                     buffer: lhtml_alloc_buffer!(lhtml_ns_buffer_t, ns_capacity),
13 |                 },
14 |                 length: 0,
15 |             },
16 |             ..unsafe { zeroed() }
17 |         })
18 |     }
19 | }
20 | 
21 | impl TokenHandler for Feedback {
22 |     fn inject_into(&mut self, tokenizer: &mut Tokenizer) {
23 |         unsafe {
24 |             lhtml_feedback_inject(tokenizer.get_state(), &mut self.0);
25 |         }
26 |     }
27 | }
28 | 
29 | impl Drop for Feedback {
30 |     fn drop(&mut self) {
31 |         unsafe {
32 |             lhtml_drop_buffer!(self.0.ns_stack.__bindgen_anon_1.buffer);
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/rust/src/lib.rs:
--------------------------------------------------------------------------------
 1 | extern crate lazyhtml_sys;
 2 | 
 3 | macro_rules! lhtml_alloc_buffer {
 4 |     ($ty:ident, $capacity:expr) => {{
 5 |         let mut vec = Vec::with_capacity($capacity);
 6 |         let buf = $ty {
 7 |             data: vec.as_mut_ptr(),
 8 |             capacity: vec.capacity()
 9 |         };
10 |         ::std::mem::forget(vec);
11 |         buf
12 |     }};
13 | }
14 | 
15 | macro_rules! lhtml_drop_buffer {
16 |     ($buf:expr) => {
17 |         let buf = $buf;
18 |         Box::from_raw(::std::slice::from_raw_parts_mut(
19 |             buf.data,
20 |             buf.capacity
21 |         ));
22 |     }
23 | }
24 | 
25 | mod tokenizer;
26 | mod feedback;
27 | mod serializer;
28 | 
29 | pub use tokenizer::*;
30 | pub use feedback::*;
31 | pub use serializer::*;
32 | 


--------------------------------------------------------------------------------
/rust/src/serializer.rs:
--------------------------------------------------------------------------------
 1 | pub use lazyhtml_sys::*;
 2 | use std::mem::zeroed;
 3 | use tokenizer::*;
 4 | 
 5 | #[repr(C)]
 6 | pub struct Serializer<F> {
 7 |     state: lhtml_serializer_t,
 8 |     callback: F,
 9 | }
10 | 
11 | impl<F: FnMut(&str)> Serializer<F> {
12 |     pub fn new(callback: F) -> Self {
13 |         Serializer {
14 |             state: lhtml_serializer_t {
15 |                 handler: unsafe { zeroed() },
16 |                 writer: Some(Self::writer),
17 |             },
18 |             callback,
19 |         }
20 |     }
21 | 
22 |     unsafe extern "C" fn writer(s: lhtml_string_t, state: *mut lhtml_serializer_t) {
23 |         ((*(state as *mut Self)).callback)(::std::str::from_utf8_unchecked(&s))
24 |     }
25 | }
26 | 
27 | impl<F> TokenHandler for Serializer<F> {
28 |     fn inject_into(&mut self, tokenizer: &mut Tokenizer) {
29 |         unsafe {
30 |             lhtml_serializer_inject(tokenizer.get_state(), &mut self.state);
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/rust/src/tokenizer.rs:
--------------------------------------------------------------------------------
 1 | pub use lazyhtml_sys::*;
 2 | use std::mem::zeroed;
 3 | use std::marker::PhantomData;
 4 | 
 5 | pub struct Tokenizer<'a> {
 6 |     state: lhtml_tokenizer_t,
 7 |     phantom: PhantomData<&'a ()>,
 8 | }
 9 | 
10 | impl<'a> Tokenizer<'a> {
11 |     pub fn new(char_capacity: usize, attr_capacity: usize) -> Self {
12 |         let mut state = lhtml_tokenizer_t {
13 |             buffer: lhtml_alloc_buffer!(lhtml_char_buffer_t, char_capacity),
14 |             attr_buffer: lhtml_alloc_buffer!(lhtml_attr_buffer_t, attr_capacity),
15 |             ..unsafe { zeroed() }
16 |         };
17 |         unsafe {
18 |             lhtml_init(&mut state);
19 |         }
20 |         Tokenizer {
21 |             state,
22 |             phantom: PhantomData,
23 |         }
24 |     }
25 | 
26 |     fn feed_opt(&mut self, input: *const lhtml_string_t) -> Result<(), ()> {
27 |         if unsafe { lhtml_feed(&mut self.state, input) } {
28 |             Ok(())
29 |         } else {
30 |             Err(())
31 |         }
32 |     }
33 | 
34 |     pub fn feed(&mut self, input: &str) -> Result<(), ()> {
35 |         self.feed_opt(&lhtml_string_t {
36 |             data: input.as_ptr() as _,
37 |             length: input.len(),
38 |         })
39 |     }
40 | 
41 |     pub fn end(mut self) -> Result<(), ()> {
42 |         self.feed_opt(::std::ptr::null())
43 |     }
44 | 
45 |     pub unsafe fn set_cs(&mut self, cs: ::std::os::raw::c_int) {
46 |         self.state.cs = cs;
47 |     }
48 | 
49 |     pub unsafe fn set_last_start_tag(&mut self, last_start_tag: &str) {
50 |         self.state.last_start_tag_type = lhtml_get_tag_type(lhtml_string_t {
51 |             data: last_start_tag.as_ptr() as _,
52 |             length: last_start_tag.len(),
53 |         });
54 |     }
55 | 
56 |     pub unsafe fn get_state(&mut self) -> &mut lhtml_tokenizer_t {
57 |         &mut self.state
58 |     }
59 | }
60 | 
61 | impl<'a> Drop for Tokenizer<'a> {
62 |     fn drop(&mut self) {
63 |         unsafe {
64 |             let state = self.get_state();
65 |             lhtml_drop_buffer!(state.buffer);
66 |             lhtml_drop_buffer!(state.attr_buffer);
67 |         }
68 |     }
69 | }
70 | 
71 | pub trait TokenHandler {
72 |     fn inject_into<'a>(&'a mut self, tokenizer: &mut Tokenizer<'a>);
73 | }
74 | 
75 | impl TokenHandler for lhtml_token_handler_t {
76 |     fn inject_into(&mut self, tokenizer: &mut Tokenizer) {
77 |         unsafe {
78 |             lhtml_append_handlers(&mut tokenizer.get_state().base_handler, self);
79 |         }
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/rust/tests/decoder.rs:
--------------------------------------------------------------------------------
  1 | use html5ever::data::{C1_REPLACEMENTS, NAMED_ENTITIES};
  2 | use std::char;
  3 | use std::str::Chars;
  4 | use std::iter::Peekable;
  5 | 
  6 | #[derive(PartialEq, Eq)]
  7 | enum Entities {
  8 |     None,
  9 |     Text,
 10 |     Attribute,
 11 | }
 12 | 
 13 | pub struct Decoder<'a> {
 14 |     chars: Peekable<Chars<'a>>,
 15 |     result: String,
 16 |     null: bool,
 17 |     entities: Entities,
 18 | }
 19 | 
 20 | impl<'a> Decoder<'a> {
 21 |     fn next_if_char(&mut self, expected: char) -> bool {
 22 |         self.next_if(|c| c == expected).is_some()
 23 |     }
 24 | 
 25 |     fn next_if<F: Fn(char) -> bool>(&mut self, f: F) -> Option<char> {
 26 |         self.next_opt(|c| if f(c) { Some(c) } else { None })
 27 |     }
 28 | 
 29 |     fn next_opt<T, F: Fn(char) -> Option<T>>(&mut self, f: F) -> Option<T> {
 30 |         let opt = self.chars.peek().cloned().and_then(f);
 31 |         if opt.is_some() {
 32 |             self.chars.next();
 33 |         }
 34 |         opt
 35 |     }
 36 | 
 37 |     fn decode_numeric_entity(&mut self, radix: u32) -> bool {
 38 |         if let Some(mut code) = self.next_opt(|c| c.to_digit(radix)) {
 39 |             while let Some(digit) = self.next_opt(|c| c.to_digit(radix)) {
 40 |                 if code < 0x10FFFF {
 41 |                     code = code * radix + digit;
 42 |                 }
 43 |             }
 44 |             self.result.push(
 45 |                 match code {
 46 |                     0x00 => None,
 47 |                     0x80...0x9F => {
 48 |                         C1_REPLACEMENTS[(code - 0x80) as usize].or_else(|| char::from_u32(code))
 49 |                     }
 50 |                     _ => char::from_u32(code),
 51 |                 }.unwrap_or('\u{FFFD}'),
 52 |             );
 53 |             self.next_if_char(';');
 54 |             true
 55 |         } else {
 56 |             self.result += "&#";
 57 |             false
 58 |         }
 59 |     }
 60 | 
 61 |     fn decode_named_entity(&mut self) {
 62 |         let mut name_buf = String::new();
 63 |         let mut name_match = ('&' as u32, 0, 0);
 64 |         while let Some(&c) = self.chars.peek() {
 65 |             name_buf.push(c);
 66 |             if let Some(&m) = NAMED_ENTITIES.get(&name_buf[..]) {
 67 |                 self.chars.next();
 68 |                 if m.0 != 0 {
 69 |                     if c != ';' && self.entities == Entities::Attribute {
 70 |                         if let Some(&c) = self.chars.peek() {
 71 |                             match c {
 72 |                                 'A'...'Z' | 'a'...'z' | '0'...'9' | '=' => {
 73 |                                     continue;
 74 |                                 }
 75 |                                 _ => {}
 76 |                             }
 77 |                         }
 78 |                     }
 79 |                     name_match = (m.0, m.1, name_buf.len());
 80 |                 }
 81 |             } else {
 82 |                 name_buf.pop();
 83 |                 break;
 84 |             }
 85 |         }
 86 |         self.result.push(char::from_u32(name_match.0).unwrap());
 87 |         if name_match.1 != 0 {
 88 |             self.result.push(char::from_u32(name_match.1).unwrap());
 89 |         }
 90 |         self.result += &name_buf[name_match.2..];
 91 |     }
 92 | 
 93 |     fn decode_entity(&mut self) {
 94 |         if self.next_if_char('#') {
 95 |             if let Some(x) = self.next_if(|c| c == 'x' || c == 'X') {
 96 |                 if !self.decode_numeric_entity(16) {
 97 |                     self.result.push(x);
 98 |                 }
 99 |             } else {
100 |                 self.decode_numeric_entity(10);
101 |             }
102 |         } else {
103 |             self.decode_named_entity();
104 |         }
105 |     }
106 | 
107 |     fn decode_cr(&mut self) {
108 |         self.result.push('\n');
109 |         self.next_if_char('\n');
110 |     }
111 | 
112 |     pub fn new(raw: &'a str) -> Self {
113 |         Decoder {
114 |             chars: raw.chars().peekable(),
115 |             result: String::with_capacity(raw.len()),
116 |             null: false,
117 |             entities: Entities::None,
118 |         }
119 |     }
120 | 
121 |     pub fn unsafe_null(mut self) -> Self {
122 |         self.null = true;
123 |         self
124 |     }
125 | 
126 |     pub fn text_entities(mut self) -> Self {
127 |         self.entities = Entities::Text;
128 |         self
129 |     }
130 | 
131 |     pub fn attr_entities(mut self) -> Self {
132 |         self.entities = Entities::Attribute;
133 |         self
134 |     }
135 | 
136 |     pub fn run(mut self) -> String {
137 |         while let Some(c) = self.chars.next() {
138 |             match c {
139 |                 '\r' => {
140 |                     self.decode_cr();
141 |                 }
142 |                 '\0' if self.null => {
143 |                     self.result.push('\u{FFFD}');
144 |                 }
145 |                 '&' if self.entities != Entities::None => {
146 |                     self.decode_entity();
147 |                 }
148 |                 _ => {
149 |                     self.result.push(c);
150 |                 }
151 |             }
152 |         }
153 | 
154 |         self.result
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/rust/tests/feedback_tokens/mod.rs:
--------------------------------------------------------------------------------
 1 | mod noop_tree_sink;
 2 | mod token_sink_proxy;
 3 | 
 4 | use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts};
 5 | use html5ever::tokenizer::{BufferQueue, Tokenizer, TokenizerOpts, TokenizerResult};
 6 | use html5ever::tendril::StrTendril;
 7 | use token::Token as MyToken;
 8 | use self::noop_tree_sink::NoopTreeSink;
 9 | use self::token_sink_proxy::TokenSinkProxy;
10 | 
11 | pub fn tokenize_with_tree_builder(input: &str) -> Vec<MyToken> {
12 |     let mut tokens = Vec::new();
13 |     let mut b = BufferQueue::new();
14 |     b.push_back(StrTendril::from(input));
15 |     {
16 |         let mut t = Tokenizer::new(
17 |             TokenSinkProxy {
18 |                 inner: TreeBuilder::new(NoopTreeSink::default(), TreeBuilderOpts::default()),
19 |                 tokens: &mut tokens,
20 |             },
21 |             TokenizerOpts::default(),
22 |         );
23 | 
24 |         while let TokenizerResult::Script(_) = t.feed(&mut b) {
25 |             // ignore script markers
26 |         }
27 | 
28 |         t.end();
29 |     }
30 |     tokens
31 | }
32 | 


--------------------------------------------------------------------------------
/rust/tests/feedback_tokens/noop_tree_sink.rs:
--------------------------------------------------------------------------------
 1 | // https://github.com/servo/html5ever/blob/master/html5ever/examples/noop-tree-builder.rs
 2 | 
 3 | use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
 4 | use html5ever::{Attribute, ExpandedName, QualName};
 5 | use html5ever::tendril::StrTendril;
 6 | use std::borrow::Cow;
 7 | 
 8 | pub struct NoopTreeSink {
 9 |     names: Vec<Option<QualName>>,
10 | }
11 | 
12 | impl Default for NoopTreeSink {
13 |     fn default() -> Self {
14 |         NoopTreeSink { names: Vec::new() }
15 |     }
16 | }
17 | 
18 | impl NoopTreeSink {
19 |     fn get_name(&self, id: &usize) -> Option<&QualName> {
20 |         self.names.get(*id).and_then(|opt_name| opt_name.as_ref())
21 |     }
22 | 
23 |     fn set_name(&mut self, name: Option<QualName>) -> usize {
24 |         let id = self.names.len();
25 |         self.names.push(name);
26 |         id
27 |     }
28 | }
29 | 
30 | impl TreeSink for NoopTreeSink {
31 |     type Handle = usize;
32 |     type Output = Self;
33 | 
34 |     fn finish(self) -> Self {
35 |         self
36 |     }
37 | 
38 |     fn get_document(&mut self) -> usize {
39 |         0
40 |     }
41 | 
42 |     fn get_template_contents(&mut self, target: &usize) -> usize {
43 |         if let Some(expanded_name!(html "template")) = self.get_name(target).map(|n| n.expanded()) {
44 |             target + 1
45 |         } else {
46 |             panic!("not a template element")
47 |         }
48 |     }
49 | 
50 |     fn same_node(&self, x: &usize, y: &usize) -> bool {
51 |         x == y
52 |     }
53 | 
54 |     fn elem_name(&self, target: &usize) -> ExpandedName {
55 |         self.get_name(target).expect("not an element").expanded()
56 |     }
57 | 
58 |     fn create_element(&mut self, name: QualName, _: Vec<Attribute>, _: ElementFlags) -> usize {
59 |         self.set_name(Some(name))
60 |     }
61 | 
62 |     fn create_comment(&mut self, _text: StrTendril) -> usize {
63 |         self.set_name(None)
64 |     }
65 | 
66 |     #[allow(unused_variables)]
67 |     fn create_pi(&mut self, target: StrTendril, value: StrTendril) -> usize {
68 |         unimplemented!()
69 |     }
70 | 
71 |     fn append_before_sibling(&mut self, _sibling: &usize, _new_node: NodeOrText<usize>) {}
72 | 
73 |     fn parse_error(&mut self, _msg: Cow<'static, str>) {}
74 | 
75 |     fn set_quirks_mode(&mut self, _mode: QuirksMode) {}
76 | 
77 |     fn append(&mut self, _parent: &usize, _child: NodeOrText<usize>) {}
78 | 
79 |     fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) {}
80 | 
81 |     fn add_attrs_if_missing(&mut self, target: &usize, _attrs: Vec<Attribute>) {
82 |         self.get_name(target).expect("not an element");
83 |     }
84 | 
85 |     fn remove_from_parent(&mut self, _target: &usize) {}
86 | 
87 |     fn reparent_children(&mut self, _node: &usize, _new_parent: &usize) {}
88 | 
89 |     fn mark_script_already_started(&mut self, _node: &usize) {}
90 | 
91 |     fn append_based_on_parent_node(
92 |         &mut self,
93 |         _element: &usize,
94 |         _prev_element: &usize,
95 |         _new_node: NodeOrText<usize>,
96 |     ) {
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/rust/tests/feedback_tokens/token_sink_proxy.rs:
--------------------------------------------------------------------------------
 1 | use html5ever::tokenizer::{TagKind, Token, TokenSink, TokenSinkResult};
 2 | use token::Token as MyToken;
 3 | use std::collections::HashMap;
 4 | use std::iter::FromIterator;
 5 | 
 6 | // sends tokens to a given sink, while at the same time converting and
 7 | // recording them into the provided array
 8 | pub struct TokenSinkProxy<'a, Sink> {
 9 |     pub inner: Sink,
10 |     pub tokens: &'a mut Vec<MyToken>,
11 | }
12 | 
13 | impl<'a, Sink> TokenSinkProxy<'a, Sink> {
14 |     fn push_character_token(&mut self, s: &str) {
15 |         if let Some(&mut MyToken::Character(ref mut last)) = self.tokens.last_mut() {
16 |             *last += s;
17 |             return;
18 |         }
19 |         self.tokens.push(MyToken::Character(s.to_string()));
20 |     }
21 | }
22 | 
23 | impl<'a, Sink> TokenSink for TokenSinkProxy<'a, Sink>
24 | where
25 |     Sink: TokenSink,
26 | {
27 |     type Handle = Sink::Handle;
28 | 
29 |     fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
30 |         match token {
31 |             Token::DoctypeToken(ref doctype) => {
32 |                 self.tokens.push(MyToken::Doctype {
33 |                     name: doctype.name.as_ref().map(|s| s.to_string()),
34 |                     public_id: doctype.public_id.as_ref().map(|s| s.to_string()),
35 |                     system_id: doctype.system_id.as_ref().map(|s| s.to_string()),
36 |                     correctness: !doctype.force_quirks,
37 |                 });
38 |             }
39 |             Token::TagToken(ref tag) => {
40 |                 let name = tag.name.to_string();
41 |                 self.tokens.push(match tag.kind {
42 |                     TagKind::StartTag => MyToken::StartTag {
43 |                         name,
44 |                         attributes: HashMap::from_iter(
45 |                             tag.attrs
46 |                                 .iter()
47 |                                 .rev()
48 |                                 .map(|attr| (attr.name.local.to_string(), attr.value.to_string())),
49 |                         ),
50 |                         self_closing: tag.self_closing,
51 |                     },
52 |                     TagKind::EndTag => MyToken::EndTag {
53 |                         name: name.to_string(),
54 |                     },
55 |                 })
56 |             }
57 |             Token::CommentToken(ref s) => {
58 |                 self.tokens.push(MyToken::Comment(s.to_string()));
59 |             }
60 |             Token::CharacterTokens(ref s) => if !s.is_empty() {
61 |                 self.push_character_token(s);
62 |             },
63 |             Token::NullCharacterToken => {
64 |                 self.push_character_token("\0");
65 |             }
66 |             _ => {}
67 |         }
68 |         self.inner.process_token(token, line_number)
69 |     }
70 | 
71 |     fn end(&mut self) {
72 |         self.inner.end()
73 |     }
74 | 
75 |     fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
76 |         self.inner
77 |             .adjusted_current_node_present_but_not_in_html_namespace()
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/rust/tests/html5lib.rs:
--------------------------------------------------------------------------------
  1 | use serde_json;
  2 | use glob;
  3 | use std::io::{BufRead, BufReader};
  4 | use std::fs::File;
  5 | use unescape::Unescape;
  6 | use lazyhtml;
  7 | use token::{Token, TokenRange};
  8 | use feedback_tokens::tokenize_with_tree_builder;
  9 | use parse_errors::{ParseErrors, ERROR_CODES};
 10 | 
 11 | // Skip some errors in certain tests due to the limited functionality of the parser.
 12 | const SKIP_ERRORS: &'static [(&'static str, &'static str)] = &[
 13 |     ("Duplicate close tag attributes", "duplicate-attribute"), // We don't collect attributes on end tags
 14 | ];
 15 | 
 16 | #[derive(Deserialize)]
 17 | struct Suite {
 18 |     #[serde(default)]
 19 |     pub tests: Vec<Test>,
 20 | }
 21 | 
 22 | macro_rules! read_tests {
 23 |     ($path: expr) => (
 24 |         glob::glob(concat!(
 25 |             env!("CARGO_MANIFEST_DIR"),
 26 |             "/../",
 27 |             $path
 28 |         )).unwrap()
 29 |         .map(|path| BufReader::new(File::open(path.unwrap()).unwrap()))
 30 |     )
 31 | }
 32 | 
 33 | #[derive(Clone, Copy, Deserialize, Debug)]
 34 | #[repr(i32)]
 35 | pub enum InitialState {
 36 |     #[serde(rename = "Data state")]
 37 |     Data = lazyhtml::html_en_Data,
 38 |     #[serde(rename = "PLAINTEXT state")]
 39 |     PlainText = lazyhtml::html_en_PlainText,
 40 |     #[serde(rename = "RCDATA state")]
 41 |     RCData = lazyhtml::html_en_RCData,
 42 |     #[serde(rename = "RAWTEXT state")]
 43 |     RawText = lazyhtml::html_en_RawText,
 44 |     #[serde(rename = "Script data state")]
 45 |     ScriptData = lazyhtml::html_en_ScriptData,
 46 |     #[serde(rename = "CDATA section state")]
 47 |     CDataSection = lazyhtml::html_en_CDataSection,
 48 | }
 49 | 
 50 | fn default_initial_states() -> Vec<InitialState> {
 51 |     vec![InitialState::Data]
 52 | }
 53 | 
 54 | fn default_with_errors() -> bool {
 55 |     true // ¯\_(ツ)_/¯
 56 | }
 57 | 
 58 | #[derive(Deserialize)]
 59 | pub struct ParseError {
 60 |     pub code: String,
 61 |     pub line: usize,
 62 |     pub col: usize,
 63 | }
 64 | 
 65 | #[derive(Deserialize)]
 66 | #[serde(rename_all = "camelCase")]
 67 | pub struct Test {
 68 |     pub description: String,
 69 |     pub input: String,
 70 |     pub output: Vec<Token>,
 71 | 
 72 |     #[serde(skip)]
 73 |     pub with_feedback: bool,
 74 | 
 75 |     #[serde(default = "default_with_errors")]
 76 |     pub with_errors: bool,
 77 | 
 78 |     #[serde(default = "default_initial_states")]
 79 |     pub initial_states: Vec<InitialState>,
 80 | 
 81 |     #[serde(default)]
 82 |     pub double_escaped: bool,
 83 | 
 84 |     #[serde(default)]
 85 |     pub last_start_tag: String,
 86 | 
 87 |     #[serde(default)]
 88 |     errors: Vec<ParseError>,
 89 | }
 90 | 
 91 | impl Test {
 92 |     pub fn get_expected_parse_errors(
 93 |         &self,
 94 |         token_ranges: Vec<TokenRange>,
 95 |     ) -> Result<ParseErrors, String> {
 96 |         let mut expected_errors = ParseErrors::new();
 97 | 
 98 |         let errors = self.errors.iter().filter_map(|err| {
 99 |             ERROR_CODES
100 |                 .iter()
101 |                 .filter(|&code| !SKIP_ERRORS.contains(&(self.description.as_str(), code)))
102 |                 .find(|&&code| code == err.code)
103 |                 .map(|&code| {
104 |                     let pos = self.input
105 |                         .split("\n")
106 |                         .take(err.line - 1)
107 |                         .fold(err.col - 1, |pos, s| pos + s.len());
108 | 
109 |                     // NOTE: use error code slice from the static array
110 |                     // to avoid specifying lifetimes on owning structures.
111 |                     (code, pos)
112 |                 })
113 |         });
114 | 
115 |         'outer: for (code, pos) in errors {
116 |             for &range in token_ranges.iter() {
117 |                 if range.contains(pos) {
118 |                     expected_errors.insert((range, code));
119 |                     continue 'outer;
120 |                 }
121 |             }
122 | 
123 |             return Err(format!(
124 |                 "The following error doesn't fit into any token range: {:?}",
125 |                 (code, pos)
126 |             ));
127 |         }
128 | 
129 |         Ok(expected_errors)
130 |     }
131 | }
132 | 
133 | impl Unescape for Test {
134 |     fn unescape(&mut self) -> Result<(), serde_json::error::Error> {
135 |         if self.double_escaped {
136 |             self.double_escaped = false;
137 |             self.input.unescape()?;
138 |             for token in &mut self.output {
139 |                 token.unescape()?;
140 |             }
141 |         }
142 |         Ok(())
143 |     }
144 | }
145 | 
146 | pub fn get_tests() -> Vec<Test> {
147 |     let mut tests = Vec::new();
148 |     for file in read_tests!("html5lib-tests/tokenizer/*.test") {
149 |         tests.extend(serde_json::from_reader::<_, Suite>(file).unwrap().tests);
150 |     }
151 |     for file in read_tests!("error-with-feedback-tests/*.test") {
152 |         tests.extend(
153 |             serde_json::from_reader::<_, Suite>(file)
154 |                 .unwrap()
155 |                 .tests
156 |                 .into_iter()
157 |                 .map(|mut test| {
158 |                     test.with_feedback = true;
159 |                     test
160 |                 }),
161 |         );
162 |     }
163 |     for file in read_tests!("html5lib-tests/tree-construction/*.dat") {
164 |         let mut inputs = Vec::new();
165 |         let mut in_data = 0;
166 |         for line in file.lines().map(|line| line.unwrap()) {
167 |             if line == "#data" {
168 |                 in_data = 1;
169 |             } else if line.starts_with('#') {
170 |                 in_data = 0;
171 |             } else if in_data > 0 {
172 |                 if in_data > 1 {
173 |                     let s: &mut String = inputs.last_mut().unwrap();
174 |                     s.push('\n');
175 |                     s.push_str(&line);
176 |                 } else {
177 |                     inputs.push(line);
178 |                 }
179 |                 in_data += 1;
180 |             }
181 |         }
182 |         tests.extend(inputs.into_iter().map(|input| {
183 |             Test {
184 |                 description: input
185 |                     .chars()
186 |                     .flat_map(|c| c.escape_default())
187 |                     .collect::<String>() + " (with feedback)",
188 |                 output: tokenize_with_tree_builder(&input),
189 |                 input,
190 |                 with_feedback: true,
191 |                 with_errors: false,
192 |                 initial_states: default_initial_states(),
193 |                 double_escaped: false,
194 |                 last_start_tag: String::new(),
195 |                 errors: Vec::default(),
196 |             }
197 |         }));
198 |     }
199 |     tests
200 | }
201 | 


--------------------------------------------------------------------------------
/rust/tests/parse_errors.rs:
--------------------------------------------------------------------------------
 1 | use token::TokenRange;
 2 | use std::collections::HashSet;
 3 | 
 4 | pub const ERROR_CODES: &'static [&'static str] = &[
 5 |     "abrupt-closing-of-empty-comment",
 6 |     "abrupt-doctype-public-identifier",
 7 |     "abrupt-doctype-system-identifier",
 8 |     // "absence-of-digits-in-numeric-character-reference" (character references are not supported)
 9 |     "cdata-in-html-content",
10 |     // "character-reference-outside-unicode-range" (character references are not supported)
11 |     // "control-character-in-input-stream" (has significant performance impact)
12 |     // "control-character-reference" (character references are not supported)
13 |     "end-tag-with-attributes",
14 |     "duplicate-attribute",
15 |     "end-tag-with-trailing-solidus",
16 |     "eof-before-tag-name",
17 |     "eof-in-cdata",
18 |     "eof-in-comment",
19 |     "eof-in-doctype",
20 |     "eof-in-script-html-comment-like-text",
21 |     "eof-in-tag",
22 |     "incorrectly-closed-comment",
23 |     "incorrectly-opened-comment",
24 |     "invalid-character-sequence-after-doctype-name",
25 |     "invalid-first-character-of-tag-name",
26 |     "missing-attribute-value",
27 |     "missing-doctype-name",
28 |     "missing-doctype-public-identifier",
29 |     "missing-doctype-system-identifier",
30 |     "missing-end-tag-name",
31 |     "missing-quote-before-doctype-public-identifier",
32 |     "missing-quote-before-doctype-system-identifier",
33 |     "missing-whitespace-after-doctype-public-keyword",
34 |     "missing-whitespace-after-doctype-system-keyword",
35 |     "missing-whitespace-before-doctype-name",
36 |     "missing-whitespace-between-attributes",
37 |     "missing-whitespace-between-doctype-public-and-system-identifiers",
38 |     "nested-comment",
39 |     // "noncharacter-character-reference" (character references are not supported)
40 |     // "noncharacter-in-input-stream" (requires UTF decoding, has significant performance impact)
41 |     "non-void-html-element-start-tag-with-trailing-solidus",
42 |     // "null-character-reference" (character references are not supported)
43 |     // "surrogate-character-reference" (character references are not supported)
44 |     // "surrogate-in-input-stream" (requires UTF decoding, has significant performance impact)
45 |     "unexpected-character-after-doctype-system-identifier",
46 |     "unexpected-character-in-attribute-name",
47 |     "unexpected-character-in-unquoted-attribute-value",
48 |     "unexpected-equals-sign-before-attribute-name",
49 |     // "unexpected-null-character" (has significant performance impact)
50 |     "unexpected-question-mark-instead-of-tag-name",
51 |     "unexpected-solidus-in-tag",
52 |     // "unknown-named-character-reference" (character references are not supported)
53 | ];
54 | 
55 | pub type ParseErrors = HashSet<(TokenRange, &'static str)>;
56 | 


--------------------------------------------------------------------------------
/rust/tests/test.rs:
--------------------------------------------------------------------------------
  1 | extern crate lazyhtml;
  2 | 
  3 | #[macro_use]
  4 | extern crate serde;
  5 | 
  6 | extern crate serde_json;
  7 | 
  8 | #[macro_use]
  9 | extern crate html5ever;
 10 | 
 11 | // From 'rustc-test' crate.
 12 | // Mirrors Rust's internal 'libtest'.
 13 | // https://doc.rust-lang.org/1.1.0/test/index.html
 14 | extern crate rustc_test as test;
 15 | 
 16 | extern crate glob;
 17 | 
 18 | mod token;
 19 | mod feedback_tokens;
 20 | mod decoder;
 21 | mod unescape;
 22 | mod html5lib;
 23 | mod parse_errors;
 24 | 
 25 | use std::collections::HashMap;
 26 | use lazyhtml::*;
 27 | use std::os::raw::c_void;
 28 | use std::iter::FromIterator;
 29 | use std::ptr::null_mut;
 30 | use test::{test_main, ShouldPanic, TestDesc, TestDescAndFn, TestFn, TestName};
 31 | use token::{Token, TokenRange};
 32 | use decoder::Decoder;
 33 | use unescape::Unescape;
 34 | use html5lib::{get_tests, Test};
 35 | use std::iter::IntoIterator;
 36 | use parse_errors::{ParseErrors, ERROR_CODES};
 37 | use lazyhtml::lhtml_token_type_t::{LHTML_TOKEN_CHARACTER, LHTML_TOKEN_EOF};
 38 | 
 39 | unsafe fn lhtml_to_raw_str(s: &lhtml_string_t) -> &str {
 40 |     ::std::str::from_utf8_unchecked(s)
 41 | }
 42 | 
 43 | unsafe fn lhtml_to_name(s: lhtml_string_t) -> String {
 44 |     let mut s = Decoder::new(lhtml_to_raw_str(&s)).unsafe_null().run();
 45 | 
 46 |     s.make_ascii_lowercase();
 47 | 
 48 |     s
 49 | }
 50 | 
 51 | struct HandlerState {
 52 |     handler: lhtml_token_handler_t,
 53 |     tokenizer: *const lhtml_tokenizer_t,
 54 |     tokens: Vec<Token>,
 55 |     raw_output: String,
 56 |     saw_eof: bool,
 57 |     parse_errors: ParseErrors,
 58 |     token_ranges: Vec<TokenRange>,
 59 | }
 60 | 
 61 | impl HandlerState {
 62 |     pub fn new() -> Self {
 63 |         HandlerState {
 64 |             handler: lhtml_token_handler_t {
 65 |                 callback: Some(HandlerState::callback),
 66 |                 next: null_mut(),
 67 |             },
 68 |             tokenizer: ::std::ptr::null(),
 69 |             tokens: Vec::new(),
 70 |             raw_output: String::new(),
 71 |             saw_eof: false,
 72 |             parse_errors: ParseErrors::new(),
 73 |             token_ranges: Vec::new(),
 74 |         }
 75 |     }
 76 | 
 77 |     fn get_extended_last_token_range(&mut self, new_end: usize) -> TokenRange {
 78 |         let last_range = self.token_ranges.last_mut().unwrap();
 79 | 
 80 |         let extended_range = TokenRange {
 81 |             start: last_range.start,
 82 |             end: new_end,
 83 |         };
 84 | 
 85 |         // NOTE: go through all errors and update their ranges
 86 |         self.parse_errors =
 87 |             ParseErrors::from_iter(self.parse_errors.iter().map(|&(token_range, code)| {
 88 |                 if token_range == *last_range {
 89 |                     (extended_range, code)
 90 |                 } else {
 91 |                     (token_range, code)
 92 |                 }
 93 |             }));
 94 | 
 95 |         *last_range = extended_range;
 96 |         extended_range
 97 |     }
 98 | 
 99 |     unsafe fn update_parse_errors(&mut self, token: *mut lhtml_token_t, token_len: usize) {
100 |         let errors_bit_flags = (*token).parse_errors;
101 |         let start = self.raw_output.len();
102 |         let mut end = start + token_len;
103 |         let mut is_consequent_chars = false;
104 |         let is_eof = (*token).type_ == LHTML_TOKEN_EOF;
105 | 
106 |         if is_eof {
107 |             end += 1;
108 |         }
109 | 
110 |         if let (LHTML_TOKEN_CHARACTER, Some(&Token::Character(_))) =
111 |             ((*token).type_, self.tokens.last())
112 |         {
113 |             is_consequent_chars = true;
114 |         }
115 | 
116 |         // NOTE: Consider we have an EOF in DOCTYPE at pos 15.
117 |         // We attach error to DOCTYPE, so actual error has range [0; 15).
118 |         // However, when we assign range to expected error it fails into EOF's
119 |         // range [15;16) and, thus, we have mismatch.
120 |         // Therefore, to workaround such cases, instead of adding separate range
121 |         // for EOF or consequent character token we just extend last available
122 |         // token range.
123 |         let should_extend_last_range =
124 |             (is_consequent_chars || is_eof) && self.token_ranges.len() > 0;
125 | 
126 |         let token_range = if should_extend_last_range {
127 |             self.get_extended_last_token_range(end)
128 |         } else {
129 |             let token_range = TokenRange { start, end };
130 | 
131 |             self.token_ranges.push(token_range);
132 |             token_range
133 |         };
134 | 
135 |         ERROR_CODES.iter().enumerate().for_each(|(i, code)| {
136 |             if errors_bit_flags & (1 << i) > 0 {
137 |                 self.parse_errors.insert((token_range, code));
138 |             }
139 |         });
140 |     }
141 | 
142 |     unsafe extern "C" fn callback(token: *mut lhtml_token_t, extra: *mut c_void) {
143 |         use lhtml_token_type_t::*;
144 | 
145 |         let state = &mut *(extra as *mut Self);
146 |         let data = &mut (*token).__bindgen_anon_1;
147 | 
148 |         if let Some(&mut Token::Character(ref mut s)) = state.tokens.last_mut() {
149 |             if (*token).type_ != LHTML_TOKEN_CHARACTER {
150 |                 *s = {
151 |                     let mut decoder = Decoder::new(s);
152 | 
153 |                     if (*state.tokenizer).unsafe_null {
154 |                         decoder = decoder.unsafe_null();
155 |                     }
156 | 
157 |                     if (*state.tokenizer).entities {
158 |                         decoder = decoder.text_entities();
159 |                     }
160 | 
161 |                     decoder.run()
162 |                 };
163 |             }
164 |         }
165 | 
166 |         assert!((*token).raw.has_value);
167 | 
168 |         let raw = lhtml_to_raw_str(&(*token).raw.value);
169 | 
170 |         state.update_parse_errors(token, raw.len());
171 | 
172 |         let test_token = match (*token).type_ {
173 |             LHTML_TOKEN_CDATA_START | LHTML_TOKEN_CDATA_END | LHTML_TOKEN_UNPARSED => None,
174 |             LHTML_TOKEN_CHARACTER => {
175 |                 if let Some(&mut Token::Character(ref mut s)) = state.tokens.last_mut() {
176 |                     *s += raw;
177 |                     None
178 |                 } else {
179 |                     Some(Token::Character(raw.to_owned()))
180 |                 }
181 |             }
182 |             LHTML_TOKEN_COMMENT => {
183 |                 (*token).raw.has_value = false;
184 | 
185 |                 Some(Token::Comment(
186 |                     Decoder::new(lhtml_to_raw_str(&data.comment.value))
187 |                         .unsafe_null()
188 |                         .run(),
189 |                 ))
190 |             }
191 |             LHTML_TOKEN_START_TAG => {
192 |                 let start_tag = &mut data.start_tag;
193 | 
194 |                 (*token).raw.has_value = false;
195 | 
196 |                 assert_eq!(lhtml_get_tag_type(start_tag.name), start_tag.type_);
197 | 
198 |                 Some(Token::StartTag {
199 |                     name: lhtml_to_name(start_tag.name),
200 | 
201 |                     attributes: HashMap::from_iter(start_tag.attributes.iter_mut().rev().map(
202 |                         |attr| {
203 |                             attr.raw.has_value = false;
204 | 
205 |                             (
206 |                                 lhtml_to_name(attr.name),
207 |                                 Decoder::new(lhtml_to_raw_str(&attr.value))
208 |                                     .unsafe_null()
209 |                                     .attr_entities()
210 |                                     .run(),
211 |                             )
212 |                         },
213 |                     )),
214 | 
215 |                     self_closing: start_tag.self_closing,
216 |                 })
217 |             }
218 |             LHTML_TOKEN_END_TAG => {
219 |                 let end_tag = &data.end_tag;
220 | 
221 |                 (*token).raw.has_value = false;
222 | 
223 |                 assert_eq!(lhtml_get_tag_type(end_tag.name), end_tag.type_);
224 | 
225 |                 Some(Token::EndTag {
226 |                     name: lhtml_to_name(end_tag.name),
227 |                 })
228 |             }
229 |             LHTML_TOKEN_DOCTYPE => {
230 |                 let doctype = &data.doctype;
231 | 
232 |                 (*token).raw.has_value = false;
233 | 
234 |                 Some(Token::Doctype {
235 |                     name: if doctype.name.has_value {
236 |                         Some(lhtml_to_name(doctype.name.value))
237 |                     } else {
238 |                         None
239 |                     },
240 |                     public_id: if doctype.public_id.has_value {
241 |                         Some(
242 |                             Decoder::new(lhtml_to_raw_str(&doctype.public_id.value))
243 |                                 .unsafe_null()
244 |                                 .run(),
245 |                         )
246 |                     } else {
247 |                         None
248 |                     },
249 |                     system_id: if doctype.system_id.has_value {
250 |                         Some(
251 |                             Decoder::new(lhtml_to_raw_str(&doctype.system_id.value))
252 |                                 .unsafe_null()
253 |                                 .run(),
254 |                         )
255 |                     } else {
256 |                         None
257 |                     },
258 |                     correctness: !doctype.force_quirks,
259 |                 })
260 |             }
261 |             LHTML_TOKEN_EOF if !state.saw_eof => {
262 |                 state.saw_eof = true;
263 |                 None
264 |             }
265 |             _ => {
266 |                 panic!("Unexpected token type");
267 |             }
268 |         };
269 | 
270 |         if let Some(test_token) = test_token {
271 |             state.tokens.push(test_token);
272 |         }
273 | 
274 |         state.raw_output += raw;
275 | 
276 |         lhtml_emit(token, extra);
277 |     }
278 | }
279 | 
280 | impl TokenHandler for HandlerState {
281 |     fn inject_into<'a>(&'a mut self, tokenizer: &mut Tokenizer<'a>) {
282 |         self.tokenizer = unsafe { tokenizer.get_state() };
283 |         self.handler.inject_into(tokenizer);
284 |     }
285 | }
286 | 
287 | impl Test {
288 |     pub unsafe fn run(&self) {
289 |         for &cs in &self.initial_states {
290 |             let mut output = String::new();
291 | 
292 |             for pass in 0..2 {
293 |                 let is_serializer_test = pass == 1;
294 |                 let mut serializer;
295 |                 let mut test_state = HandlerState::new();
296 |                 let mut feedback;
297 | 
298 |                 let input = {
299 |                     let mut tokenizer = Tokenizer::new(2048, 256);
300 |                     tokenizer.set_cs(cs as _);
301 |                     tokenizer.set_last_start_tag(&self.last_start_tag);
302 | 
303 |                     if self.with_feedback {
304 |                         feedback = Feedback::new(64);
305 |                         feedback.inject_into(&mut tokenizer);
306 |                     }
307 | 
308 |                     test_state.inject_into(&mut tokenizer);
309 | 
310 |                     let input = if !is_serializer_test {
311 |                         serializer = Serializer::new(|chunk| {
312 |                             output += chunk;
313 |                         });
314 |                         serializer.inject_into(&mut tokenizer);
315 |                         &self.input
316 |                     } else {
317 |                         &output
318 |                     };
319 | 
320 |                     tokenizer.feed(input).expect("Could not feed input");
321 |                     tokenizer.end().expect("Could not finalize input");
322 | 
323 |                     input
324 |                 };
325 | 
326 |                 assert_eq!(&test_state.raw_output, input);
327 | 
328 |                 if !is_serializer_test && self.with_errors {
329 |                     let expected_errors = self.get_expected_parse_errors(test_state.token_ranges)
330 |                         .unwrap();
331 | 
332 |                     assert_eq!(
333 |                         test_state.parse_errors, expected_errors,
334 |                         "Parse error mismatch:\n\
335 |                          actual: {:?}\n\
336 |                          expected: {:?}\n",
337 |                         test_state.parse_errors, expected_errors
338 |                     );
339 |                 }
340 | 
341 |                 assert!(
342 |                     test_state.tokens == self.output,
343 |                     "Token mismatch\n\
344 |                      state: {:?}\n\
345 |                      original input: {:?}\n\
346 |                      input: {:?}\n\
347 |                      actual: {:#?}\n\
348 |                      expected: {:#?}",
349 |                     cs,
350 |                     if is_serializer_test {
351 |                         Some(&self.input)
352 |                     } else {
353 |                         None
354 |                     },
355 |                     input,
356 |                     test_state.tokens,
357 |                     self.output
358 |                 );
359 |             }
360 |         }
361 |     }
362 | }
363 | 
364 | fn main() {
365 |     let args: Vec<_> = ::std::env::args().collect();
366 | 
367 |     let tests = get_tests()
368 |         .into_iter()
369 |         .map(|mut test| {
370 |             let ignore = test.unescape().is_err();
371 | 
372 |             TestDescAndFn {
373 |                 desc: TestDesc {
374 |                     name: TestName::DynTestName(test.description.to_owned()),
375 |                     ignore,
376 |                     should_panic: ShouldPanic::No,
377 |                     allow_fail: false,
378 |                 },
379 |                 testfn: TestFn::DynTestFn(Box::new(move || unsafe {
380 |                     test.run();
381 |                 })),
382 |             }
383 |         })
384 |         .collect();
385 | 
386 |     test_main(&args, tests);
387 | }
388 | 


--------------------------------------------------------------------------------
/rust/tests/token.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use serde::de::{Deserialize, Deserializer, Error as DeError};
  3 | use std::fmt::{self, Formatter};
  4 | use std::iter::FromIterator;
  5 | 
  6 | #[derive(Clone, Copy, Deserialize)]
  7 | enum TokenKind {
  8 |     Character,
  9 |     Comment,
 10 |     StartTag,
 11 |     EndTag,
 12 |     #[serde(rename = "DOCTYPE")]
 13 |     Doctype,
 14 | }
 15 | 
 16 | // NOTE: we use custom Range implementation for tokens because std::ops::Range
 17 | // is an iterator and, therefore, doesn't implement Copy trait. Also, currently
 18 | // contains() method is available only in the nightly.
 19 | #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
 20 | pub struct TokenRange {
 21 |     pub start: usize,
 22 |     pub end: usize,
 23 | }
 24 | 
 25 | impl TokenRange {
 26 |     pub fn contains(&self, val: usize) -> bool {
 27 |         (self.start <= val) && (val < self.end)
 28 |     }
 29 | }
 30 | 
 31 | #[derive(Debug, PartialEq, Eq)]
 32 | pub enum Token {
 33 |     Character(String),
 34 | 
 35 |     Comment(String),
 36 | 
 37 |     StartTag {
 38 |         name: String,
 39 |         attributes: HashMap<String, String>,
 40 |         self_closing: bool,
 41 |     },
 42 | 
 43 |     EndTag {
 44 |         name: String,
 45 |     },
 46 | 
 47 |     Doctype {
 48 |         name: Option<String>,
 49 |         public_id: Option<String>,
 50 |         system_id: Option<String>,
 51 |         correctness: bool,
 52 |     },
 53 | }
 54 | 
 55 | impl<'de> Deserialize<'de> for Token {
 56 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 57 |     where
 58 |         D: Deserializer<'de>,
 59 |     {
 60 |         struct Visitor;
 61 | 
 62 |         impl<'de> ::serde::de::Visitor<'de> for Visitor {
 63 |             type Value = Token;
 64 | 
 65 |             fn expecting(&self, f: &mut Formatter) -> fmt::Result {
 66 |                 f.write_str("['TokenKind', ...]")
 67 |             }
 68 | 
 69 |             fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
 70 |             where
 71 |                 A: ::serde::de::SeqAccess<'de>,
 72 |             {
 73 |                 let mut actual_length = 0;
 74 | 
 75 |                 macro_rules! next {
 76 |                     ($error_msg: expr) => (match seq.next_element()? {
 77 |                         Some(value) => {
 78 |                             #[allow(unused_assignments)] {
 79 |                                 actual_length += 1;
 80 |                             }
 81 | 
 82 |                             value
 83 |                         },
 84 |                         None => return Err(DeError::invalid_length(
 85 |                             actual_length,
 86 |                             &$error_msg
 87 |                         ))
 88 |                     })
 89 |                 }
 90 | 
 91 |                 let kind = next!("2 or more");
 92 | 
 93 |                 Ok(match kind {
 94 |                     TokenKind::Character => Token::Character(next!("2")),
 95 |                     TokenKind::Comment => Token::Comment(next!("2")),
 96 |                     TokenKind::StartTag => Token::StartTag {
 97 |                         name: {
 98 |                             let mut value: String = next!("3 or 4");
 99 |                             value.make_ascii_lowercase();
100 |                             value
101 |                         },
102 |                         attributes: {
103 |                             let value: HashMap<String, String> = next!("3 or 4");
104 |                             HashMap::from_iter(value.into_iter().map(|(mut k, v)| {
105 |                                 k.make_ascii_lowercase();
106 |                                 (k, v)
107 |                             }))
108 |                         },
109 |                         self_closing: seq.next_element()?.unwrap_or(false),
110 |                     },
111 |                     TokenKind::EndTag => Token::EndTag {
112 |                         name: {
113 |                             let mut value: String = next!("2");
114 |                             value.make_ascii_lowercase();
115 |                             value
116 |                         },
117 |                     },
118 |                     TokenKind::Doctype => Token::Doctype {
119 |                         name: {
120 |                             let mut value: Option<String> = next!("5");
121 |                             if let Some(ref mut value) = value {
122 |                                 value.make_ascii_lowercase();
123 |                             }
124 |                             value
125 |                         },
126 |                         public_id: next!("5"),
127 |                         system_id: next!("5"),
128 |                         correctness: next!("5"),
129 |                     },
130 |                 })
131 |             }
132 |         }
133 | 
134 |         deserializer.deserialize_seq(Visitor)
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/rust/tests/unescape.rs:
--------------------------------------------------------------------------------
 1 | use token::Token;
 2 | use serde_json::error::Error;
 3 | use serde_json::de::from_str as parse_json;
 4 | 
 5 | pub trait Unescape {
 6 |     fn unescape(&mut self) -> Result<(), Error>;
 7 | }
 8 | 
 9 | impl Unescape for String {
10 |     // dummy but does the job
11 |     fn unescape(&mut self) -> Result<(), Error> {
12 |         *self = parse_json(&format!(r#""{}""#, self))?;
13 |         Ok(())
14 |     }
15 | }
16 | 
17 | impl<T: Unescape> Unescape for Option<T> {
18 |     fn unescape(&mut self) -> Result<(), Error> {
19 |         if let Some(ref mut inner) = *self {
20 |             inner.unescape()?;
21 |         }
22 |         Ok(())
23 |     }
24 | }
25 | 
26 | impl Unescape for Token {
27 |     fn unescape(&mut self) -> Result<(), Error> {
28 |         match *self {
29 |             Token::Character(ref mut s) | Token::Comment(ref mut s) => {
30 |                 s.unescape()?;
31 |             }
32 | 
33 |             Token::EndTag { ref mut name } => {
34 |                 name.unescape()?;
35 |             }
36 | 
37 |             Token::StartTag {
38 |                 ref mut name,
39 |                 ref mut attributes,
40 |                 ..
41 |             } => {
42 |                 name.unescape()?;
43 |                 for value in attributes.values_mut() {
44 |                     value.unescape()?;
45 |                 }
46 |             }
47 | 
48 |             Token::Doctype {
49 |                 ref mut name,
50 |                 ref mut public_id,
51 |                 ref mut system_id,
52 |                 ..
53 |             } => {
54 |                 name.unescape()?;
55 |                 public_id.unescape()?;
56 |                 system_id.unescape()?;
57 |             }
58 |         }
59 |         Ok(())
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/simplify-graph.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | var fs = require('fs');
 4 | var graphlib = require('graphlib-dot');
 5 | 
 6 | var g = graphlib.read(fs.readFileSync(process.argv[2], 'utf-8'));
 7 | 
 8 | g.nodes().forEach(v => {
 9 |     var out = g.outEdges(v);
10 |     var outW = new Set(out.map(edge => edge.w));
11 |     outW.forEach(w => {
12 |         var labels = out.reduce((map, edge) => {
13 |             if (edge.w === w) {
14 |                 var match = g.edge(v, w, edge.name).label.match(/^(.*?)((?:\(.*?\))?(?: \/ \w+(?:, \w+)*)?)$/);
15 |                 var strings = map.get(match[2] || '');
16 |                 if (!strings) {
17 |                     map.set(match[2] || '', strings = []);
18 |                 }
19 |                 strings.push(match[1]);
20 |                 g.removeEdge(v, w, edge.name);
21 |             }
22 |             return map;
23 |         }, new Map());
24 |         var label = Array.from(labels, ([ action, strings ]) => strings.join(' | ') + action).join('\n');
25 |         g.setEdge(v, w, { label });
26 |     });
27 | });
28 | 
29 | fs.writeFileSync(process.argv[2], graphlib.write(g));
30 | 


--------------------------------------------------------------------------------
/syntax/_helpers.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     TAB = '\t';
 5 |     CR = '\r';
 6 |     LF = '\n';
 7 |     FF = '\f';
 8 | 
 9 |     TagNameSpace = TAB | CR | LF | FF | ' ';
10 | 
11 |     TagNameEnd = TagNameSpace | '/' | '>';
12 | 
13 |     _Quote = ('"' | "'");
14 | 
15 |     _StartQuote = _Quote @SaveQuote;
16 | 
17 |     _EndQuote = _Quote when IsMatchingQuote;
18 | 
19 |     _UnsafeText = (any+ >CreateCharacter >UnsafeNull >StartSlice %EmitSlice)?;
20 | 
21 |     _EndTagEnd = (
22 |         TagNameSpace |
23 |         '/' |
24 |         '>'
25 |     ) @Reconsume @To_EndTagNameContents;
26 | 
27 |     _SpecialEndTag = (
28 |         '/' >StartAppropriateEndTag
29 |         (alpha when FeedAppropriateEndTag)*
30 |         _EndTagEnd when IsAppropriateEndTagFed >CreateEndTagToken >SetAppropriateEndTagName
31 |     ) @err(CreateCharacter) @err(EmitSlice) @err(Reconsume);
32 | }%%
33 | 


--------------------------------------------------------------------------------
/syntax/_navigation.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     action Reconsume { fhold; }
 5 | 
 6 |     action Next_Data { fnext Data; }
 7 | 
 8 |     action To_TagOpen { fgoto TagOpen; }
 9 |     action To_Data { fgoto Data; }
10 |     action To_RCDataLessThanSign { fgoto RCDataLessThanSign; }
11 |     action To_RCData { fgoto RCData; }
12 |     action To_RawTextLessThanSign { fgoto RawTextLessThanSign; }
13 |     action To_RawText { fgoto RawText; }
14 |     action To_ScriptDataLessThanSign { fgoto ScriptDataLessThanSign; }
15 |     action To_ScriptData { fgoto ScriptData; }
16 |     action To_MarkupDeclarationOpen { fgoto MarkupDeclarationOpen; }
17 |     action To_EndTagOpen { fgoto EndTagOpen; }
18 |     action To_StartTagName { fgoto StartTagName; }
19 |     action To_EndTagName { fgoto EndTagName; }
20 |     action To_EndTagNameContents { fgoto EndTagNameContents; }
21 |     action To_BogusComment { fgoto BogusComment; }
22 |     action To_BeforeAttributeName { fgoto BeforeAttributeName; }
23 |     action To_SelfClosingTag { fgoto SelfClosingTag; }
24 |     action To_ScriptDataEscapedDashDash { fgoto ScriptDataEscapedDashDash; }
25 |     action To_ScriptDataEscapedDash { fgoto ScriptDataEscapedDash; }
26 |     action To_ScriptDataEscapedLessThanSign { fgoto ScriptDataEscapedLessThanSign; }
27 |     action To_ScriptDataEscaped { fgoto ScriptDataEscaped; }
28 |     action To_ScriptDataDoubleEscaped { fgoto ScriptDataDoubleEscaped; }
29 |     action To_ScriptDataDoubleEscapedDash { fgoto ScriptDataDoubleEscapedDash; }
30 |     action To_ScriptDataDoubleEscapedLessThanSign { fgoto ScriptDataDoubleEscapedLessThanSign; }
31 |     action To_ScriptDataDoubleEscapedDashDash { fgoto ScriptDataDoubleEscapedDashDash; }
32 |     action To_AttributeName { fgoto AttributeName; }
33 |     action To_AfterAttributeName { fgoto AfterAttributeName; }
34 |     action To_BeforeAttributeValue { fgoto BeforeAttributeValue; }
35 |     action To_AttributeValueQuoted { fgoto AttributeValueQuoted; }
36 |     action To_AfterAttributeValueQuoted { fgoto AfterAttributeValueQuoted; }
37 |     action To_AttributeValueUnquoted { fgoto AttributeValueUnquoted; }
38 |     action To_DocType { fgoto DocType; }
39 |     action To_CDataSection { fgoto CDataSection; }
40 |     action To_Comment { fgoto Comment; }
41 |     action To_BeforeDocTypeName { fgoto BeforeDocTypeName; }
42 |     action To_DocTypeName { fgoto DocTypeName; }
43 |     action To_AfterDocTypeName { fgoto AfterDocTypeName; }
44 |     action To_AfterDocTypePublicKeyword { fgoto AfterDocTypePublicKeyword; }
45 |     action To_BeforeDocTypePublicIdentifier { fgoto BeforeDocTypePublicIdentifier; }
46 |     action To_DocTypePublicIdentifierQuoted { fgoto DocTypePublicIdentifierQuoted; }
47 |     action To_BogusDocType { fgoto BogusDocType; }
48 |     action To_AfterDocTypePublicIdentifier { fgoto AfterDocTypePublicIdentifier; }
49 |     action To_BetweenDocTypePublicAndSystemIdentifiers { fgoto BetweenDocTypePublicAndSystemIdentifiers; }
50 |     action To_DocTypeSystemIdentifierQuoted { fgoto DocTypeSystemIdentifierQuoted; }
51 |     action To_AfterDocTypeSystemKeyword { fgoto AfterDocTypeSystemKeyword; }
52 |     action To_BeforeDocTypeSystemIdentifier { fgoto BeforeDocTypeSystemIdentifier; }
53 |     action To_AfterDocTypeSystemIdentifier { fgoto AfterDocTypeSystemIdentifier; }
54 | }%%
55 | 


--------------------------------------------------------------------------------
/syntax/cdata.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     CDataSection := (
 5 |         start: any* :> (
 6 |             ']' @MarkPosition -> cdata_end
 7 |         ),
 8 | 
 9 |         cdata_end: (
10 |             ']' >1 -> cdata_end_right_bracket |
11 |             any >0 @UnmarkPosition -> start
12 |         ) @eof(UnmarkPosition),
13 | 
14 |         cdata_end_right_bracket: ']'* $AdvanceMarkedPosition <: (
15 |             '>' >1 -> final |
16 |             any >0 @UnmarkPosition -> start
17 |         ) @eof(UnmarkPosition)
18 |     ) >CreateCharacter >StartSlice @EmitSlice @CreateCDataEnd @EmitToken @UnmarkPosition @eof(Err_EofInCData) <>eof(EmitSlice) @To_Data;
19 | }%%
20 | 


--------------------------------------------------------------------------------
/syntax/comment.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     Comment := (
 5 |         start: (
 6 |             (
 7 |                 '-' -> comment_start_dash |
 8 |                 '<' -> comment_less_than_sign |
 9 |                 '>' @Err_AbruptClosingOfEmptyComment -> final
10 |             ) >1 >MarkPosition |
11 |             any >0 -> text_slice
12 |         ),
13 | 
14 |         comment_start_dash: (
15 |             (
16 |                 '-' -> comment_end |
17 |                 '>' @Err_AbruptClosingOfEmptyComment -> final
18 |             ) >1 |
19 |             any >0 -> text_slice
20 |         ),
21 | 
22 |         text_slice: any* :> (
23 |             '<' @MarkPosition -> comment_less_than_sign |
24 |             '-' @MarkPosition -> comment_end_dash
25 |         ) @eof(MarkPosition),
26 | 
27 |         comment_less_than_sign: '<'* $AdvanceMarkedPosition <: (
28 |             '!' >1 -> comment_less_than_sign_bang |
29 |             any >0 @Reconsume -> text_slice
30 |         ) @eof(MarkPosition),
31 | 
32 |         comment_less_than_sign_bang: (
33 |             '-' >1 -> comment_less_than_sign_bang_dash |
34 |             any >0 @Reconsume -> text_slice
35 |         ),
36 | 
37 |         comment_less_than_sign_bang_dash: (
38 |             '-' >1 ->comment_less_than_sign_bang_dash_dash |
39 |             any >0 @Reconsume -> text_slice
40 |         ),
41 | 
42 |         comment_less_than_sign_bang_dash_dash: (
43 |             (
44 |                 '>' -> final |
45 |                 '!' -> comment_end_bang
46 |             ) >1 |
47 |             any >0 @Err_NestedComment -> text_slice
48 |         ),
49 | 
50 |         comment_end_dash: (
51 |             '-' >1 -> comment_end |
52 |             any >0 -> text_slice
53 |         ),
54 | 
55 |         comment_end: '-'* $AdvanceMarkedPosition <: (
56 |             (
57 |                 '>' -> final |
58 |                 '!' -> comment_end_bang
59 |             ) >1 |
60 |             any >0 -> text_slice
61 |         ),
62 | 
63 |         comment_end_bang: (
64 |             (
65 |                 '-' @MarkPosition -> comment_end_dash |
66 |                 '>' @Err_IncorrectlyClosedComment -> final
67 |             ) >1 |
68 |             any >0 -> text_slice
69 |         )
70 |     ) >StartSlice >eof(StartSlice) >eof(MarkPosition) @EndComment @EmitToken @UnmarkPosition @To_Data @eof(Err_EofInComment) @eof(EndComment) @eof(EmitToken);
71 | }%%
72 | 


--------------------------------------------------------------------------------
/syntax/data.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     Data := (any+ >CreateCharacter >AllowEntities >StartSlice %EmitSlice)? :> (
 5 |         '<' @StartSlice @To_TagOpen
 6 |     )?;
 7 | 
 8 |     TagOpen := (
 9 |         (
10 |             '!' @To_MarkupDeclarationOpen |
11 |             '/' @To_EndTagOpen |
12 |             alpha @CreateStartTagToken @StartSlice @To_StartTagName |
13 |             '?' @Err_UnexpectedQuestionMarkInsteadOfTagName @StartSlice @Reconsume @To_BogusComment
14 |         ) >1 |
15 |         any >0 @Err_InvalidFirstCharacterOfTagName @CreateCharacter @EmitSlice @Reconsume @To_Data
16 |     ) @eof(Err_EofBeforeTagName) @eof(CreateCharacter) @eof(EmitSlice);
17 | 
18 |     BogusComment := any* %MarkPosition %EndComment %EmitToken %UnmarkPosition :> ('>' @To_Data)?;
19 | 
20 |     MarkupDeclarationOpen := (
21 |         '--' @To_Comment |
22 |         /DOCTYPE/i @To_DocType |
23 |         ('[CDATA' (
24 |             '[' when IsCDataAllowed @CreateCDataStart @EmitToken @To_CDataSection |
25 |             '[' @Err_CDataInHtmlContent @To_BogusComment
26 |         ))
27 |     ) >StartSlice >err(StartSlice) $err(Err_IncorrectlyOpenedComment) $err(Reconsume) $err(To_BogusComment);
28 | }%%
29 | 


--------------------------------------------------------------------------------
/syntax/doctype.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     DocType := (
 5 |         TagNameSpace >1 @To_BeforeDocTypeName |
 6 |         '>' >1 @Err_MissingDoctypeName @SetForceQuirksFlag @EmitToken @To_Data |
 7 |         any >0 @Err_MissingWhitespaceBeforeDoctypeName @StartSlice @To_DocTypeName
 8 |     ) >CreateDocType >eof(CreateDocType) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
 9 | 
10 |     BeforeDocTypeName := TagNameSpace* <: (
11 |         '>' >1 @Err_MissingDoctypeName @SetForceQuirksFlag @EmitToken @To_Data |
12 |         any >0 @StartSlice @To_DocTypeName
13 |     ) >eof(CreateDocType) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
14 | 
15 |     DocTypeName := any* %SetDocTypeName %eof(SetDocTypeName) :> (
16 |         TagNameSpace |
17 |         '>'
18 |     ) @Reconsume @To_AfterDocTypeName @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
19 | 
20 |     AfterDocTypeName := TagNameSpace* $eof(Err_EofInDoctype) $eof(SetForceQuirksFlag) $eof(Reconsume) $eof(To_BogusDocType) (
21 |         (
22 |             '>' @EmitToken @To_Data |
23 |             /PUBLIC/i @To_AfterDocTypePublicKeyword |
24 |             /SYSTEM/i @To_AfterDocTypeSystemKeyword
25 |         ) @err(Err_InvalidCharacterSequenceAfterDoctypeName)
26 |     )? $err(SetForceQuirksFlag) $err(Reconsume) $err(To_BogusDocType);
27 | 
28 |      AfterDocTypePublicKeyword := (
29 |          TagNameSpace >1 @To_BeforeDocTypePublicIdentifier |
30 |         _StartQuote >1 @Err_MissingSpaceAfterDoctypePublicKeyword @To_DocTypePublicIdentifierQuoted |
31 |         any >0 @Reconsume @To_BeforeDocTypePublicIdentifier
32 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
33 | 
34 |     BeforeDocTypePublicIdentifier := TagNameSpace* <: (
35 |         _StartQuote >1 @To_DocTypePublicIdentifierQuoted |
36 |         '>' >1 @Err_MissingDoctypePublicIdentifier @SetForceQuirksFlag @EmitToken @To_Data |
37 |         any >0 @Err_MissingQuoteBeforeDoctypePublicIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType
38 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
39 | 
40 |     DocTypePublicIdentifierQuoted := any* >StartSlice >eof(StartSlice) %SetDocTypePublicIdentifier %eof(SetDocTypePublicIdentifier) :> (
41 |         _EndQuote @To_AfterDocTypePublicIdentifier |
42 |         '>' @Err_AbruptDoctypePublicIdentifier @SetForceQuirksFlag @EmitToken @To_Data
43 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
44 | 
45 |     AfterDocTypePublicIdentifier := (
46 |         (
47 |             TagNameSpace @To_BetweenDocTypePublicAndSystemIdentifiers |
48 |             _StartQuote @Err_MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers @To_DocTypeSystemIdentifierQuoted |
49 |             '>' @EmitToken @To_Data
50 |         ) >1 |
51 |         any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType
52 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
53 | 
54 |     BetweenDocTypePublicAndSystemIdentifiers := TagNameSpace* <: (
55 |         (
56 |             _StartQuote @To_DocTypeSystemIdentifierQuoted |
57 |             '>' @EmitToken @To_Data
58 |         ) >1 |
59 |         any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType
60 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
61 | 
62 |     AfterDocTypeSystemKeyword := (
63 |         TagNameSpace >1 @To_BeforeDocTypeSystemIdentifier |
64 |         _StartQuote >1 @Err_MissingSpaceAfterDoctypeSystemKeyword @To_DocTypeSystemIdentifierQuoted |
65 |         any >0 @Reconsume @To_BeforeDocTypeSystemIdentifier
66 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
67 | 
68 |     BeforeDocTypeSystemIdentifier := TagNameSpace* <: (
69 |         _StartQuote >1 @To_DocTypeSystemIdentifierQuoted |
70 |         '>' >1 @Err_MissingDoctypeSystemIdentifier @SetForceQuirksFlag @EmitToken @To_Data |
71 |         any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType
72 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
73 | 
74 |     DocTypeSystemIdentifierQuoted := any* >StartSlice >eof(StartSlice) %SetDocTypeSystemIdentifier %eof(SetDocTypeSystemIdentifier) :> (
75 |         _EndQuote @To_AfterDocTypeSystemIdentifier |
76 |         '>' @Err_AbruptDoctypeSystemIdentifier @SetForceQuirksFlag @EmitToken @To_Data
77 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
78 | 
79 |     AfterDocTypeSystemIdentifier := TagNameSpace* <: (
80 |         '>' >1 @EmitToken @To_Data |
81 |         any >0 @Err_UnexpectedCharacterAfterDoctypeSystemIdentifier @Reconsume @To_BogusDocType
82 |     ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken);
83 | 
84 |     BogusDocType := any* :> '>' @EmitToken @To_Data @eof(EmitToken);
85 | }%%
86 | 


--------------------------------------------------------------------------------
/syntax/endtag.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     EndTagName := (any* :> _EndTagEnd >SetEndTagName) @eof(Err_EofInTag);
 5 | 
 6 |     EndTagNameContents := (
 7 |         start: (TagNameSpace)* <: (
 8 |             '/' >1 -> solidus |
 9 |             '>' >1 @EmitToken @To_Data |
10 |             any+ >0 @Err_EndTagWithAttributes :> (
11 |                 '/' -> start |
12 |                 '>' @EmitToken @To_Data |
13 |                 '=' TagNameSpace* <: (
14 |                     _StartQuote >1 any* :> _EndQuote -> start |
15 |                     '>' >1 @EmitToken @To_Data |
16 |                     any+ >0 :> (
17 |                         TagNameSpace -> start |
18 |                         '>' @EmitToken @To_Data
19 |                     )
20 |                 )
21 |             )
22 |         ),
23 |         solidus: (
24 |             '>' @Err_EndTagWithTrailingSolidus @EmitToken @To_Data |
25 |             any >0 @Reconsume -> start
26 |         )
27 |     ) @eof(Err_EofInTag);
28 | 
29 |     EndTagOpen := (
30 |         (
31 |             alpha @CreateEndTagToken @StartSlice @To_EndTagName |
32 |             '>' @Err_MissingEndTagName @CreateUnparsed @EmitToken @To_Data
33 |         ) >1 |
34 |         any >0 @Err_InvalidFirstCharacterOfTagName @StartSlice @Reconsume @To_BogusComment
35 |     ) @eof(Err_EofBeforeTagName) @eof(CreateCharacter) @eof(EmitSlice);
36 | }%%
37 | 


--------------------------------------------------------------------------------
/syntax/index.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     include '_navigation.rl';
 5 |     include '_helpers.rl';
 6 | 
 7 |     include 'data.rl';
 8 | 
 9 |     include 'starttag.rl';
10 |     include 'endtag.rl';
11 | 
12 |     include 'comment.rl';
13 |     include 'doctype.rl';
14 |     include 'cdata.rl';
15 | 
16 |     include 'scriptdata.rl';
17 |     include 'rcdata.rl';
18 |     include 'rawtext.rl';
19 |     include 'plaintext.rl';
20 | }%%
21 | 


--------------------------------------------------------------------------------
/syntax/plaintext.rl:
--------------------------------------------------------------------------------
1 | %%{
2 |     machine html;
3 | 
4 |     PlainText := _UnsafeText;
5 | }%%
6 | 


--------------------------------------------------------------------------------
/syntax/rawtext.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     RawText := _UnsafeText :> (
 5 |         '<' @StartSlice @To_RawTextLessThanSign
 6 |     )?;
 7 | 
 8 |     RawTextLessThanSign := _SpecialEndTag @err(To_RawText);
 9 | }%%
10 | 


--------------------------------------------------------------------------------
/syntax/rcdata.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     RCData := (any+ >CreateCharacter >UnsafeNull >AllowEntities >StartSlice %EmitSlice)? :> (
 5 |         '<' @StartSlice @To_RCDataLessThanSign
 6 |     )?;
 7 | 
 8 |     RCDataLessThanSign := _SpecialEndTag @err(To_RCData);
 9 | }%%
10 | 


--------------------------------------------------------------------------------
/syntax/scriptdata.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     ScriptData := _UnsafeText :> (
 5 |         '<' @StartSlice @To_ScriptDataLessThanSign
 6 |     )?;
 7 | 
 8 |     ScriptDataLessThanSign := (
 9 |         _SpecialEndTag |
10 |         '!--' >CreateCharacter >UnsafeNull @To_ScriptDataEscapedDashDash
11 |     ) @err(EmitSlice) @err(Reconsume) @err(To_ScriptData);
12 | 
13 |     ScriptDataEscaped := _UnsafeText :> (
14 |         '-' @CreateCharacter @UnsafeNull @StartSlice @To_ScriptDataEscapedDash |
15 |         '<' @StartSlice @To_ScriptDataEscapedLessThanSign
16 |     ) @eof(Err_EofInScriptHtmlCommentLikeText);
17 | 
18 |     ScriptDataEscapedDash := (
19 |         (
20 |             '-' @To_ScriptDataEscapedDashDash |
21 |             '<' @EmitSlice @StartSlice @To_ScriptDataEscapedLessThanSign
22 |         ) >1 |
23 |         any >0 @EmitSlice @Reconsume @To_ScriptDataEscaped
24 |     ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice);
25 | 
26 |     ScriptDataEscapedDashDash := '-'* <: (
27 |         (
28 |             '<' @EmitSlice @StartSlice @To_ScriptDataEscapedLessThanSign |
29 |             '>' @EmitSlice @Reconsume @To_ScriptData
30 |         ) >1 |
31 |         any >0 @EmitSlice @Reconsume @To_ScriptDataEscaped
32 |     ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice);
33 | 
34 |     ScriptDataEscapedLessThanSign := (
35 |         _SpecialEndTag |
36 |         (/script/i TagNameEnd) @CreateCharacter @UnsafeNull @To_ScriptDataDoubleEscaped
37 |     ) @err(CreateCharacter) @err(EmitSlice) @err(Reconsume) @err(To_ScriptDataEscaped);
38 | 
39 |     ScriptDataDoubleEscaped := any* :> (
40 |         '-' @To_ScriptDataDoubleEscapedDash |
41 |         '<' @To_ScriptDataDoubleEscapedLessThanSign
42 |     ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice);
43 | 
44 |     ScriptDataDoubleEscapedDash := (
45 |         (
46 |             '-' @To_ScriptDataDoubleEscapedDashDash |
47 |             '<' @To_ScriptDataDoubleEscapedLessThanSign
48 |         ) >1 |
49 |         any >0 @To_ScriptDataDoubleEscaped
50 |     ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice);
51 | 
52 |     ScriptDataDoubleEscapedDashDash := '-'* <: (
53 |         (
54 |             '<' @To_ScriptDataDoubleEscapedLessThanSign |
55 |             '>' @EmitSlice @Reconsume @To_ScriptData
56 |         ) >1 |
57 |         any >0 @To_ScriptDataDoubleEscaped
58 |     ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice);
59 | 
60 |     ScriptDataDoubleEscapedLessThanSign := (
61 |         '/' /script/i TagNameEnd @EmitSlice @Reconsume @To_ScriptDataEscaped
62 |     ) @err(Reconsume) @err(To_ScriptDataDoubleEscaped);
63 | }%%
64 | 


--------------------------------------------------------------------------------
/syntax/starttag.rl:
--------------------------------------------------------------------------------
 1 | %%{
 2 |     machine html;
 3 | 
 4 |     _StartTagEnd = (
 5 |         TagNameSpace @To_BeforeAttributeName |
 6 |         '/' @To_SelfClosingTag |
 7 |         '>' @SetLastStartTagName @Next_Data @EmitToken
 8 |     );
 9 | 
10 |     _AttributeNameChars = (
11 |         ('"' | "'" | '<') >1 @Err_UnexpectedCharacterInAttributeName |
12 |         any >0
13 |     );
14 | 
15 |     StartTagName := (any* %SetStartTagName :> _StartTagEnd) @eof(Err_EofInTag);
16 | 
17 |     BeforeAttributeName := TagNameSpace* <: (
18 |         ('/' | '>') >1 @Reconsume @To_AfterAttributeName |
19 |         '=' >1 @Err_UnexpectedEqualsSignBeforeAttributeName when CanCreateAttribute @StartSlice @To_AttributeName |
20 |         _AttributeNameChars >0 when CanCreateAttribute @StartSlice @To_AttributeName
21 |     ) @eof(Err_EofInTag);
22 | 
23 |     AttributeName := _AttributeNameChars* %AppendAttribute :> (
24 |         TagNameEnd @Reconsume @To_AfterAttributeName |
25 |         '=' @To_BeforeAttributeValue
26 |     ) @eof(Err_EofInTag);
27 | 
28 |     AfterAttributeName := TagNameSpace* <: (
29 |         (
30 |             _StartTagEnd |
31 |             '=' @To_BeforeAttributeValue
32 |         ) >1 |
33 |         _AttributeNameChars >0 when CanCreateAttribute @StartSlice @To_AttributeName
34 |     ) @eof(Err_EofInTag);
35 | 
36 |     BeforeAttributeValue := TagNameSpace* <: (
37 |         _StartQuote >1 @To_AttributeValueQuoted |
38 |         '>' >1 @Err_MissingAttributeValue @Reconsume @To_AttributeValueUnquoted |
39 |         any >0 @Reconsume @To_AttributeValueUnquoted
40 |     ) @eof(Err_EofInTag);
41 | 
42 |     _AttrValueCharsQuoted = (any* >StartSlice %SetAttributeValue)?;
43 | 
44 |     AttributeValueQuoted := (_AttrValueCharsQuoted :> _EndQuote @To_AfterAttributeValueQuoted) @eof(Err_EofInTag);
45 | 
46 |     AfterAttributeValueQuoted := (
47 |         _StartTagEnd >1 |
48 |         '=' >1  @Err_MissingWhitespaceBetweenAttributes @Err_UnexpectedEqualsSignBeforeAttributeName when CanCreateAttribute @StartSlice @To_AttributeName |
49 |         _AttributeNameChars >0 @Err_MissingWhitespaceBetweenAttributes when CanCreateAttribute @StartSlice @To_AttributeName
50 |     ) @eof(Err_EofInTag);
51 | 
52 |     _AttrValueCharsUnquoted = ((
53 |         ('"' | "'" | '<' | '=' | '`') >1 @Err_UnexpectedCharacterInUnquotedAttributeValue |
54 |         any >0
55 |     )* >StartSlice %SetAttributeValue)?;
56 | 
57 |     AttributeValueUnquoted := (_AttrValueCharsUnquoted :> ((TagNameSpace | '>') & _StartTagEnd)) @eof(Err_EofInTag);
58 | 
59 |     SelfClosingTag := (
60 |         '>' >1 @SetSelfClosingFlag @SetLastStartTagName @Next_Data @EmitToken |
61 |         any >0 @Err_UnexpectedSolidusInTag @Reconsume @To_BeforeAttributeName
62 |     ) @eof(Err_EofInTag);
63 | }%%
64 | 


--------------------------------------------------------------------------------