├── .gitattributes ├── .github └── workflows │ └── semgrep.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── bench-fixtures ├── es-spec.html └── html-spec.html ├── c ├── .gitignore ├── Makefile ├── actions.rl ├── field-names.h ├── parse_errors.rl ├── parser-feedback.c ├── parser-feedback.h ├── serializer.c ├── serializer.h ├── tag-types.h ├── tokenizer-states.rl ├── tokenizer.h └── tokenizer.rl ├── cfsetup.yaml ├── convert-test-log.py ├── error-with-feedback-tests └── trailing-solidus.test ├── images ├── language-specific-actions.png ├── perf-comparison.png ├── ragel-visualization.png ├── syntax-description.png └── syntax-files.png ├── package.json ├── rust ├── .editorconfig ├── .gitignore ├── Cargo.toml ├── benches │ └── bench.rs ├── examples │ └── trace.rs ├── lazyhtml-sys │ ├── Cargo.toml │ ├── build.rs │ ├── src │ │ └── lib.rs │ └── wrapper.h ├── src │ ├── feedback.rs │ ├── lib.rs │ ├── serializer.rs │ └── tokenizer.rs └── tests │ ├── decoder.rs │ ├── feedback_tokens │ ├── mod.rs │ ├── noop_tree_sink.rs │ └── token_sink_proxy.rs │ ├── html5lib.rs │ ├── parse_errors.rs │ ├── test.rs │ ├── token.rs │ └── unescape.rs ├── simplify-graph.js └── syntax ├── _helpers.rl ├── _navigation.rl ├── cdata.rl ├── comment.rl ├── data.rl ├── doctype.rl ├── endtag.rl ├── index.rl ├── plaintext.rl ├── rawtext.rl ├── rcdata.rl ├── scriptdata.rl └── starttag.rl /.gitattributes: -------------------------------------------------------------------------------- 1 | # Exclude the HTML files from GitHub's language statistics 2 | # https://github.com/github/linguist#using-gitattributes 3 | bench-fixtures/* linguist-vendored 4 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | schedule: 10 | - cron: '0 0 * * *' 11 | name: Semgrep config 12 | jobs: 13 | semgrep: 14 | name: semgrep/ci 15 | runs-on: ubuntu-20.04 16 | env: 17 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 18 | SEMGREP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 20 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 21 | container: 22 | image: returntocorp/semgrep 23 | steps: 24 | - uses: actions/checkout@v3 25 | - run: semgrep ci 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.dot 2 | *.png 3 | *.ri 4 | node_modules 5 | /rust/tests.log 6 | /rust/failures.log 7 | /rust/tests.xml 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "html5lib-tests"] 2 | path = html5lib-tests 3 | url = https://github.com/html5lib/html5lib-tests.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2019, Cloudflare, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | RAGEL = ragel 2 | RAGELFLAGS = 3 | 4 | RL_FILES := $(wildcard syntax/*.rl) 5 | 6 | .PHONY: c-tokenizer 7 | c-tokenizer: 8 | make -C c 9 | 10 | .PHONY: test 11 | test: 12 | cd rust && cargo test 13 | 14 | .PHONY: bench 15 | bench: 16 | cd rust && cargo bench 17 | 18 | %.dot: c/tokenizer.rl $(RL_FILES) 19 | $(RAGEL) $(RAGELFLAGS) -Vp -M $(notdir $(basename $@)) $< > $@ 20 | node simplify-graph.js $@ 21 | 22 | %.png: %.dot 23 | dot -Tpng $< -o $@ 24 | open $@ 25 | 26 | .PHONY: clean 27 | clean: 28 | rm -rf *.dot *.png 29 | make -C c clean 30 | cd rust; cargo clean 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LazyHTML (lhtml) 2 | 3 | LazyHTML is an HTML5-compliant parser and serializer than enables building transformation pipeline in a pluggable manner. 4 | 5 | ## Testing 6 | 7 | ``` 8 | make test 9 | ``` 10 | 11 | ## Benchmark 12 | 13 | ``` 14 | make bench 15 | ``` 16 | 17 | ## How do we use it? 18 | 19 | First of all, you need to create a buffer of a desired size: 20 | 21 | ```c 22 | char buffer[1048576]; 23 | ``` 24 | 25 | Then, you want to create a parsing state and initialize it with desired options: 26 | 27 | ```c 28 | lhtml_options_t options = { 29 | .initial_state = LHTML_STATE_DATA, 30 | .allow_cdata = false, 31 | .last_start_tag_name = { .length = 0 }, 32 | .buffer = buffer, 33 | .buffer_size = sizeof(buffer) 34 | }; 35 | 36 | lhtml_state_t state; 37 | 38 | lhtml_init(state, options); 39 | ``` 40 | 41 | At this point, you can inject own handler(s) for transformation: 42 | 43 | ``` 44 | lhtml_token_handler_t handler; 45 | lhtml_add_handler(&state, &handler, handle_token); 46 | ``` 47 | 48 | Finally, feed it chunk by chunk: 49 | 50 | ```c 51 | lhtml_string_t chunk = { .data = "...", .length = 3 }; 52 | lhtml_feed(&state, &chunk); 53 | ``` 54 | 55 | And finalize by sending NULL chunk (noting that no further data will be available): 56 | 57 | ```c 58 | lhtml_feed(&state, NULL); 59 | ``` 60 | 61 | ## Nice, but what do we put into the custom handlers / plugins? 62 | 63 | Each plugin can have own state. To simplify the API, we take advantage of the fact that in C, structure always points to its first element, so if your transformation needs its own state, the convention is to put lhtml_token_handler_t handler; as the first item of your structure, and dereference the extra pointer in a callback to your state. If transformation doesn't need its own state, lhtml_token_handler_t can be used directly as shown below. This item is needed so that lhtml could chain various handlers into a single pipeline (if you're familiar with Nginx module system, this should look familiar to you, although with some modifications). 64 | 65 | So, for example, function that only transforms href propertly on links, can look like following: 66 | 67 | ```c 68 | // define static string to be used for replacements 69 | static const lhtml_string_t REPLACEMENT = { 70 | .data = "[REPLACED]", 71 | .length = sizeof("[REPLACED]") - 1 72 | }; 73 | 74 | static void token_handler(lhtml_token_t *token, void *extra /* this can be your state */) { 75 | if (token->type == LHTML_TOKEN_START_TAG) { // we're interested only in start tags 76 | const lhtml_token_starttag_t *tag = &token->start_tag; 77 | if (tag->type == LHTML_TAG_A) { // check whether tag is of type 78 | const size_t n_attrs = tag->attributes.count; 79 | const lhtml_attribute_t *attrs = tag->attributes.items; 80 | for (size_t i = 0; i < n_attrs; i++) { // iterate over attributes 81 | const lhtml_attribute_t *attr = &attrs[i]; 82 | if (lhtml_name_equals(attr->name, "href")) { // match the attribute name 83 | attr->value = REPLACEMENT; // set the attribute value 84 | } 85 | } 86 | } 87 | } 88 | lhtml_emit(token, extra); // pass transformed token(s) to next handler(s) 89 | } 90 | ``` 91 | 92 | In your main code, use this handler: 93 | 94 | ```c 95 | lhtml_token_handler_t handler; 96 | lhtml_add_handler(&state, &handler, token_handler); 97 | ``` 98 | 99 | That's it! 100 | 101 | ## What does it do? 102 | 103 | lhtml is a lexer which is also written in Ragel, but in a more modular fashion and with support for HTML5. 104 | 105 | * Various parts of the HTML syntax spec live in separate Ragel files (syntax/comment.rl, syntax.starttag.rl, ...) and are connected in syntax/index.rl 106 | 107 | ![files](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/syntax-files.png) 108 | 109 | * Syntax descriptions are separated from actions. 110 | 111 | ![descriptions](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/syntax-description.png) 112 | 113 | One benefit that this brings is enforced named actions in such codestyle. This makes it easy to visualize, debug and fix specific machines using built-in Ragel's visualization. 114 | Sample output from make AttributeName.png below: 115 | 116 | ![visualization](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/ragel-visualization.png) 117 | 118 | This was proved/used during development, as the parser was prototyped in JavaScript for the sake of simplicity and then ported to C with only API / string handling changes within couple of days. 119 | 120 | * lhtml operates on a byte level. HTML spec defines precise set of encodings that are allowed, and one interesting bit from the spec is: 121 | 122 | > Since support for encodings that are not defined in the WHATWG Encoding standard is prohibited, [UTF-16 encodings](https://html.spec.whatwg.org/multipage/infrastructure.html#utf-16-encoding) are the only encodings that this specification needs to treat as not being [ASCII-compatible encodings](https://html.spec.whatwg.org/multipage/infrastructure.html#ascii-compatible-encoding). 123 | 124 | That means that as long as we care only about ASCII-compatible subset (and we do for all the known tags and attributes potentially used in transformations) and the content is not in UTF-16, we can lex HTML on a byte level without expensive streaming decoding in front of it and encoding back after transformation. This is pretty much what we did in the previous parsers, so we can't transform UTF-16 at the moment, but should we decide that we want it in the future, it can be implemented as a special-cased transform in the front of the lexer (it's pretty rare on the Web though, so it's unlikely we will want it as potential issues overweight benefits). 125 | 126 | * lhtml operates in a streaming fashion. When it gets a new chunk, it combines it with a previous leftover in the preallocated buffer and parses the newly formed string. The leftover is formed from the part of the previous token that was not finished. 127 | 128 | * Character tokens (pure text) is not saved between buffers as they are the most popular content, and usually we don't care about them for transformation. That means only short tokens such as start tags, end tags, comments and doctype will be buffered. 129 | 130 | * This leftover + chunk concatenation is the only place where copy occurs. This significantly simplifies handling of the strings across the code (as otherwise we would end up with a rope instead of flat in-memory chunk), and has low overhead (uses only one memmove on small leftover and one memcpy on the new chunk). Parsing itself is zero-copy, and returns tokens with {data, length} string structures which point to this buffer, making them lightweight on memory and easy to work with (and they're compatible with ngx_str_t out of the box). 131 | 132 | * All the memory is statically allocated for entire context (document). On one hand, this means that if transformation wants to preserve some tokens, it needs to copy their data manually into own state, but on another hand brings significant performance wins as we don't need to allocate/free memory over and over for various buffers and tokens, and instead reuse same one. Also, this allows to avoid any restrictions on how that memory is allocated (whether it's malloc/free, Nginx pool or even a stack - anything works as long as it's live during the parsing). 133 | 134 | * Tag names are hashed by transforming each letter to range of 1..26 with shifting step of 5 bits. This wouldn't cover custom tags, but gives a fast inlinable linear function that covers all the standard tags we care about, and for the other rare cases we can use lhtml_name_equals which compares the actual names in a case-insensitive manner. 135 | 136 | * Each token & attribute, in addition to lexed strings, provides a string for the entire token / attribute which can be used if no modifications happened. This both allows to preserve formatting and bring even better performance by avoiding custom per-token serialization in favor of passing this raw strings as-is to the output on any tokens that we don't care about (don't modify). 137 | 138 | 139 | ## So is it correct and fast? 140 | 141 | It's HTML5 compliant, was tested against the official test suites, and several contributions were sent to the specification itself for clarification / simplification of the spec language. 142 | 143 | Unlike existing parsers, it didn't bail out on any of the 2,382,625 documents from HTTP Archive, although 0.2% of documents exceeded expected bufferization limits as they were in fact JavaScript or RSS or other types of content incorrectly served with Content-Type: text/html, and since anything is valid HTML5, parser tried to parse e.g. a` (only that property only in those tags) to a static value. It was compared against few existing and popular HTML parser (only tokenization mode was used for the fair comparison, so that they don't need to build AST and so on), and timings in milliseconds for 100 iterations are the following (lazy mode means that we're using raw strings whenever possible, the other one serializes each token just for the comparison): 146 | 147 | Parser | Example #1: 3.6 MB | Example #2: 7.9 MB | Speed #1 (MB/s) | Speed #2 (MB/s) 148 | --- | --- | --- | --- | --- 149 | Gumbo (Google) | 265.05 | 542.93 | 13.62 | 14.62 150 | html5ever (Mozilla) | 289.75 | 444.32 | 12.46 | 17.87 151 | libhubbbub (Netsurf) | 113.57 | 232.33 | 31.80 | 34.17 152 | lhtml (CloudFlare) |45.32 | 71.55 | 79.69 | 110.97 153 | lhtml (lazy mode) (CloudFlare) | 26.40 | 49.57 | 136.78 | 160.18 154 | 155 | ![comparison](https://github.com/cloudflare/lazyhtml/blob/60b7026da4c0df92284e03212988beac7c973b6e/images/perf-comparison.png) 156 | 157 | ## Are there any quirks? 158 | 159 | these parts were carefully extracted from the spec in the way that doesn't break compatibility, but instead allows to move out unnecessary yet expensive operations into a separate optional module in the pipeline. 160 | 161 | More specifically, as per specification, you have various text transformations in different contexts, such as: 162 | 163 | * normalizing CR / CRLF to LF 164 | * named / numeric XML-like entities 165 | * replacing U+0000 (NUL) character with U+FFFD (replacement character) in certain contexts where it's considered unsafe 166 | * normalizing uppercase tag names and attributes to lowercase in non-XML contexts 167 | 168 | Those are important for correct display in browsers, but as we don't render content, perform very limited text processing, and care only about standard (ASCII-subset) tag names and attributes, we can get away with ignoring those and implementing in a separate plugin if needed. This doesn't change correctness as long as you do e.g. case-insensitive comparisons (which we already do in a very cheap way - case-insensitive hashing). 169 | 170 | Otherwise, we would need to apply charset detection and text decoding (as entity matches or U+FFFD have different representations in various encodings) in front of the parser which would make it significantly slower for little to no benefits. 171 | 172 | ## License 173 | 174 | BSD licensed. See the [LICENSE](LICENSE) file for details. -------------------------------------------------------------------------------- /c/.gitignore: -------------------------------------------------------------------------------- 1 | /*.dSYM 2 | /node_modules 3 | /out 4 | -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | RAGEL = ragel 2 | RAGELFLAGS = -G2 3 | CFLAGS = -g -O3 4 | override CFLAGS += -std=c99 -Wall -Wextra -Wcast-qual -Wwrite-strings -Wshadow -Winline -Wdisabled-optimization -Wuninitialized -Wcast-align -Wno-missing-field-initializers -Werror 5 | OUT = out 6 | TARGET := $(shell $(CC) -dumpmachine) 7 | OUT_TARGET := $(OUT)/$(TARGET) 8 | RAGEL_SOURCES := actions.rl $(wildcard ../syntax/*.rl) 9 | SOURCES := $(wildcard *.c) 10 | 11 | ## Phony tasks 12 | 13 | .PHONY: all 14 | all: lib 15 | 16 | .PHONY: lib 17 | lib: $(OUT_TARGET)/liblhtml.a 18 | 19 | .PHONY: clean-obj 20 | clean-obj: 21 | rm -rf $(OUT_TARGET) 22 | 23 | .PHONY: clean 24 | clean: 25 | rm -rf $(OUT) 26 | 27 | ## Intermediate dependencies 28 | 29 | $(OUT) $(OUT_TARGET): 30 | mkdir -p $@ 31 | 32 | $(OUT)/tokenizer.c: tokenizer.rl $(RAGEL_SOURCES) | $(OUT) 33 | $(RAGEL) $(RAGELFLAGS) $< -o $@ 34 | 35 | $(OUT)/tokenizer-states.h: tokenizer-states.rl $(RAGEL_SOURCES) | $(OUT) 36 | $(RAGEL) $(RAGELFLAGS) $< -o $@ 37 | 38 | $(OUT)/%.d: %.c | $(OUT) 39 | $(CC) $(CFLAGS) -MM $< -MT "\$$(OUT)/$(@F)" -MT "\$$(OUT_TARGET)/$(@F:.d=.o)" -MP -MF $@ 40 | 41 | $(OUT)/tokenizer.d: $(OUT)/tokenizer.c 42 | $(CC) $(CFLAGS) -xc -iquote. -MM $< -MT "\$$(OUT)/$(@F)" -MT "\$$(OUT_TARGET)/$(@F:.d=.o)" -MP -MF $@ 43 | 44 | ifneq (, $(filter all lib $(OUT)/%.d $(OUT_TARGET)/%, $(MAKECMDGOALS))) 45 | -include $(patsubst %.c, $(OUT)/%.d, $(SOURCES)) 46 | -include $(OUT)/tokenizer.d 47 | endif 48 | 49 | ## Object files 50 | 51 | $(OUT_TARGET)/tokenizer.o: $(OUT)/tokenizer-states.h | $(OUT_TARGET) 52 | $(OUT_TARGET)/tokenizer.o: 53 | $(CC) $(CFLAGS) -c $< -include $(OUT)/tokenizer-states.h -iquote. -Wno-parentheses-equality -o $@ 54 | 55 | $(OUT_TARGET)/parser-feedback.o: $(OUT)/tokenizer-states.h | $(OUT_TARGET) 56 | $(OUT_TARGET)/parser-feedback.o: 57 | $(CC) $(CFLAGS) -c $< -include $(OUT)/tokenizer-states.h -o $@ 58 | 59 | $(OUT_TARGET)/serializer.o: | $(OUT_TARGET) 60 | $(CC) $(CFLAGS) -c $< -o $@ 61 | 62 | ## Final library and binaries 63 | 64 | $(OUT_TARGET)/liblhtml.a: $(OUT_TARGET)/tokenizer.o $(OUT_TARGET)/parser-feedback.o $(OUT_TARGET)/serializer.o 65 | $(AR) rcs $@ $? 66 | -------------------------------------------------------------------------------- /c/actions.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | action SaveQuote { 5 | state->quote = fc; 6 | } 7 | 8 | action IsMatchingQuote { fc == state->quote } 9 | 10 | action StartAppropriateEndTag { 11 | state->special_end_tag_type = 0; 12 | } 13 | 14 | action FeedAppropriateEndTag { tag_type_append_char(&state->special_end_tag_type, fc) } 15 | 16 | action IsAppropriateEndTagFed { state->special_end_tag_type == state->last_start_tag_type } 17 | 18 | action SetAppropriateEndTagName { 19 | lhtml_token_endtag_t *end_tag = GET_TOKEN(END_TAG); 20 | end_tag->name = range_string(state->slice_start + 2, p); 21 | end_tag->type = state->special_end_tag_type; 22 | } 23 | 24 | action StartSlice { 25 | state->slice_start = p; 26 | } 27 | 28 | action MarkPosition { 29 | state->mark = p; 30 | } 31 | 32 | action UnmarkPosition { 33 | state->mark = NULL; 34 | } 35 | 36 | action AdvanceMarkedPosition { 37 | state->mark++; 38 | } 39 | 40 | action EmitToken { 41 | emit_token(state, p + (p != eof)); 42 | } 43 | 44 | action CreateCharacter { 45 | token->type = LHTML_TOKEN_CHARACTER; 46 | state->unsafe_null = false; 47 | state->entities = false; 48 | } 49 | 50 | action UnsafeNull { 51 | state->unsafe_null = true; 52 | } 53 | 54 | action AllowEntities { 55 | state->entities = true; 56 | } 57 | 58 | action CreateCDataStart { 59 | token->type = LHTML_TOKEN_CDATA_START; 60 | } 61 | 62 | action CreateCDataEnd { 63 | token->type = LHTML_TOKEN_CDATA_END; 64 | } 65 | 66 | action CreateUnparsed { 67 | token->type = LHTML_TOKEN_UNPARSED; 68 | } 69 | 70 | action EmitSlice { 71 | emit_slice(state, p); 72 | } 73 | 74 | action CreateStartTagToken { 75 | CREATE_TOKEN(START_TAG, { 76 | .attributes = (lhtml_attributes_t) { 77 | .buffer = state->attr_buffer 78 | } 79 | }); 80 | } 81 | 82 | action SetStartTagName { 83 | lhtml_token_starttag_t *start_tag = GET_TOKEN(START_TAG); 84 | start_tag->name = range_string(state->slice_start, p); 85 | start_tag->type = lhtml_get_tag_type(start_tag->name); 86 | } 87 | 88 | action SetEndTagName { 89 | lhtml_token_endtag_t *end_tag = GET_TOKEN(END_TAG); 90 | end_tag->name = range_string(state->slice_start, p); 91 | end_tag->type = lhtml_get_tag_type(end_tag->name); 92 | } 93 | 94 | action SetLastStartTagName { 95 | state->last_start_tag_type = GET_TOKEN(START_TAG)->type; 96 | } 97 | 98 | action SetSelfClosingFlag { 99 | GET_TOKEN(START_TAG)->self_closing = true; 100 | } 101 | 102 | action EndComment { 103 | CREATE_TOKEN(COMMENT, { 104 | .value = range_string(state->slice_start, state->mark) 105 | }); 106 | } 107 | 108 | action CreateEndTagToken { 109 | CREATE_TOKEN(END_TAG, {}); 110 | } 111 | 112 | action CanCreateAttribute { can_create_attr(&GET_TOKEN(START_TAG)->attributes) } 113 | 114 | action SetAttributeValue { 115 | if (state->current_attr_is_unique) { 116 | lhtml_attributes_t *attributes = &GET_TOKEN(START_TAG)->attributes; 117 | lhtml_attribute_t *attr = &attributes->data[attributes->length - 1]; 118 | attr->value = range_string(state->slice_start, p); 119 | attr->raw.value.length = (size_t) (p + (*p == '"' || *p == '\'') - attr->name.data); 120 | } 121 | } 122 | 123 | action AppendAttribute { 124 | lhtml_attributes_t *attributes = &GET_TOKEN(START_TAG)->attributes; 125 | lhtml_string_t name = range_string(state->slice_start, p); 126 | 127 | state->current_attr_is_unique = lhtml_find_attr(attributes, name) == NULL; 128 | 129 | if (state->current_attr_is_unique ) { 130 | attributes->data[attributes->length++] = (lhtml_attribute_t) { 131 | .name = name, 132 | .raw = (lhtml_opt_string_t) { 133 | .has_value = true, 134 | .value = name 135 | } 136 | }; 137 | } else { 138 | parse_error(state, LHTML_ERR_DUPLICATE_ATTRIBUTE); 139 | } 140 | } 141 | 142 | action IsCDataAllowed { state->allow_cdata } 143 | 144 | action CreateDocType { 145 | CREATE_TOKEN(DOCTYPE, {}); 146 | } 147 | 148 | action SetDocTypeName { 149 | GET_TOKEN(DOCTYPE)->name = opt_range_string(state->slice_start, p); 150 | } 151 | 152 | action SetForceQuirksFlag { 153 | GET_TOKEN(DOCTYPE)->force_quirks = true; 154 | } 155 | 156 | action SetDocTypePublicIdentifier { 157 | GET_TOKEN(DOCTYPE)->public_id = opt_range_string(state->slice_start, p); 158 | } 159 | 160 | action SetDocTypeSystemIdentifier { 161 | GET_TOKEN(DOCTYPE)->system_id = opt_range_string(state->slice_start, p); 162 | } 163 | }%% 164 | -------------------------------------------------------------------------------- /c/field-names.h: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_FIELD_NAMES_H 2 | #define LHTML_FIELD_NAMES_H 3 | 4 | #define LHTML_FIELD_NAME_COMMENT comment 5 | #define LHTML_FIELD_NAME_START_TAG start_tag 6 | #define LHTML_FIELD_NAME_END_TAG end_tag 7 | #define LHTML_FIELD_NAME_DOCTYPE doctype 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /c/parse_errors.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | action Err_AbruptClosingOfEmptyComment { parse_error(state, LHTML_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT); } 5 | action Err_AbruptDoctypePublicIdentifier { parse_error(state, LHTML_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER); } 6 | action Err_AbruptDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER); } 7 | action Err_CDataInHtmlContent { parse_error(state, LHTML_ERR_CDATA_IN_HTML_CONTENT); } 8 | action Err_EndTagWithAttributes { parse_error(state, LHTML_ERR_END_TAG_WITH_ATTRIBUTES); } 9 | action Err_EndTagWithTrailingSolidus { parse_error(state, LHTML_ERR_END_TAG_WITH_TRAILING_SOLIDUS); } 10 | action Err_EofBeforeTagName { parse_error(state, LHTML_ERR_EOF_BEFORE_TAG_NAME); } 11 | action Err_EofInCData { parse_error(state, LHTML_ERR_EOF_IN_CDATA); } 12 | action Err_EofInComment { parse_error(state, LHTML_ERR_EOF_IN_COMMENT); } 13 | action Err_EofInDoctype { parse_error(state, LHTML_ERR_EOF_IN_DOCTYPE); } 14 | action Err_EofInScriptHtmlCommentLikeText { parse_error(state, LHTML_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT); } 15 | action Err_EofInTag { parse_error(state, LHTML_ERR_EOF_IN_TAG); } 16 | action Err_IncorrectlyClosedComment { parse_error(state, LHTML_ERR_INCORRECTLY_CLOSED_COMMENT); } 17 | action Err_IncorrectlyOpenedComment { parse_error(state, LHTML_ERR_INCORRECTLY_OPENED_COMMENT); } 18 | action Err_InvalidCharacterSequenceAfterDoctypeName { parse_error(state, LHTML_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME); } 19 | action Err_InvalidFirstCharacterOfTagName { parse_error(state, LHTML_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME); } 20 | action Err_MissingAttributeValue { parse_error(state, LHTML_ERR_MISSING_ATTRIBUTE_VALUE); } 21 | action Err_MissingDoctypeName { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_NAME); } 22 | action Err_MissingDoctypePublicIdentifier { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER); } 23 | action Err_MissingDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER); } 24 | action Err_MissingEndTagName { parse_error(state, LHTML_ERR_MISSING_END_TAG_NAME); } 25 | action Err_MissingQuoteBeforeDoctypePublicIdentifier { parse_error(state, LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER); } 26 | action Err_MissingQuoteBeforeDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); } 27 | action Err_MissingSpaceAfterDoctypePublicKeyword { parse_error(state, LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD); } 28 | action Err_MissingSpaceAfterDoctypeSystemKeyword { parse_error(state, LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD); } 29 | action Err_MissingWhitespaceBeforeDoctypeName { parse_error (state, LHTML_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME); } 30 | action Err_MissingWhitespaceBetweenAttributes { parse_error(state, LHTML_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES); } 31 | action Err_MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers { parse_error(state, LHTML_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS); } 32 | action Err_NestedComment { parse_error(state, LHTML_ERR_NESTED_COMMENT); } 33 | action Err_UnexpectedCharacterAfterDoctypeSystemIdentifier { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER); } 34 | action Err_UnexpectedCharacterInAttributeName { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME); } 35 | action Err_UnexpectedCharacterInUnquotedAttributeValue { parse_error(state, LHTML_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE); } 36 | action Err_UnexpectedEqualsSignBeforeAttributeName { parse_error(state, LHTML_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME); } 37 | action Err_UnexpectedQuestionMarkInsteadOfTagName { parse_error(state, LHTML_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME); } 38 | action Err_UnexpectedSolidusInTag { parse_error(state, LHTML_ERR_UNEXPECTED_SOLIDUS_IN_TAG); } 39 | }%% -------------------------------------------------------------------------------- /c/parser-feedback.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "parser-feedback.h" 4 | // #include "$OUT/tokenizer-states.h" - included with command option to respect env var 5 | 6 | lhtml_ns_t lhtml_get_current_ns(const lhtml_feedback_t *state) { 7 | return state->ns_stack.data[state->ns_stack.length - 1]; 8 | } 9 | 10 | static bool is_foreign_ns(lhtml_ns_t ns) { 11 | return ns != LHTML_NS_HTML; 12 | } 13 | 14 | __attribute__((warn_unused_result)) 15 | static bool enter_ns(lhtml_feedback_t *state, lhtml_ns_t ns) { 16 | if (state->ns_stack.length >= state->ns_stack.capacity) { 17 | return false; 18 | } 19 | state->ns_stack.data[state->ns_stack.length++] = ns; 20 | state->tokenizer->allow_cdata = is_foreign_ns(ns); 21 | return true; 22 | } 23 | 24 | static void leave_ns(lhtml_feedback_t *state) { 25 | assert(state->ns_stack.length > 1); 26 | state->ns_stack.length--; 27 | state->tokenizer->allow_cdata = is_foreign_ns(lhtml_get_current_ns(state)); 28 | } 29 | 30 | static void ensure_tokenizer_mode(lhtml_tokenizer_t *tokenizer, lhtml_tag_type_t tag_type) { 31 | int new_state; 32 | 33 | switch (tag_type) { 34 | case LHTML_TAG_TEXTAREA: 35 | case LHTML_TAG_TITLE: 36 | new_state = html_en_RCData; 37 | break; 38 | 39 | case LHTML_TAG_PLAINTEXT: 40 | new_state = html_en_PlainText; 41 | break; 42 | 43 | case LHTML_TAG_SCRIPT: 44 | new_state = html_en_ScriptData; 45 | break; 46 | 47 | case LHTML_TAG_STYLE: 48 | case LHTML_TAG_IFRAME: 49 | case LHTML_TAG_XMP: 50 | case LHTML_TAG_NOEMBED: 51 | case LHTML_TAG_NOFRAMES: 52 | case LHTML_TAG_NOSCRIPT: 53 | new_state = html_en_RawText; 54 | break; 55 | 56 | default: 57 | return; 58 | } 59 | 60 | tokenizer->cs = new_state; 61 | } 62 | 63 | static bool can_be_self_closing(lhtml_tag_type_t tag_type) { 64 | switch (tag_type) { 65 | case LHTML_TAG_BASE: 66 | case LHTML_TAG_BASEFONT: 67 | case LHTML_TAG_BGSOUND: 68 | case LHTML_TAG_LINK: 69 | case LHTML_TAG_META: 70 | case LHTML_TAG_AREA: 71 | case LHTML_TAG_BR: 72 | case LHTML_TAG_EMBED: 73 | case LHTML_TAG_IMG: 74 | case LHTML_TAG_KEYGEN: 75 | case LHTML_TAG_WBR: 76 | case LHTML_TAG_INPUT: 77 | case LHTML_TAG_PARAM: 78 | case LHTML_TAG_SOURCE: 79 | case LHTML_TAG_TRACK: 80 | case LHTML_TAG_HR: 81 | case LHTML_TAG_MATH: 82 | case LHTML_TAG_SVG: 83 | case LHTML_TAG_COL: 84 | case LHTML_TAG_FRAME: 85 | return true; 86 | default: 87 | return false; 88 | } 89 | } 90 | 91 | static bool foreign_causes_exit(const lhtml_token_starttag_t *start_tag) { 92 | switch (start_tag->type) { 93 | case LHTML_TAG_B: 94 | case LHTML_TAG_BIG: 95 | case LHTML_TAG_BLOCKQUOTE: 96 | case LHTML_TAG_BODY: 97 | case LHTML_TAG_BR: 98 | case LHTML_TAG_CENTER: 99 | case LHTML_TAG_CODE: 100 | case LHTML_TAG_DD: 101 | case LHTML_TAG_DIV: 102 | case LHTML_TAG_DL: 103 | case LHTML_TAG_DT: 104 | case LHTML_TAG_EM: 105 | case LHTML_TAG_EMBED: 106 | /*case LHTML_TAG_H1: 107 | case LHTML_TAG_H2: 108 | case LHTML_TAG_H3: 109 | case LHTML_TAG_H4: 110 | case LHTML_TAG_H5: 111 | case LHTML_TAG_H6:*/ 112 | case LHTML_TAG_HEAD: 113 | case LHTML_TAG_HR: 114 | case LHTML_TAG_I: 115 | case LHTML_TAG_IMG: 116 | case LHTML_TAG_LI: 117 | case LHTML_TAG_LISTING: 118 | case LHTML_TAG_MENU: 119 | case LHTML_TAG_META: 120 | case LHTML_TAG_NOBR: 121 | case LHTML_TAG_OL: 122 | case LHTML_TAG_P: 123 | case LHTML_TAG_PRE: 124 | case LHTML_TAG_RUBY: 125 | case LHTML_TAG_S: 126 | case LHTML_TAG_SMALL: 127 | case LHTML_TAG_SPAN: 128 | case LHTML_TAG_STRONG: 129 | case LHTML_TAG_STRIKE: 130 | case LHTML_TAG_SUB: 131 | case LHTML_TAG_SUP: 132 | case LHTML_TAG_TABLE: 133 | case LHTML_TAG_TT: 134 | case LHTML_TAG_U: 135 | case LHTML_TAG_UL: 136 | case LHTML_TAG_VAR: 137 | return true; 138 | case LHTML_TAG_FONT: { 139 | const lhtml_attributes_t *attrs = &start_tag->attributes; 140 | for (size_t i = 0; i < attrs->length; i++) { 141 | const lhtml_string_t name = attrs->data[i].name; 142 | if (LHTML_STR_NOCASE_EQUALS(name, "color") || LHTML_STR_NOCASE_EQUALS(name, "size") || LHTML_STR_NOCASE_EQUALS(name, "face")) { 143 | return true; 144 | } 145 | } 146 | return false; 147 | } 148 | default: { 149 | const lhtml_string_t name = start_tag->name; 150 | return name.length == 2 && ((name.data[0] | 0x20) == 'h') && (name.data[1] >= '1' && name.data[1] <= '6'); 151 | } 152 | } 153 | } 154 | 155 | static bool foreign_is_integration_point(lhtml_ns_t ns, lhtml_tag_type_t type, const lhtml_string_t name, const lhtml_attributes_t *attrs) { 156 | switch (ns) { 157 | case LHTML_NS_MATHML: 158 | switch (type) { 159 | case LHTML_TAG_MI: 160 | case LHTML_TAG_MO: 161 | case LHTML_TAG_MN: 162 | case LHTML_TAG_MS: 163 | case LHTML_TAG_MTEXT: 164 | return true; 165 | 166 | default: { 167 | if (attrs && LHTML_STR_NOCASE_EQUALS(name, "annotation-xml")) { 168 | for (size_t i = 0; i < attrs->length; i++) { 169 | const lhtml_attribute_t *attr = &attrs->data[i]; 170 | if (LHTML_STR_NOCASE_EQUALS(attr->name, "encoding") && (LHTML_STR_NOCASE_EQUALS(attr->value, "text/html") || LHTML_STR_NOCASE_EQUALS(attr->value, "application/xhtml+xml"))) { 171 | return true; 172 | } 173 | } 174 | } 175 | return false; 176 | } 177 | } 178 | 179 | case LHTML_NS_SVG: 180 | return type == LHTML_TAG_DESC || type == LHTML_TAG_TITLE || type == LHTML_TAG_FOREIGNOBJECT; 181 | 182 | case LHTML_NS_HTML: 183 | return false; 184 | 185 | default: 186 | assert(false); 187 | } 188 | } 189 | 190 | __attribute__((warn_unused_result)) 191 | static bool handle_start_tag_token(lhtml_feedback_t *state, lhtml_token_starttag_t *tag, bool *delayed_enter_html) { 192 | lhtml_tag_type_t type = tag->type; 193 | 194 | if (type == LHTML_TAG_SVG || type == LHTML_TAG_MATH) { 195 | return enter_ns(state, (lhtml_ns_t) type); 196 | } 197 | 198 | lhtml_ns_t ns = lhtml_get_current_ns(state); 199 | 200 | if (is_foreign_ns(ns)) { 201 | if (foreign_causes_exit(tag)) { 202 | leave_ns(state); 203 | } else { 204 | *delayed_enter_html = !tag->self_closing && foreign_is_integration_point(ns, type, tag->name, &tag->attributes); 205 | } 206 | } else { 207 | if (type == LHTML_TAG_IMAGE) { 208 | tag->type = LHTML_TAG_IMG; 209 | tag->name = LHTML_STRING("img"); 210 | } 211 | 212 | ensure_tokenizer_mode(state->tokenizer, type); 213 | } 214 | 215 | return true; 216 | } 217 | 218 | static void handle_end_tag_token(lhtml_feedback_t *state, const lhtml_token_endtag_t *tag) { 219 | lhtml_tag_type_t type = tag->type; 220 | 221 | lhtml_ns_t ns = lhtml_get_current_ns(state); 222 | 223 | if (is_foreign_ns(ns)) { 224 | if (type == (lhtml_tag_type_t) ns) { 225 | leave_ns(state); 226 | } 227 | } else if (state->ns_stack.length >= 2) { 228 | lhtml_ns_t prev_ns = state->ns_stack.data[state->ns_stack.length - 2]; 229 | 230 | if (foreign_is_integration_point(prev_ns, type, tag->name, NULL)) { 231 | leave_ns(state); 232 | } 233 | } 234 | } 235 | 236 | static void handle_token(lhtml_token_t *token, lhtml_feedback_t *state) { 237 | if (token->type == LHTML_TOKEN_START_TAG) { 238 | bool delayed_enter_html = false; 239 | if (!handle_start_tag_token(state, &token->start_tag, &delayed_enter_html)) { 240 | token->type = LHTML_TOKEN_ERROR; 241 | state->tokenizer->cs = html_error; 242 | } 243 | 244 | lhtml_ns_t ns = lhtml_get_current_ns(state); 245 | 246 | if (!is_foreign_ns(ns) && !can_be_self_closing(token->start_tag.type)) { 247 | token->parse_errors |= 1ULL << LHTML_ERR_NON_VOID_HTML_START_TAG_WITH_TRAILING_SOLIDUS; 248 | } 249 | 250 | lhtml_emit(token, state); 251 | 252 | if (delayed_enter_html) { 253 | if (!enter_ns(state, LHTML_NS_HTML)) { 254 | state->tokenizer->cs = html_error; 255 | } 256 | } 257 | } else { 258 | lhtml_emit(token, state); 259 | if (token->type == LHTML_TOKEN_END_TAG) { 260 | handle_end_tag_token(state, &token->end_tag); 261 | } 262 | } 263 | } 264 | 265 | void lhtml_feedback_inject(lhtml_tokenizer_t *tokenizer, lhtml_feedback_t *state) { 266 | state->tokenizer = tokenizer; 267 | assert(enter_ns(state, LHTML_NS_HTML)); 268 | LHTML_ADD_HANDLER(tokenizer, state, handle_token); 269 | } 270 | -------------------------------------------------------------------------------- /c/parser-feedback.h: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_FEEDBACK_H 2 | #define LHTML_FEEDBACK_H 3 | 4 | #include "tokenizer.h" 5 | 6 | typedef enum { 7 | LHTML_NS_HTML = LHTML_TAG_HTML, 8 | LHTML_NS_MATHML = LHTML_TAG_MATH, 9 | LHTML_NS_SVG = LHTML_TAG_SVG 10 | } lhtml_ns_t; 11 | 12 | typedef LHTML_BUFFER_T(lhtml_ns_t) lhtml_ns_buffer_t; 13 | typedef LHTML_LIST_T(lhtml_ns_buffer_t) lhtml_ns_stack_t; 14 | 15 | typedef struct { 16 | lhtml_token_handler_t handler; // needs to be the first one 17 | 18 | lhtml_tokenizer_t *tokenizer; 19 | lhtml_ns_stack_t ns_stack; 20 | } lhtml_feedback_t; 21 | 22 | __attribute__((nonnull)) 23 | void lhtml_feedback_inject(lhtml_tokenizer_t *tokenizer, lhtml_feedback_t *state); 24 | 25 | __attribute__((nonnull, pure, warn_unused_result)) 26 | lhtml_ns_t lhtml_get_current_ns(const lhtml_feedback_t *state); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /c/serializer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "serializer.h" 5 | 6 | typedef struct { 7 | lhtml_string_t str; 8 | const char separator; 9 | bool done; 10 | } split_iterator_t; 11 | 12 | static lhtml_string_t split_iterator_next(split_iterator_t *iter) { 13 | lhtml_string_t str = iter->str; 14 | const char *ptr = memchr(str.data, iter->separator, str.length); 15 | if (ptr == NULL) { 16 | iter->done = true; 17 | return str; 18 | } 19 | const char *next = ptr + 1; 20 | const char *end = str.data + str.length; 21 | iter->str = (lhtml_string_t) { 22 | .data = next, 23 | .length = end - next 24 | }; 25 | return (lhtml_string_t) { 26 | .data = str.data, 27 | .length = ptr - str.data 28 | }; 29 | } 30 | 31 | static void serialize(lhtml_token_t *token, lhtml_serializer_t *extra) { 32 | lhtml_string_callback_t write = extra->writer; 33 | 34 | if (token->raw.has_value) { 35 | write(token->raw.value, extra); 36 | return; 37 | } 38 | 39 | switch (token->type) { 40 | case LHTML_TOKEN_CDATA_START: { 41 | write(LHTML_STRING(""), extra); 47 | break; 48 | } 49 | 50 | case LHTML_TOKEN_DOCTYPE: { 51 | write(LHTML_STRING("doctype.name.has_value) { 53 | // with name: `doctype.name.value, extra); // non-empty; shouldn't contain spaces or `>` 56 | if (token->doctype.public_id.has_value) { 57 | // with public id: `doctype.public_id.value, extra); // shouldn't contain `"` or `>` 60 | if (!(token->doctype.force_quirks && !token->doctype.system_id.has_value)) { 61 | write(LHTML_STRING("\""), extra); 62 | } 63 | } else if (token->doctype.system_id.has_value) { 64 | write(LHTML_STRING(" SYSTEM"), extra); 65 | } else if (token->doctype.force_quirks) { 66 | write(LHTML_STRING(" _"), extra); 67 | } 68 | if (token->doctype.system_id.has_value) { 69 | write(LHTML_STRING(" \""), extra); 70 | write(token->doctype.system_id.value, extra); 71 | if (!token->doctype.force_quirks) { 72 | write(LHTML_STRING("\""), extra); 73 | } 74 | } 75 | } 76 | write(LHTML_STRING(">"), extra); 77 | break; 78 | } 79 | 80 | case LHTML_TOKEN_COMMENT: { 81 | write(LHTML_STRING("` 83 | write(LHTML_STRING("-->"), extra); 84 | break; 85 | } 86 | 87 | case LHTML_TOKEN_START_TAG: { 88 | write(LHTML_STRING("<"), extra); 89 | write(token->start_tag.name, extra); // non-empty, starts with ASCII letter 90 | lhtml_attributes_t *attrs = &token->start_tag.attributes; 91 | for (size_t i = 0; i < attrs->length; i++) { 92 | lhtml_attribute_t *attr = &attrs->data[i]; 93 | write(LHTML_STRING(" "), extra); 94 | if (attr->raw.has_value) { 95 | write(attr->raw.value, extra); 96 | } else { 97 | write(attr->name, extra); 98 | write(LHTML_STRING("=\""), extra); 99 | split_iterator_t iter = { 100 | .str = attr->value, 101 | .separator = '"' 102 | }; 103 | for(;;) { 104 | // escape double-quotes in attribute values by splitting 105 | // the string and emitting " between chunks 106 | lhtml_string_t chunk = split_iterator_next(&iter); 107 | write(chunk, extra); 108 | if (iter.done) { 109 | // last chunk, no quote afterwards 110 | break; 111 | } 112 | write(LHTML_STRING("""), extra); 113 | } 114 | write(LHTML_STRING("\""), extra); 115 | } 116 | } 117 | if (token->start_tag.self_closing) { 118 | write(LHTML_STRING(" /"), extra); 119 | } 120 | write(LHTML_STRING(">"), extra); 121 | break; 122 | } 123 | 124 | case LHTML_TOKEN_END_TAG: { 125 | write(LHTML_STRING("end_tag.name, extra); 127 | write(LHTML_STRING(">"), extra); 128 | break; 129 | } 130 | 131 | case LHTML_TOKEN_CHARACTER: 132 | case LHTML_TOKEN_UNPARSED: 133 | case LHTML_TOKEN_ERROR: 134 | case LHTML_TOKEN_EOF: { 135 | // These tokens must have a raw value 136 | assert(false); 137 | } 138 | } 139 | } 140 | 141 | void lhtml_serializer_inject(lhtml_tokenizer_t *tokenizer, lhtml_serializer_t *state) { 142 | LHTML_ADD_HANDLER(tokenizer, state, serialize); 143 | } 144 | -------------------------------------------------------------------------------- /c/serializer.h: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_SERIALIZER_H 2 | #define LHTML_SERIALIZER_H 3 | 4 | #include "tokenizer.h" 5 | 6 | typedef struct lhtml_serializer_state_s lhtml_serializer_t; 7 | 8 | typedef void (*lhtml_string_callback_t)(lhtml_string_t string, lhtml_serializer_t *extra); 9 | 10 | struct lhtml_serializer_state_s { 11 | lhtml_token_handler_t handler; // needs to be the first one 12 | lhtml_string_callback_t writer; 13 | }; 14 | 15 | __attribute__((nonnull)) 16 | void lhtml_serializer_inject(lhtml_tokenizer_t *tokenizer, lhtml_serializer_t *state); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /c/tag-types.h: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_TAG_TYPES_H 2 | #define LHTML_TAG_TYPES_H 3 | 4 | typedef enum { 5 | // Regular elements 6 | LHTML_TAG_A = 1, 7 | LHTML_TAG_ABBR = 34898, 8 | LHTML_TAG_ADDRESS = 1212749427, 9 | LHTML_TAG_AREA = 51361, 10 | LHTML_TAG_ARTICLE = 1698991493, 11 | LHTML_TAG_ASIDE = 1680517, 12 | LHTML_TAG_AUDIO = 1741103, 13 | LHTML_TAG_B = 2, 14 | LHTML_TAG_BASE = 67173, 15 | LHTML_TAG_BDI = 2185, 16 | LHTML_TAG_BDO = 2191, 17 | LHTML_TAG_BLOCKQUOTE = 84081888640645, 18 | LHTML_TAG_BODY = 81049, 19 | LHTML_TAG_BR = 82, 20 | LHTML_TAG_BUTTON = 89805294, 21 | LHTML_TAG_CANVAS = 102193203, 22 | LHTML_TAG_CAPTION = 3272222190, 23 | LHTML_TAG_CITE = 108165, 24 | LHTML_TAG_CODE = 113797, 25 | LHTML_TAG_COL = 3564, 26 | LHTML_TAG_COLGROUP = 119595941552, 27 | LHTML_TAG_DATA = 132737, 28 | LHTML_TAG_DATALIST = 139185235572, 29 | LHTML_TAG_DD = 132, 30 | LHTML_TAG_DEL = 4268, 31 | LHTML_TAG_DETAILS = 4483753363, 32 | LHTML_TAG_DFN = 4302, 33 | LHTML_TAG_DIALOG = 143700455, 34 | LHTML_TAG_DIV = 4406, 35 | LHTML_TAG_DL = 140, 36 | LHTML_TAG_DT = 148, 37 | LHTML_TAG_EM = 173, 38 | LHTML_TAG_EMBED = 5671076, 39 | LHTML_TAG_FIELDSET = 216002612404, 40 | LHTML_TAG_FIGCAPTION = 221245627573742, 41 | LHTML_TAG_FIGURE = 211015237, 42 | LHTML_TAG_FOOTER = 217567410, 43 | LHTML_TAG_FORM = 212557, 44 | LHTML_TAG_HEAD = 267300, 45 | LHTML_TAG_HEADER = 273715378, 46 | LHTML_TAG_HGROUP = 276381360, 47 | LHTML_TAG_HR = 274, 48 | LHTML_TAG_HTML = 283052, 49 | LHTML_TAG_I = 9, 50 | LHTML_TAG_IFRAME = 308872613, 51 | LHTML_TAG_IMG = 9639, 52 | LHTML_TAG_INPUT = 9913012, 53 | LHTML_TAG_INS = 9683, 54 | LHTML_TAG_KBD = 11332, 55 | LHTML_TAG_KEYGEN = 375168174, 56 | LHTML_TAG_LABEL = 12617900, 57 | LHTML_TAG_LEGEND = 408131012, 58 | LHTML_TAG_LI = 393, 59 | LHTML_TAG_LINK = 402891, 60 | LHTML_TAG_MAIN = 427310, 61 | LHTML_TAG_MAP = 13360, 62 | LHTML_TAG_MARK = 427595, 63 | LHTML_TAG_MATH = 427656, 64 | LHTML_TAG_MENU = 431573, 65 | LHTML_TAG_MENUITEM = 452537405613, 66 | LHTML_TAG_META = 431745, 67 | LHTML_TAG_METER = 13815986, 68 | LHTML_TAG_NAV = 14390, 69 | LHTML_TAG_NOSCRIPT = 497783744020, 70 | LHTML_TAG_OBJECT = 505746548, 71 | LHTML_TAG_OL = 492, 72 | LHTML_TAG_OPTGROUP = 533254979248, 73 | LHTML_TAG_OPTION = 520758766, 74 | LHTML_TAG_OUTPUT = 526009012, 75 | LHTML_TAG_P = 16, 76 | LHTML_TAG_PARAM = 16828461, 77 | LHTML_TAG_PICTURE = 17485682245, 78 | LHTML_TAG_PRE = 16965, 79 | LHTML_TAG_PROGRESS = 569594418803, 80 | LHTML_TAG_Q = 17, 81 | LHTML_TAG_RP = 592, 82 | LHTML_TAG_RT = 596, 83 | LHTML_TAG_RUBY = 611417, 84 | LHTML_TAG_S = 19, 85 | LHTML_TAG_SAMP = 624048, 86 | LHTML_TAG_SCRIPT = 641279508, 87 | LHTML_TAG_SECTION = 20572677614, 88 | LHTML_TAG_SELECT = 643175540, 89 | LHTML_TAG_SLOT = 635380, 90 | LHTML_TAG_SMALL = 20350348, 91 | LHTML_TAG_SOURCE = 653969509, 92 | LHTML_TAG_SPAN = 639022, 93 | LHTML_TAG_STRONG = 659111367, 94 | LHTML_TAG_STYLE = 20604293, 95 | LHTML_TAG_SUB = 20130, 96 | LHTML_TAG_SUMMARY = 21119796825, 97 | LHTML_TAG_SUP = 20144, 98 | LHTML_TAG_SVG = 20167, 99 | LHTML_TAG_TABLE = 21006725, 100 | LHTML_TAG_TBODY = 21052569, 101 | LHTML_TAG_TD = 644, 102 | LHTML_TAG_TEMPLATE = 693016856197, 103 | LHTML_TAG_TEXTAREA = 693389805729, 104 | LHTML_TAG_TFOOT = 21183988, 105 | LHTML_TAG_TH = 648, 106 | LHTML_TAG_THEAD = 21238820, 107 | LHTML_TAG_TIME = 664997, 108 | LHTML_TAG_TITLE = 21287301, 109 | LHTML_TAG_TR = 658, 110 | LHTML_TAG_TRACK = 21562475, 111 | LHTML_TAG_U = 21, 112 | LHTML_TAG_UL = 684, 113 | LHTML_TAG_VAR = 22578, 114 | LHTML_TAG_VIDEO = 23367855, 115 | LHTML_TAG_WBR = 23634, 116 | 117 | // Obsolete elements 118 | LHTML_TAG_APPLET = 50868404, 119 | LHTML_TAG_ACRONYM = 1193786157, 120 | LHTML_TAG_BGSOUND = 2402801092, 121 | LHTML_TAG_DIR = 4402, 122 | LHTML_TAG_FRAME = 6882725, 123 | LHTML_TAG_FRAMESET = 225533152436, 124 | LHTML_TAG_NOFRAMES = 497362711731, 125 | LHTML_TAG_ISINDEX = 10311110840, 126 | LHTML_TAG_LISTING = 13207479751, 127 | LHTML_TAG_NEXTID = 475812132, 128 | LHTML_TAG_NOEMBED = 15541373092, 129 | LHTML_TAG_PLAINTEXT = 18005893977876, 130 | LHTML_TAG_RB = 578, 131 | LHTML_TAG_RTC = 19075, 132 | LHTML_TAG_STRIKE = 659105125, 133 | LHTML_TAG_XMP = 25008, 134 | LHTML_TAG_BASEFONT = 70436208084, 135 | LHTML_TAG_BIG = 2343, 136 | LHTML_TAG_BLINK = 2500043, 137 | LHTML_TAG_CENTER = 106385586, 138 | LHTML_TAG_FONT = 212436, 139 | LHTML_TAG_MARQUEE = 14011651237, 140 | LHTML_TAG_MULTICOL = 469649100268, 141 | LHTML_TAG_NOBR = 474194, 142 | LHTML_TAG_SPACER = 654347442, 143 | LHTML_TAG_TT = 660, 144 | LHTML_TAG_IMAGE = 9864421, 145 | 146 | // MathML text integration points 147 | LHTML_TAG_MI = 425, 148 | LHTML_TAG_MO = 431, 149 | LHTML_TAG_MN = 430, 150 | LHTML_TAG_MS = 435, 151 | LHTML_TAG_MTEXT = 14292756, 152 | 153 | // SVG HTML integration points 154 | LHTML_TAG_DESC = 136803, 155 | // LHTML_TAG_TITLE // already exists, 156 | LHTML_TAG_FOREIGNOBJECT = 7478413254770103412, 157 | } lhtml_tag_type_t; 158 | 159 | #endif -------------------------------------------------------------------------------- /c/tokenizer-states.rl: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_TOKENIZER_STATES_H 2 | #define LHTML_TOKENIZER_STATES_H 3 | 4 | %%{ 5 | machine html; 6 | 7 | include 'actions.rl'; 8 | include 'parse_errors.rl'; 9 | include '../syntax/index.rl'; 10 | }%% 11 | 12 | #pragma GCC diagnostic push 13 | #pragma GCC diagnostic ignored "-Wunused-variable" 14 | %%write data nofinal; 15 | #pragma GCC diagnostic pop 16 | 17 | #endif -------------------------------------------------------------------------------- /c/tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef LHTML_TOKENIZER_H 2 | #define LHTML_TOKENIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "tag-types.h" 8 | 9 | // gcc :( 10 | #ifdef __clang__ 11 | #define LHTML_IMMUTABLE const 12 | #else 13 | #define LHTML_IMMUTABLE 14 | #endif 15 | 16 | #define LHTML_BUFFER_T(ITEM_T) struct {\ 17 | ITEM_T *LHTML_IMMUTABLE data;\ 18 | LHTML_IMMUTABLE size_t capacity;\ 19 | } 20 | 21 | #define LHTML_LIST_T(BUFFER_T) struct {\ 22 | union {\ 23 | BUFFER_T buffer;\ 24 | LHTML_IMMUTABLE LHTML_BUFFER_T(__typeof__(((BUFFER_T *)0)->data[0]));\ 25 | };\ 26 | size_t length;\ 27 | } 28 | 29 | typedef struct { 30 | const char *data; 31 | size_t length; 32 | } lhtml_string_t; 33 | 34 | typedef LHTML_BUFFER_T(char) lhtml_char_buffer_t; 35 | 36 | typedef struct { 37 | bool has_value; 38 | lhtml_string_t value; 39 | } lhtml_opt_string_t; 40 | 41 | typedef enum { 42 | LHTML_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT, 43 | LHTML_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER, 44 | LHTML_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER, 45 | LHTML_ERR_CDATA_IN_HTML_CONTENT, 46 | LHTML_ERR_END_TAG_WITH_ATTRIBUTES, 47 | LHTML_ERR_DUPLICATE_ATTRIBUTE, 48 | LHTML_ERR_END_TAG_WITH_TRAILING_SOLIDUS, 49 | LHTML_ERR_EOF_BEFORE_TAG_NAME, 50 | LHTML_ERR_EOF_IN_CDATA, 51 | LHTML_ERR_EOF_IN_COMMENT, 52 | LHTML_ERR_EOF_IN_DOCTYPE, 53 | LHTML_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT, 54 | LHTML_ERR_EOF_IN_TAG, 55 | LHTML_ERR_INCORRECTLY_CLOSED_COMMENT, 56 | LHTML_ERR_INCORRECTLY_OPENED_COMMENT, 57 | LHTML_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME, 58 | LHTML_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME, 59 | LHTML_ERR_MISSING_ATTRIBUTE_VALUE, 60 | LHTML_ERR_MISSING_DOCTYPE_NAME, 61 | LHTML_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER, 62 | LHTML_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER, 63 | LHTML_ERR_MISSING_END_TAG_NAME, 64 | LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, 65 | LHTML_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, 66 | LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD, 67 | LHTML_ERR_MISSING_SPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD, 68 | LHTML_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME, 69 | LHTML_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES, 70 | LHTML_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, 71 | LHTML_ERR_NESTED_COMMENT, 72 | LHTML_ERR_NON_VOID_HTML_START_TAG_WITH_TRAILING_SOLIDUS, 73 | LHTML_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, 74 | LHTML_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME, 75 | LHTML_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE, 76 | LHTML_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME, 77 | LHTML_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME, 78 | LHTML_ERR_UNEXPECTED_SOLIDUS_IN_TAG 79 | } lhtml_parse_error_t; 80 | 81 | typedef enum { 82 | LHTML_TOKEN_ERROR, 83 | LHTML_TOKEN_UNPARSED, 84 | LHTML_TOKEN_CHARACTER, 85 | LHTML_TOKEN_COMMENT, 86 | LHTML_TOKEN_START_TAG, 87 | LHTML_TOKEN_END_TAG, 88 | LHTML_TOKEN_DOCTYPE, 89 | LHTML_TOKEN_EOF, 90 | LHTML_TOKEN_CDATA_START, 91 | LHTML_TOKEN_CDATA_END 92 | } lhtml_token_type_t; 93 | 94 | typedef struct { 95 | lhtml_string_t value; 96 | } lhtml_token_comment_t; 97 | 98 | typedef struct { 99 | lhtml_string_t name; 100 | lhtml_string_t value; 101 | 102 | lhtml_opt_string_t raw; 103 | } lhtml_attribute_t; 104 | 105 | typedef LHTML_BUFFER_T(lhtml_attribute_t) lhtml_attr_buffer_t; 106 | typedef LHTML_LIST_T(lhtml_attr_buffer_t) lhtml_attributes_t; 107 | 108 | typedef struct { 109 | lhtml_string_t name; 110 | lhtml_tag_type_t type; 111 | lhtml_attributes_t attributes; 112 | bool self_closing; 113 | } lhtml_token_starttag_t; 114 | 115 | typedef struct { 116 | lhtml_string_t name; 117 | lhtml_tag_type_t type; 118 | } lhtml_token_endtag_t; 119 | 120 | typedef struct { 121 | lhtml_opt_string_t name; 122 | lhtml_opt_string_t public_id; 123 | lhtml_opt_string_t system_id; 124 | bool force_quirks; 125 | } lhtml_token_doctype_t; 126 | 127 | typedef struct { 128 | lhtml_token_type_t type; 129 | union { 130 | lhtml_token_comment_t comment; 131 | lhtml_token_starttag_t start_tag; 132 | lhtml_token_endtag_t end_tag; 133 | lhtml_token_doctype_t doctype; 134 | }; 135 | lhtml_opt_string_t raw; 136 | uint64_t parse_errors; 137 | } lhtml_token_t; 138 | 139 | #define LHTML_TOKEN_CALLBACK_T(NAME, T) void (*NAME)(lhtml_token_t *token, T *extra) 140 | 141 | typedef __attribute__((nonnull(1))) LHTML_TOKEN_CALLBACK_T(lhtml_token_callback_t, void); 142 | 143 | typedef struct lhtml_token_handler_s lhtml_token_handler_t; 144 | 145 | struct lhtml_token_handler_s { 146 | lhtml_token_callback_t callback; 147 | lhtml_token_handler_t *next; 148 | }; 149 | 150 | ///
151 | typedef struct { 152 | lhtml_token_handler_t base_handler; // needs to be the first one 153 | 154 | bool allow_cdata; 155 | bool unsafe_null; 156 | bool entities; 157 | char quote; 158 | int cs; 159 | lhtml_tag_type_t last_start_tag_type; 160 | lhtml_char_buffer_t buffer; 161 | lhtml_attr_buffer_t attr_buffer; 162 | 163 | uint64_t special_end_tag_type; 164 | lhtml_token_t token; 165 | const char *slice_start; 166 | const char *mark; 167 | char *buffer_pos; 168 | bool current_attr_is_unique; 169 | } lhtml_tokenizer_t; 170 | 171 | __attribute__((nonnull)) 172 | void lhtml_init(lhtml_tokenizer_t *state); 173 | 174 | __attribute__((nonnull)) 175 | void lhtml_append_handlers(lhtml_token_handler_t *dest, lhtml_token_handler_t *src); 176 | 177 | __attribute__((nonnull)) 178 | void lhtml_emit(lhtml_token_t *token, void *extra); 179 | 180 | __attribute__((warn_unused_result, nonnull(1))) 181 | bool lhtml_feed(lhtml_tokenizer_t *state, const lhtml_string_t *chunk); 182 | 183 | __attribute__((pure, warn_unused_result)) 184 | bool lhtml_str_nocase_equals(const lhtml_string_t actual, const lhtml_string_t expected); 185 | 186 | __attribute__((pure, warn_unused_result)) 187 | lhtml_tag_type_t lhtml_get_tag_type(const lhtml_string_t name); 188 | 189 | __attribute__((nonnull, pure, warn_unused_result)) 190 | lhtml_attribute_t *lhtml_find_attr(lhtml_attributes_t *attrs, const lhtml_string_t name); 191 | 192 | __attribute__((nonnull, warn_unused_result)) 193 | lhtml_attribute_t *lhtml_create_attr(lhtml_attributes_t *attrs); 194 | 195 | #define LHTML_STRING(str) ((lhtml_string_t) { .data = str, .length = sizeof(str) - 1 }) 196 | 197 | #define LHTML_STR_EQUALS(actual, expected) ({\ 198 | lhtml_string_t _actual = (actual);\ 199 | lhtml_string_t _expected = LHTML_STRING(expected);\ 200 | _actual.length == _expected.length && memcmp(_actual.data, _expected.data, _expected.length) == 0;\ 201 | }) 202 | 203 | #define LHTML_STR_NOCASE_EQUALS(actual, expected) lhtml_str_nocase_equals(actual, LHTML_STRING(expected)) 204 | 205 | #define LHTML_FIND_ATTR(attrs, name) lhtml_find_attr(attrs, LHTML_STRING(name)) 206 | 207 | #define LHTML_INIT_HANDLER(state, cb) {\ 208 | _Static_assert(offsetof(__typeof__(*(state)), handler) == 0, ".handler is the first item in the state");\ 209 | LHTML_TOKEN_CALLBACK_T(_cb, __typeof__(*(state))) = (cb);\ 210 | (state)->handler = (lhtml_token_handler_t) { .callback = (lhtml_token_callback_t) _cb };\ 211 | } 212 | 213 | #define LHTML_ADD_HANDLER(tokenizer, state, cb) {\ 214 | __typeof__((state)) _state = (state);\ 215 | LHTML_INIT_HANDLER(_state, (cb));\ 216 | lhtml_append_handlers(&(tokenizer)->base_handler, &_state->handler);\ 217 | } 218 | 219 | #endif 220 | -------------------------------------------------------------------------------- /c/tokenizer.rl: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "tokenizer.h" 4 | #include "field-names.h" 5 | // #include "$OUT/tokenizer-states.h" - included with command option to respect env var 6 | 7 | %%{ 8 | machine html; 9 | 10 | include 'actions.rl'; 11 | include 'parse_errors.rl'; 12 | include '../syntax/index.rl'; 13 | 14 | access state->; 15 | }%% 16 | 17 | #define GET_TOKEN(TYPE) (assert(token->type == LHTML_TOKEN_##TYPE), &token->LHTML_FIELD_NAME_##TYPE) 18 | #define TO_LOWER(c) (c | ((unsigned char) (c - 'A') < 26) << 5) // tolower that vectorizes 19 | 20 | #define CREATE_TOKEN(TYPE, VALUE) {\ 21 | token->type = LHTML_TOKEN_##TYPE;\ 22 | token->LHTML_FIELD_NAME_##TYPE = (__typeof__(token->LHTML_FIELD_NAME_##TYPE)) VALUE;\ 23 | } 24 | 25 | #define HELPER(...) __attribute__((always_inline, __VA_ARGS__)) inline static 26 | 27 | HELPER(nonnull) 28 | lhtml_string_t range_string(const char *begin, const char *end) { 29 | assert(end >= begin); 30 | return (lhtml_string_t) { 31 | .data = begin, 32 | .length = (size_t) (end - begin) 33 | }; 34 | } 35 | 36 | HELPER(nonnull) 37 | lhtml_opt_string_t opt_range_string(const char *begin, const char *end) { 38 | return (lhtml_opt_string_t) { 39 | .has_value = true, 40 | .value = range_string(begin, end) 41 | }; 42 | } 43 | 44 | HELPER(const, warn_unused_result) 45 | uint64_t tag_type_append_char(uint64_t *code, char c) { 46 | // protect against overflow 47 | if (*code >> (64 - 5)) { 48 | return *code = 0; 49 | } 50 | 51 | if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { 52 | return *code = (*code << 5) | (c & 31); 53 | } else { 54 | return *code = 0; 55 | } 56 | } 57 | 58 | __attribute__((always_inline)) 59 | inline lhtml_tag_type_t lhtml_get_tag_type(const lhtml_string_t name) { 60 | uint64_t code = 0; 61 | 62 | for (size_t i = 0; i < name.length; i++) { 63 | if (!tag_type_append_char(&code, name.data[i])) { 64 | break; 65 | } 66 | } 67 | 68 | return code; 69 | } 70 | 71 | HELPER(nonnull) 72 | void emit_token(lhtml_tokenizer_t *state, const char *end) { 73 | lhtml_token_t *token = &state->token; 74 | token->raw.value.length = (size_t) (end - token->raw.value.data); 75 | if (token->raw.value.length) { 76 | token->raw.has_value = true; 77 | lhtml_emit(token, &state->base_handler); 78 | token->parse_errors = 0; 79 | } 80 | token->type = LHTML_TOKEN_ERROR; 81 | token->raw.value.data = end; 82 | token->raw.value.length = 0; 83 | } 84 | 85 | HELPER(nonnull) 86 | bool already_errored(lhtml_tokenizer_t *state, lhtml_string_t unprocessed) { 87 | if (unprocessed.length > 0) { 88 | lhtml_token_t *token = &state->token; 89 | token->type = LHTML_TOKEN_ERROR; 90 | token->raw.value = unprocessed; 91 | token->raw.has_value = true; 92 | lhtml_emit(token, &state->base_handler); 93 | } 94 | return false; 95 | } 96 | 97 | HELPER(nonnull) 98 | bool emit_error(lhtml_tokenizer_t *state, lhtml_string_t unprocessed) { 99 | state->token.type = LHTML_TOKEN_ERROR; 100 | emit_token(state, state->buffer_pos); 101 | return already_errored(state, unprocessed); 102 | } 103 | 104 | HELPER(nonnull) 105 | void emit_slice(lhtml_tokenizer_t *state, const char *p) { 106 | assert(state->token.type == LHTML_TOKEN_CHARACTER); 107 | assert(state->slice_start == state->token.raw.value.data); 108 | const char *slice_end = state->mark != NULL ? state->mark : p; 109 | emit_token(state, slice_end); 110 | } 111 | 112 | HELPER(nonnull) 113 | void emit_eof(lhtml_tokenizer_t *state) { 114 | lhtml_token_t *token = &state->token; 115 | token->type = LHTML_TOKEN_EOF; 116 | token->raw.has_value = true; 117 | lhtml_emit(token, &state->base_handler); 118 | } 119 | 120 | HELPER(nonnull) 121 | void parse_error(lhtml_tokenizer_t *state, lhtml_parse_error_t err) { 122 | state->token.parse_errors |= 1ULL << err; 123 | } 124 | 125 | inline bool lhtml_has_parse_error(lhtml_token_t *token, lhtml_parse_error_t err) { 126 | return token->parse_errors & (1ULL << err); 127 | } 128 | 129 | void lhtml_emit(lhtml_token_t *token, void *extra) { 130 | lhtml_token_handler_t *handler = ((lhtml_token_handler_t *) extra)->next; 131 | if (handler != NULL) { 132 | handler->callback(token, handler); 133 | } 134 | } 135 | 136 | inline bool lhtml_str_nocase_equals(const lhtml_string_t actual, const lhtml_string_t expected) { 137 | size_t length = expected.length; 138 | 139 | if (actual.length != length) { 140 | return false; 141 | } 142 | 143 | for (size_t i = 0; i < length; i++) { 144 | if (TO_LOWER(actual.data[i]) != TO_LOWER(expected.data[i])) { 145 | return false; 146 | } 147 | } 148 | 149 | return true; 150 | } 151 | 152 | lhtml_attribute_t *lhtml_find_attr(lhtml_attributes_t *attrs, const lhtml_string_t name) { 153 | size_t count = attrs->length; 154 | lhtml_attribute_t *items = attrs->data; 155 | for (size_t i = 0; i < count; i++) { 156 | lhtml_attribute_t *attr = &items[i]; 157 | if (lhtml_str_nocase_equals(attr->name, name)) { 158 | return attr; 159 | } 160 | } 161 | return NULL; 162 | } 163 | 164 | HELPER(nonnull) 165 | bool can_create_attr(lhtml_attributes_t *attrs) { 166 | return attrs->length < attrs->capacity; 167 | } 168 | 169 | inline lhtml_attribute_t *lhtml_create_attr(lhtml_attributes_t *attrs) { 170 | return can_create_attr(attrs) ? &attrs->data[attrs->length++] : NULL; 171 | } 172 | 173 | void lhtml_init(lhtml_tokenizer_t *state) { 174 | %%write init nocs; 175 | 176 | if (state->cs == 0) { 177 | state->cs = html_en_Data; 178 | } 179 | 180 | state->buffer_pos = state->buffer.data; 181 | } 182 | 183 | void lhtml_append_handlers(lhtml_token_handler_t *dest, lhtml_token_handler_t *src) { 184 | while (dest->next != NULL) { 185 | dest = dest->next; 186 | } 187 | dest->next = src; 188 | } 189 | 190 | bool lhtml_feed(lhtml_tokenizer_t *state, const lhtml_string_t *chunk) { 191 | lhtml_token_t *const token = &state->token; 192 | 193 | if (token->type == LHTML_TOKEN_EOF) { 194 | // if already saw an EOF, ignore any further input 195 | return false; 196 | } 197 | 198 | if (state->cs == html_error) { 199 | if (chunk != NULL) { 200 | return already_errored(state, *chunk); 201 | } else { 202 | token->raw.value.length = 0; 203 | emit_eof(state); 204 | return false; 205 | } 206 | } 207 | 208 | lhtml_string_t unprocessed = chunk != NULL ? *chunk : LHTML_STRING(""); 209 | 210 | do { 211 | token->raw.value.data = state->buffer.data; 212 | 213 | size_t available_space = (size_t) (state->buffer.data + state->buffer.capacity - state->buffer_pos); 214 | 215 | if (unprocessed.length <= available_space) { 216 | available_space = unprocessed.length; 217 | } else if (available_space == 0) { 218 | state->cs = html_error; 219 | return emit_error(state, unprocessed); 220 | } 221 | 222 | const char *p = state->buffer_pos; 223 | 224 | if (available_space > 0) { 225 | memcpy(state->buffer_pos, unprocessed.data, available_space); 226 | state->buffer_pos += available_space; 227 | unprocessed.data += available_space; 228 | unprocessed.length -= available_space; 229 | } 230 | 231 | const char *const pe = state->buffer_pos; 232 | const char *const eof = chunk == NULL ? pe : NULL; 233 | 234 | #pragma GCC diagnostic push 235 | #pragma GCC diagnostic ignored "-Wimplicit-fallthrough" 236 | %%write exec; 237 | #pragma GCC diagnostic pop 238 | 239 | if (state->cs == html_error) { 240 | return emit_error(state, unprocessed); 241 | } 242 | 243 | if (chunk == NULL) { 244 | token->raw.value.length = (size_t) (pe - token->raw.value.data); 245 | emit_eof(state); 246 | return true; 247 | } 248 | 249 | if (token->type == LHTML_TOKEN_CHARACTER) { 250 | emit_slice(state, pe); 251 | token->type = LHTML_TOKEN_CHARACTER; 252 | state->slice_start = token->raw.value.data; 253 | } 254 | 255 | size_t shift = (size_t) (token->raw.value.data - state->buffer.data); 256 | 257 | if (shift != 0) { 258 | switch (token->type) { 259 | case LHTML_TOKEN_COMMENT: { 260 | token->comment.value.data -= shift; 261 | break; 262 | } 263 | 264 | case LHTML_TOKEN_DOCTYPE: { 265 | token->doctype.name.value.data -= shift; 266 | token->doctype.public_id.value.data -= shift; 267 | token->doctype.system_id.value.data -= shift; 268 | break; 269 | } 270 | 271 | case LHTML_TOKEN_END_TAG: { 272 | token->end_tag.name.data -= shift; 273 | break; 274 | } 275 | 276 | case LHTML_TOKEN_START_TAG: { 277 | token->start_tag.name.data -= shift; 278 | lhtml_attributes_t *attrs = &token->start_tag.attributes; 279 | for (size_t i = 0; i < attrs->length; i++) { 280 | lhtml_attribute_t *attr = &attrs->data[i]; 281 | attr->name.data -= shift; 282 | attr->value.data -= shift; 283 | attr->raw.value.data -= shift; 284 | } 285 | break; 286 | } 287 | 288 | default: { 289 | break; 290 | } 291 | } 292 | 293 | memmove(state->buffer.data, token->raw.value.data, (size_t) (state->buffer_pos - token->raw.value.data)); 294 | state->buffer_pos -= shift; 295 | state->slice_start -= shift; 296 | 297 | if (state->mark != NULL) { 298 | state->mark -= shift; 299 | } 300 | } 301 | } while (unprocessed.length > 0); 302 | 303 | return true; 304 | } 305 | -------------------------------------------------------------------------------- /cfsetup.yaml: -------------------------------------------------------------------------------- 1 | everything: &everything 2 | build: 3 | builddeps: 4 | - build-essential 5 | - ragel 6 | post-cache: 7 | - make -C c lib 8 | test: 9 | builddeps: 10 | - ragel 11 | - rust 12 | - clang 13 | post-cache: 14 | - cd rust 15 | - cargo test 16 | bamboo-test: 17 | builddeps: 18 | - ragel 19 | - rust 20 | - clang 21 | - python 22 | post-cache: 23 | - cd rust 24 | - cargo test --no-run # print compilation failures if any 25 | - RUST_TEST_THREADS=1 cargo test -q --test test -- --logfile tests.log 2>failures.log; ../convert-test-log.py 26 | squeeze: *everything 27 | jessie: *everything 28 | stretch: *everything 29 | -------------------------------------------------------------------------------- /convert-test-log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import xml.etree.cElementTree as ET 3 | import re 4 | 5 | failure_re = re.compile(r"""thread '(.*?)' panicked at '(.*?)', tests/test\.rs:\d+:\d+ 6 | (?:note: Run with `RUST_BACKTRACE=1` for a backtrace. 7 | )?""", re.DOTALL) 8 | 9 | tests = open('tests.log', 'r').read().rstrip().split('\n') 10 | failures = open('failures.log', 'r').read() 11 | failures_pos = 0 12 | 13 | root = ET.Element('testsuite', name='html5lib-tests', tests=str(len(tests))) 14 | 15 | for test in tests: 16 | (status, name) = test.split(' ', 1) 17 | case = ET.SubElement(root, 'testcase', name=name) 18 | if status == 'failed': 19 | match = failure_re.match(failures, failures_pos) 20 | assert match is not None, "Could not parse %r" % failures[failures_pos:].split('\n', 1)[0] 21 | failure_name, details = match.groups() 22 | assert name == failure_name, "Could not find failure message for %s" % name 23 | ET.SubElement(case, 'failure').text = details 24 | failures_pos = match.end() 25 | elif status == 'ignored': 26 | ET.SubElement(case, 'skipped') 27 | else: 28 | assert status == 'ok', 'Unknown test status: %s' % status 29 | 30 | ET.ElementTree(root).write('tests.xml') 31 | -------------------------------------------------------------------------------- /error-with-feedback-tests/trailing-solidus.test: -------------------------------------------------------------------------------- 1 | { 2 | "tests": [ 3 | { 4 | "description": "Non-void HTML element with trailing solidus", 5 | "input": "
", 6 | "output": [["StartTag", "div", {}, true]], 7 | "errors": [ 8 | { 9 | "code": "non-void-html-element-start-tag-with-trailing-solidus", 10 | "line": 1, 11 | "col": 6 12 | } 13 | ] 14 | }, 15 | { 16 | "description": 17 | "Non-void HTML element with trailing solidus in foreign content", 18 | "input": "
", 19 | "output": [ 20 | ["StartTag", "svg", {}], 21 | ["StartTag", "div", {}, true], 22 | ["EndTag", "svg"] 23 | ], 24 | "errors": [ 25 | { 26 | "code": "non-void-html-element-start-tag-with-trailing-solidus", 27 | "line": 1, 28 | "col": 11 29 | } 30 | ] 31 | } 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /images/language-specific-actions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/language-specific-actions.png -------------------------------------------------------------------------------- /images/perf-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/perf-comparison.png -------------------------------------------------------------------------------- /images/ragel-visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/ragel-visualization.png -------------------------------------------------------------------------------- /images/syntax-description.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/syntax-description.png -------------------------------------------------------------------------------- /images/syntax-files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudflare/lazyhtml/05b4b877400796ca3fbba3b9ec84005688200db3/images/syntax-files.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lazyhtml-scripts", 3 | "private": true, 4 | "version": "1.0.0", 5 | "description": "Helper scripts for lazyhtml", 6 | "author": "Ingvar Stepanyan (https://reverser.com/)", 7 | "dependencies": { 8 | "graphlib-dot": "^0.6.2" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /rust/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | -------------------------------------------------------------------------------- /rust/.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Ingvar Stepanyan "] 3 | name = "lazyhtml" 4 | version = "0.1.0" 5 | publish = false 6 | autotests = false 7 | 8 | [[bench]] 9 | harness = false 10 | name = "bench" 11 | 12 | [[test]] 13 | harness = false 14 | name = "test" 15 | 16 | [dependencies.lazyhtml-sys] 17 | path = "lazyhtml-sys" 18 | version = "0.1.0" 19 | 20 | [dev-dependencies] 21 | getopts = "0.2.15" 22 | glob = "0.3.0" 23 | html5ever = "0.23.0" 24 | serde = { version = "1.0.19", features = ["derive"] } 25 | serde_json = "1.0.5" 26 | rustc-test = "0.3.0" 27 | 28 | [workspace] 29 | -------------------------------------------------------------------------------- /rust/benches/bench.rs: -------------------------------------------------------------------------------- 1 | extern crate glob; 2 | extern crate html5ever; 3 | extern crate lazyhtml; 4 | extern crate rustc_test as test; 5 | 6 | use lazyhtml::*; 7 | use test::black_box; 8 | use std::ptr::null_mut; 9 | use test::Bencher; 10 | use std::os::raw::c_void; 11 | use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer, 12 | TokenizerOpts, TokenizerResult}; 13 | use html5ever::tendril::StrTendril; 14 | use test::{test_main, ShouldPanic, TDynBenchFn, TestDesc, TestDescAndFn, TestFn, TestName}; 15 | use std::fs::File; 16 | use std::io::Read; 17 | 18 | unsafe extern "C" fn handle_token(token: *mut lhtml_token_t, _state: *mut c_void) { 19 | black_box(*token); 20 | } 21 | 22 | const CHUNK_SIZE: usize = 1024; 23 | 24 | fn string_chunks(mut s: &str) -> Vec { 25 | let mut result = Vec::with_capacity((s.len() / CHUNK_SIZE) + 1); 26 | 27 | while !s.is_empty() { 28 | let mut offset = CHUNK_SIZE; 29 | 30 | if offset < s.len() { 31 | while !s.is_char_boundary(offset) { 32 | offset += 1; 33 | } 34 | } else { 35 | offset = s.len(); 36 | } 37 | 38 | let (before, after) = s.split_at(offset); 39 | 40 | result.push(before.to_owned()); 41 | 42 | s = after; 43 | } 44 | 45 | result 46 | } 47 | 48 | fn bench_lhtml_tokenizer(chunks: &[String]) { 49 | let mut bench_handler = lhtml_token_handler_t { 50 | callback: Some(handle_token), 51 | next: null_mut(), 52 | }; 53 | 54 | let mut tokenizer = lazyhtml::Tokenizer::new(100 << 10, 256); 55 | 56 | bench_handler.inject_into(&mut tokenizer); 57 | 58 | for chunk in chunks { 59 | tokenizer.feed(chunk).expect("Could not feed input chunk"); 60 | } 61 | 62 | tokenizer.end().expect("Could not finalize input"); 63 | } 64 | 65 | struct Sink; 66 | 67 | impl TokenSink for Sink { 68 | type Handle = (); 69 | 70 | fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { 71 | black_box(token); 72 | TokenSinkResult::Continue 73 | } 74 | } 75 | 76 | fn bench_html5ever_tokenizer(chunks: &[String]) { 77 | let mut tokenizer = Tokenizer::new(Sink, TokenizerOpts::default()); 78 | 79 | let mut queue = BufferQueue::new(); 80 | 81 | for chunk in chunks { 82 | queue.push_back(StrTendril::from_slice(chunk)); 83 | 84 | while let TokenizerResult::Script(_) = tokenizer.feed(&mut queue) { 85 | // ignore script markers 86 | } 87 | } 88 | 89 | tokenizer.end(); 90 | } 91 | 92 | struct Bench { 93 | func: fn(&[String]), 94 | chunks: Vec, 95 | } 96 | 97 | impl TDynBenchFn for Bench { 98 | fn run(&self, b: &mut Bencher) { 99 | b.iter(|| { 100 | (self.func)(&self.chunks); 101 | }); 102 | } 103 | } 104 | 105 | fn main() { 106 | let args: Vec<_> = ::std::env::args().collect(); 107 | 108 | let fixtures: Vec<_> = glob::glob("../bench-fixtures/*.html") 109 | .unwrap() 110 | .map(|path| path.unwrap()) 111 | .collect(); 112 | 113 | let funcs: [(&str, fn(&[String])); 2] = [ 114 | ("bench_lhtml_tokenizer", bench_lhtml_tokenizer), 115 | ("bench_html5ever_tokenizer", bench_html5ever_tokenizer), 116 | ]; 117 | 118 | let mut tests = Vec::with_capacity(fixtures.len() * funcs.len()); 119 | 120 | for path in fixtures { 121 | let mut input = String::new(); 122 | File::open(&path) 123 | .unwrap() 124 | .read_to_string(&mut input) 125 | .unwrap(); 126 | 127 | let input_name = path.file_name().unwrap().to_str().unwrap(); 128 | 129 | let chunks = string_chunks(&input); 130 | 131 | for &(func_name, func) in &funcs { 132 | tests.push(TestDescAndFn { 133 | desc: TestDesc { 134 | name: TestName::DynTestName(format!("{} x {}", func_name, input_name)), 135 | ignore: false, 136 | should_panic: ShouldPanic::No, 137 | allow_fail: false, 138 | }, 139 | testfn: TestFn::DynBenchFn(Box::new(Bench { 140 | func, 141 | chunks: chunks.clone(), 142 | })), 143 | }); 144 | } 145 | } 146 | 147 | test_main(&args, tests); 148 | } 149 | -------------------------------------------------------------------------------- /rust/examples/trace.rs: -------------------------------------------------------------------------------- 1 | extern crate getopts; 2 | extern crate lazyhtml; 3 | 4 | use std::ptr::null_mut; 5 | use lazyhtml::*; 6 | use std::os::raw::c_void; 7 | use getopts::Options; 8 | use std::env::args; 9 | 10 | struct HandlerState { 11 | handler: lhtml_token_handler_t, 12 | } 13 | 14 | impl HandlerState { 15 | pub fn new() -> Self { 16 | HandlerState { 17 | handler: lhtml_token_handler_t { 18 | callback: Some(Self::callback), 19 | next: null_mut(), 20 | }, 21 | } 22 | } 23 | 24 | unsafe extern "C" fn callback(token: *mut lhtml_token_t, extra: *mut c_void) { 25 | println!("{:#?}", *token); 26 | lhtml_emit(token, extra); 27 | } 28 | } 29 | 30 | fn main() { 31 | let mut opts = Options::new(); 32 | 33 | opts.optflag("f", "feedback", "Enable parser feedback"); 34 | opts.optopt( 35 | "s", 36 | "state", 37 | "Initial state", 38 | "-s (Data|PlainText|RCData|RawText|ScriptData|CDataSection)", 39 | ); 40 | opts.optflag("h", "help", "Show this help"); 41 | 42 | let matches = match opts.parse(args().skip(1)) { 43 | Ok(matches) => if matches.free.is_empty() { 44 | eprintln!("Missing HTML input"); 45 | None 46 | } else if matches.opt_present("h") { 47 | None 48 | } else { 49 | Some(matches) 50 | }, 51 | Err(e) => { 52 | eprintln!("{}", e); 53 | None 54 | } 55 | }; 56 | 57 | let matches = match matches { 58 | Some(m) => m, 59 | None => { 60 | eprintln!("{}", opts.usage("Usage: trace [options] INPUT")); 61 | return; 62 | } 63 | }; 64 | 65 | let initial_state = match matches.opt_str("s").as_ref().map(|s| s.as_str()) { 66 | None | Some("Data") => html_en_Data, 67 | Some("PlainText") => html_en_PlainText, 68 | Some("RCData") => html_en_RCData, 69 | Some("RawText") => html_en_RawText, 70 | Some("ScriptData") => html_en_ScriptData, 71 | Some("CDataSection") => html_en_CDataSection, 72 | _ => { 73 | eprintln!("Unknown state, defaulting to Data"); 74 | html_en_Data 75 | } 76 | }; 77 | 78 | let with_feedback = matches.opt_present("f"); 79 | 80 | let input = matches.free.first().unwrap(); 81 | 82 | let mut test_state = HandlerState::new(); 83 | 84 | let mut feedback; 85 | 86 | let mut tokenizer = Tokenizer::new(2048, 256); 87 | 88 | unsafe { 89 | tokenizer.set_cs(initial_state); 90 | } 91 | 92 | if with_feedback { 93 | feedback = Feedback::new(64); 94 | feedback.inject_into(&mut tokenizer); 95 | } 96 | 97 | test_state.handler.inject_into(&mut tokenizer); 98 | 99 | tokenizer.feed(input).expect("Could not feed input"); 100 | tokenizer.end().expect("Could not finalize input"); 101 | } 102 | -------------------------------------------------------------------------------- /rust/lazyhtml-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Ingvar Stepanyan "] 3 | name = "lazyhtml-sys" 4 | version = "0.1.0" 5 | 6 | [build-dependencies] 7 | bindgen = "0.31.3" 8 | glob = "0.2.11" 9 | -------------------------------------------------------------------------------- /rust/lazyhtml-sys/build.rs: -------------------------------------------------------------------------------- 1 | extern crate bindgen; 2 | extern crate glob; 3 | 4 | use std::env; 5 | use std::path::PathBuf; 6 | use std::process::Command; 7 | use glob::glob; 8 | 9 | const IMPLICIT_DEPS: &[&str] = &[ 10 | "../../c/tokenizer-states.rl", 11 | "../../c/actions.rl", 12 | "../../c/field-names.h", 13 | "../../c/tag-types.h", 14 | "../../c/tokenizer.*", 15 | "../../c/parser-feedback.*", 16 | "../../c/serializer.*", 17 | "../../syntax/*.rl", 18 | ]; 19 | 20 | fn main() { 21 | let out_dir = env::var("OUT_DIR").unwrap(); 22 | let out_path = PathBuf::from(&out_dir); 23 | 24 | assert!( 25 | Command::new("make") 26 | .current_dir("../../c") 27 | .arg("lib") 28 | .arg(format!("OUT_TARGET={}", out_dir)) 29 | .arg("CFLAGS=-fPIC") 30 | .status() 31 | .unwrap() 32 | .success(), 33 | "building LazyHTML failed" 34 | ); 35 | 36 | bindgen::builder() 37 | .clang_arg("-U__clang__") 38 | .header("wrapper.h") 39 | .rust_target(bindgen::RustTarget::Stable_1_19) 40 | .prepend_enum_name(false) 41 | .whitelist_function("lhtml_.*") 42 | .whitelist_type("lhtml_.*") 43 | .whitelist_var("LHTML_.*|html_en_.*") 44 | .constified_enum_module("lhtml_tag_type_t") 45 | .rustified_enum("lhtml_token_type_t|lhtml_ns_t") 46 | .derive_debug(false) 47 | .generate() 48 | .expect("Unable to generate bindings") 49 | .write_to_file(out_path.join("bindings.rs")) 50 | .expect("Unable to write bindings"); 51 | 52 | println!("cargo:rustc-link-search=native={}", &out_dir); 53 | println!("cargo:rustc-link-lib=static=lhtml"); 54 | 55 | for dep in IMPLICIT_DEPS { 56 | for entry in glob(dep).unwrap() { 57 | println!("cargo:rerun-if-changed={}", entry.unwrap().display()); 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rust/lazyhtml-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | #![allow(unused)] 5 | 6 | use std::fmt::{self, Debug, Formatter}; 7 | use std::slice; 8 | use std::ops::{Deref, DerefMut}; 9 | 10 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 11 | 12 | impl Deref for lhtml_string_t { 13 | type Target = [u8]; 14 | 15 | fn deref(&self) -> &[u8] { 16 | if self.data.is_null() { 17 | &[] 18 | } else { 19 | unsafe { slice::from_raw_parts(self.data as _, self.length) } 20 | } 21 | } 22 | } 23 | 24 | impl Deref for lhtml_attributes_t { 25 | type Target = [lhtml_attribute_t]; 26 | 27 | fn deref(&self) -> &[lhtml_attribute_t] { 28 | let data = unsafe { self.__bindgen_anon_1.buffer.data }; 29 | 30 | if data.is_null() { 31 | &[] 32 | } else { 33 | unsafe { slice::from_raw_parts(data, self.length) } 34 | } 35 | } 36 | } 37 | 38 | impl DerefMut for lhtml_attributes_t { 39 | fn deref_mut(&mut self) -> &mut [lhtml_attribute_t] { 40 | let data = unsafe { self.__bindgen_anon_1.buffer.data }; 41 | 42 | if data.is_null() { 43 | &mut [] 44 | } else { 45 | unsafe { slice::from_raw_parts_mut(data, self.length) } 46 | } 47 | } 48 | } 49 | 50 | impl Debug for lhtml_string_t { 51 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 52 | let s = unsafe { ::std::str::from_utf8_unchecked(self) }; 53 | s.fmt(f) 54 | } 55 | } 56 | 57 | impl Debug for lhtml_opt_string_t { 58 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 59 | if self.has_value { 60 | self.value.fmt(f) 61 | } else { 62 | f.write_str("(none)") 63 | } 64 | } 65 | } 66 | 67 | impl Debug for lhtml_token_comment_t { 68 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 69 | f.debug_struct("lhtml_token_comment_t") 70 | .field("value", &self.value) 71 | .finish() 72 | } 73 | } 74 | 75 | impl Debug for lhtml_attribute_t { 76 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 77 | write!(f, "{:?}: {:?} (raw: {:?})", self.name, self.value, self.raw) 78 | } 79 | } 80 | 81 | impl Debug for lhtml_attributes_t { 82 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 83 | f.debug_list().entries(self.iter()).finish() 84 | } 85 | } 86 | 87 | impl Debug for lhtml_token_starttag_t { 88 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 89 | f.debug_struct("lhtml_token_starttag_t") 90 | .field("name", &self.name) 91 | .field("attributes", &self.attributes) 92 | .field("self_closing", &self.self_closing) 93 | .finish() 94 | } 95 | } 96 | 97 | impl Debug for lhtml_token_endtag_t { 98 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 99 | f.debug_struct("lhtml_token_endtag_t") 100 | .field("name", &self.name) 101 | .finish() 102 | } 103 | } 104 | 105 | impl Debug for lhtml_token_doctype_t { 106 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 107 | f.debug_struct("lhtml_token_doctype_t") 108 | .field("name", &self.name) 109 | .field("public_id", &self.public_id) 110 | .field("system_id", &self.system_id) 111 | .field("force_quirks", &self.force_quirks) 112 | .finish() 113 | } 114 | } 115 | 116 | impl Debug for lhtml_token_t { 117 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { 118 | use lhtml_token_type_t::*; 119 | 120 | let mut f = f.debug_struct("lhtml_token_t"); 121 | 122 | f.field("type", &self.type_); 123 | 124 | unsafe { 125 | match self.type_ { 126 | LHTML_TOKEN_COMMENT => { 127 | f.field("comment", &self.__bindgen_anon_1.comment); 128 | } 129 | LHTML_TOKEN_START_TAG => { 130 | f.field("start_tag", &self.__bindgen_anon_1.start_tag); 131 | } 132 | LHTML_TOKEN_END_TAG => { 133 | f.field("end_tag", &self.__bindgen_anon_1.end_tag); 134 | } 135 | LHTML_TOKEN_DOCTYPE => { 136 | f.field("doctype", &self.__bindgen_anon_1.doctype); 137 | } 138 | _ => {} 139 | } 140 | } 141 | 142 | f.field("raw", &self.raw); 143 | 144 | f.finish() 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /rust/lazyhtml-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "../../c/tokenizer.h" 2 | #include "../../c/parser-feedback.h" 3 | #include "../../c/serializer.h" 4 | #include "../../c/out/tokenizer-states.h" 5 | -------------------------------------------------------------------------------- /rust/src/feedback.rs: -------------------------------------------------------------------------------- 1 | pub use lazyhtml_sys::*; 2 | use std::mem::zeroed; 3 | use tokenizer::*; 4 | 5 | pub struct Feedback(lhtml_feedback_t); 6 | 7 | impl Feedback { 8 | pub fn new(ns_capacity: usize) -> Self { 9 | Feedback(lhtml_feedback_t { 10 | ns_stack: lhtml_ns_stack_t { 11 | __bindgen_anon_1: lhtml_ns_stack_t__bindgen_ty_1 { 12 | buffer: lhtml_alloc_buffer!(lhtml_ns_buffer_t, ns_capacity), 13 | }, 14 | length: 0, 15 | }, 16 | ..unsafe { zeroed() } 17 | }) 18 | } 19 | } 20 | 21 | impl TokenHandler for Feedback { 22 | fn inject_into(&mut self, tokenizer: &mut Tokenizer) { 23 | unsafe { 24 | lhtml_feedback_inject(tokenizer.get_state(), &mut self.0); 25 | } 26 | } 27 | } 28 | 29 | impl Drop for Feedback { 30 | fn drop(&mut self) { 31 | unsafe { 32 | lhtml_drop_buffer!(self.0.ns_stack.__bindgen_anon_1.buffer); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate lazyhtml_sys; 2 | 3 | macro_rules! lhtml_alloc_buffer { 4 | ($ty:ident, $capacity:expr) => {{ 5 | let mut vec = Vec::with_capacity($capacity); 6 | let buf = $ty { 7 | data: vec.as_mut_ptr(), 8 | capacity: vec.capacity() 9 | }; 10 | ::std::mem::forget(vec); 11 | buf 12 | }}; 13 | } 14 | 15 | macro_rules! lhtml_drop_buffer { 16 | ($buf:expr) => { 17 | let buf = $buf; 18 | Box::from_raw(::std::slice::from_raw_parts_mut( 19 | buf.data, 20 | buf.capacity 21 | )); 22 | } 23 | } 24 | 25 | mod tokenizer; 26 | mod feedback; 27 | mod serializer; 28 | 29 | pub use tokenizer::*; 30 | pub use feedback::*; 31 | pub use serializer::*; 32 | -------------------------------------------------------------------------------- /rust/src/serializer.rs: -------------------------------------------------------------------------------- 1 | pub use lazyhtml_sys::*; 2 | use std::mem::zeroed; 3 | use tokenizer::*; 4 | 5 | #[repr(C)] 6 | pub struct Serializer { 7 | state: lhtml_serializer_t, 8 | callback: F, 9 | } 10 | 11 | impl Serializer { 12 | pub fn new(callback: F) -> Self { 13 | Serializer { 14 | state: lhtml_serializer_t { 15 | handler: unsafe { zeroed() }, 16 | writer: Some(Self::writer), 17 | }, 18 | callback, 19 | } 20 | } 21 | 22 | unsafe extern "C" fn writer(s: lhtml_string_t, state: *mut lhtml_serializer_t) { 23 | ((*(state as *mut Self)).callback)(::std::str::from_utf8_unchecked(&s)) 24 | } 25 | } 26 | 27 | impl TokenHandler for Serializer { 28 | fn inject_into(&mut self, tokenizer: &mut Tokenizer) { 29 | unsafe { 30 | lhtml_serializer_inject(tokenizer.get_state(), &mut self.state); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /rust/src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | pub use lazyhtml_sys::*; 2 | use std::mem::zeroed; 3 | use std::marker::PhantomData; 4 | 5 | pub struct Tokenizer<'a> { 6 | state: lhtml_tokenizer_t, 7 | phantom: PhantomData<&'a ()>, 8 | } 9 | 10 | impl<'a> Tokenizer<'a> { 11 | pub fn new(char_capacity: usize, attr_capacity: usize) -> Self { 12 | let mut state = lhtml_tokenizer_t { 13 | buffer: lhtml_alloc_buffer!(lhtml_char_buffer_t, char_capacity), 14 | attr_buffer: lhtml_alloc_buffer!(lhtml_attr_buffer_t, attr_capacity), 15 | ..unsafe { zeroed() } 16 | }; 17 | unsafe { 18 | lhtml_init(&mut state); 19 | } 20 | Tokenizer { 21 | state, 22 | phantom: PhantomData, 23 | } 24 | } 25 | 26 | fn feed_opt(&mut self, input: *const lhtml_string_t) -> Result<(), ()> { 27 | if unsafe { lhtml_feed(&mut self.state, input) } { 28 | Ok(()) 29 | } else { 30 | Err(()) 31 | } 32 | } 33 | 34 | pub fn feed(&mut self, input: &str) -> Result<(), ()> { 35 | self.feed_opt(&lhtml_string_t { 36 | data: input.as_ptr() as _, 37 | length: input.len(), 38 | }) 39 | } 40 | 41 | pub fn end(mut self) -> Result<(), ()> { 42 | self.feed_opt(::std::ptr::null()) 43 | } 44 | 45 | pub unsafe fn set_cs(&mut self, cs: ::std::os::raw::c_int) { 46 | self.state.cs = cs; 47 | } 48 | 49 | pub unsafe fn set_last_start_tag(&mut self, last_start_tag: &str) { 50 | self.state.last_start_tag_type = lhtml_get_tag_type(lhtml_string_t { 51 | data: last_start_tag.as_ptr() as _, 52 | length: last_start_tag.len(), 53 | }); 54 | } 55 | 56 | pub unsafe fn get_state(&mut self) -> &mut lhtml_tokenizer_t { 57 | &mut self.state 58 | } 59 | } 60 | 61 | impl<'a> Drop for Tokenizer<'a> { 62 | fn drop(&mut self) { 63 | unsafe { 64 | let state = self.get_state(); 65 | lhtml_drop_buffer!(state.buffer); 66 | lhtml_drop_buffer!(state.attr_buffer); 67 | } 68 | } 69 | } 70 | 71 | pub trait TokenHandler { 72 | fn inject_into<'a>(&'a mut self, tokenizer: &mut Tokenizer<'a>); 73 | } 74 | 75 | impl TokenHandler for lhtml_token_handler_t { 76 | fn inject_into(&mut self, tokenizer: &mut Tokenizer) { 77 | unsafe { 78 | lhtml_append_handlers(&mut tokenizer.get_state().base_handler, self); 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /rust/tests/decoder.rs: -------------------------------------------------------------------------------- 1 | use html5ever::data::{C1_REPLACEMENTS, NAMED_ENTITIES}; 2 | use std::char; 3 | use std::str::Chars; 4 | use std::iter::Peekable; 5 | 6 | #[derive(PartialEq, Eq)] 7 | enum Entities { 8 | None, 9 | Text, 10 | Attribute, 11 | } 12 | 13 | pub struct Decoder<'a> { 14 | chars: Peekable>, 15 | result: String, 16 | null: bool, 17 | entities: Entities, 18 | } 19 | 20 | impl<'a> Decoder<'a> { 21 | fn next_if_char(&mut self, expected: char) -> bool { 22 | self.next_if(|c| c == expected).is_some() 23 | } 24 | 25 | fn next_if bool>(&mut self, f: F) -> Option { 26 | self.next_opt(|c| if f(c) { Some(c) } else { None }) 27 | } 28 | 29 | fn next_opt Option>(&mut self, f: F) -> Option { 30 | let opt = self.chars.peek().cloned().and_then(f); 31 | if opt.is_some() { 32 | self.chars.next(); 33 | } 34 | opt 35 | } 36 | 37 | fn decode_numeric_entity(&mut self, radix: u32) -> bool { 38 | if let Some(mut code) = self.next_opt(|c| c.to_digit(radix)) { 39 | while let Some(digit) = self.next_opt(|c| c.to_digit(radix)) { 40 | if code < 0x10FFFF { 41 | code = code * radix + digit; 42 | } 43 | } 44 | self.result.push( 45 | match code { 46 | 0x00 => None, 47 | 0x80...0x9F => { 48 | C1_REPLACEMENTS[(code - 0x80) as usize].or_else(|| char::from_u32(code)) 49 | } 50 | _ => char::from_u32(code), 51 | }.unwrap_or('\u{FFFD}'), 52 | ); 53 | self.next_if_char(';'); 54 | true 55 | } else { 56 | self.result += "&#"; 57 | false 58 | } 59 | } 60 | 61 | fn decode_named_entity(&mut self) { 62 | let mut name_buf = String::new(); 63 | let mut name_match = ('&' as u32, 0, 0); 64 | while let Some(&c) = self.chars.peek() { 65 | name_buf.push(c); 66 | if let Some(&m) = NAMED_ENTITIES.get(&name_buf[..]) { 67 | self.chars.next(); 68 | if m.0 != 0 { 69 | if c != ';' && self.entities == Entities::Attribute { 70 | if let Some(&c) = self.chars.peek() { 71 | match c { 72 | 'A'...'Z' | 'a'...'z' | '0'...'9' | '=' => { 73 | continue; 74 | } 75 | _ => {} 76 | } 77 | } 78 | } 79 | name_match = (m.0, m.1, name_buf.len()); 80 | } 81 | } else { 82 | name_buf.pop(); 83 | break; 84 | } 85 | } 86 | self.result.push(char::from_u32(name_match.0).unwrap()); 87 | if name_match.1 != 0 { 88 | self.result.push(char::from_u32(name_match.1).unwrap()); 89 | } 90 | self.result += &name_buf[name_match.2..]; 91 | } 92 | 93 | fn decode_entity(&mut self) { 94 | if self.next_if_char('#') { 95 | if let Some(x) = self.next_if(|c| c == 'x' || c == 'X') { 96 | if !self.decode_numeric_entity(16) { 97 | self.result.push(x); 98 | } 99 | } else { 100 | self.decode_numeric_entity(10); 101 | } 102 | } else { 103 | self.decode_named_entity(); 104 | } 105 | } 106 | 107 | fn decode_cr(&mut self) { 108 | self.result.push('\n'); 109 | self.next_if_char('\n'); 110 | } 111 | 112 | pub fn new(raw: &'a str) -> Self { 113 | Decoder { 114 | chars: raw.chars().peekable(), 115 | result: String::with_capacity(raw.len()), 116 | null: false, 117 | entities: Entities::None, 118 | } 119 | } 120 | 121 | pub fn unsafe_null(mut self) -> Self { 122 | self.null = true; 123 | self 124 | } 125 | 126 | pub fn text_entities(mut self) -> Self { 127 | self.entities = Entities::Text; 128 | self 129 | } 130 | 131 | pub fn attr_entities(mut self) -> Self { 132 | self.entities = Entities::Attribute; 133 | self 134 | } 135 | 136 | pub fn run(mut self) -> String { 137 | while let Some(c) = self.chars.next() { 138 | match c { 139 | '\r' => { 140 | self.decode_cr(); 141 | } 142 | '\0' if self.null => { 143 | self.result.push('\u{FFFD}'); 144 | } 145 | '&' if self.entities != Entities::None => { 146 | self.decode_entity(); 147 | } 148 | _ => { 149 | self.result.push(c); 150 | } 151 | } 152 | } 153 | 154 | self.result 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /rust/tests/feedback_tokens/mod.rs: -------------------------------------------------------------------------------- 1 | mod noop_tree_sink; 2 | mod token_sink_proxy; 3 | 4 | use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts}; 5 | use html5ever::tokenizer::{BufferQueue, Tokenizer, TokenizerOpts, TokenizerResult}; 6 | use html5ever::tendril::StrTendril; 7 | use token::Token as MyToken; 8 | use self::noop_tree_sink::NoopTreeSink; 9 | use self::token_sink_proxy::TokenSinkProxy; 10 | 11 | pub fn tokenize_with_tree_builder(input: &str) -> Vec { 12 | let mut tokens = Vec::new(); 13 | let mut b = BufferQueue::new(); 14 | b.push_back(StrTendril::from(input)); 15 | { 16 | let mut t = Tokenizer::new( 17 | TokenSinkProxy { 18 | inner: TreeBuilder::new(NoopTreeSink::default(), TreeBuilderOpts::default()), 19 | tokens: &mut tokens, 20 | }, 21 | TokenizerOpts::default(), 22 | ); 23 | 24 | while let TokenizerResult::Script(_) = t.feed(&mut b) { 25 | // ignore script markers 26 | } 27 | 28 | t.end(); 29 | } 30 | tokens 31 | } 32 | -------------------------------------------------------------------------------- /rust/tests/feedback_tokens/noop_tree_sink.rs: -------------------------------------------------------------------------------- 1 | // https://github.com/servo/html5ever/blob/master/html5ever/examples/noop-tree-builder.rs 2 | 3 | use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; 4 | use html5ever::{Attribute, ExpandedName, QualName}; 5 | use html5ever::tendril::StrTendril; 6 | use std::borrow::Cow; 7 | 8 | pub struct NoopTreeSink { 9 | names: Vec>, 10 | } 11 | 12 | impl Default for NoopTreeSink { 13 | fn default() -> Self { 14 | NoopTreeSink { names: Vec::new() } 15 | } 16 | } 17 | 18 | impl NoopTreeSink { 19 | fn get_name(&self, id: &usize) -> Option<&QualName> { 20 | self.names.get(*id).and_then(|opt_name| opt_name.as_ref()) 21 | } 22 | 23 | fn set_name(&mut self, name: Option) -> usize { 24 | let id = self.names.len(); 25 | self.names.push(name); 26 | id 27 | } 28 | } 29 | 30 | impl TreeSink for NoopTreeSink { 31 | type Handle = usize; 32 | type Output = Self; 33 | 34 | fn finish(self) -> Self { 35 | self 36 | } 37 | 38 | fn get_document(&mut self) -> usize { 39 | 0 40 | } 41 | 42 | fn get_template_contents(&mut self, target: &usize) -> usize { 43 | if let Some(expanded_name!(html "template")) = self.get_name(target).map(|n| n.expanded()) { 44 | target + 1 45 | } else { 46 | panic!("not a template element") 47 | } 48 | } 49 | 50 | fn same_node(&self, x: &usize, y: &usize) -> bool { 51 | x == y 52 | } 53 | 54 | fn elem_name(&self, target: &usize) -> ExpandedName { 55 | self.get_name(target).expect("not an element").expanded() 56 | } 57 | 58 | fn create_element(&mut self, name: QualName, _: Vec, _: ElementFlags) -> usize { 59 | self.set_name(Some(name)) 60 | } 61 | 62 | fn create_comment(&mut self, _text: StrTendril) -> usize { 63 | self.set_name(None) 64 | } 65 | 66 | #[allow(unused_variables)] 67 | fn create_pi(&mut self, target: StrTendril, value: StrTendril) -> usize { 68 | unimplemented!() 69 | } 70 | 71 | fn append_before_sibling(&mut self, _sibling: &usize, _new_node: NodeOrText) {} 72 | 73 | fn parse_error(&mut self, _msg: Cow<'static, str>) {} 74 | 75 | fn set_quirks_mode(&mut self, _mode: QuirksMode) {} 76 | 77 | fn append(&mut self, _parent: &usize, _child: NodeOrText) {} 78 | 79 | fn append_doctype_to_document(&mut self, _: StrTendril, _: StrTendril, _: StrTendril) {} 80 | 81 | fn add_attrs_if_missing(&mut self, target: &usize, _attrs: Vec) { 82 | self.get_name(target).expect("not an element"); 83 | } 84 | 85 | fn remove_from_parent(&mut self, _target: &usize) {} 86 | 87 | fn reparent_children(&mut self, _node: &usize, _new_parent: &usize) {} 88 | 89 | fn mark_script_already_started(&mut self, _node: &usize) {} 90 | 91 | fn append_based_on_parent_node( 92 | &mut self, 93 | _element: &usize, 94 | _prev_element: &usize, 95 | _new_node: NodeOrText, 96 | ) { 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /rust/tests/feedback_tokens/token_sink_proxy.rs: -------------------------------------------------------------------------------- 1 | use html5ever::tokenizer::{TagKind, Token, TokenSink, TokenSinkResult}; 2 | use token::Token as MyToken; 3 | use std::collections::HashMap; 4 | use std::iter::FromIterator; 5 | 6 | // sends tokens to a given sink, while at the same time converting and 7 | // recording them into the provided array 8 | pub struct TokenSinkProxy<'a, Sink> { 9 | pub inner: Sink, 10 | pub tokens: &'a mut Vec, 11 | } 12 | 13 | impl<'a, Sink> TokenSinkProxy<'a, Sink> { 14 | fn push_character_token(&mut self, s: &str) { 15 | if let Some(&mut MyToken::Character(ref mut last)) = self.tokens.last_mut() { 16 | *last += s; 17 | return; 18 | } 19 | self.tokens.push(MyToken::Character(s.to_string())); 20 | } 21 | } 22 | 23 | impl<'a, Sink> TokenSink for TokenSinkProxy<'a, Sink> 24 | where 25 | Sink: TokenSink, 26 | { 27 | type Handle = Sink::Handle; 28 | 29 | fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { 30 | match token { 31 | Token::DoctypeToken(ref doctype) => { 32 | self.tokens.push(MyToken::Doctype { 33 | name: doctype.name.as_ref().map(|s| s.to_string()), 34 | public_id: doctype.public_id.as_ref().map(|s| s.to_string()), 35 | system_id: doctype.system_id.as_ref().map(|s| s.to_string()), 36 | correctness: !doctype.force_quirks, 37 | }); 38 | } 39 | Token::TagToken(ref tag) => { 40 | let name = tag.name.to_string(); 41 | self.tokens.push(match tag.kind { 42 | TagKind::StartTag => MyToken::StartTag { 43 | name, 44 | attributes: HashMap::from_iter( 45 | tag.attrs 46 | .iter() 47 | .rev() 48 | .map(|attr| (attr.name.local.to_string(), attr.value.to_string())), 49 | ), 50 | self_closing: tag.self_closing, 51 | }, 52 | TagKind::EndTag => MyToken::EndTag { 53 | name: name.to_string(), 54 | }, 55 | }) 56 | } 57 | Token::CommentToken(ref s) => { 58 | self.tokens.push(MyToken::Comment(s.to_string())); 59 | } 60 | Token::CharacterTokens(ref s) => if !s.is_empty() { 61 | self.push_character_token(s); 62 | }, 63 | Token::NullCharacterToken => { 64 | self.push_character_token("\0"); 65 | } 66 | _ => {} 67 | } 68 | self.inner.process_token(token, line_number) 69 | } 70 | 71 | fn end(&mut self) { 72 | self.inner.end() 73 | } 74 | 75 | fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { 76 | self.inner 77 | .adjusted_current_node_present_but_not_in_html_namespace() 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /rust/tests/html5lib.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | use glob; 3 | use std::io::{BufRead, BufReader}; 4 | use std::fs::File; 5 | use unescape::Unescape; 6 | use lazyhtml; 7 | use token::{Token, TokenRange}; 8 | use feedback_tokens::tokenize_with_tree_builder; 9 | use parse_errors::{ParseErrors, ERROR_CODES}; 10 | 11 | // Skip some errors in certain tests due to the limited functionality of the parser. 12 | const SKIP_ERRORS: &'static [(&'static str, &'static str)] = &[ 13 | ("Duplicate close tag attributes", "duplicate-attribute"), // We don't collect attributes on end tags 14 | ]; 15 | 16 | #[derive(Deserialize)] 17 | struct Suite { 18 | #[serde(default)] 19 | pub tests: Vec, 20 | } 21 | 22 | macro_rules! read_tests { 23 | ($path: expr) => ( 24 | glob::glob(concat!( 25 | env!("CARGO_MANIFEST_DIR"), 26 | "/../", 27 | $path 28 | )).unwrap() 29 | .map(|path| BufReader::new(File::open(path.unwrap()).unwrap())) 30 | ) 31 | } 32 | 33 | #[derive(Clone, Copy, Deserialize, Debug)] 34 | #[repr(i32)] 35 | pub enum InitialState { 36 | #[serde(rename = "Data state")] 37 | Data = lazyhtml::html_en_Data, 38 | #[serde(rename = "PLAINTEXT state")] 39 | PlainText = lazyhtml::html_en_PlainText, 40 | #[serde(rename = "RCDATA state")] 41 | RCData = lazyhtml::html_en_RCData, 42 | #[serde(rename = "RAWTEXT state")] 43 | RawText = lazyhtml::html_en_RawText, 44 | #[serde(rename = "Script data state")] 45 | ScriptData = lazyhtml::html_en_ScriptData, 46 | #[serde(rename = "CDATA section state")] 47 | CDataSection = lazyhtml::html_en_CDataSection, 48 | } 49 | 50 | fn default_initial_states() -> Vec { 51 | vec![InitialState::Data] 52 | } 53 | 54 | fn default_with_errors() -> bool { 55 | true // ¯\_(ツ)_/¯ 56 | } 57 | 58 | #[derive(Deserialize)] 59 | pub struct ParseError { 60 | pub code: String, 61 | pub line: usize, 62 | pub col: usize, 63 | } 64 | 65 | #[derive(Deserialize)] 66 | #[serde(rename_all = "camelCase")] 67 | pub struct Test { 68 | pub description: String, 69 | pub input: String, 70 | pub output: Vec, 71 | 72 | #[serde(skip)] 73 | pub with_feedback: bool, 74 | 75 | #[serde(default = "default_with_errors")] 76 | pub with_errors: bool, 77 | 78 | #[serde(default = "default_initial_states")] 79 | pub initial_states: Vec, 80 | 81 | #[serde(default)] 82 | pub double_escaped: bool, 83 | 84 | #[serde(default)] 85 | pub last_start_tag: String, 86 | 87 | #[serde(default)] 88 | errors: Vec, 89 | } 90 | 91 | impl Test { 92 | pub fn get_expected_parse_errors( 93 | &self, 94 | token_ranges: Vec, 95 | ) -> Result { 96 | let mut expected_errors = ParseErrors::new(); 97 | 98 | let errors = self.errors.iter().filter_map(|err| { 99 | ERROR_CODES 100 | .iter() 101 | .filter(|&code| !SKIP_ERRORS.contains(&(self.description.as_str(), code))) 102 | .find(|&&code| code == err.code) 103 | .map(|&code| { 104 | let pos = self.input 105 | .split("\n") 106 | .take(err.line - 1) 107 | .fold(err.col - 1, |pos, s| pos + s.len()); 108 | 109 | // NOTE: use error code slice from the static array 110 | // to avoid specifying lifetimes on owning structures. 111 | (code, pos) 112 | }) 113 | }); 114 | 115 | 'outer: for (code, pos) in errors { 116 | for &range in token_ranges.iter() { 117 | if range.contains(pos) { 118 | expected_errors.insert((range, code)); 119 | continue 'outer; 120 | } 121 | } 122 | 123 | return Err(format!( 124 | "The following error doesn't fit into any token range: {:?}", 125 | (code, pos) 126 | )); 127 | } 128 | 129 | Ok(expected_errors) 130 | } 131 | } 132 | 133 | impl Unescape for Test { 134 | fn unescape(&mut self) -> Result<(), serde_json::error::Error> { 135 | if self.double_escaped { 136 | self.double_escaped = false; 137 | self.input.unescape()?; 138 | for token in &mut self.output { 139 | token.unescape()?; 140 | } 141 | } 142 | Ok(()) 143 | } 144 | } 145 | 146 | pub fn get_tests() -> Vec { 147 | let mut tests = Vec::new(); 148 | for file in read_tests!("html5lib-tests/tokenizer/*.test") { 149 | tests.extend(serde_json::from_reader::<_, Suite>(file).unwrap().tests); 150 | } 151 | for file in read_tests!("error-with-feedback-tests/*.test") { 152 | tests.extend( 153 | serde_json::from_reader::<_, Suite>(file) 154 | .unwrap() 155 | .tests 156 | .into_iter() 157 | .map(|mut test| { 158 | test.with_feedback = true; 159 | test 160 | }), 161 | ); 162 | } 163 | for file in read_tests!("html5lib-tests/tree-construction/*.dat") { 164 | let mut inputs = Vec::new(); 165 | let mut in_data = 0; 166 | for line in file.lines().map(|line| line.unwrap()) { 167 | if line == "#data" { 168 | in_data = 1; 169 | } else if line.starts_with('#') { 170 | in_data = 0; 171 | } else if in_data > 0 { 172 | if in_data > 1 { 173 | let s: &mut String = inputs.last_mut().unwrap(); 174 | s.push('\n'); 175 | s.push_str(&line); 176 | } else { 177 | inputs.push(line); 178 | } 179 | in_data += 1; 180 | } 181 | } 182 | tests.extend(inputs.into_iter().map(|input| { 183 | Test { 184 | description: input 185 | .chars() 186 | .flat_map(|c| c.escape_default()) 187 | .collect::() + " (with feedback)", 188 | output: tokenize_with_tree_builder(&input), 189 | input, 190 | with_feedback: true, 191 | with_errors: false, 192 | initial_states: default_initial_states(), 193 | double_escaped: false, 194 | last_start_tag: String::new(), 195 | errors: Vec::default(), 196 | } 197 | })); 198 | } 199 | tests 200 | } 201 | -------------------------------------------------------------------------------- /rust/tests/parse_errors.rs: -------------------------------------------------------------------------------- 1 | use token::TokenRange; 2 | use std::collections::HashSet; 3 | 4 | pub const ERROR_CODES: &'static [&'static str] = &[ 5 | "abrupt-closing-of-empty-comment", 6 | "abrupt-doctype-public-identifier", 7 | "abrupt-doctype-system-identifier", 8 | // "absence-of-digits-in-numeric-character-reference" (character references are not supported) 9 | "cdata-in-html-content", 10 | // "character-reference-outside-unicode-range" (character references are not supported) 11 | // "control-character-in-input-stream" (has significant performance impact) 12 | // "control-character-reference" (character references are not supported) 13 | "end-tag-with-attributes", 14 | "duplicate-attribute", 15 | "end-tag-with-trailing-solidus", 16 | "eof-before-tag-name", 17 | "eof-in-cdata", 18 | "eof-in-comment", 19 | "eof-in-doctype", 20 | "eof-in-script-html-comment-like-text", 21 | "eof-in-tag", 22 | "incorrectly-closed-comment", 23 | "incorrectly-opened-comment", 24 | "invalid-character-sequence-after-doctype-name", 25 | "invalid-first-character-of-tag-name", 26 | "missing-attribute-value", 27 | "missing-doctype-name", 28 | "missing-doctype-public-identifier", 29 | "missing-doctype-system-identifier", 30 | "missing-end-tag-name", 31 | "missing-quote-before-doctype-public-identifier", 32 | "missing-quote-before-doctype-system-identifier", 33 | "missing-whitespace-after-doctype-public-keyword", 34 | "missing-whitespace-after-doctype-system-keyword", 35 | "missing-whitespace-before-doctype-name", 36 | "missing-whitespace-between-attributes", 37 | "missing-whitespace-between-doctype-public-and-system-identifiers", 38 | "nested-comment", 39 | // "noncharacter-character-reference" (character references are not supported) 40 | // "noncharacter-in-input-stream" (requires UTF decoding, has significant performance impact) 41 | "non-void-html-element-start-tag-with-trailing-solidus", 42 | // "null-character-reference" (character references are not supported) 43 | // "surrogate-character-reference" (character references are not supported) 44 | // "surrogate-in-input-stream" (requires UTF decoding, has significant performance impact) 45 | "unexpected-character-after-doctype-system-identifier", 46 | "unexpected-character-in-attribute-name", 47 | "unexpected-character-in-unquoted-attribute-value", 48 | "unexpected-equals-sign-before-attribute-name", 49 | // "unexpected-null-character" (has significant performance impact) 50 | "unexpected-question-mark-instead-of-tag-name", 51 | "unexpected-solidus-in-tag", 52 | // "unknown-named-character-reference" (character references are not supported) 53 | ]; 54 | 55 | pub type ParseErrors = HashSet<(TokenRange, &'static str)>; 56 | -------------------------------------------------------------------------------- /rust/tests/test.rs: -------------------------------------------------------------------------------- 1 | extern crate lazyhtml; 2 | 3 | #[macro_use] 4 | extern crate serde; 5 | 6 | extern crate serde_json; 7 | 8 | #[macro_use] 9 | extern crate html5ever; 10 | 11 | // From 'rustc-test' crate. 12 | // Mirrors Rust's internal 'libtest'. 13 | // https://doc.rust-lang.org/1.1.0/test/index.html 14 | extern crate rustc_test as test; 15 | 16 | extern crate glob; 17 | 18 | mod token; 19 | mod feedback_tokens; 20 | mod decoder; 21 | mod unescape; 22 | mod html5lib; 23 | mod parse_errors; 24 | 25 | use std::collections::HashMap; 26 | use lazyhtml::*; 27 | use std::os::raw::c_void; 28 | use std::iter::FromIterator; 29 | use std::ptr::null_mut; 30 | use test::{test_main, ShouldPanic, TestDesc, TestDescAndFn, TestFn, TestName}; 31 | use token::{Token, TokenRange}; 32 | use decoder::Decoder; 33 | use unescape::Unescape; 34 | use html5lib::{get_tests, Test}; 35 | use std::iter::IntoIterator; 36 | use parse_errors::{ParseErrors, ERROR_CODES}; 37 | use lazyhtml::lhtml_token_type_t::{LHTML_TOKEN_CHARACTER, LHTML_TOKEN_EOF}; 38 | 39 | unsafe fn lhtml_to_raw_str(s: &lhtml_string_t) -> &str { 40 | ::std::str::from_utf8_unchecked(s) 41 | } 42 | 43 | unsafe fn lhtml_to_name(s: lhtml_string_t) -> String { 44 | let mut s = Decoder::new(lhtml_to_raw_str(&s)).unsafe_null().run(); 45 | 46 | s.make_ascii_lowercase(); 47 | 48 | s 49 | } 50 | 51 | struct HandlerState { 52 | handler: lhtml_token_handler_t, 53 | tokenizer: *const lhtml_tokenizer_t, 54 | tokens: Vec, 55 | raw_output: String, 56 | saw_eof: bool, 57 | parse_errors: ParseErrors, 58 | token_ranges: Vec, 59 | } 60 | 61 | impl HandlerState { 62 | pub fn new() -> Self { 63 | HandlerState { 64 | handler: lhtml_token_handler_t { 65 | callback: Some(HandlerState::callback), 66 | next: null_mut(), 67 | }, 68 | tokenizer: ::std::ptr::null(), 69 | tokens: Vec::new(), 70 | raw_output: String::new(), 71 | saw_eof: false, 72 | parse_errors: ParseErrors::new(), 73 | token_ranges: Vec::new(), 74 | } 75 | } 76 | 77 | fn get_extended_last_token_range(&mut self, new_end: usize) -> TokenRange { 78 | let last_range = self.token_ranges.last_mut().unwrap(); 79 | 80 | let extended_range = TokenRange { 81 | start: last_range.start, 82 | end: new_end, 83 | }; 84 | 85 | // NOTE: go through all errors and update their ranges 86 | self.parse_errors = 87 | ParseErrors::from_iter(self.parse_errors.iter().map(|&(token_range, code)| { 88 | if token_range == *last_range { 89 | (extended_range, code) 90 | } else { 91 | (token_range, code) 92 | } 93 | })); 94 | 95 | *last_range = extended_range; 96 | extended_range 97 | } 98 | 99 | unsafe fn update_parse_errors(&mut self, token: *mut lhtml_token_t, token_len: usize) { 100 | let errors_bit_flags = (*token).parse_errors; 101 | let start = self.raw_output.len(); 102 | let mut end = start + token_len; 103 | let mut is_consequent_chars = false; 104 | let is_eof = (*token).type_ == LHTML_TOKEN_EOF; 105 | 106 | if is_eof { 107 | end += 1; 108 | } 109 | 110 | if let (LHTML_TOKEN_CHARACTER, Some(&Token::Character(_))) = 111 | ((*token).type_, self.tokens.last()) 112 | { 113 | is_consequent_chars = true; 114 | } 115 | 116 | // NOTE: Consider we have an EOF in DOCTYPE at pos 15. 117 | // We attach error to DOCTYPE, so actual error has range [0; 15). 118 | // However, when we assign range to expected error it fails into EOF's 119 | // range [15;16) and, thus, we have mismatch. 120 | // Therefore, to workaround such cases, instead of adding separate range 121 | // for EOF or consequent character token we just extend last available 122 | // token range. 123 | let should_extend_last_range = 124 | (is_consequent_chars || is_eof) && self.token_ranges.len() > 0; 125 | 126 | let token_range = if should_extend_last_range { 127 | self.get_extended_last_token_range(end) 128 | } else { 129 | let token_range = TokenRange { start, end }; 130 | 131 | self.token_ranges.push(token_range); 132 | token_range 133 | }; 134 | 135 | ERROR_CODES.iter().enumerate().for_each(|(i, code)| { 136 | if errors_bit_flags & (1 << i) > 0 { 137 | self.parse_errors.insert((token_range, code)); 138 | } 139 | }); 140 | } 141 | 142 | unsafe extern "C" fn callback(token: *mut lhtml_token_t, extra: *mut c_void) { 143 | use lhtml_token_type_t::*; 144 | 145 | let state = &mut *(extra as *mut Self); 146 | let data = &mut (*token).__bindgen_anon_1; 147 | 148 | if let Some(&mut Token::Character(ref mut s)) = state.tokens.last_mut() { 149 | if (*token).type_ != LHTML_TOKEN_CHARACTER { 150 | *s = { 151 | let mut decoder = Decoder::new(s); 152 | 153 | if (*state.tokenizer).unsafe_null { 154 | decoder = decoder.unsafe_null(); 155 | } 156 | 157 | if (*state.tokenizer).entities { 158 | decoder = decoder.text_entities(); 159 | } 160 | 161 | decoder.run() 162 | }; 163 | } 164 | } 165 | 166 | assert!((*token).raw.has_value); 167 | 168 | let raw = lhtml_to_raw_str(&(*token).raw.value); 169 | 170 | state.update_parse_errors(token, raw.len()); 171 | 172 | let test_token = match (*token).type_ { 173 | LHTML_TOKEN_CDATA_START | LHTML_TOKEN_CDATA_END | LHTML_TOKEN_UNPARSED => None, 174 | LHTML_TOKEN_CHARACTER => { 175 | if let Some(&mut Token::Character(ref mut s)) = state.tokens.last_mut() { 176 | *s += raw; 177 | None 178 | } else { 179 | Some(Token::Character(raw.to_owned())) 180 | } 181 | } 182 | LHTML_TOKEN_COMMENT => { 183 | (*token).raw.has_value = false; 184 | 185 | Some(Token::Comment( 186 | Decoder::new(lhtml_to_raw_str(&data.comment.value)) 187 | .unsafe_null() 188 | .run(), 189 | )) 190 | } 191 | LHTML_TOKEN_START_TAG => { 192 | let start_tag = &mut data.start_tag; 193 | 194 | (*token).raw.has_value = false; 195 | 196 | assert_eq!(lhtml_get_tag_type(start_tag.name), start_tag.type_); 197 | 198 | Some(Token::StartTag { 199 | name: lhtml_to_name(start_tag.name), 200 | 201 | attributes: HashMap::from_iter(start_tag.attributes.iter_mut().rev().map( 202 | |attr| { 203 | attr.raw.has_value = false; 204 | 205 | ( 206 | lhtml_to_name(attr.name), 207 | Decoder::new(lhtml_to_raw_str(&attr.value)) 208 | .unsafe_null() 209 | .attr_entities() 210 | .run(), 211 | ) 212 | }, 213 | )), 214 | 215 | self_closing: start_tag.self_closing, 216 | }) 217 | } 218 | LHTML_TOKEN_END_TAG => { 219 | let end_tag = &data.end_tag; 220 | 221 | (*token).raw.has_value = false; 222 | 223 | assert_eq!(lhtml_get_tag_type(end_tag.name), end_tag.type_); 224 | 225 | Some(Token::EndTag { 226 | name: lhtml_to_name(end_tag.name), 227 | }) 228 | } 229 | LHTML_TOKEN_DOCTYPE => { 230 | let doctype = &data.doctype; 231 | 232 | (*token).raw.has_value = false; 233 | 234 | Some(Token::Doctype { 235 | name: if doctype.name.has_value { 236 | Some(lhtml_to_name(doctype.name.value)) 237 | } else { 238 | None 239 | }, 240 | public_id: if doctype.public_id.has_value { 241 | Some( 242 | Decoder::new(lhtml_to_raw_str(&doctype.public_id.value)) 243 | .unsafe_null() 244 | .run(), 245 | ) 246 | } else { 247 | None 248 | }, 249 | system_id: if doctype.system_id.has_value { 250 | Some( 251 | Decoder::new(lhtml_to_raw_str(&doctype.system_id.value)) 252 | .unsafe_null() 253 | .run(), 254 | ) 255 | } else { 256 | None 257 | }, 258 | correctness: !doctype.force_quirks, 259 | }) 260 | } 261 | LHTML_TOKEN_EOF if !state.saw_eof => { 262 | state.saw_eof = true; 263 | None 264 | } 265 | _ => { 266 | panic!("Unexpected token type"); 267 | } 268 | }; 269 | 270 | if let Some(test_token) = test_token { 271 | state.tokens.push(test_token); 272 | } 273 | 274 | state.raw_output += raw; 275 | 276 | lhtml_emit(token, extra); 277 | } 278 | } 279 | 280 | impl TokenHandler for HandlerState { 281 | fn inject_into<'a>(&'a mut self, tokenizer: &mut Tokenizer<'a>) { 282 | self.tokenizer = unsafe { tokenizer.get_state() }; 283 | self.handler.inject_into(tokenizer); 284 | } 285 | } 286 | 287 | impl Test { 288 | pub unsafe fn run(&self) { 289 | for &cs in &self.initial_states { 290 | let mut output = String::new(); 291 | 292 | for pass in 0..2 { 293 | let is_serializer_test = pass == 1; 294 | let mut serializer; 295 | let mut test_state = HandlerState::new(); 296 | let mut feedback; 297 | 298 | let input = { 299 | let mut tokenizer = Tokenizer::new(2048, 256); 300 | tokenizer.set_cs(cs as _); 301 | tokenizer.set_last_start_tag(&self.last_start_tag); 302 | 303 | if self.with_feedback { 304 | feedback = Feedback::new(64); 305 | feedback.inject_into(&mut tokenizer); 306 | } 307 | 308 | test_state.inject_into(&mut tokenizer); 309 | 310 | let input = if !is_serializer_test { 311 | serializer = Serializer::new(|chunk| { 312 | output += chunk; 313 | }); 314 | serializer.inject_into(&mut tokenizer); 315 | &self.input 316 | } else { 317 | &output 318 | }; 319 | 320 | tokenizer.feed(input).expect("Could not feed input"); 321 | tokenizer.end().expect("Could not finalize input"); 322 | 323 | input 324 | }; 325 | 326 | assert_eq!(&test_state.raw_output, input); 327 | 328 | if !is_serializer_test && self.with_errors { 329 | let expected_errors = self.get_expected_parse_errors(test_state.token_ranges) 330 | .unwrap(); 331 | 332 | assert_eq!( 333 | test_state.parse_errors, expected_errors, 334 | "Parse error mismatch:\n\ 335 | actual: {:?}\n\ 336 | expected: {:?}\n", 337 | test_state.parse_errors, expected_errors 338 | ); 339 | } 340 | 341 | assert!( 342 | test_state.tokens == self.output, 343 | "Token mismatch\n\ 344 | state: {:?}\n\ 345 | original input: {:?}\n\ 346 | input: {:?}\n\ 347 | actual: {:#?}\n\ 348 | expected: {:#?}", 349 | cs, 350 | if is_serializer_test { 351 | Some(&self.input) 352 | } else { 353 | None 354 | }, 355 | input, 356 | test_state.tokens, 357 | self.output 358 | ); 359 | } 360 | } 361 | } 362 | } 363 | 364 | fn main() { 365 | let args: Vec<_> = ::std::env::args().collect(); 366 | 367 | let tests = get_tests() 368 | .into_iter() 369 | .map(|mut test| { 370 | let ignore = test.unescape().is_err(); 371 | 372 | TestDescAndFn { 373 | desc: TestDesc { 374 | name: TestName::DynTestName(test.description.to_owned()), 375 | ignore, 376 | should_panic: ShouldPanic::No, 377 | allow_fail: false, 378 | }, 379 | testfn: TestFn::DynTestFn(Box::new(move || unsafe { 380 | test.run(); 381 | })), 382 | } 383 | }) 384 | .collect(); 385 | 386 | test_main(&args, tests); 387 | } 388 | -------------------------------------------------------------------------------- /rust/tests/token.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use serde::de::{Deserialize, Deserializer, Error as DeError}; 3 | use std::fmt::{self, Formatter}; 4 | use std::iter::FromIterator; 5 | 6 | #[derive(Clone, Copy, Deserialize)] 7 | enum TokenKind { 8 | Character, 9 | Comment, 10 | StartTag, 11 | EndTag, 12 | #[serde(rename = "DOCTYPE")] 13 | Doctype, 14 | } 15 | 16 | // NOTE: we use custom Range implementation for tokens because std::ops::Range 17 | // is an iterator and, therefore, doesn't implement Copy trait. Also, currently 18 | // contains() method is available only in the nightly. 19 | #[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)] 20 | pub struct TokenRange { 21 | pub start: usize, 22 | pub end: usize, 23 | } 24 | 25 | impl TokenRange { 26 | pub fn contains(&self, val: usize) -> bool { 27 | (self.start <= val) && (val < self.end) 28 | } 29 | } 30 | 31 | #[derive(Debug, PartialEq, Eq)] 32 | pub enum Token { 33 | Character(String), 34 | 35 | Comment(String), 36 | 37 | StartTag { 38 | name: String, 39 | attributes: HashMap, 40 | self_closing: bool, 41 | }, 42 | 43 | EndTag { 44 | name: String, 45 | }, 46 | 47 | Doctype { 48 | name: Option, 49 | public_id: Option, 50 | system_id: Option, 51 | correctness: bool, 52 | }, 53 | } 54 | 55 | impl<'de> Deserialize<'de> for Token { 56 | fn deserialize(deserializer: D) -> Result 57 | where 58 | D: Deserializer<'de>, 59 | { 60 | struct Visitor; 61 | 62 | impl<'de> ::serde::de::Visitor<'de> for Visitor { 63 | type Value = Token; 64 | 65 | fn expecting(&self, f: &mut Formatter) -> fmt::Result { 66 | f.write_str("['TokenKind', ...]") 67 | } 68 | 69 | fn visit_seq(self, mut seq: A) -> Result 70 | where 71 | A: ::serde::de::SeqAccess<'de>, 72 | { 73 | let mut actual_length = 0; 74 | 75 | macro_rules! next { 76 | ($error_msg: expr) => (match seq.next_element()? { 77 | Some(value) => { 78 | #[allow(unused_assignments)] { 79 | actual_length += 1; 80 | } 81 | 82 | value 83 | }, 84 | None => return Err(DeError::invalid_length( 85 | actual_length, 86 | &$error_msg 87 | )) 88 | }) 89 | } 90 | 91 | let kind = next!("2 or more"); 92 | 93 | Ok(match kind { 94 | TokenKind::Character => Token::Character(next!("2")), 95 | TokenKind::Comment => Token::Comment(next!("2")), 96 | TokenKind::StartTag => Token::StartTag { 97 | name: { 98 | let mut value: String = next!("3 or 4"); 99 | value.make_ascii_lowercase(); 100 | value 101 | }, 102 | attributes: { 103 | let value: HashMap = next!("3 or 4"); 104 | HashMap::from_iter(value.into_iter().map(|(mut k, v)| { 105 | k.make_ascii_lowercase(); 106 | (k, v) 107 | })) 108 | }, 109 | self_closing: seq.next_element()?.unwrap_or(false), 110 | }, 111 | TokenKind::EndTag => Token::EndTag { 112 | name: { 113 | let mut value: String = next!("2"); 114 | value.make_ascii_lowercase(); 115 | value 116 | }, 117 | }, 118 | TokenKind::Doctype => Token::Doctype { 119 | name: { 120 | let mut value: Option = next!("5"); 121 | if let Some(ref mut value) = value { 122 | value.make_ascii_lowercase(); 123 | } 124 | value 125 | }, 126 | public_id: next!("5"), 127 | system_id: next!("5"), 128 | correctness: next!("5"), 129 | }, 130 | }) 131 | } 132 | } 133 | 134 | deserializer.deserialize_seq(Visitor) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /rust/tests/unescape.rs: -------------------------------------------------------------------------------- 1 | use token::Token; 2 | use serde_json::error::Error; 3 | use serde_json::de::from_str as parse_json; 4 | 5 | pub trait Unescape { 6 | fn unescape(&mut self) -> Result<(), Error>; 7 | } 8 | 9 | impl Unescape for String { 10 | // dummy but does the job 11 | fn unescape(&mut self) -> Result<(), Error> { 12 | *self = parse_json(&format!(r#""{}""#, self))?; 13 | Ok(()) 14 | } 15 | } 16 | 17 | impl Unescape for Option { 18 | fn unescape(&mut self) -> Result<(), Error> { 19 | if let Some(ref mut inner) = *self { 20 | inner.unescape()?; 21 | } 22 | Ok(()) 23 | } 24 | } 25 | 26 | impl Unescape for Token { 27 | fn unescape(&mut self) -> Result<(), Error> { 28 | match *self { 29 | Token::Character(ref mut s) | Token::Comment(ref mut s) => { 30 | s.unescape()?; 31 | } 32 | 33 | Token::EndTag { ref mut name } => { 34 | name.unescape()?; 35 | } 36 | 37 | Token::StartTag { 38 | ref mut name, 39 | ref mut attributes, 40 | .. 41 | } => { 42 | name.unescape()?; 43 | for value in attributes.values_mut() { 44 | value.unescape()?; 45 | } 46 | } 47 | 48 | Token::Doctype { 49 | ref mut name, 50 | ref mut public_id, 51 | ref mut system_id, 52 | .. 53 | } => { 54 | name.unescape()?; 55 | public_id.unescape()?; 56 | system_id.unescape()?; 57 | } 58 | } 59 | Ok(()) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /simplify-graph.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var fs = require('fs'); 4 | var graphlib = require('graphlib-dot'); 5 | 6 | var g = graphlib.read(fs.readFileSync(process.argv[2], 'utf-8')); 7 | 8 | g.nodes().forEach(v => { 9 | var out = g.outEdges(v); 10 | var outW = new Set(out.map(edge => edge.w)); 11 | outW.forEach(w => { 12 | var labels = out.reduce((map, edge) => { 13 | if (edge.w === w) { 14 | var match = g.edge(v, w, edge.name).label.match(/^(.*?)((?:\(.*?\))?(?: \/ \w+(?:, \w+)*)?)$/); 15 | var strings = map.get(match[2] || ''); 16 | if (!strings) { 17 | map.set(match[2] || '', strings = []); 18 | } 19 | strings.push(match[1]); 20 | g.removeEdge(v, w, edge.name); 21 | } 22 | return map; 23 | }, new Map()); 24 | var label = Array.from(labels, ([ action, strings ]) => strings.join(' | ') + action).join('\n'); 25 | g.setEdge(v, w, { label }); 26 | }); 27 | }); 28 | 29 | fs.writeFileSync(process.argv[2], graphlib.write(g)); 30 | -------------------------------------------------------------------------------- /syntax/_helpers.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | TAB = '\t'; 5 | CR = '\r'; 6 | LF = '\n'; 7 | FF = '\f'; 8 | 9 | TagNameSpace = TAB | CR | LF | FF | ' '; 10 | 11 | TagNameEnd = TagNameSpace | '/' | '>'; 12 | 13 | _Quote = ('"' | "'"); 14 | 15 | _StartQuote = _Quote @SaveQuote; 16 | 17 | _EndQuote = _Quote when IsMatchingQuote; 18 | 19 | _UnsafeText = (any+ >CreateCharacter >UnsafeNull >StartSlice %EmitSlice)?; 20 | 21 | _EndTagEnd = ( 22 | TagNameSpace | 23 | '/' | 24 | '>' 25 | ) @Reconsume @To_EndTagNameContents; 26 | 27 | _SpecialEndTag = ( 28 | '/' >StartAppropriateEndTag 29 | (alpha when FeedAppropriateEndTag)* 30 | _EndTagEnd when IsAppropriateEndTagFed >CreateEndTagToken >SetAppropriateEndTagName 31 | ) @err(CreateCharacter) @err(EmitSlice) @err(Reconsume); 32 | }%% 33 | -------------------------------------------------------------------------------- /syntax/_navigation.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | action Reconsume { fhold; } 5 | 6 | action Next_Data { fnext Data; } 7 | 8 | action To_TagOpen { fgoto TagOpen; } 9 | action To_Data { fgoto Data; } 10 | action To_RCDataLessThanSign { fgoto RCDataLessThanSign; } 11 | action To_RCData { fgoto RCData; } 12 | action To_RawTextLessThanSign { fgoto RawTextLessThanSign; } 13 | action To_RawText { fgoto RawText; } 14 | action To_ScriptDataLessThanSign { fgoto ScriptDataLessThanSign; } 15 | action To_ScriptData { fgoto ScriptData; } 16 | action To_MarkupDeclarationOpen { fgoto MarkupDeclarationOpen; } 17 | action To_EndTagOpen { fgoto EndTagOpen; } 18 | action To_StartTagName { fgoto StartTagName; } 19 | action To_EndTagName { fgoto EndTagName; } 20 | action To_EndTagNameContents { fgoto EndTagNameContents; } 21 | action To_BogusComment { fgoto BogusComment; } 22 | action To_BeforeAttributeName { fgoto BeforeAttributeName; } 23 | action To_SelfClosingTag { fgoto SelfClosingTag; } 24 | action To_ScriptDataEscapedDashDash { fgoto ScriptDataEscapedDashDash; } 25 | action To_ScriptDataEscapedDash { fgoto ScriptDataEscapedDash; } 26 | action To_ScriptDataEscapedLessThanSign { fgoto ScriptDataEscapedLessThanSign; } 27 | action To_ScriptDataEscaped { fgoto ScriptDataEscaped; } 28 | action To_ScriptDataDoubleEscaped { fgoto ScriptDataDoubleEscaped; } 29 | action To_ScriptDataDoubleEscapedDash { fgoto ScriptDataDoubleEscapedDash; } 30 | action To_ScriptDataDoubleEscapedLessThanSign { fgoto ScriptDataDoubleEscapedLessThanSign; } 31 | action To_ScriptDataDoubleEscapedDashDash { fgoto ScriptDataDoubleEscapedDashDash; } 32 | action To_AttributeName { fgoto AttributeName; } 33 | action To_AfterAttributeName { fgoto AfterAttributeName; } 34 | action To_BeforeAttributeValue { fgoto BeforeAttributeValue; } 35 | action To_AttributeValueQuoted { fgoto AttributeValueQuoted; } 36 | action To_AfterAttributeValueQuoted { fgoto AfterAttributeValueQuoted; } 37 | action To_AttributeValueUnquoted { fgoto AttributeValueUnquoted; } 38 | action To_DocType { fgoto DocType; } 39 | action To_CDataSection { fgoto CDataSection; } 40 | action To_Comment { fgoto Comment; } 41 | action To_BeforeDocTypeName { fgoto BeforeDocTypeName; } 42 | action To_DocTypeName { fgoto DocTypeName; } 43 | action To_AfterDocTypeName { fgoto AfterDocTypeName; } 44 | action To_AfterDocTypePublicKeyword { fgoto AfterDocTypePublicKeyword; } 45 | action To_BeforeDocTypePublicIdentifier { fgoto BeforeDocTypePublicIdentifier; } 46 | action To_DocTypePublicIdentifierQuoted { fgoto DocTypePublicIdentifierQuoted; } 47 | action To_BogusDocType { fgoto BogusDocType; } 48 | action To_AfterDocTypePublicIdentifier { fgoto AfterDocTypePublicIdentifier; } 49 | action To_BetweenDocTypePublicAndSystemIdentifiers { fgoto BetweenDocTypePublicAndSystemIdentifiers; } 50 | action To_DocTypeSystemIdentifierQuoted { fgoto DocTypeSystemIdentifierQuoted; } 51 | action To_AfterDocTypeSystemKeyword { fgoto AfterDocTypeSystemKeyword; } 52 | action To_BeforeDocTypeSystemIdentifier { fgoto BeforeDocTypeSystemIdentifier; } 53 | action To_AfterDocTypeSystemIdentifier { fgoto AfterDocTypeSystemIdentifier; } 54 | }%% 55 | -------------------------------------------------------------------------------- /syntax/cdata.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | CDataSection := ( 5 | start: any* :> ( 6 | ']' @MarkPosition -> cdata_end 7 | ), 8 | 9 | cdata_end: ( 10 | ']' >1 -> cdata_end_right_bracket | 11 | any >0 @UnmarkPosition -> start 12 | ) @eof(UnmarkPosition), 13 | 14 | cdata_end_right_bracket: ']'* $AdvanceMarkedPosition <: ( 15 | '>' >1 -> final | 16 | any >0 @UnmarkPosition -> start 17 | ) @eof(UnmarkPosition) 18 | ) >CreateCharacter >StartSlice @EmitSlice @CreateCDataEnd @EmitToken @UnmarkPosition @eof(Err_EofInCData) <>eof(EmitSlice) @To_Data; 19 | }%% 20 | -------------------------------------------------------------------------------- /syntax/comment.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | Comment := ( 5 | start: ( 6 | ( 7 | '-' -> comment_start_dash | 8 | '<' -> comment_less_than_sign | 9 | '>' @Err_AbruptClosingOfEmptyComment -> final 10 | ) >1 >MarkPosition | 11 | any >0 -> text_slice 12 | ), 13 | 14 | comment_start_dash: ( 15 | ( 16 | '-' -> comment_end | 17 | '>' @Err_AbruptClosingOfEmptyComment -> final 18 | ) >1 | 19 | any >0 -> text_slice 20 | ), 21 | 22 | text_slice: any* :> ( 23 | '<' @MarkPosition -> comment_less_than_sign | 24 | '-' @MarkPosition -> comment_end_dash 25 | ) @eof(MarkPosition), 26 | 27 | comment_less_than_sign: '<'* $AdvanceMarkedPosition <: ( 28 | '!' >1 -> comment_less_than_sign_bang | 29 | any >0 @Reconsume -> text_slice 30 | ) @eof(MarkPosition), 31 | 32 | comment_less_than_sign_bang: ( 33 | '-' >1 -> comment_less_than_sign_bang_dash | 34 | any >0 @Reconsume -> text_slice 35 | ), 36 | 37 | comment_less_than_sign_bang_dash: ( 38 | '-' >1 ->comment_less_than_sign_bang_dash_dash | 39 | any >0 @Reconsume -> text_slice 40 | ), 41 | 42 | comment_less_than_sign_bang_dash_dash: ( 43 | ( 44 | '>' -> final | 45 | '!' -> comment_end_bang 46 | ) >1 | 47 | any >0 @Err_NestedComment -> text_slice 48 | ), 49 | 50 | comment_end_dash: ( 51 | '-' >1 -> comment_end | 52 | any >0 -> text_slice 53 | ), 54 | 55 | comment_end: '-'* $AdvanceMarkedPosition <: ( 56 | ( 57 | '>' -> final | 58 | '!' -> comment_end_bang 59 | ) >1 | 60 | any >0 -> text_slice 61 | ), 62 | 63 | comment_end_bang: ( 64 | ( 65 | '-' @MarkPosition -> comment_end_dash | 66 | '>' @Err_IncorrectlyClosedComment -> final 67 | ) >1 | 68 | any >0 -> text_slice 69 | ) 70 | ) >StartSlice >eof(StartSlice) >eof(MarkPosition) @EndComment @EmitToken @UnmarkPosition @To_Data @eof(Err_EofInComment) @eof(EndComment) @eof(EmitToken); 71 | }%% 72 | -------------------------------------------------------------------------------- /syntax/data.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | Data := (any+ >CreateCharacter >AllowEntities >StartSlice %EmitSlice)? :> ( 5 | '<' @StartSlice @To_TagOpen 6 | )?; 7 | 8 | TagOpen := ( 9 | ( 10 | '!' @To_MarkupDeclarationOpen | 11 | '/' @To_EndTagOpen | 12 | alpha @CreateStartTagToken @StartSlice @To_StartTagName | 13 | '?' @Err_UnexpectedQuestionMarkInsteadOfTagName @StartSlice @Reconsume @To_BogusComment 14 | ) >1 | 15 | any >0 @Err_InvalidFirstCharacterOfTagName @CreateCharacter @EmitSlice @Reconsume @To_Data 16 | ) @eof(Err_EofBeforeTagName) @eof(CreateCharacter) @eof(EmitSlice); 17 | 18 | BogusComment := any* %MarkPosition %EndComment %EmitToken %UnmarkPosition :> ('>' @To_Data)?; 19 | 20 | MarkupDeclarationOpen := ( 21 | '--' @To_Comment | 22 | /DOCTYPE/i @To_DocType | 23 | ('[CDATA' ( 24 | '[' when IsCDataAllowed @CreateCDataStart @EmitToken @To_CDataSection | 25 | '[' @Err_CDataInHtmlContent @To_BogusComment 26 | )) 27 | ) >StartSlice >err(StartSlice) $err(Err_IncorrectlyOpenedComment) $err(Reconsume) $err(To_BogusComment); 28 | }%% 29 | -------------------------------------------------------------------------------- /syntax/doctype.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | DocType := ( 5 | TagNameSpace >1 @To_BeforeDocTypeName | 6 | '>' >1 @Err_MissingDoctypeName @SetForceQuirksFlag @EmitToken @To_Data | 7 | any >0 @Err_MissingWhitespaceBeforeDoctypeName @StartSlice @To_DocTypeName 8 | ) >CreateDocType >eof(CreateDocType) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 9 | 10 | BeforeDocTypeName := TagNameSpace* <: ( 11 | '>' >1 @Err_MissingDoctypeName @SetForceQuirksFlag @EmitToken @To_Data | 12 | any >0 @StartSlice @To_DocTypeName 13 | ) >eof(CreateDocType) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 14 | 15 | DocTypeName := any* %SetDocTypeName %eof(SetDocTypeName) :> ( 16 | TagNameSpace | 17 | '>' 18 | ) @Reconsume @To_AfterDocTypeName @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 19 | 20 | AfterDocTypeName := TagNameSpace* $eof(Err_EofInDoctype) $eof(SetForceQuirksFlag) $eof(Reconsume) $eof(To_BogusDocType) ( 21 | ( 22 | '>' @EmitToken @To_Data | 23 | /PUBLIC/i @To_AfterDocTypePublicKeyword | 24 | /SYSTEM/i @To_AfterDocTypeSystemKeyword 25 | ) @err(Err_InvalidCharacterSequenceAfterDoctypeName) 26 | )? $err(SetForceQuirksFlag) $err(Reconsume) $err(To_BogusDocType); 27 | 28 | AfterDocTypePublicKeyword := ( 29 | TagNameSpace >1 @To_BeforeDocTypePublicIdentifier | 30 | _StartQuote >1 @Err_MissingSpaceAfterDoctypePublicKeyword @To_DocTypePublicIdentifierQuoted | 31 | any >0 @Reconsume @To_BeforeDocTypePublicIdentifier 32 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 33 | 34 | BeforeDocTypePublicIdentifier := TagNameSpace* <: ( 35 | _StartQuote >1 @To_DocTypePublicIdentifierQuoted | 36 | '>' >1 @Err_MissingDoctypePublicIdentifier @SetForceQuirksFlag @EmitToken @To_Data | 37 | any >0 @Err_MissingQuoteBeforeDoctypePublicIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType 38 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 39 | 40 | DocTypePublicIdentifierQuoted := any* >StartSlice >eof(StartSlice) %SetDocTypePublicIdentifier %eof(SetDocTypePublicIdentifier) :> ( 41 | _EndQuote @To_AfterDocTypePublicIdentifier | 42 | '>' @Err_AbruptDoctypePublicIdentifier @SetForceQuirksFlag @EmitToken @To_Data 43 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 44 | 45 | AfterDocTypePublicIdentifier := ( 46 | ( 47 | TagNameSpace @To_BetweenDocTypePublicAndSystemIdentifiers | 48 | _StartQuote @Err_MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers @To_DocTypeSystemIdentifierQuoted | 49 | '>' @EmitToken @To_Data 50 | ) >1 | 51 | any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType 52 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 53 | 54 | BetweenDocTypePublicAndSystemIdentifiers := TagNameSpace* <: ( 55 | ( 56 | _StartQuote @To_DocTypeSystemIdentifierQuoted | 57 | '>' @EmitToken @To_Data 58 | ) >1 | 59 | any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType 60 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 61 | 62 | AfterDocTypeSystemKeyword := ( 63 | TagNameSpace >1 @To_BeforeDocTypeSystemIdentifier | 64 | _StartQuote >1 @Err_MissingSpaceAfterDoctypeSystemKeyword @To_DocTypeSystemIdentifierQuoted | 65 | any >0 @Reconsume @To_BeforeDocTypeSystemIdentifier 66 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 67 | 68 | BeforeDocTypeSystemIdentifier := TagNameSpace* <: ( 69 | _StartQuote >1 @To_DocTypeSystemIdentifierQuoted | 70 | '>' >1 @Err_MissingDoctypeSystemIdentifier @SetForceQuirksFlag @EmitToken @To_Data | 71 | any >0 @Err_MissingQuoteBeforeDoctypeSystemIdentifier @SetForceQuirksFlag @Reconsume @To_BogusDocType 72 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 73 | 74 | DocTypeSystemIdentifierQuoted := any* >StartSlice >eof(StartSlice) %SetDocTypeSystemIdentifier %eof(SetDocTypeSystemIdentifier) :> ( 75 | _EndQuote @To_AfterDocTypeSystemIdentifier | 76 | '>' @Err_AbruptDoctypeSystemIdentifier @SetForceQuirksFlag @EmitToken @To_Data 77 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 78 | 79 | AfterDocTypeSystemIdentifier := TagNameSpace* <: ( 80 | '>' >1 @EmitToken @To_Data | 81 | any >0 @Err_UnexpectedCharacterAfterDoctypeSystemIdentifier @Reconsume @To_BogusDocType 82 | ) @eof(Err_EofInDoctype) @eof(SetForceQuirksFlag) @eof(EmitToken); 83 | 84 | BogusDocType := any* :> '>' @EmitToken @To_Data @eof(EmitToken); 85 | }%% 86 | -------------------------------------------------------------------------------- /syntax/endtag.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | EndTagName := (any* :> _EndTagEnd >SetEndTagName) @eof(Err_EofInTag); 5 | 6 | EndTagNameContents := ( 7 | start: (TagNameSpace)* <: ( 8 | '/' >1 -> solidus | 9 | '>' >1 @EmitToken @To_Data | 10 | any+ >0 @Err_EndTagWithAttributes :> ( 11 | '/' -> start | 12 | '>' @EmitToken @To_Data | 13 | '=' TagNameSpace* <: ( 14 | _StartQuote >1 any* :> _EndQuote -> start | 15 | '>' >1 @EmitToken @To_Data | 16 | any+ >0 :> ( 17 | TagNameSpace -> start | 18 | '>' @EmitToken @To_Data 19 | ) 20 | ) 21 | ) 22 | ), 23 | solidus: ( 24 | '>' @Err_EndTagWithTrailingSolidus @EmitToken @To_Data | 25 | any >0 @Reconsume -> start 26 | ) 27 | ) @eof(Err_EofInTag); 28 | 29 | EndTagOpen := ( 30 | ( 31 | alpha @CreateEndTagToken @StartSlice @To_EndTagName | 32 | '>' @Err_MissingEndTagName @CreateUnparsed @EmitToken @To_Data 33 | ) >1 | 34 | any >0 @Err_InvalidFirstCharacterOfTagName @StartSlice @Reconsume @To_BogusComment 35 | ) @eof(Err_EofBeforeTagName) @eof(CreateCharacter) @eof(EmitSlice); 36 | }%% 37 | -------------------------------------------------------------------------------- /syntax/index.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | include '_navigation.rl'; 5 | include '_helpers.rl'; 6 | 7 | include 'data.rl'; 8 | 9 | include 'starttag.rl'; 10 | include 'endtag.rl'; 11 | 12 | include 'comment.rl'; 13 | include 'doctype.rl'; 14 | include 'cdata.rl'; 15 | 16 | include 'scriptdata.rl'; 17 | include 'rcdata.rl'; 18 | include 'rawtext.rl'; 19 | include 'plaintext.rl'; 20 | }%% 21 | -------------------------------------------------------------------------------- /syntax/plaintext.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | PlainText := _UnsafeText; 5 | }%% 6 | -------------------------------------------------------------------------------- /syntax/rawtext.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | RawText := _UnsafeText :> ( 5 | '<' @StartSlice @To_RawTextLessThanSign 6 | )?; 7 | 8 | RawTextLessThanSign := _SpecialEndTag @err(To_RawText); 9 | }%% 10 | -------------------------------------------------------------------------------- /syntax/rcdata.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | RCData := (any+ >CreateCharacter >UnsafeNull >AllowEntities >StartSlice %EmitSlice)? :> ( 5 | '<' @StartSlice @To_RCDataLessThanSign 6 | )?; 7 | 8 | RCDataLessThanSign := _SpecialEndTag @err(To_RCData); 9 | }%% 10 | -------------------------------------------------------------------------------- /syntax/scriptdata.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | ScriptData := _UnsafeText :> ( 5 | '<' @StartSlice @To_ScriptDataLessThanSign 6 | )?; 7 | 8 | ScriptDataLessThanSign := ( 9 | _SpecialEndTag | 10 | '!--' >CreateCharacter >UnsafeNull @To_ScriptDataEscapedDashDash 11 | ) @err(EmitSlice) @err(Reconsume) @err(To_ScriptData); 12 | 13 | ScriptDataEscaped := _UnsafeText :> ( 14 | '-' @CreateCharacter @UnsafeNull @StartSlice @To_ScriptDataEscapedDash | 15 | '<' @StartSlice @To_ScriptDataEscapedLessThanSign 16 | ) @eof(Err_EofInScriptHtmlCommentLikeText); 17 | 18 | ScriptDataEscapedDash := ( 19 | ( 20 | '-' @To_ScriptDataEscapedDashDash | 21 | '<' @EmitSlice @StartSlice @To_ScriptDataEscapedLessThanSign 22 | ) >1 | 23 | any >0 @EmitSlice @Reconsume @To_ScriptDataEscaped 24 | ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice); 25 | 26 | ScriptDataEscapedDashDash := '-'* <: ( 27 | ( 28 | '<' @EmitSlice @StartSlice @To_ScriptDataEscapedLessThanSign | 29 | '>' @EmitSlice @Reconsume @To_ScriptData 30 | ) >1 | 31 | any >0 @EmitSlice @Reconsume @To_ScriptDataEscaped 32 | ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice); 33 | 34 | ScriptDataEscapedLessThanSign := ( 35 | _SpecialEndTag | 36 | (/script/i TagNameEnd) @CreateCharacter @UnsafeNull @To_ScriptDataDoubleEscaped 37 | ) @err(CreateCharacter) @err(EmitSlice) @err(Reconsume) @err(To_ScriptDataEscaped); 38 | 39 | ScriptDataDoubleEscaped := any* :> ( 40 | '-' @To_ScriptDataDoubleEscapedDash | 41 | '<' @To_ScriptDataDoubleEscapedLessThanSign 42 | ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice); 43 | 44 | ScriptDataDoubleEscapedDash := ( 45 | ( 46 | '-' @To_ScriptDataDoubleEscapedDashDash | 47 | '<' @To_ScriptDataDoubleEscapedLessThanSign 48 | ) >1 | 49 | any >0 @To_ScriptDataDoubleEscaped 50 | ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice); 51 | 52 | ScriptDataDoubleEscapedDashDash := '-'* <: ( 53 | ( 54 | '<' @To_ScriptDataDoubleEscapedLessThanSign | 55 | '>' @EmitSlice @Reconsume @To_ScriptData 56 | ) >1 | 57 | any >0 @To_ScriptDataDoubleEscaped 58 | ) @eof(Err_EofInScriptHtmlCommentLikeText) @eof(EmitSlice); 59 | 60 | ScriptDataDoubleEscapedLessThanSign := ( 61 | '/' /script/i TagNameEnd @EmitSlice @Reconsume @To_ScriptDataEscaped 62 | ) @err(Reconsume) @err(To_ScriptDataDoubleEscaped); 63 | }%% 64 | -------------------------------------------------------------------------------- /syntax/starttag.rl: -------------------------------------------------------------------------------- 1 | %%{ 2 | machine html; 3 | 4 | _StartTagEnd = ( 5 | TagNameSpace @To_BeforeAttributeName | 6 | '/' @To_SelfClosingTag | 7 | '>' @SetLastStartTagName @Next_Data @EmitToken 8 | ); 9 | 10 | _AttributeNameChars = ( 11 | ('"' | "'" | '<') >1 @Err_UnexpectedCharacterInAttributeName | 12 | any >0 13 | ); 14 | 15 | StartTagName := (any* %SetStartTagName :> _StartTagEnd) @eof(Err_EofInTag); 16 | 17 | BeforeAttributeName := TagNameSpace* <: ( 18 | ('/' | '>') >1 @Reconsume @To_AfterAttributeName | 19 | '=' >1 @Err_UnexpectedEqualsSignBeforeAttributeName when CanCreateAttribute @StartSlice @To_AttributeName | 20 | _AttributeNameChars >0 when CanCreateAttribute @StartSlice @To_AttributeName 21 | ) @eof(Err_EofInTag); 22 | 23 | AttributeName := _AttributeNameChars* %AppendAttribute :> ( 24 | TagNameEnd @Reconsume @To_AfterAttributeName | 25 | '=' @To_BeforeAttributeValue 26 | ) @eof(Err_EofInTag); 27 | 28 | AfterAttributeName := TagNameSpace* <: ( 29 | ( 30 | _StartTagEnd | 31 | '=' @To_BeforeAttributeValue 32 | ) >1 | 33 | _AttributeNameChars >0 when CanCreateAttribute @StartSlice @To_AttributeName 34 | ) @eof(Err_EofInTag); 35 | 36 | BeforeAttributeValue := TagNameSpace* <: ( 37 | _StartQuote >1 @To_AttributeValueQuoted | 38 | '>' >1 @Err_MissingAttributeValue @Reconsume @To_AttributeValueUnquoted | 39 | any >0 @Reconsume @To_AttributeValueUnquoted 40 | ) @eof(Err_EofInTag); 41 | 42 | _AttrValueCharsQuoted = (any* >StartSlice %SetAttributeValue)?; 43 | 44 | AttributeValueQuoted := (_AttrValueCharsQuoted :> _EndQuote @To_AfterAttributeValueQuoted) @eof(Err_EofInTag); 45 | 46 | AfterAttributeValueQuoted := ( 47 | _StartTagEnd >1 | 48 | '=' >1 @Err_MissingWhitespaceBetweenAttributes @Err_UnexpectedEqualsSignBeforeAttributeName when CanCreateAttribute @StartSlice @To_AttributeName | 49 | _AttributeNameChars >0 @Err_MissingWhitespaceBetweenAttributes when CanCreateAttribute @StartSlice @To_AttributeName 50 | ) @eof(Err_EofInTag); 51 | 52 | _AttrValueCharsUnquoted = (( 53 | ('"' | "'" | '<' | '=' | '`') >1 @Err_UnexpectedCharacterInUnquotedAttributeValue | 54 | any >0 55 | )* >StartSlice %SetAttributeValue)?; 56 | 57 | AttributeValueUnquoted := (_AttrValueCharsUnquoted :> ((TagNameSpace | '>') & _StartTagEnd)) @eof(Err_EofInTag); 58 | 59 | SelfClosingTag := ( 60 | '>' >1 @SetSelfClosingFlag @SetLastStartTagName @Next_Data @EmitToken | 61 | any >0 @Err_UnexpectedSolidusInTag @Reconsume @To_BeforeAttributeName 62 | ) @eof(Err_EofInTag); 63 | }%% 64 | --------------------------------------------------------------------------------