├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── examples ├── defer_scripts.lua ├── hn.lua └── mixed_content_rewriter.lua ├── lolhtml.c ├── rockspecs └── lolhtml-dev-2.rockspec └── spec └── lolhtml.lua /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lol-html"] 2 | path = lol-html 3 | url = https://github.com/cloudflare/lol-html.git 4 | [submodule "lua-compat-5.3"] 5 | path = lua-compat-5.3 6 | url = https://github.com/keplerproject/lua-compat-5.3.git 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2019, Julien Desgats 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LOLHTML_SRC_DIR=lol-html/c-api 2 | LOLHTML_STATIC_LIB=$(LOLHTML_SRC_DIR)/target/release/liblolhtml.a 3 | COMPAT_SRC_DIR=lua-compat-5.3/c-api 4 | 5 | all: lolhtml.so 6 | 7 | .PHONY: $(LOLHTML_STATIC_LIB) 8 | $(LOLHTML_STATIC_LIB): 9 | [ -d lol-html ] || ( echo "need to clone submodules" >&2 ; exit 1 ) 10 | cd lol-html/c-api && cargo build --release --locked 11 | 12 | lolhtml.o: lolhtml.c 13 | $(CC) -c -o $@ $(CFLAGS) -Wall -I"$(LOLHTML_SRC_DIR)/include" -I"$(COMPAT_SRC_DIR)" -fPIC $< 14 | 15 | lolhtml.so: $(LOLHTML_STATIC_LIB) lolhtml.o 16 | $(CC) -shared -o $@ -Wall -lpthread \ 17 | lolhtml.o \ 18 | -Wl,--whole-archive $(LOLHTML_STATIC_LIB) \ 19 | -Wl,--no-whole-archive 20 | 21 | clean: 22 | rm -fr lolhtml.o lolhtml.so 23 | 24 | distclean: clean 25 | cd lol-html/c-api && cargo clean 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Lua binding for lol-html 2 | ======================== 3 | 4 | This library is a Lua binding for [lol-html][lolhtml], a *Low output latency 5 | streaming HTML parser/rewriter with CSS selector-based API*. 6 | 7 | It can be used to either extract data from HTML documents or rewrite them 8 | on-the-fly. 9 | 10 | Installation 11 | ------------ 12 | 13 | You need a functional setup of Rust and Cargo to be able to build this module. 14 | Please refer to the [Rust website][rust-install] or install it with your 15 | distribution's package manager. 16 | 17 | ### Luarocks (version >= 3.0 required) 18 | 19 | You can install this module with Luarocks: 20 | 21 | ``` 22 | luarocks install https://raw.githubusercontent.com/jdesgats/lua-lolhtml/master/rockspecs/lolhtml-dev-1.rockspec 23 | ``` 24 | 25 | ### Manual build 26 | 27 | First, be sure to clone this repository with its submodules. Then the provided 28 | Makefile should be able to build the module. 29 | 30 | ``` 31 | git clone --recursive https://github.com/jdesgats/lua-lolhtml.git 32 | make 33 | ``` 34 | 35 | Running the tests require [my fork][telescope] of Telescope: 36 | 37 | ``` 38 | luarocks install https://raw.githubusercontent.com/jdesgats/telescope/master/rockspecs/telescope-scm-1.rockspec 39 | tsc spec/lolhtml.lua 40 | ``` 41 | 42 | Quick start 43 | ----------- 44 | 45 | The workflow is usually: 46 | 47 | 1. Create a [*rewriter builder*](#rewriterbuilder-objects) object: 48 | ```lua 49 | local lolhtml = require "lolhtml" 50 | local my_builder = lolhtml.new_rewriter_builder() 51 | ``` 52 | 2. Attach callbacks to it with the logic to transform your documents: 53 | ```lua 54 | my_builder:add_element_content_handlers { 55 | selector = lolhtml.new_selector("h1"), 56 | element_handler = function(el) el:set_attribute("class", "title") end 57 | } 58 | ``` 59 | 3. Use the previous builder to create [*rewriter*](#rewriter-objects) objects, 60 | one for each HTML page you want to work on: 61 | ```lua 62 | local my_rewriter = lolhtml.new_rewriter { 63 | builder = my_builder, 64 | sink = function(s) print(s) end, 65 | } 66 | ``` 67 | 4. Feed the rewriter with the actual HTML stream: 68 | ```lua 69 | for l in io.stdin:lines() do 70 | my_rewriter:write(l) 71 | end 72 | my_rewriter:close() 73 | ``` 74 | 75 | The `examples` directory contains a port of the original Rust examples from 76 | lol-html. You can run them by feeding an HTML page as input: 77 | 78 | ```sh 79 | curl -NL https://git.io/JeOSZ | lua examples/defer_scripts.lua 80 | ``` 81 | 82 | Status 83 | ------ 84 | 85 | **ALPHA VERSION** 86 | 87 | This binding is not finished yet. Even if the test coverage is quite good and 88 | pass and Valgrind is not complaining, bugs might still be present. 89 | 90 | Also, the API is dot frozen and might change. Here are a non-exhaustive list 91 | of things that I still consider: 92 | 93 | * API naming: stay close of the original names, or choose shorter ones 94 | * Selectors: should they be exposed at all? or compiled and cached transparently 95 | * Some data could be exposed as attributes rather than methods, is it better? 96 | * Tables vs. lots of arguments for some functions 97 | * Error handling: when to raise errors, when to return `nil, err` 98 | 99 | Reference 100 | --------- 101 | 102 | This library tries to stay close of the original API, while being more Lua-ish 103 | when appropriate. In particular it should not panic (as in triggering 104 | `SIGABRT`), such case would be considered as a bug. 105 | 106 | ### Top-level objects 107 | 108 | Object constructors: 109 | 110 | * `lolhtml.new_selector`: see [`Selector`](#selector-objects) 111 | * `lolhtml.new_rewriter_builder`: see [`RewriterBuilder`](#rewriterbuilder-objects) 112 | * `lolhtml.new_rewriter`: see [`Rewriter`](#rewriter-objects) 113 | 114 | Constants: 115 | 116 | * `lolhtml.CONTINUE` 117 | * `lolhtml.STOP` 118 | 119 | ### Selector objects 120 | 121 | Selector object represent a parsed CSS selector that can be used to build 122 | rewriter builders. 123 | 124 | Selector objects don't have any methods or attributes. They are exposed only 125 | for garbage collection purposes (and also as an optimization if you need to 126 | reuse the same selector in multiple builders). 127 | 128 | #### `lolhtml.new_selector(sel: string) => Selector | nil, err` 129 | 130 | Builds a new [`Selector`](#selector-objects) object out of the give string. 131 | Returns `nil, err` in case of syntax error. 132 | 133 | ### RewriterBuilder objects 134 | 135 | The `RewriterBuilder` encapsulate the logic to make rewrites, usually they are 136 | created at program startup and are used to instantiate many `Rewriter` objects. 137 | 138 | All callbacks functions are called with a single argument whose type depend on 139 | the type of callback. This argument should not outlive the callback and any 140 | attempt to keep a reference of it to use it later will result in an error. 141 | 142 | These functions can return: 143 | 144 | * `lolhtml.CONTINUE`: instructs the parser to continue processing the HTML 145 | stream 146 | * `lolhtml.STOP`: causes the parser to stop immediately, `write()` or `end()` 147 | methods of the rewriter will return an error code 148 | * *nothing*: same as `lolhtml.CONTINUE` 149 | 150 | If a callback raises an error, it will also causes the rewriter to stop 151 | immediately. The error object or message will be returned as error by the 152 | `write()` or `end()` methods of the rewriter. 153 | 154 | #### `lolhtml.new_rewriter_builder() => RewriterBuilder` 155 | 156 | Create a new `RewriterBuilder` object. 157 | 158 | #### `RewriterBuilder:add_document_content_handlers(callbacks) => self` 159 | 160 | Adds new document-level content handlers. This function might be called 161 | multiple times to add multiple handlers. 162 | 163 | The `callback` parameter must be a table with callbacks for different types 164 | of events, the possible fields are: 165 | 166 | * `doctype_handler`: called after parsing the Document Type declaration with 167 | a [`Doctype`](#doctype-objects) object. 168 | * `comment_handler`: called whenever a comment is parsed with a 169 | [`Comment`](#comment-objects) object. 170 | * `text_handler`: called when text nodes are parsed with a 171 | [`TextChunk`](#textchunk-objects) object. 172 | * `doc_end_handler`: called at the end of the document with a 173 | [`DocumentEnd`](#documentend-objects) object. 174 | 175 | All of the fields are optional. Calling a callback has a cost so leave out any 176 | callback you don't need. 177 | 178 | #### `RewriterBuilder:add_element_content_handlers(callbacks) => self` 179 | 180 | Adds new element content handlers associated with a selector. This function 181 | might be called multiple times to add multiple handlers for different 182 | selectors. 183 | 184 | The `callback` parameter must be a table with the selector and the callbacks 185 | for different types of events, the possible fields are: 186 | 187 | * `selector`: the [CSS selector](#selector-objects) to call the callbacks on 188 | (required) 189 | * `comment_handler`: called whenever a comment is parsed with a 190 | [`Comment`](#comment-objects) object. 191 | * `text_handler`: called when text nodes are parsed with a 192 | [`TextChunk`](#textchunk-objects) object. 193 | * `element_handler`: called when an element is parsed with a 194 | [`Element`](#element-objects) object. 195 | 196 | All of the fields are optional (except `selector`). Calling a callback has a 197 | cost so leave out any callback you don't need. 198 | 199 | 200 | ### Rewriter objects 201 | 202 | Rewriter object are processing a single HTML document and are instantiated with 203 | a [`RewriterBuilder`](#rewriterbuilder-objects) object. 204 | 205 | Each rewriter has an associated `sink`, which is a function called to output 206 | the rewritten HTML. 207 | 208 | #### `lolhtml.new_rewriter(options) => Rewriter | nil, err` 209 | 210 | Creates a new reriter object. The `options` argument must be a table, the 211 | following fields are allowed: 212 | 213 | * `builder`: a `RewriterBuilder` object (required) 214 | * `encoding`: the text encoding for the HTML stream. Can be a label for any of 215 | the web-compatible encodings with an exception for `UTF-16LE`, `UTF-16BE`, 216 | `ISO-2022-JP` and `replacement` (these non-ASCII-compatible encodings are 217 | not supported). (optional, default is `"utf-8"`) 218 | * `preallocated_parsing_buffer_size`: Specifies the number of bytes that should 219 | be preallocated on HtmlRewriter instantiation for the internal parsing 220 | buffer. See [lol-html documentation][lolhtml-memory] for details. (optional, 221 | default is 1024) 222 | * `max_allowed_memory_usage`: Sets a hard limit in bytes on memory consumption 223 | of a Rewriter instance. See [lol-html documentation][lolhtml-memory] for 224 | details. (optional, default is `SIZE_MAX`) 225 | * `strict`: boolean, if set to true the rewriter bails out if it encounters 226 | markup that drives the HTML parser into ambigious state. See 227 | [lol-html documentation][lolhtml-strict] for details. (optional, default is 228 | `false`) 229 | 230 | Returns the new Rewriter on success, or `nil` and an error message on failure. 231 | 232 | #### `Rewriter:write(s) => self | nil, err` 233 | 234 | Write HTML chunk to rewriter. Returns the rewriter itself on success, or `nil` 235 | and an error message on failure. Failure happens if (incomplete list): 236 | 237 | * A callback or a sink raises an error 238 | * A previous invocation returned an error 239 | * Called after `close` 240 | 241 | #### `Rewriter:close(s) => self | nil, err` 242 | 243 | Finalizes the rewriting process. Should be called once the last chunk of the 244 | input is written. Returns the rewriter itself on success, or `nil` and an 245 | error message on failure. Failure happens if (incomplete list): 246 | 247 | * A callback or a sink raises an error 248 | * A previous invocation returned an error 249 | * Called more than once 250 | 251 | 252 | ### Doctype objects 253 | 254 | #### `Doctype:get_name() => string|nil` 255 | #### `Doctype:get_id() => string|nil` 256 | #### `Doctype:get_system_id() => string|nil` 257 | 258 | ### Comment objects 259 | 260 | #### `Comment:get_text() => string` 261 | #### `Comment:set_text(string) => self|nil, err` 262 | #### `Comment:before(string, is_html) => self|nil, err` 263 | #### `Comment:after(string, is_html) => self|nil, err` 264 | #### `Comment:replace(string, is_html) => self|nil, err` 265 | #### `Comment:remove() => self|nil, err` 266 | #### `Comment:is_removed() => boolean` 267 | 268 | ### TextChunk objects 269 | 270 | #### `TextChunk:get_text() => string` 271 | #### `TextChunk:is_last_in_text_node() => boolean` 272 | #### `TextChunk:before(string, is_html) => self|nil, err` 273 | #### `TextChunk:after(string, is_html) => self|nil, err` 274 | #### `TextChunk:replace(string, is_html) => self|nil, err` 275 | #### `TextChunk:remove() => self|nil, err` 276 | #### `TextChunk:is_removed() => boolean` 277 | 278 | ### Element objects 279 | 280 | #### `Element:get_tag_name() => string` 281 | #### `Element:get_namespace_uri() => string` 282 | #### `Element:get_attribute(name) => string|nil` 283 | #### `Element:has_attribute(name) => boolean` 284 | #### `Element:set_attribute(name, value) => self|nil, err` 285 | #### `Element:remove_attribute(name) => self|nil, err` 286 | #### `Element:attributes() => iterator` 287 | 288 | Returns a Lua iterator triplet so the following construction is valid: 289 | 290 | ```lua 291 | for attr_name, value in element:attribute() do 292 | ... 293 | end 294 | ``` 295 | 296 | #### `Element:before(string, is_html) => self|nil, err` 297 | #### `Element:after(string, is_html) => self|nil, err` 298 | #### `Element:prepend(string, is_html) => self|nil, err` 299 | #### `Element:append(string, is_html) => self|nil, err` 300 | #### `Element:set_inner_content(string, is_html) => self|nil, err` 301 | #### `Element:replace(string, is_html) => self|nil, err` 302 | #### `Element:remove() => self|nil, err` 303 | #### `Element:remove_and_keep_content() => self|nil, err` 304 | #### `Element:is_removed() => boolean` 305 | 306 | ### DocumentEnd objects 307 | 308 | #### `DocumentEnd:append(string, is_html) => self|nil, err` 309 | 310 | 311 | [lolhtml]: https://github.com/cloudflare/lol-html 312 | [lolhtml-memory]: https://docs.rs/lol_html/0.1.0/lol_html/struct.MemorySettings.html 313 | [lolhtml-strict]: https://docs.rs/lol_html/0.1.0/lol_html/struct.Settings.html#structfield.stricti 314 | [rust-install]: https://www.rust-lang.org/tools/install 315 | [telescope]: https://github.com/jdesgats/telescope 316 | -------------------------------------------------------------------------------- /examples/defer_scripts.lua: -------------------------------------------------------------------------------- 1 | -- Reads HTML from the stdin stream and defers render-blocking scripts, then 2 | -- streams the result to the stdout. 3 | 4 | local lolhtml = require "lolhtml" 5 | 6 | -- create the rewriter 7 | local rewriter = lolhtml.new_rewriter { 8 | builder = lolhtml.new_rewriter_builder() 9 | :add_element_content_handlers { 10 | selector = lolhtml.new_selector("script[src]:not([async]):not([defer])"), 11 | element_handler = function(el) 12 | el:set_attribute("defer", "") 13 | end 14 | }, 15 | -- just write the output to stdout 16 | sink = function(s) 17 | io.stdout:write(s) 18 | end, 19 | } 20 | 21 | -- feed from stdin to the rewriter 22 | for l in io.stdin:lines() do 23 | rewriter:write(l .. "\n") 24 | end 25 | rewriter:close() 26 | -------------------------------------------------------------------------------- /examples/hn.lua: -------------------------------------------------------------------------------- 1 | -- This script will grab title and links from the Hacker News front page. 2 | -- This example demonstrate how to extract content from a page without 3 | -- necessarily rewriting it. 4 | -- 5 | -- Run it like this: 6 | -- curl -s https://news.ycombinator.com/ | lua hn.lua 7 | 8 | local lolhtml = require "lolhtml" 9 | 10 | -- This table will hold all our links 11 | local links = {} 12 | 13 | -- this variable will store the currently parsed link as the content extraction 14 | -- will span across different callbacks 15 | -- Note that `links[#links]` should be equivalent. 16 | local current_link 17 | 18 | local builder = lolhtml.new_rewriter_builder() 19 | :add_element_content_handlers { 20 | selector = lolhtml.new_selector "a.storylink", 21 | element_handler = function(el) 22 | -- This is called right after parsing the opening anchor: create a new 23 | -- link table and grab the target. the `tmp` filed will be used as an 24 | -- accumulator 25 | current_link = { href = el:get_attribute("href"), tmp = {} } 26 | table.insert(links, current_link) 27 | end, 28 | text_handler = function(t) 29 | -- Grabbing text is a bit more involved than attributes as the callback 30 | -- might be called an arbitrary number of times (depending how the text 31 | -- is fed into the parser. 32 | -- Here we use the accumulator to keep the whole text nutil the anchor 33 | -- tag is closed. 34 | table.insert(current_link.tmp, t:get_text()) 35 | 36 | if t:is_last_in_text_node() then 37 | -- At this point the anchor tab is being closed and we are sure we 38 | -- grabbed the entire text. 39 | current_link.text = table.concat(current_link.tmp) 40 | current_link.tmp = {} -- reset the accumulator 41 | end 42 | end 43 | } 44 | -- grab the score: we need another selector for that 45 | :add_element_content_handlers { 46 | selector = lolhtml.new_selector "span.score", 47 | text_handler = function(t) 48 | -- This callback is called after the above one (as long as the page 49 | -- structure doesn't change. So we should have a current_link object. 50 | -- Apply the same accumulator technique as above. 51 | table.insert(current_link.tmp, t:get_text()) 52 | 53 | if t:is_last_in_text_node() then 54 | local score = table.concat(current_link.tmp) 55 | current_link.tmp = {} 56 | 57 | -- now we just want the actual score as a number, not the text 58 | current_link.points = tonumber(score:match("^(%d+) points?$")) or -1 59 | end 60 | end 61 | } 62 | 63 | local rewriter = lolhtml.new_rewriter { 64 | builder = builder, 65 | -- here we don't care about the output, just throw it away 66 | sink = function() end, 67 | } 68 | 69 | while true do 70 | local chunk = io.stdin:read(1024) 71 | if not chunk then break end 72 | assert(rewriter:write(chunk)) 73 | end 74 | assert(rewriter:close()) 75 | 76 | for _, l in ipairs(links) do 77 | io.stdout:write(string.format("%s\n\tpoints: %d\n\tlink: %s\n", l.text, l.points, l.href)) 78 | end 79 | -------------------------------------------------------------------------------- /examples/mixed_content_rewriter.lua: -------------------------------------------------------------------------------- 1 | -- Reads HTML from the stdin stream, rewrites mixed content in it and streams 2 | -- the result to the stdout. 3 | 4 | local lolhtml = require "lolhtml" 5 | 6 | local function rewrite_url_in_attr(el, attr) 7 | local val = el:get_attribute(attr):gsub("http://", "https://") 8 | el:set_attribute(attr, val) 9 | end 10 | 11 | -- create the rewriter 12 | local rewriter = lolhtml.new_rewriter { 13 | builder = lolhtml.new_rewriter_builder() 14 | :add_element_content_handlers { 15 | selector = lolhtml.new_selector("a[href], link[rel=stylesheet][href]"), 16 | element_handler = function(el) rewrite_url_in_attr(el, "href") end, 17 | } 18 | :add_element_content_handlers { 19 | selector = lolhtml.new_selector("script[src], iframe[src], img[src], audio[src], video[src]"), 20 | element_handler = function(el) rewrite_url_in_attr(el, "src") end, 21 | }, 22 | -- just write the output to stdout 23 | sink = function(s) 24 | io.stdout:write(s) 25 | end, 26 | } 27 | 28 | -- feed from stdin to the rewriter 29 | for l in io.stdin:lines() do 30 | assert(rewriter:write(l .. "\n")) 31 | end 32 | assert(rewriter:close()) 33 | -------------------------------------------------------------------------------- /lolhtml.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define PREFIX "lolhtml." 10 | 11 | /* VM registry name for a module wide sub-registry used to keep references to 12 | * actual objects using `luaL_ref` in case where we need to retrieve Lua objects 13 | * from callbacks. 14 | * This table will have weak values so it doesn't prevent GC. 15 | */ 16 | #define LOL_REGISTRY (PREFIX "weakreg") 17 | 18 | /* rewriter uservalue indices */ 19 | /* note: for now the uservalue is a Lua table with numeric indices, but Lua 5.4 20 | * allows multiple user values, that should be more efficient */ 21 | #define REWRITER_CALLBACK_INDEX 1 22 | #define REWRITER_BUILDER_INDEX 2 23 | #define REWRITER_ERROR_INDEX 3 24 | 25 | typedef struct { 26 | lua_State *L; 27 | int builder_index; 28 | int callback_index; 29 | } handler_data_t; 30 | 31 | static void push_lol_str_maybe(lua_State *L, lol_html_str_t *s) { 32 | if (s == NULL) { 33 | lua_pushnil(L); 34 | } else { 35 | lua_pushlstring(L, s->data, s->len); 36 | lol_html_str_free(*s); 37 | free(s); 38 | } 39 | } 40 | 41 | static int push_last_error(lua_State *L) { 42 | lol_html_str_t *err = lol_html_take_last_error(); 43 | lua_pushnil(L); 44 | if (err == NULL) { 45 | lua_pushliteral(L, "unknown error"); 46 | } else { 47 | lua_pushlstring(L, err->data, err->len); 48 | lol_html_str_free(*err); 49 | free(err); 50 | } 51 | return 2; 52 | } 53 | 54 | /* checks a function result code and prepares a method return to Lua: 55 | * if zero, shrink the stack to 1 (the self argument) and returns 1 56 | * otherwise, pushes nil and the error message and returns 2 57 | */ 58 | static int return_self_or_err(lua_State *L, int rc) { 59 | if (rc != 0) { 60 | return push_last_error(L); 61 | } 62 | lua_settop(L, 1); 63 | return 1; 64 | } 65 | 66 | /* helper function used for rewriter builder callbacks where the userdata is a 67 | * pointer to a pointer. These objects should not be used outside of the 68 | * callback but nothing prevents the Lua code to keep references around. These 69 | * references are NULL'd after the callback so this helper can detect this and 70 | * throw regular errors. 71 | * 72 | * Note: the `void *` should be `void**` but this would require explicit casts 73 | * all over the place. 74 | */ 75 | static void* check_valid_udata(lua_State *L, int arg, const char *tname) { 76 | void **ptr = luaL_checkudata(L, arg, tname); 77 | if (*ptr == NULL) { 78 | luaL_argerror(L, arg, "attempt to use a value past its lifetime"); 79 | } 80 | return ptr; 81 | } 82 | 83 | /* document content handlers callbacks */ 84 | static lol_html_rewriter_directive_t 85 | do_document_content_callback(const char *param_type, void *param, handler_data_t *handler) { 86 | lol_html_rewriter_directive_t directive; 87 | lua_State *L = handler->L; 88 | 89 | /* locate the handler to call */ 90 | lua_getfield(L, LUA_REGISTRYINDEX, LOL_REGISTRY); /* reg */ 91 | lua_rawgeti(L, -1, handler->builder_index); /* reg, ud */ 92 | lua_getuservalue(L, -1); /* reg, ud, uv */ 93 | lua_rawgeti(L, -1, handler->callback_index); /* reg, ud, uv, cb */ 94 | lua_replace(L, -4); /* cb, ud, uv */ 95 | lua_pop(L, 2); /* cb */ 96 | 97 | /* allocate the parameter object */ 98 | void **lua_param = lua_newuserdata(handler->L, sizeof(void *)); 99 | luaL_getmetatable(L, param_type); 100 | lua_setmetatable(L, -2); 101 | *lua_param = param; 102 | 103 | int rc = lua_pcall(L, 1, 1, 0); 104 | *lua_param = NULL; /* signals that this value cannot be used anymore */ 105 | if (rc != LUA_OK) { 106 | /* in case of error, just leave the error on the stack, the calling 107 | * site will check if the stack level changed and assume an error 108 | * happened if it did */ 109 | return LOL_HTML_STOP; 110 | } 111 | 112 | switch (lua_type(L, -1)) { 113 | case LUA_TNIL: /* no return value => assume continue */ 114 | directive = LOL_HTML_CONTINUE; 115 | break; 116 | case LUA_TNUMBER: { 117 | int isnum; 118 | lua_Integer result = lua_tointegerx(L, -1, &isnum); 119 | if (!isnum) goto type_error; 120 | if (result == LOL_HTML_CONTINUE || result == LOL_HTML_STOP) { 121 | directive = result; 122 | } else goto type_error; 123 | break; 124 | } 125 | default: goto type_error; 126 | } 127 | 128 | lua_pop(L, 1); /* pop the function result */ 129 | return directive; 130 | 131 | type_error: 132 | lua_pop(L, 1); /* pop the function result */ 133 | lua_pushliteral(L, "invalid content handler return"); 134 | return LOL_HTML_STOP; 135 | } 136 | 137 | static lol_html_rewriter_directive_t 138 | doctype_handler(lol_html_doctype_t *doctype, void *user_data) 139 | { 140 | return do_document_content_callback(PREFIX "doctype", doctype, user_data); 141 | } 142 | 143 | static lol_html_rewriter_directive_t 144 | comment_handler(lol_html_comment_t *comment, void *user_data) 145 | { 146 | return do_document_content_callback(PREFIX "comment", comment, user_data); 147 | } 148 | 149 | static lol_html_rewriter_directive_t 150 | text_chunk_handler(lol_html_text_chunk_t *chunk, void *user_data) 151 | { 152 | return do_document_content_callback(PREFIX "text_chunk", chunk, user_data); 153 | } 154 | 155 | static lol_html_rewriter_directive_t 156 | doc_end_handler(lol_html_doc_end_t *doc_end, void *user_data) 157 | { 158 | return do_document_content_callback(PREFIX "doc_end", doc_end, user_data); 159 | } 160 | 161 | static lol_html_rewriter_directive_t 162 | element_handler(lol_html_element_t *element, void *user_data) 163 | { 164 | return do_document_content_callback(PREFIX "element", element, user_data); 165 | } 166 | 167 | /* doctype */ 168 | static int doctype_get_name(lua_State *L) { 169 | const lol_html_doctype_t **doctype = check_valid_udata(L, 1, PREFIX "doctype"); 170 | push_lol_str_maybe(L, lol_html_doctype_name_get(*doctype)); 171 | return 1; 172 | } 173 | 174 | static int doctype_get_id(lua_State *L) { 175 | const lol_html_doctype_t **doctype = check_valid_udata(L, 1, PREFIX "doctype"); 176 | push_lol_str_maybe(L, lol_html_doctype_public_id_get(*doctype)); 177 | return 1; 178 | } 179 | 180 | static int doctype_get_system_id(lua_State *L) { 181 | const lol_html_doctype_t **doctype = check_valid_udata(L, 1, PREFIX "doctype"); 182 | push_lol_str_maybe(L, lol_html_doctype_system_id_get(*doctype)); 183 | return 1; 184 | } 185 | 186 | static luaL_Reg doctype_methods[] = { 187 | { "get_name", doctype_get_name }, 188 | { "get_id", doctype_get_id }, 189 | { "get_system_id", doctype_get_system_id }, 190 | { NULL, NULL } 191 | }; 192 | 193 | /* comment */ 194 | static int comment_get_text(lua_State *L) { 195 | const lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 196 | lol_html_str_t text = lol_html_comment_text_get(*comment); 197 | lua_pushlstring(L, text.data, text.len); 198 | lol_html_str_free(text); 199 | return 1; 200 | } 201 | 202 | static int comment_set_text(lua_State *L) { 203 | size_t text_len; 204 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 205 | const char *text = luaL_checklstring(L, 2, &text_len); 206 | return return_self_or_err(L, lol_html_comment_text_set(*comment, text, text_len)); 207 | } 208 | 209 | static int comment_before(lua_State *L) { 210 | size_t content_len; 211 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 212 | const char *content = luaL_checklstring(L, 2, &content_len); 213 | bool is_html = lua_toboolean(L, 3); 214 | return return_self_or_err(L, lol_html_comment_before(*comment, content, content_len, is_html)); 215 | } 216 | 217 | static int comment_after(lua_State *L) { 218 | size_t content_len; 219 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 220 | const char *content = luaL_checklstring(L, 2, &content_len); 221 | bool is_html = lua_toboolean(L, 3); 222 | return return_self_or_err(L, lol_html_comment_after(*comment, content, content_len, is_html)); 223 | } 224 | 225 | static int comment_replace(lua_State *L) { 226 | size_t content_len; 227 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 228 | const char *content = luaL_checklstring(L, 2, &content_len); 229 | bool is_html = lua_toboolean(L, 3); 230 | return return_self_or_err(L, lol_html_comment_replace(*comment, content, content_len, is_html)); 231 | } 232 | 233 | static int comment_remove(lua_State *L) { 234 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 235 | lol_html_comment_remove(*comment); 236 | return return_self_or_err(L, 0); /* cannot fail */ 237 | } 238 | 239 | static int comment_is_removed(lua_State *L) { 240 | lol_html_comment_t **comment = check_valid_udata(L, 1, PREFIX "comment"); 241 | lua_pushboolean(L, lol_html_comment_is_removed(*comment)); 242 | return 1; 243 | } 244 | 245 | static luaL_Reg comment_methods[] = { 246 | { "get_text", comment_get_text }, 247 | { "set_text", comment_set_text }, 248 | { "before", comment_before }, 249 | { "after", comment_after }, 250 | { "replace", comment_replace }, 251 | { "remove", comment_remove }, 252 | { "is_removed", comment_is_removed }, 253 | { NULL, NULL } 254 | }; 255 | 256 | 257 | /* text_chunk */ 258 | static int text_chunk_get_text(lua_State *L) { 259 | const lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 260 | lol_html_text_chunk_content_t content = lol_html_text_chunk_content_get(*chunk); 261 | lua_pushlstring(L, content.data, content.len); 262 | return 1; 263 | } 264 | 265 | static int text_chunk_is_last_in_text_node(lua_State *L) { 266 | const lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 267 | lua_pushboolean(L, lol_html_text_chunk_is_last_in_text_node(*chunk)); 268 | return 1; 269 | } 270 | 271 | static int text_chunk_before(lua_State *L) { 272 | size_t content_len; 273 | lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 274 | const char *content = luaL_checklstring(L, 2, &content_len); 275 | bool is_html = lua_toboolean(L, 3); 276 | return return_self_or_err(L, lol_html_text_chunk_before(*chunk, content, content_len, is_html)); 277 | } 278 | 279 | static int text_chunk_after(lua_State *L) { 280 | size_t content_len; 281 | lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 282 | const char *content = luaL_checklstring(L, 2, &content_len); 283 | bool is_html = lua_toboolean(L, 3); 284 | return return_self_or_err(L, lol_html_text_chunk_after(*chunk, content, content_len, is_html)); 285 | } 286 | 287 | static int text_chunk_replace(lua_State *L) { 288 | size_t content_len; 289 | lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 290 | const char *content = luaL_checklstring(L, 2, &content_len); 291 | bool is_html = lua_toboolean(L, 3); 292 | return return_self_or_err(L, lol_html_text_chunk_replace(*chunk, content, content_len, is_html)); 293 | } 294 | 295 | static int text_chunk_remove(lua_State *L) { 296 | lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 297 | lol_html_text_chunk_remove(*chunk); 298 | return return_self_or_err(L, 0); /* cannot fail */ 299 | } 300 | 301 | static int text_chunk_is_removed(lua_State *L) { 302 | const lol_html_text_chunk_t **chunk = check_valid_udata(L, 1, PREFIX "text_chunk"); 303 | lua_pushboolean(L, lol_html_text_chunk_is_removed(*chunk)); 304 | return 1; 305 | } 306 | 307 | static luaL_Reg text_chunk_methods[] = { 308 | { "get_text", text_chunk_get_text }, 309 | { "is_last_in_text_node", text_chunk_is_last_in_text_node }, 310 | { "before", text_chunk_before }, 311 | { "after", text_chunk_after }, 312 | { "replace", text_chunk_replace }, 313 | { "remove", text_chunk_remove }, 314 | { "is_removed", text_chunk_is_removed }, 315 | { NULL, NULL } 316 | }; 317 | 318 | 319 | /* doc_end */ 320 | static int doc_end_append(lua_State *L) { 321 | size_t content_len; 322 | lol_html_doc_end_t **doc_end = check_valid_udata(L, 1, PREFIX "doc_end"); 323 | const char *content = luaL_checklstring(L, 2, &content_len); 324 | bool is_html = lua_toboolean(L, 3); 325 | return return_self_or_err(L, lol_html_doc_end_append(*doc_end, content, content_len, is_html)); 326 | } 327 | 328 | static luaL_Reg doc_end_methods[] = { 329 | { "append", doc_end_append }, 330 | { NULL, NULL } 331 | }; 332 | 333 | /* element */ 334 | static int element_get_tag_name(lua_State *L) { 335 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 336 | lol_html_str_t tag_name = lol_html_element_tag_name_get(*el); 337 | lua_pushlstring(L, tag_name.data, tag_name.len); 338 | lol_html_str_free(tag_name); 339 | return 1; 340 | } 341 | 342 | static int element_get_namespace_uri(lua_State *L) { 343 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 344 | lua_pushstring(L, lol_html_element_namespace_uri_get(*el)); // TODO: can it return nil? 345 | return 1; 346 | } 347 | 348 | static int element_get_attribute(lua_State *L) { 349 | size_t len; 350 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 351 | const char *attr = luaL_checklstring(L, 2, &len); 352 | push_lol_str_maybe(L, lol_html_element_get_attribute(*el, attr, len)); 353 | return 1; 354 | } 355 | 356 | static int element_has_attribute(lua_State *L) { 357 | size_t len; 358 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 359 | const char *attr = luaL_checklstring(L, 2, &len); 360 | int rc = lol_html_element_has_attribute(*el, attr, len); 361 | if (rc < 0) { 362 | return push_last_error(L); 363 | } 364 | 365 | lua_pushboolean(L, rc); 366 | return 1; 367 | } 368 | 369 | static int element_set_attribute(lua_State *L) { 370 | size_t attr_len, value_len; 371 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 372 | const char *attr = luaL_checklstring(L, 2, &attr_len); 373 | const char *value = luaL_checklstring(L, 3, &value_len); 374 | return return_self_or_err(L, lol_html_element_set_attribute( 375 | *el, attr, attr_len, value, value_len)); 376 | } 377 | 378 | static int element_remove_attribute(lua_State *L) { 379 | size_t len; 380 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 381 | const char *attr = luaL_checklstring(L, 2, &len); 382 | return return_self_or_err(L, lol_html_element_remove_attribute(*el, attr, len)); 383 | } 384 | 385 | static int attribute_iterator_next(lua_State *L) { 386 | lol_html_str_t s; 387 | lol_html_attributes_iterator_t **it = check_valid_udata(L, 1, PREFIX "attribute_iterator"); 388 | const lol_html_attribute_t *attr = lol_html_attributes_iterator_next(*it); 389 | 390 | if (attr == NULL) { 391 | /* end of the attributes: eagerly free the iterator */ 392 | lol_html_attributes_iterator_free(*it); 393 | *it = NULL; 394 | lua_pushnil(L); 395 | return 1; 396 | } 397 | 398 | s = lol_html_attribute_name_get(attr); 399 | lua_pushlstring(L, s.data, s.len); 400 | lol_html_str_free(s); 401 | 402 | s = lol_html_attribute_value_get(attr); 403 | lua_pushlstring(L, s.data, s.len); 404 | lol_html_str_free(s); 405 | 406 | return 2; 407 | } 408 | 409 | static int attribute_iterator_destroy(lua_State *L) { 410 | lol_html_attributes_iterator_t **it = luaL_checkudata(L, 1, PREFIX "attribute_iterator"); 411 | if (*it != NULL) { 412 | lol_html_attributes_iterator_free(*it); 413 | *it = NULL; 414 | } 415 | return 0; 416 | } 417 | 418 | static int element_attributes(lua_State *L) { 419 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 420 | 421 | lua_pushcfunction(L, attribute_iterator_next); 422 | 423 | /* We have to use a full userdata as we need to reliably GC the iterator if 424 | * the program breaks early from the loop. */ 425 | lol_html_attributes_iterator_t **it = lua_newuserdata(L, sizeof(void*)); 426 | luaL_getmetatable(L, PREFIX "attribute_iterator"); 427 | lua_setmetatable(L, -2); 428 | *it = lol_html_attributes_iterator_get(*el); 429 | 430 | lua_pushnil(L); 431 | return 3; 432 | } 433 | 434 | static int element_before(lua_State *L) { 435 | size_t len; 436 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 437 | const char *text = luaL_checklstring(L, 2, &len); 438 | bool is_html = lua_toboolean(L, 3); 439 | return return_self_or_err(L, lol_html_element_before(*el, text, len, is_html)); 440 | } 441 | 442 | static int element_after(lua_State *L) { 443 | size_t len; 444 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 445 | const char *text = luaL_checklstring(L, 2, &len); 446 | bool is_html = lua_toboolean(L, 3); 447 | return return_self_or_err(L, lol_html_element_after(*el, text, len, is_html)); 448 | } 449 | 450 | static int element_prepend(lua_State *L) { 451 | size_t len; 452 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 453 | const char *text = luaL_checklstring(L, 2, &len); 454 | bool is_html = lua_toboolean(L, 3); 455 | return return_self_or_err(L, lol_html_element_prepend(*el, text, len, is_html)); 456 | } 457 | 458 | static int element_append(lua_State *L) { 459 | size_t len; 460 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 461 | const char *text = luaL_checklstring(L, 2, &len); 462 | bool is_html = lua_toboolean(L, 3); 463 | return return_self_or_err(L, lol_html_element_append(*el, text, len, is_html)); 464 | } 465 | 466 | static int element_set_inner_content(lua_State *L) { 467 | size_t len; 468 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 469 | const char *text = luaL_checklstring(L, 2, &len); 470 | bool is_html = lua_toboolean(L, 3); 471 | return return_self_or_err(L, lol_html_element_set_inner_content(*el, text, len, is_html)); 472 | } 473 | 474 | static int element_replace(lua_State *L) { 475 | size_t len; 476 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 477 | const char *text = luaL_checklstring(L, 2, &len); 478 | bool is_html = lua_toboolean(L, 3); 479 | return return_self_or_err(L, lol_html_element_replace(*el, text, len, is_html)); 480 | } 481 | 482 | static int element_is_removed(lua_State *L) { 483 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 484 | lua_pushboolean(L, lol_html_element_is_removed(*el)); 485 | return 1; 486 | } 487 | 488 | static int element_remove(lua_State *L) { 489 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 490 | lol_html_element_remove(*el); 491 | return return_self_or_err(L, 0); /* cannot fail */ 492 | } 493 | 494 | static int element_remove_and_keep_content(lua_State *L) { 495 | lol_html_element_t **el = check_valid_udata(L, 1, PREFIX "element"); 496 | lol_html_element_remove_and_keep_content(*el); 497 | return return_self_or_err(L, 0); /* cannot fail */ 498 | } 499 | 500 | static luaL_Reg element_methods[] = { 501 | { "get_tag_name", element_get_tag_name }, 502 | { "get_namespace_uri", element_get_namespace_uri }, 503 | { "get_attribute", element_get_attribute }, 504 | { "has_attribute", element_has_attribute }, 505 | { "set_attribute", element_set_attribute }, 506 | { "remove_attribute", element_remove_attribute }, 507 | { "attributes", element_attributes }, 508 | { "before", element_before }, 509 | { "after", element_after }, 510 | { "prepend", element_prepend }, 511 | { "append", element_append }, 512 | { "set_inner_content", element_set_inner_content }, 513 | { "replace", element_replace }, 514 | { "is_removed", element_is_removed }, 515 | { "remove", element_remove }, 516 | { "remove_and_keep_content", element_remove_and_keep_content }, 517 | { NULL, NULL } 518 | }; 519 | 520 | 521 | /* rewriter builder */ 522 | /* note: as there is a dynamic number of callbacks, the userdata for the builder 523 | * is just a boxed pointer with a table as uservalue. 524 | * Each callback will also have a userdata associated with it, and the references 525 | * will be anchored with the table uservalue mentioned above. 526 | */ 527 | 528 | /*** 529 | * Create a new builder. 530 | * @return the created builder 531 | */ 532 | static int rewriter_builder_new(lua_State *L) { 533 | int builder_ref; 534 | lol_html_rewriter_builder_t **ud = lua_newuserdata(L, sizeof(lol_html_rewriter_builder_t *)); 535 | *ud = lol_html_rewriter_builder_new(); 536 | 537 | luaL_getmetatable(L, PREFIX "builder"); 538 | lua_setmetatable(L, -2); 539 | 540 | /* register the new builder in the builder registry */ 541 | lua_getfield(L, LUA_REGISTRYINDEX, LOL_REGISTRY); /* ud, reg */ 542 | lua_pushvalue(L, -2); /* ud, reg, ud */ 543 | builder_ref = luaL_ref(L, -2); /* ud, reg */ 544 | lua_pop(L, 1); /* ud */ 545 | 546 | /* allocate a table as a uservalue for the builder: this uservalue will be 547 | * used to keep references to the callback functions and the handler_data_t 548 | * structures used as userdata for the lol_html API */ 549 | lua_newtable(L); /* ud, uv */ 550 | lua_pushinteger(L, builder_ref); /* ud, uv, ref */ 551 | lua_setfield(L, -2, "ref"); /* ud, uv */ 552 | lua_setuservalue(L, -2); /* ud */ 553 | 554 | return 1; 555 | } 556 | 557 | static int rewriter_builder_destroy(lua_State *L) { 558 | lol_html_rewriter_builder_t **ud = luaL_checkudata(L, 1, PREFIX "builder"); 559 | lol_html_rewriter_builder_free(*ud); 560 | return 0; 561 | } 562 | 563 | static handler_data_t* create_handler(lua_State *L, int builder_idx, int cb_table_idx, const char *field) { 564 | if (lua_getfield(L, cb_table_idx, field) == LUA_TFUNCTION) { 565 | handler_data_t *handler = lua_newuserdata(L, sizeof(handler_data_t)); /* func, hander_data */ 566 | handler->L = L; 567 | 568 | lua_getuservalue(L, builder_idx); /* func, hander_data, uv */ 569 | lua_getfield(L, -1, "ref"); /* func, hander_data, uv, ref */ 570 | handler->builder_index = lua_tointeger(L, -1); 571 | lua_pop(L, 1); /* func, hander_data, uv */ 572 | 573 | /* keep a reference to the callback function */ 574 | lua_pushvalue(L, -3); /* func, hander_data, uv, func */ 575 | handler->callback_index = luaL_ref(L, -2); /* func, hander_data, uv */ 576 | 577 | /* keep a reference to the handler data (kept until the builder is GC'd */ 578 | lua_pushvalue(L, -2); /* func, hander_data, uv, hander_data */ 579 | luaL_ref(L, -2); /* func, hander_data, uv */ 580 | 581 | lua_pop(L, 3); /* func */ 582 | return handler; 583 | } else { 584 | // TODO: throw error if the handler is not a function 585 | // TODO: what about __call? allow everything and hope for the best? 586 | lua_pop(L, 1); 587 | return NULL; 588 | } 589 | } 590 | 591 | static int rewriter_builder_add_document_content_handlers(lua_State *L) { 592 | void *doctype_ud, *comment_ud, *text_ud, *doc_end_ud; 593 | 594 | lol_html_rewriter_builder_t **builder = luaL_checkudata(L, 1, PREFIX "builder"); 595 | luaL_checktype(L, 2, LUA_TTABLE); 596 | doctype_ud = create_handler(L, 1, 2, "doctype_handler"); 597 | comment_ud = create_handler(L, 1, 2, "comment_handler"); 598 | text_ud = create_handler(L, 1, 2, "text_handler"); 599 | doc_end_ud = create_handler(L, 1, 2, "doc_end_handler"); 600 | 601 | lol_html_rewriter_builder_add_document_content_handlers( 602 | *builder, 603 | (doctype_ud == NULL) ? NULL : doctype_handler, doctype_ud, 604 | (comment_ud == NULL) ? NULL : comment_handler, comment_ud, 605 | (text_ud == NULL) ? NULL : text_chunk_handler, text_ud, 606 | (doc_end_ud == NULL) ? NULL : doc_end_handler, doc_end_ud); 607 | 608 | /* return self */ 609 | lua_settop(L, 1); 610 | return 1; 611 | } 612 | 613 | static int rewriter_builder_add_element_content_handlers(lua_State *L) { 614 | void *comment_ud, *text_ud, *element_ud; 615 | const lol_html_selector_t **selector; 616 | int rc; 617 | 618 | lol_html_rewriter_builder_t **builder = luaL_checkudata(L, 1, PREFIX "builder"); 619 | luaL_checktype(L, 2, LUA_TTABLE); 620 | 621 | /* get selector, and anchor it to the builder */ 622 | lua_getuservalue(L, 1); 623 | lua_getfield(L, 2, "selector"); 624 | selector = luaL_checkudata(L, -1, PREFIX "selector"); 625 | luaL_ref(L, -2); 626 | lua_pop(L, 1); 627 | 628 | comment_ud = create_handler(L, 1, 2, "comment_handler"); 629 | text_ud = create_handler(L, 1, 2, "text_handler"); 630 | element_ud = create_handler(L, 1, 2, "element_handler"); 631 | 632 | rc = lol_html_rewriter_builder_add_element_content_handlers( 633 | *builder, *selector, 634 | (element_ud == NULL) ? NULL : element_handler, element_ud, 635 | (comment_ud == NULL) ? NULL : comment_handler, comment_ud, 636 | (text_ud == NULL) ? NULL : text_chunk_handler, text_ud); 637 | 638 | return return_self_or_err(L, rc); 639 | } 640 | 641 | static luaL_Reg rewriter_builder_methods[] = { 642 | { "add_document_content_handlers", rewriter_builder_add_document_content_handlers }, 643 | { "add_element_content_handlers", rewriter_builder_add_element_content_handlers }, 644 | { NULL, NULL } 645 | }; 646 | 647 | 648 | /* Rewriter */ 649 | typedef struct { 650 | lol_html_rewriter_t *rewriter; 651 | lua_State *L; 652 | int reg_idx; 653 | bool broken; /* used to signal sink errors */ 654 | } lua_rewriter_t; 655 | 656 | static void sink_callback(const char *chunk, size_t chunk_len, void *user_data) { 657 | int rc; 658 | lua_rewriter_t *rewriter = user_data; 659 | if (rewriter->broken) { 660 | return; 661 | } 662 | 663 | lua_checkstack(rewriter->L, 4); 664 | lua_getfield(rewriter->L, LUA_REGISTRYINDEX, LOL_REGISTRY); /* reg */ 665 | lua_rawgeti(rewriter->L, -1, rewriter->reg_idx); /* reg, rewriter */ 666 | lua_getuservalue(rewriter->L, -1); /* reg, rewriter, uv */ 667 | lua_rawgeti(rewriter->L, -1, REWRITER_CALLBACK_INDEX); /* reg, rewriter, uv, cb */ 668 | lua_pushlstring(rewriter->L, chunk, chunk_len); /* reg, rewriter, uv, cb, chunk */ 669 | rc = lua_pcall(rewriter->L, 1, 0, 0); /* reg, rewriter, uv, err? */ 670 | 671 | if (rc != LUA_OK) { /* reg, rewriter, uv, err */ 672 | /* at this point, the lol-html API does not allow to abort the 673 | * processing straight away, so we have to let it continue until the 674 | * end. However the Lua handler will not be called again. */ 675 | lua_rawseti(rewriter->L, -2, REWRITER_ERROR_INDEX); /* reg, rewriter, uv */ 676 | rewriter->broken = 1; 677 | } 678 | 679 | lua_pop(rewriter->L, 3); 680 | } 681 | 682 | static int rewriter_new(lua_State *L) { 683 | size_t encoding_len; 684 | const char *encoding; 685 | lol_html_memory_settings_t memory_settings; 686 | lua_rewriter_t *rewriter; 687 | bool strict; 688 | 689 | luaL_checktype(L, 1, LUA_TTABLE); 690 | 691 | /* the error messages for the luaL_opt* functions are not great in this case */ 692 | lua_getfield(L, 1, "builder"); 693 | lol_html_rewriter_builder_t **builder = luaL_checkudata(L, -1, PREFIX "builder"); 694 | /* keep the builder on the stack */ 695 | 696 | lua_getfield(L, 1, "encoding"); 697 | encoding = luaL_optlstring(L, -1, "utf-8", &encoding_len); 698 | lua_pop(L, 1); 699 | 700 | lua_getfield(L, 1, "preallocated_parsing_buffer_size"); 701 | memory_settings.preallocated_parsing_buffer_size = luaL_optinteger(L, -1, 1024); 702 | lua_pop(L, 1); 703 | 704 | lua_getfield(L, 1, "max_allowed_memory_usage"); 705 | memory_settings.max_allowed_memory_usage = luaL_optinteger(L, -1, SIZE_MAX); 706 | lua_pop(L, 1); 707 | 708 | lua_getfield(L, 1, "strict"); 709 | strict = lua_toboolean(L, -1); 710 | lua_pop(L, 1); 711 | 712 | // TODO: support a "blackhole" sink by default that avoids all the callback 713 | // machinery 714 | if (lua_getfield(L, 1, "sink") != LUA_TFUNCTION) { 715 | /* not a function, check if it's a callable */ 716 | if (luaL_getmetafield(L, -1, "__call") == LUA_TNIL) { 717 | luaL_argerror(L, 1, "field \"sink\" cannot be called"); 718 | } 719 | lua_pop(L, 1); 720 | } 721 | 722 | rewriter = lua_newuserdata(L, sizeof(lua_rewriter_t)); /* builder, cb, ud */ 723 | rewriter->L = L; 724 | rewriter->broken = 0; 725 | rewriter->rewriter = lol_html_rewriter_build( 726 | *builder, 727 | encoding, encoding_len, 728 | memory_settings, 729 | sink_callback, rewriter, 730 | strict 731 | ); 732 | 733 | if (rewriter->rewriter == NULL) { 734 | return push_last_error(L); 735 | } 736 | 737 | // keep a reference of the rewriter in the weak registry to retrieve the 738 | // reference later on 739 | lua_getfield(L, LUA_REGISTRYINDEX, LOL_REGISTRY); /* builder, cb, ud, reg */ 740 | lua_pushvalue(L, -2); /* builder, cb, ud, reg, ud */ 741 | rewriter->reg_idx = luaL_ref(L, -2); /* builder, cb, ud, reg */ 742 | lua_pop(L, 1); /* builder, cb, ud */ 743 | 744 | /* attach the buidler and handler functions to the userdata */ 745 | lua_createtable(L, 2, 0); /* builder, cb, ud, uv */ 746 | lua_pushvalue(L, -3); /* builder, cb, ud, uv, cb */ 747 | lua_rawseti(L, -2, REWRITER_CALLBACK_INDEX); /* builder, cb, ud, uv */ 748 | lua_pushvalue(L, -4); /* builder, cb, ud, uv, builder */ 749 | lua_rawseti(L, -2, REWRITER_BUILDER_INDEX); /* builder, cb, ud, uv */ 750 | lua_setuservalue(L, -2); /* builder, cb, ud */ 751 | 752 | luaL_getmetatable(L, PREFIX "rewriter"); /* builder, cb, ud, mt */ 753 | lua_setmetatable(L, -2); /* builder, cb, ud */ 754 | 755 | return 1; 756 | } 757 | 758 | static int return_self_or_stack_error(lua_State *L, int rc, int prev_top, lua_rewriter_t *rewriter) { 759 | if (rc == 0) { 760 | assert(lua_gettop(L) == prev_top); 761 | 762 | if (!rewriter->broken) { 763 | /* all good */ 764 | lua_settop(L, 1); 765 | return 1; 766 | } 767 | 768 | /* rc == 0 but rewriter->broken: the sink threw an error. 769 | * Fetch the error and leave it on top of the stack */ 770 | lua_getuservalue(L, 1); 771 | lua_rawgeti(L, -1, REWRITER_ERROR_INDEX); 772 | assert(!lua_isnil(L, -1)); 773 | } 774 | 775 | /* the rewriter is broken: free it now and leave a NULL pointer to signal 776 | * that */ 777 | lol_html_rewriter_free(rewriter->rewriter); 778 | rewriter->rewriter = NULL; 779 | 780 | /* error case: if the Lua stack moved, that was a Lua runtime error, and 781 | * the error value is at the top of the stack already, otherwise it is a 782 | * lolhtml error */ 783 | if (lua_gettop(L) == prev_top) { 784 | /* lolhtml error */ 785 | return push_last_error(L); 786 | } 787 | 788 | /* Lua runtime error */ 789 | lua_pushnil(L); 790 | lua_pushvalue(L, -2); 791 | return 2; 792 | } 793 | 794 | static int rewriter_write(lua_State *L) { 795 | const char *chunk; 796 | size_t chunk_len; 797 | int top, rc; 798 | 799 | lua_rewriter_t *rewriter = luaL_checkudata(L, 1, PREFIX "rewriter"); 800 | if (rewriter->rewriter == NULL) { 801 | lua_pushnil(L); 802 | lua_pushliteral(L, "broken rewriter"); 803 | return 2; 804 | } 805 | 806 | chunk = luaL_checklstring(L, 2, &chunk_len); 807 | top = lua_gettop(L); 808 | rc = lol_html_rewriter_write(rewriter->rewriter, chunk, chunk_len); 809 | return return_self_or_stack_error(L, rc, top, rewriter); 810 | } 811 | 812 | static int rewriter_end(lua_State *L) { 813 | int top, rc; 814 | 815 | lua_rewriter_t *rewriter = luaL_checkudata(L, 1, PREFIX "rewriter"); 816 | if (rewriter->rewriter == NULL) { 817 | lua_pushnil(L); 818 | lua_pushliteral(L, "broken rewriter"); 819 | return 2; 820 | } 821 | top = lua_gettop(L); 822 | rc = lol_html_rewriter_end(rewriter->rewriter); 823 | 824 | /* destroy it anyway, otherwise calling the rewriter again will abort */ 825 | if (rc == 0) { 826 | lol_html_rewriter_free(rewriter->rewriter); 827 | rewriter->rewriter = NULL; 828 | } 829 | 830 | return return_self_or_stack_error(L, rc, top, rewriter); 831 | } 832 | 833 | static int rewriter_destroy(lua_State *L) { 834 | lua_rewriter_t *rewriter = luaL_checkudata(L, 1, PREFIX "rewriter"); 835 | if (rewriter->rewriter != NULL) { 836 | lol_html_rewriter_free(rewriter->rewriter); 837 | rewriter->rewriter = NULL; 838 | } 839 | return 0; 840 | } 841 | 842 | static luaL_Reg rewriter_methods[] = { 843 | { "write", rewriter_write }, 844 | { "close", rewriter_end }, // end is a keyword in Lua 845 | { NULL, NULL } 846 | }; 847 | 848 | /* selectors */ 849 | /** Selectors don't have any methods, they are only exposed for the sake of 850 | * efficiency, as it might avoid parsing many times the same selector for 851 | * different builders. 852 | */ 853 | static int selector_new(lua_State *L) { 854 | size_t len; 855 | const char *src = luaL_checklstring(L, 1, &len); 856 | lol_html_selector_t *selector = lol_html_selector_parse(src, len); 857 | 858 | if (selector == NULL) { 859 | return push_last_error(L); 860 | } 861 | 862 | lol_html_selector_t **lua_selector = lua_newuserdata(L, sizeof(lol_html_selector_t *)); 863 | *lua_selector = selector; 864 | luaL_getmetatable(L, PREFIX "selector"); 865 | lua_setmetatable(L, -2); 866 | 867 | return 1; 868 | } 869 | 870 | static int selector_destroy(lua_State *L) { 871 | lol_html_selector_t **lua_selector = luaL_checkudata(L, 1, PREFIX "selector"); 872 | lol_html_selector_free(*lua_selector); 873 | return 0; 874 | } 875 | 876 | /* top level module */ 877 | static luaL_Reg module_functions[] = { 878 | { "new_rewriter_builder", rewriter_builder_new }, 879 | { "new_rewriter", rewriter_new }, 880 | { "new_selector", selector_new }, 881 | { NULL, NULL } 882 | }; 883 | 884 | int luaopen_lolhtml(lua_State *L) { 885 | /* create the document builders table */ 886 | if (lua_getfield(L, LUA_REGISTRYINDEX, LOL_REGISTRY) != LUA_TNIL) { 887 | luaL_error(L, "the library is already loaded"); 888 | } 889 | lua_pop(L, 1); 890 | lua_newtable(L); /* reg */ 891 | lua_newtable(L); /* reg, mt */ 892 | lua_pushliteral(L, "v"); /* reg, mt, "v" */ 893 | lua_setfield(L, -2, "__mode"); /* reg, mt */ 894 | lua_setmetatable(L, -2); /* reg */ 895 | lua_setfield(L, LUA_REGISTRYINDEX, LOL_REGISTRY); 896 | 897 | /* register types */ 898 | luaL_newmetatable(L, PREFIX "builder"); 899 | lua_newtable(L); 900 | luaL_setfuncs(L, rewriter_builder_methods, 0); 901 | lua_setfield(L, -2, "__index"); 902 | lua_pushcfunction(L, rewriter_builder_destroy); 903 | lua_setfield(L, -2, "__gc"); 904 | lua_pop(L, 1); 905 | 906 | luaL_newmetatable(L, PREFIX "rewriter"); 907 | lua_newtable(L); 908 | luaL_setfuncs(L, rewriter_methods, 0); 909 | lua_setfield(L, -2, "__index"); 910 | lua_pushcfunction(L, rewriter_destroy); 911 | lua_setfield(L, -2, "__gc"); 912 | lua_pop(L, 1); 913 | 914 | luaL_newmetatable(L, PREFIX "selector"); 915 | lua_pushcfunction(L, selector_destroy); 916 | lua_setfield(L, -2, "__gc"); 917 | lua_pop(L, 1); 918 | 919 | luaL_newmetatable(L, PREFIX "doctype"); 920 | lua_newtable(L); 921 | luaL_setfuncs(L, doctype_methods, 0); 922 | lua_setfield(L, -2, "__index"); 923 | lua_pop(L, 1); 924 | 925 | luaL_newmetatable(L, PREFIX "comment"); 926 | lua_newtable(L); 927 | luaL_setfuncs(L, comment_methods, 0); 928 | lua_setfield(L, -2, "__index"); 929 | lua_pop(L, 1); 930 | 931 | luaL_newmetatable(L, PREFIX "text_chunk"); 932 | lua_newtable(L); 933 | luaL_setfuncs(L, text_chunk_methods, 0); 934 | lua_setfield(L, -2, "__index"); 935 | lua_pop(L, 1); 936 | 937 | luaL_newmetatable(L, PREFIX "doc_end"); 938 | lua_newtable(L); 939 | luaL_setfuncs(L, doc_end_methods, 0); 940 | lua_setfield(L, -2, "__index"); 941 | lua_pop(L, 1); 942 | 943 | luaL_newmetatable(L, PREFIX "element"); 944 | lua_newtable(L); 945 | luaL_setfuncs(L, element_methods, 0); 946 | lua_setfield(L, -2, "__index"); 947 | lua_pop(L, 1); 948 | 949 | luaL_newmetatable(L, PREFIX "attribute_iterator"); 950 | lua_pushcfunction(L, attribute_iterator_destroy); 951 | lua_setfield(L, -2, "__gc"); 952 | lua_pop(L, 1); 953 | 954 | /* module functions */ 955 | lua_newtable(L); 956 | luaL_setfuncs(L, module_functions, 0); 957 | lua_pushinteger(L, LOL_HTML_CONTINUE); 958 | lua_setfield(L, -2, "CONTINUE"); 959 | lua_pushinteger(L, LOL_HTML_STOP); 960 | lua_setfield(L, -2, "STOP"); 961 | return 1; 962 | } 963 | -------------------------------------------------------------------------------- /rockspecs/lolhtml-dev-2.rockspec: -------------------------------------------------------------------------------- 1 | rockspec_format = "3.0" 2 | package = "lolhtml" 3 | version = "dev-2" 4 | source = { 5 | url = "git+https://github.com/jdesgats/lua-lolhtml.git" 6 | } 7 | description = { 8 | summary = "HTML parser/rewriter with CSS selector-based API", 9 | detailed = [[ 10 | This library is a Lua binding for lol-html, a Low output latency 11 | streaming HTML parser/rewriter with CSS selector-based API.]], 12 | homepage = "https://github.com/jdesgats/lua-lolhtml", 13 | license = "BSD3" 14 | } 15 | dependencies = { 16 | "lua <= 5.3" 17 | } 18 | build = { 19 | type = "make", 20 | build_variables = { 21 | CFLAGS = '$(CFLAGS) -I"$(LUA_INCDIR)"'; 22 | }, 23 | install_pass = false, 24 | install = { 25 | lib = { lolhtml="lolhtml.so" }, 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spec/lolhtml.lua: -------------------------------------------------------------------------------- 1 | local lolhtml = require "lolhtml" 2 | 3 | -- Note: these tests are not meant to test lolhtml itself (it already has tests 4 | -- on its own), but the Lua binding and its behaviour 5 | 6 | local basic_page = [[ 7 | 8 | 9 | 10 | Hello, Lua-lolhtml 11 | 12 | 13 | 14 |

Hello, Lua-lolhtml

15 | 16 | 17 | ]] 18 | 19 | -- basic string buffer for the sink 20 | local sink_buffer do 21 | local mt = { 22 | __index = { value = table.concat }, 23 | __call = table.insert 24 | } 25 | sink_buffer = function() 26 | return setmetatable({}, mt) 27 | end 28 | end 29 | 30 | describe("lolhtml rewriter", function() 31 | after(function() 32 | collectgarbage("collect") 33 | end) 34 | 35 | test("basic pipeline", function() 36 | local result = {} 37 | local function sink(t) table.insert(result, t) end 38 | 39 | local rewriter = lolhtml.new_rewriter { 40 | builder = lolhtml.new_rewriter_builder(), 41 | sink = sink, 42 | } 43 | assert(rewriter:write(basic_page)) 44 | assert(rewriter:close()) 45 | assert_equal(table.concat(result), basic_page) 46 | end) 47 | 48 | test("callable sink", function() 49 | local buf = sink_buffer() 50 | local rewriter = lolhtml.new_rewriter { 51 | builder = lolhtml.new_rewriter_builder(), 52 | sink = buf, 53 | } 54 | assert(rewriter:write(basic_page)) 55 | assert(rewriter:close()) 56 | assert_equal(buf:value(), basic_page) 57 | end) 58 | 59 | describe("document content handlers", function() 60 | test("doctype handler", function() 61 | local data, buf = nil, sink_buffer() 62 | local kept_ref 63 | local builder = lolhtml.new_rewriter_builder() 64 | :add_document_content_handlers{ 65 | doctype_handler = function(doctype) 66 | data = { doctype:get_name(), doctype:get_id(), doctype:get_system_id() } 67 | kept_ref = doctype 68 | end 69 | } 70 | collectgarbage("collect") -- loose the ref to the handler function 71 | 72 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 73 | assert(rewriter:write(basic_page)) 74 | assert(rewriter:close()) 75 | assert_not_nil(data, "callback not called") 76 | assert_equal(data[1], "html") 77 | assert_equal(data[2], nil) 78 | assert_equal(data[3], nil) 79 | assert_equal(buf:value(), basic_page) 80 | 81 | -- now try to use the doctype object outside of the callback 82 | assert_not_nil(kept_ref) 83 | assert_error(function() kept_ref:get_name() end) 84 | 85 | local full_doctype = [[ 86 | 88 | ]] 89 | data, buf = nil, sink_buffer() 90 | rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 91 | assert(rewriter:write(full_doctype)) 92 | assert_not_nil(data, "callback not called") 93 | assert_equal(data[1], "html") 94 | assert_equal(data[2], "-//W3C//DTD HTML 4.01//EN") 95 | assert_equal(data[3], "http://www.w3.org/TR/html4/strict.dtd") 96 | assert_equal(buf:value(), full_doctype) 97 | end) 98 | 99 | describe("comment_handler", function() 100 | local function run_parser(input, cb) 101 | local buf = sink_buffer() 102 | local builder = lolhtml.new_rewriter_builder() 103 | :add_document_content_handlers{ 104 | comment_handler = cb, 105 | } 106 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 107 | assert(rewriter:write(input)) 108 | assert(rewriter:close()) 109 | collectgarbage("collect") -- loose the ref to the handler and builder 110 | return buf:value() 111 | end 112 | 113 | test("get_text", function() 114 | local data 115 | local out = run_parser(basic_page, function(comment) 116 | data = comment:get_text() 117 | end) 118 | assert_equal(out, basic_page) 119 | assert_equal(data, "hello, comments") 120 | end) 121 | 122 | test("set_text", function() 123 | local out = run_parser(" hello ", function(comment) 124 | comment:set_text("replaced") 125 | end) 126 | assert_equal(out, " hello ") 127 | end) 128 | 129 | test("before/after", function() 130 | local out = run_parser("hello, ", function(comment) 131 | comment:before("") 132 | comment:after("!", true) 133 | end) 134 | assert_equal(out, "hello, <World>!") 135 | end) 136 | 137 | test("replace", function() 138 | local out = run_parser("hello, ", function(comment) 139 | comment:replace("World!") 140 | end) 141 | assert_equal(out, "hello, World!") 142 | end) 143 | 144 | test("remove/is_removed", function() 145 | local before_removing, after_removing 146 | local out = run_parser("hello, ", function(comment) 147 | before_removing = comment:is_removed() 148 | comment:remove() 149 | after_removing = comment:is_removed() 150 | end) 151 | assert_equal(out, "hello, ") 152 | assert_false(before_removing) 153 | assert_true(after_removing) 154 | end) 155 | 156 | test("usage after lifetime", function() 157 | local c 158 | run_parser("hello, ", function(comment) c=comment end) 159 | 160 | assert_error(function() c:get_text() end) 161 | assert_error(function() c:set_text("foo") end) 162 | assert_error(function() c:before("foo") end) 163 | assert_error(function() c:after("foo") end) 164 | assert_error(function() c:replace("foo") end) 165 | assert_error(function() c:remove() end) 166 | assert_error(function() c:is_removed() end) 167 | end) 168 | end) 169 | 170 | describe("text chunk handler", function() 171 | local function run_parser(input, cb) 172 | local buf = sink_buffer() 173 | local builder = lolhtml.new_rewriter_builder() 174 | :add_document_content_handlers{ 175 | text_handler = cb, 176 | } 177 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 178 | assert(rewriter:write(input)) 179 | assert(rewriter:close()) 180 | collectgarbage("collect") -- loose the ref to the handler and builder 181 | return buf:value() 182 | end 183 | 184 | test("get_text/is_last_in_text_node", function() 185 | local calls = {} 186 | local out = run_parser("foobarbaz", function(text) 187 | local txt = text:get_text() 188 | if txt == "" then 189 | assert_true(text:is_last_in_text_node()) 190 | else 191 | assert_false(text:is_last_in_text_node()) 192 | table.insert(calls, txt) 193 | end 194 | end) 195 | assert_equal(out, "foobarbaz") 196 | assert_equal(#calls, 3) 197 | assert_equal(calls[1], "foo") 198 | assert_equal(calls[2], "bar") 199 | assert_equal(calls[3], "baz") 200 | end) 201 | 202 | test("before/after", function() 203 | local out = run_parser("World", function(chunk) 204 | if chunk:get_text() == "" then 205 | chunk:after("!", true) 206 | else 207 | chunk:before(", ") 208 | end 209 | end) 210 | assert_equal(out, "<Hello>, World!") 211 | end) 212 | 213 | test("replace", function() 214 | local out = run_parser("Hello, World!", function(chunk) 215 | if chunk:get_text() == "World" then 216 | chunk:replace("lolhtml") 217 | end 218 | end) 219 | assert_equal(out, "Hello, lolhtml!") 220 | end) 221 | 222 | test("remove/is_removed", function() 223 | local out = run_parser("Hello, World!", function(chunk) 224 | assert_false(chunk:is_removed()) 225 | if chunk:get_text() == "World" then 226 | chunk:remove() 227 | assert_true(chunk:is_removed()) 228 | end 229 | end) 230 | assert_equal(out, "Hello, !") 231 | end) 232 | 233 | test("usage after lifetime", function() 234 | local c 235 | run_parser("hello, World!", function(chunk) c=chunk end) 236 | 237 | assert_error(function() c:get_text() end) 238 | assert_error(function() c:is_last_in_text_node() end) 239 | assert_error(function() c:before("foo") end) 240 | assert_error(function() c:after("foo") end) 241 | assert_error(function() c:replace("foo") end) 242 | assert_error(function() c:remove() end) 243 | assert_error(function() c:is_removed() end) 244 | end) 245 | end) 246 | 247 | test("docuemnt end", function() 248 | local buf = sink_buffer() 249 | local ref 250 | local builder = lolhtml.new_rewriter_builder() 251 | :add_document_content_handlers{ 252 | doc_end_handler = function(doc_end) 253 | doc_end:append("bye...") 254 | ref = doc_end 255 | end, 256 | } 257 | collectgarbage("collect") -- loose the ref to the handler function 258 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 259 | assert(rewriter:write(basic_page):close()) 260 | assert_equal(buf:value(), basic_page .. "bye...") 261 | assert_error(function() ref:append("foo") end) 262 | end) 263 | 264 | test("multiple handlers", function() 265 | local buf = sink_buffer() 266 | local calls = {} 267 | local builder = lolhtml.new_rewriter_builder() 268 | :add_document_content_handlers { 269 | doc_end_handler = function() table.insert(calls, "doc_end 1") end 270 | }:add_document_content_handlers { 271 | doctype_handler = function() table.insert(calls, "doctype") end, 272 | doc_end_handler = function() table.insert(calls, "doc_end 2") end 273 | } 274 | collectgarbage("collect") -- loose the ref to the handler function 275 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 276 | assert(rewriter:write(basic_page):close()) 277 | assert_equal(buf:value(), basic_page) 278 | assert_same(calls, { "doctype", "doc_end 2", "doc_end 1" }) 279 | end) 280 | 281 | describe("handler throws errors", function() 282 | for _, callback in ipairs { "doctype_handler", "comment_handler", "text_handler" } do 283 | test(callback, function() 284 | local buf = sink_buffer() 285 | local error_object = {} -- do not throw a string here, otherwise the Lua runtime will decorate it 286 | local builder = lolhtml.new_rewriter_builder() 287 | :add_document_content_handlers{ 288 | [callback] = function() error(error_object) end 289 | } 290 | collectgarbage("collect") -- loose the ref to the handler function 291 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 292 | local ok, err = rewriter:write(basic_page) 293 | assert_nil(ok) 294 | assert_equal(err, error_object) 295 | -- the result should be a subset of the page we fed 296 | assert_equal(basic_page:find(buf:value(), 1, true), 1) 297 | 298 | -- now try do interact again with the rewriter, it should raise errors 299 | -- (and not crash, preferably) 300 | assert_nil(rewriter:write("foo")) 301 | assert_nil(rewriter:close()) 302 | end) 303 | end 304 | 305 | test("doc_end", function() 306 | local buf = sink_buffer() 307 | local error_object = {} -- do not throw a string here, otherwise the Lua runtime will decorate it 308 | local builder = lolhtml.new_rewriter_builder() 309 | :add_document_content_handlers{ 310 | doc_end_handler = function() error(error_object) end 311 | } 312 | collectgarbage("collect") -- loose the ref to the handler function 313 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 314 | assert(rewriter:write(basic_page)) 315 | local ok, err = rewriter:close() 316 | assert_nil(ok) 317 | assert_equal(err, error_object) 318 | -- the result should be a subset of the page we fed 319 | assert_equal(basic_page:find(buf:value(), 1, true), 1) 320 | end) 321 | end) 322 | 323 | 324 | describe("callback return values", function() 325 | local function run(val) 326 | local builder = lolhtml.new_rewriter_builder() 327 | :add_document_content_handlers { 328 | text_handler = function(chunk) return val end 329 | } 330 | local buf = sink_buffer() 331 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 332 | local ok, err = rewriter:write("foobarbaz") 333 | return rewriter, ok, err, buf 334 | end 335 | 336 | test("continue", function() 337 | local rewriter, ok, err, buf = run(lolhtml.CONTINUE) 338 | assert_equal(ok, rewriter) -- no error: return self 339 | assert_equal(rewriter:close(), rewriter) 340 | assert_equal(buf:value(), "foobarbaz") 341 | end) 342 | 343 | for testcase, val in pairs { 344 | ["stop"] = lolhtml.STOP, 345 | ["other numbers"] = 42, 346 | ["wrong type"] = {} 347 | } do 348 | test(testcase, function() 349 | local rewriter, ok, err, buf = run(val) 350 | assert_nil(ok) 351 | assert_type(err, "string") 352 | -- keep using the rewriter will result in errors 353 | assert_nil(rewriter:write("foo")) 354 | assert_nil(rewriter:close()) 355 | end) 356 | end 357 | end) 358 | end) 359 | 360 | test("write after close", function() 361 | local buf = sink_buffer() 362 | local rewriter = lolhtml.new_rewriter { 363 | builder=lolhtml.new_rewriter_builder(), 364 | sink = buf, 365 | } 366 | 367 | assert(rewriter:write("hello, ")) 368 | assert(rewriter:close()) 369 | assert_nil(rewriter:write("world")) 370 | end) 371 | 372 | test("sink throw errors", function() 373 | local called = false 374 | local error_object = {} 375 | local rewriter = lolhtml.new_rewriter { 376 | builder=lolhtml.new_rewriter_builder(), 377 | sink = function() 378 | called = true 379 | error(error_object) 380 | end 381 | } 382 | 383 | -- XXX: hard to tell when the error will be thrown (at the :write call 384 | -- or at the :close). If this test breaks, it might be because of an 385 | -- internal change in lol-html 386 | local ok, err = rewriter:write("hello, world") 387 | assert_true(called) 388 | assert_nil(ok) 389 | assert_equal(err, error_object) 390 | local ok, err = rewriter:close() 391 | assert_nil(ok) 392 | assert_equal(err, "broken rewriter") 393 | end) 394 | 395 | test("selector syntax errors", function() 396 | local ok, err = lolhtml.new_selector("foo[attr=") 397 | assert_nil(ok) 398 | assert_type(err, "string") 399 | end) 400 | 401 | describe("element content handlers", function() 402 | -- comment/text are the samie as the document handlers, so minimal testing is done 403 | test("comment_handler", function() 404 | local buf = sink_buffer() 405 | local builder = lolhtml.new_rewriter_builder() 406 | :add_element_content_handlers{ 407 | selector = lolhtml.new_selector("strong"), 408 | comment_handler = function(comment) 409 | assert_equal(comment:get_text(), " name ") 410 | comment:set_text(" World ") 411 | end, 412 | } 413 | collectgarbage("collect") 414 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 415 | assert(rewriter:write("hello, ")) 416 | assert(rewriter:close()) 417 | assert_equal(buf:value(), "hello, ") 418 | end) 419 | 420 | test("text_handler", function() 421 | local buf = sink_buffer() 422 | local builder = lolhtml.new_rewriter_builder() 423 | :add_element_content_handlers{ 424 | selector = lolhtml.new_selector("strong"), 425 | text_handler = function(text) 426 | -- this handler might be called multiple times 427 | if text:get_text() == "name" then 428 | text:replace("World") 429 | end 430 | end, 431 | } 432 | collectgarbage("collect") 433 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 434 | assert(rewriter:write("hello, name")) 435 | assert(rewriter:close()) 436 | assert_equal(buf:value(), "hello, World") 437 | end) 438 | 439 | describe("element_handler", function() 440 | local function run_parser(sel, input, cb) 441 | local buf = sink_buffer() 442 | local builder = lolhtml.new_rewriter_builder() 443 | :add_element_content_handlers{ 444 | selector = lolhtml.new_selector(sel), 445 | element_handler = cb, 446 | } 447 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 448 | assert(rewriter:write(input)) 449 | assert(rewriter:close()) 450 | collectgarbage("collect") -- loose the ref to the handler and builder 451 | return buf:value() 452 | end 453 | 454 | test("get_tag_name/get_namespace_uri", function() 455 | local called = 0 456 | local out = run_parser("h1", basic_page, function(el) 457 | called = called + 1 458 | assert_equal(el:get_tag_name(), "h1") 459 | assert_type(el:get_namespace_uri(), "string") 460 | end) 461 | 462 | assert_equal(1, called) 463 | assert_equal(out, basic_page) 464 | end) 465 | 466 | test("get_attribute/has_attribute", function() 467 | local called = 0 468 | local out = run_parser("a", 'hello, World!', function(el) 469 | called = called + 1 470 | assert_true(el:has_attribute("href")) 471 | assert_equal(el:get_attribute("href"), "http://example.com") 472 | assert_false(el:has_attribute("foo")) 473 | assert_nil(el:get_attribute("foo")) 474 | end) 475 | assert_equal(1, called) 476 | assert_equal(out, 'hello, World!') 477 | end) 478 | 479 | test("set_attribute", function() 480 | local out = run_parser("a", 'hello, World!', function(el) 481 | assert_not_nil(el:set_attribute("href", "https://example.com")) 482 | assert_not_nil(el:set_attribute("target", "_blank")) 483 | end) 484 | 485 | -- XXX: the position of new attributes is kind of an implementation detail, so this might break easily 486 | assert_equal(out, 'hello, World!') 487 | end) 488 | 489 | test("remove attribute", function() 490 | local out = run_parser("a", 'hello, World!', 491 | function(el) 492 | assert_not_nil(el:remove_attribute("target")) 493 | assert_not_nil(el:remove_attribute("foo")) -- removing non-existant element should "work" 494 | end) 495 | assert_equal(out, 'hello, World!') 496 | end) 497 | 498 | local test_table = { 499 | { method="before", is_html="hello, World!", no_html="hello, <TEST>World!" }, 500 | { method="after", is_html="hello, World!", no_html="hello, World<TEST>!" }, 501 | { method="prepend", is_html="hello, World!", no_html="hello, <TEST>World!" }, 502 | { method="append", is_html="hello, World!", no_html="hello, World<TEST>!" }, 503 | { method="set_inner_content", is_html="hello, !", no_html="hello, <TEST>!" }, 504 | { method="replace", is_html="hello, !", no_html="hello, <TEST>!" }, 505 | } 506 | 507 | for _, testcase in ipairs(test_table) do 508 | test(testcase.method .. " is_html=true", function() 509 | local out = run_parser("b", 'hello, World!', function(el) 510 | el[testcase.method](el, "", true) 511 | end) 512 | assert_equal(out, testcase.is_html) 513 | end) 514 | test(testcase.method .. " is_html=false", function() 515 | local out = run_parser("b", 'hello, World!', function(el) 516 | el[testcase.method](el, "", false) 517 | end) 518 | assert_equal(out, testcase.no_html) 519 | end) 520 | end 521 | 522 | test("remove", function() 523 | local out = run_parser("b", 'hello, World!', function(el) 524 | assert_false(el:is_removed()) 525 | assert_not_nil(el:remove()) 526 | assert_true(el:is_removed()) 527 | end) 528 | assert_equal(out, 'hello, !') 529 | end) 530 | 531 | test("remove_and_keep_content", function() 532 | local out = run_parser("b", 'hello, World!', function(el) 533 | assert_false(el:is_removed()) 534 | assert_not_nil(el:remove_and_keep_content()) 535 | assert_true(el:is_removed()) 536 | end) 537 | assert_equal(out, 'hello, World!') 538 | end) 539 | 540 | test("attributes", function() 541 | local called = 0 542 | run_parser("a", 'hello, World!', 543 | function(el) 544 | local it = 0 545 | called = called + 1 546 | for name, value in el:attributes() do 547 | it = it + 1 548 | if it == 1 then 549 | assert_equal(name, "href") 550 | assert_equal(value, "http://example.com") 551 | elseif it == 2 then 552 | assert_equal(name, "target") 553 | assert_equal(value, "_blank") 554 | else 555 | error("more than 2 iterations") 556 | end 557 | end 558 | end) 559 | assert_equal(called, 1) 560 | end) 561 | 562 | test("usage after lifetime", function() 563 | local el 564 | run_parser("b", 'hello, World!', function(e) el=e end) 565 | assert_error(function() el:get_tag_name() end) 566 | assert_error(function() el:get_namespace_uri() end) 567 | assert_error(function() el:get_attribute("foo") end) 568 | assert_error(function() el:has_attribute("foo") end) 569 | assert_error(function() el:set_attribute("foo", "bar") end) 570 | assert_error(function() el:remove_attribute("foo") end) 571 | assert_error(function() el:attributes() end) 572 | assert_error(function() el:before("foo") end) 573 | assert_error(function() el:after("foo") end) 574 | assert_error(function() el:prepend("foo") end) 575 | assert_error(function() el:append("foo") end) 576 | assert_error(function() el:set_inner_content("foo") end) 577 | assert_error(function() el:replace("foo") end) 578 | assert_error(function() el:is_removed() end) 579 | assert_error(function() el:remove() end) 580 | assert_error(function() el:remove_and_keep_content() end) 581 | end) 582 | 583 | test("multiple selectors", function() 584 | local buf = sink_buffer() 585 | local builder = lolhtml.new_rewriter_builder() 586 | :add_element_content_handlers { 587 | selector = lolhtml.new_selector("span"), 588 | element_handler = function(el) 589 | el:set_inner_content("span content") 590 | end 591 | } 592 | :add_element_content_handlers { 593 | selector = lolhtml.new_selector("div"), 594 | element_handler = function(el) 595 | el:set_inner_content("div content") 596 | end 597 | } 598 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 599 | collectgarbage("collect") 600 | assert(rewriter:write("aaa bbb ccc
ddd
eee fff ggg")) 601 | assert(rewriter:close()) 602 | collectgarbage("collect") 603 | assert_equal(buf:value(), "aaa span content ccc
div content
eee span content ggg") 604 | end) 605 | test("multiple handlers for the same selector", function() 606 | local buf = sink_buffer() 607 | local counter = 0 608 | local builder = lolhtml.new_rewriter_builder() 609 | :add_element_content_handlers { 610 | selector = lolhtml.new_selector("span"), 611 | element_handler = function(el) 612 | el:set_inner_content("span content") 613 | end 614 | } 615 | :add_element_content_handlers { 616 | selector = lolhtml.new_selector("span"), 617 | element_handler = function(el) 618 | counter = counter + 1 619 | el:set_attribute("count", tostring(counter)) 620 | end 621 | } 622 | local rewriter = lolhtml.new_rewriter { builder=builder, sink=buf } 623 | collectgarbage("collect") 624 | assert(rewriter:write("aaa bbb ccc
ddd
eee fff ggg")) 625 | assert(rewriter:close()) 626 | collectgarbage("collect") 627 | assert_equal(buf:value(), 'aaa span content ccc
ddd
eee span content ggg') 628 | end) 629 | end) 630 | end) 631 | end) 632 | --------------------------------------------------------------------------------