├── .gitignore ├── tests ├── testdata │ ├── encoding │ │ ├── tests1.dat │ │ ├── test-yahoo-jp.dat │ │ └── tests2.dat │ ├── tokenizer │ │ ├── pendingSpecChanges.test │ │ ├── aa-lisp-tests.test │ │ ├── xmlViolation.test │ │ ├── unicodeCharsProblematic.test │ │ ├── escapeFlag.test │ │ ├── contentModelFlags.test │ │ ├── domjs.test │ │ ├── test1.test │ │ └── test2.test │ ├── tree-construction │ │ ├── pending-spec-changes-plain-text-unsafe.dat │ │ ├── button.dat │ │ ├── adoption02.dat │ │ ├── inbody01.dat │ │ ├── isindex.dat │ │ ├── tests4.dat │ │ ├── tests24.dat │ │ ├── tests14.dat │ │ ├── pending-spec-changes.dat │ │ ├── webkit02.dat │ │ ├── tests12.dat │ │ ├── comments01.dat │ │ ├── tests17.dat │ │ ├── adoption01.dat │ │ ├── tests21.dat │ │ ├── tests25.dat │ │ ├── tests23.dat │ │ ├── tables01.dat │ │ ├── html5test-com.dat │ │ ├── tests22.dat │ │ ├── tests8.dat │ │ ├── tests5.dat │ │ ├── entities02.dat │ │ ├── tests15.dat │ │ ├── tricky01.dat │ │ ├── tests18.dat │ │ ├── scriptdata01.dat │ │ ├── plain-text-unsafe.dat │ │ ├── tests3.dat │ │ ├── tests26.dat │ │ ├── doctype01.dat │ │ ├── tests20.dat │ │ └── tests7.dat │ └── serializer │ │ ├── options.test │ │ ├── whitespace.test │ │ ├── injectmeta.test │ │ └── core.test ├── packages.lisp ├── run-tests.lisp ├── support.lisp ├── test-inputstream.lisp ├── test-tree-builder.lisp └── test-parser.lisp ├── .travis.yml ├── LICENSES ├── html5-parser-class.lisp ├── packages.lisp ├── cxml └── cxml-dom.lisp ├── cl-html5-parser.asd ├── xmls.lisp ├── README.md ├── simple-tree.lisp └── COPYING.LIB /.gitignore: -------------------------------------------------------------------------------- 1 | README.html 2 | 3 | *.fasl 4 | -------------------------------------------------------------------------------- /tests/testdata/encoding/tests1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruricolist/cl-html5-parser/master/tests/testdata/encoding/tests1.dat -------------------------------------------------------------------------------- /tests/testdata/tokenizer/pendingSpecChanges.test: -------------------------------------------------------------------------------- 1 | {"tests": [ 2 | 3 | {"description":" 6 | Yahoo! JAPAN 7 | 8 |
20 | #errors 21 | #document 22 | | 23 | | 24 | | 25 | | 26 | |
27 | | 28 | | "] 49 | } 50 | 51 | ]} -------------------------------------------------------------------------------- /packages.lisp: -------------------------------------------------------------------------------- 1 | ;;;; HTML5 parser for Common Lisp 2 | ;;;; 3 | ;;;; Copyright (C) 2012 Thomas Bakketun 4 | ;;;; Copyright (C) 2012 Asgeir Bjørlykke 5 | ;;;; Copyright (C) 2012 Mathias Hellevang 6 | ;;;; Copyright (C) 2012 Stian Sletner 7 | ;;;; 8 | ;;;; This library is free software: you can redistribute it and/or modify 9 | ;;;; it under the terms of the GNU Lesser General Public License as published 10 | ;;;; by the Free Software Foundation, either version 3 of the License, or 11 | ;;;; (at your option) any later version. 12 | ;;;; 13 | ;;;; This library is distributed in the hope that it will be useful, 14 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | ;;;; GNU General Public License for more details. 17 | ;;;; 18 | ;;;; You should have received a copy of the GNU General Public License 19 | ;;;; along with this library. If not, see . 20 | 21 | (defpackage :html5-constants 22 | (:use 23 | :common-lisp) 24 | (:export 25 | #:+namespaces+ 26 | #:find-namespace 27 | #:find-prefix 28 | #:+scoping-elements+ 29 | #:+formatting-elements+ 30 | #:+special-elements+ 31 | #:+html-integration-point-elements+ 32 | #:+mathml-text-integration-point-elements+ 33 | #:+eof+ 34 | #:+token-types+ 35 | #:+tag-token-types+ 36 | #:+space-characters+ 37 | #:+table-insert-mode-elements+ 38 | #:+ascii-lowercase+ 39 | #:+ascii-uppercase+ 40 | #:+ascii-letters+ 41 | #:ascii-letter-p 42 | #:+digits+ 43 | #:+hex-digits+ 44 | #:ascii-upper-2-lower 45 | #:+replacement-characters+ 46 | #:+cdata-elements+ 47 | #:+rcdata-elements+ 48 | #:+html-integration-point-elements+ 49 | #:+mathml-text-integration-point-elements+ 50 | #:+quirks-mode-doctypes-regexp+ 51 | #:ascii-upper-2-lower 52 | #:+replacement-characters+ 53 | #:+heading-elements+)) 54 | 55 | (defpackage :html5-parser 56 | (:use 57 | :common-lisp 58 | :html5-constants 59 | :cl-ppcre) 60 | (:export 61 | #:parse-html5 62 | #:parse-html5-fragment 63 | #:transform-html5-dom 64 | 65 | ;; A simple DOM 66 | #:make-document 67 | #:make-fragment 68 | #:make-doctype 69 | #:make-comment 70 | #:make-element 71 | #:make-text-node 72 | 73 | #:node-type 74 | #:node-name 75 | #:node-namespace 76 | #:node-value 77 | #:node-public-id 78 | #:node-system-id 79 | #:element-attribute 80 | 81 | #:node-append-child 82 | #:node-insert-before 83 | #:node-remove-child 84 | 85 | #:node-parent 86 | #:node-first-child 87 | #:node-last-child 88 | #:node-previous-sibling 89 | #:node-next-sibling 90 | #:element-map-attributes 91 | #:element-map-children)) 92 | -------------------------------------------------------------------------------- /tests/testdata/tokenizer/contentModelFlags.test: -------------------------------------------------------------------------------- 1 | {"tests": [ 2 | 3 | {"description":"PLAINTEXT content model flag", 4 | "initialStates":["PLAINTEXT state"], 5 | "lastStartTag":"plaintext", 6 | "input":"&body;", 7 | "output":[["Character", "&body;"]]}, 8 | 9 | {"description":"End tag closing RCDATA or RAWTEXT", 10 | "initialStates":["RCDATA state", "RAWTEXT state"], 11 | "lastStartTag":"xmp", 12 | "input":"foo", 13 | "output":[["Character", "foo"], ["EndTag", "xmp"]]}, 14 | 15 | {"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)", 16 | "initialStates":["RCDATA state", "RAWTEXT state"], 17 | "lastStartTag":"xmp", 18 | "input":"foo", 19 | "output":[["Character", "foo"], ["EndTag", "xmp"]]}, 20 | 21 | {"description":"End tag closing RCDATA or RAWTEXT (ending with space)", 22 | "initialStates":["RCDATA state", "RAWTEXT state"], 23 | "lastStartTag":"xmp", 24 | "input":"foobar", 49 | "output":[["Character", "bar"], ["EndTag", "xmp"]]}, 50 | 51 | {"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)", 52 | "initialStates":["RCDATA state", "RAWTEXT state"], 53 | "lastStartTag":"xmp", 54 | "input":"bar", 55 | "output":[["Character", "bar"]]}, 56 | 57 | {"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA", 58 | "initialStates":["RCDATA state", "RAWTEXT state"], 59 | "lastStartTag":"xmp", 60 | "input":"foo", 61 | "output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]}, 62 | 63 | {"description":"RAWTEXT w/ something looking like an entity", 64 | "initialStates":["RAWTEXT state"], 65 | "lastStartTag":"xmp", 66 | "input":"&foo;", 67 | "output":[["Character", "&foo;"]]}, 68 | 69 | {"description":"RCDATA w/ an entity", 70 | "initialStates":["RCDATA state"], 71 | "lastStartTag":"textarea", 72 | "input":"<", 73 | "output":[["Character", "<"]]} 74 | 75 | ]} 76 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/tests17.dat: -------------------------------------------------------------------------------- 1 | #data 2 | 3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | |
11 | | 12 | | 13 | 14 | #data 15 |
16 | #errors 17 | #document 18 | | 19 | | 20 | | 21 | | 22 | | 24 | | 25 | | 26 | |
27 | 28 | #data 29 |
30 | #errors 31 | #document 32 | | 33 | | 34 | | 35 | | 36 | | 37 | | 38 | | 39 | |
40 | | 42 | 43 | #data 44 |
45 | #errors 46 | #document 47 | | 48 | | 49 | | 50 | | 51 | | 52 | | 53 | | 54 | |
55 | | 57 | 58 | #data 59 | 60 | #errors 61 | #document 62 | | 63 | | 64 | | 65 | | 66 | |
67 | | 70 | | 71 | 72 | #data 73 | 74 | #errors 75 | #document 76 | | 77 | | 78 | | 79 | | 80 | | 104 | #errors 105 | #document 106 | | 107 | | 108 | | 109 | | 110 | | 114 | #errors 115 | #document 116 | | 117 | | 118 | | 119 | | 120 | | 124 | #errors 125 | #document 126 | | 127 | | 128 | | 129 | | 130 | |
68 | |
84 | #errors 85 | #document 86 | | 87 | | 88 | | 89 | | 90 | | 94 | #errors 95 | #document 96 | | 97 | | 98 | | 99 | | 100 | |
134 | #errors 135 | #document 136 | | 137 | | 138 | | 139 | | 140 | |
a 144 | #errors 145 | #document 146 | | 147 | | 148 | | 149 | | 150 | | 151 | | 152 | | 153 | | "a" 154 | -------------------------------------------------------------------------------- /cxml/cxml-dom.lisp: -------------------------------------------------------------------------------- 1 | ;;;; HTML5 parser for Common Lisp 2 | ;;;; 3 | ;;;; Copyright (C) 2014 Joe Taylor 4 | ;;;; Copyright (C) 2012 Thomas Bakketun 5 | ;;;; Copyright (C) 2012 Asgeir Bjørlykke 6 | ;;;; Copyright (C) 2012 Mathias Hellevang 7 | ;;;; Copyright (C) 2012 Stian Sletner 8 | ;;;; 9 | ;;;; This library is free software: you can redistribute it and/or modify 10 | ;;;; it under the terms of the GNU Lesser General Public License as published 11 | ;;;; by the Free Software Foundation, either version 3 of the License, or 12 | ;;;; (at your option) any later version. 13 | ;;;; 14 | ;;;; This library is distributed in the hope that it will be useful, 15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | ;;;; GNU General Public License for more details. 18 | ;;;; 19 | ;;;; You should have received a copy of the GNU General Public License 20 | ;;;; along with this library. If not, see . 21 | 22 | (in-package #:html5-parser) 23 | 24 | (defmethod transform-html5-dom ((to-type (eql :cxml)) node &key) 25 | (let ((document (cxml:parse-empty-document nil nil (cxml-dom:make-dom-builder)))) 26 | (labels ((walk (node &optional parent) 27 | (ecase (node-type node) 28 | (:document 29 | (element-map-children #'walk node) 30 | document) 31 | (:document-type) 32 | (:document-fragment 33 | (let ((fragment (dom:create-document-fragment document))) 34 | (element-map-children (lambda (c) 35 | (walk c fragment)) 36 | node) 37 | fragment)) 38 | (:element 39 | (let ((element (dom:create-element-ns document (node-namespace node) (node-name node)))) 40 | (element-map-attributes (lambda (name namespace value) 41 | (dom:set-attribute-ns element namespace name value)) 42 | node) 43 | (element-map-children (lambda (c) 44 | (walk c element)) 45 | node) 46 | (dom:append-child (or parent document) element))) 47 | (:text 48 | (dom:append-child (or parent document) 49 | (dom:create-text-node document (node-value node)))) 50 | (:comment 51 | (dom:append-child (or parent document) 52 | (dom:create-comment document (node-value node))))))) 53 | (walk node)))) 54 | -------------------------------------------------------------------------------- /cl-html5-parser.asd: -------------------------------------------------------------------------------- 1 | ;;;; HTML5 parser for Common Lisp 2 | ;;;; 3 | ;;;; Copyright (C) 2012 Thomas Bakketun 4 | ;;;; Copyright (C) 2012 Asgeir Bjørlykke 5 | ;;;; Copyright (C) 2012 Mathias Hellevang 6 | ;;;; Copyright (C) 2012 Stian Sletner 7 | ;;;; 8 | ;;;; This library is free software: you can redistribute it and/or modify 9 | ;;;; it under the terms of the GNU Lesser General Public License as published 10 | ;;;; by the Free Software Foundation, either version 3 of the License, or 11 | ;;;; (at your option) any later version. 12 | ;;;; 13 | ;;;; This library is distributed in the hope that it will be useful, 14 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | ;;;; GNU General Public License for more details. 17 | ;;;; 18 | ;;;; You should have received a copy of the GNU General Public License 19 | ;;;; along with this library. If not, see . 20 | 21 | (defsystem #:cl-html5-parser 22 | :name "cl-html5-parser" 23 | :description "A HTML5 parser for Common Lisp" 24 | :licence "GNU Lesser General Public License" 25 | :author "Thomas Bakketun " 26 | :depends-on (:cl-ppcre :flexi-streams :string-case) 27 | :serial t 28 | :components ((:file "packages") 29 | (:file "constants") 30 | (:file "entities") 31 | (:file "inputstream") 32 | (:file "tokenizer") 33 | (:file "simple-tree") 34 | (:file "html5-parser-class") 35 | (:file "tree-help") 36 | (:file "html5-parser") 37 | (:file "xmls"))) 38 | 39 | (defsystem #:cl-html5-parser-tests 40 | :depends-on (:cl-html5-parser :stefil :cl-json :split-sequence) 41 | :components ((:module tests 42 | :serial t 43 | :components 44 | ((:file "packages") 45 | (:file "support") 46 | (:file "test-inputstream") 47 | (:file "test-tokenizer") 48 | (:file "test-tree-builder") 49 | (:file "test-parser") 50 | (:file "run-tests"))))) 51 | 52 | (defmethod perform ((o test-op) (c (eql (find-system '#:cl-html5-parser)))) 53 | (operate 'load-op '#:cl-html5-parser-tests) 54 | (funcall (find-symbol (string :run-html5-parser-tests) 55 | :html5-parser-tests))) 56 | 57 | (defsystem #:cl-html5-parser-cxml 58 | :name "cl-html5-parser" 59 | :description "CXML integration for cl-html5-parser" 60 | :licence "GNU Lesser General Public License" 61 | :author "Thomas Bakketun " 62 | :depends-on (:cl-html5-parser :cxml) 63 | :serial t 64 | :components ((:module "cxml" 65 | :serial t 66 | :components 67 | ((:file "cxml-dom"))))) 68 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/adoption01.dat: -------------------------------------------------------------------------------- 1 | #data 2 |

3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | |

10 | | 11 | 12 | #data 13 | 1

23

14 | #errors 15 | #document 16 | | 17 | | 18 | | 19 | | 20 | | "1" 21 | |

22 | | 23 | | "2" 24 | | "3" 25 | 26 | #data 27 | 1 28 | #errors 29 | #document 30 | | 31 | | 32 | | 33 | | 34 | | "1" 35 | |

1

23

74 | #errors 75 | #document 76 | | 77 | | 78 | | 79 | | 80 | | "1" 81 | |

82 | | 83 | | "2" 84 | | "3" 85 | |

86 | 87 | #data 88 |

89 | #errors 90 | #document 91 | | 92 | | 93 | | 94 | | 95 | | 96 | | 97 | |

98 | | 99 | 100 | #data 101 |

102 | #errors 103 | #document 104 | | 105 | | 106 | | 107 | | 108 | | 109 | | 110 | | 111 | |

112 | | 113 | 114 | #data 115 |

116 | #errors 117 | #document 118 | | 119 | | 120 | | 121 | | 122 | | 123 | | 124 | | 125 | | 126 | |

127 | | 128 | 129 | #data 130 |

123

45 131 | #errors 132 | #document 133 | | 134 | | 135 | | 136 | |

137 | | "1" 138 | | 139 | | id="A" 140 | | "2" 141 | | 142 | | id="B" 143 | | "3" 144 | | 145 | | id="A" 146 | | 147 | | id="B" 148 | | "4" 149 | | 150 | | id="B" 151 | | "5" 152 | 153 | #data 154 |

13
2
155 | #errors 156 | #document 157 | | 158 | | 159 | | 160 | | 161 | | "1" 162 | | 163 | | "3" 164 | | 165 | | 166 | | 167 | |
168 | | "2" 169 | 170 | #data 171 | AC
B
172 | #errors 173 | #document 174 | | 175 | | 176 | | 177 | | "AC" 178 | | 179 | | 180 | | 181 | | A 69 | #errors 70 | 26: Stray start tag “col”. 71 | #document 72 | | 73 | | 74 | | 75 | | 76 | | "A" 77 | 78 | #data 79 | A 80 | #errors 81 | #document 82 | | 83 | | 84 | | 85 | | 86 | | 87 | | "A" 88 | 89 | #data 90 | A 91 | #errors 92 | #document 93 | | 94 | | 95 | | 96 | | 97 | | 98 | | "A" 99 | 100 | #data 101 | A 102 | #errors 103 | 26: Stray start tag “frame”. 104 | #document 105 | | 106 | | 107 | | 108 | | 109 | | "A" 110 | 111 | #data 112 |
A 113 | #errors 114 | #document 115 | | 116 | | 117 | | 118 | | 119 | |
120 | | "A" 121 | 122 | #data 123 | A 124 | #errors 125 | #document 126 | | 127 | | 128 | | 129 | | 130 | | 131 | | "A" 132 | 133 | #data 134 | A 135 | #errors 136 | #document 137 | | 138 | | 139 | | 140 | | 141 | | 142 | | "A" 143 | 144 | #data 145 | A 146 | #errors 147 | #document 148 | | 149 | | 150 | | 151 | | 152 | | 153 | | "A" 154 | 155 | #data 156 | A 157 | #errors 158 | #document 159 | | 160 | | 161 | | 162 | | 163 | | 164 | | "A" 165 | 166 | #data 167 | A 168 | #errors 169 | #document 170 | | 171 | | 172 | | 173 | | 174 | | 175 | | "A" 176 | 177 | #data 178 | A 179 | #errors 180 | #document 181 | | 182 | | 183 | | 184 | | 185 | | 186 | | "A" 187 | 188 | #data 189 | A 190 | #errors 191 | #document 192 | | 193 | | 194 | | 195 | | 196 | | 197 | | "A" 198 | 199 | #data 200 | A 201 | #errors 202 | #document 203 | | 204 | | 205 | | 206 | | 207 | | 208 | | "A" 209 | 210 | #data 211 | A 212 | #errors 213 | #document 214 | | 215 | | 216 | | 217 | | 218 | | 219 | | "A" 220 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/tests23.dat: -------------------------------------------------------------------------------- 1 | #data 2 |

X 3 | #errors 4 | 3: Start tag seen without seeing a doctype first. Expected “”. 5 | 116: Unclosed elements. 6 | 117: End of file seen and there were open elements. 7 | #document 8 | | 9 | | 10 | | 11 | |

12 | | 13 | | size="4" 14 | | 15 | | color="red" 16 | | 17 | | size="4" 18 | | 19 | | size="4" 20 | | 21 | | size="4" 22 | | 23 | | size="4" 24 | | 25 | | size="4" 26 | | 27 | | color="red" 28 | |

29 | | 30 | | color="red" 31 | | 32 | | size="4" 33 | | 34 | | size="4" 35 | | 36 | | size="4" 37 | | 38 | | color="red" 39 | | "X" 40 | 41 | #data 42 |

X 43 | #errors 44 | #document 45 | | 46 | | 47 | | 48 | |

49 | | 50 | | size="4" 51 | | 52 | | size="4" 53 | | 54 | | size="4" 55 | | 56 | | size="4" 57 | |

58 | | 59 | | size="4" 60 | | 61 | | size="4" 62 | | 63 | | size="4" 64 | | "X" 65 | 66 | #data 67 |

X 68 | #errors 69 | #document 70 | | 71 | | 72 | | 73 | |

74 | | 75 | | size="4" 76 | | 77 | | size="4" 78 | | 79 | | size="4" 80 | | 81 | | size="5" 82 | | 83 | | size="4" 84 | |

85 | | 86 | | size="4" 87 | | 88 | | size="4" 89 | | 90 | | size="5" 91 | | 92 | | size="4" 93 | | "X" 94 | 95 | #data 96 |

X 97 | #errors 98 | #document 99 | | 100 | | 101 | | 102 | |

103 | | 104 | | id="a" 105 | | size="4" 106 | | 107 | | id="b" 108 | | size="4" 109 | | 110 | | size="4" 111 | | 112 | | size="4" 113 | |

114 | | 115 | | id="a" 116 | | size="4" 117 | | 118 | | id="b" 119 | | size="4" 120 | | 121 | | size="4" 122 | | 123 | | size="4" 124 | | "X" 125 | 126 | #data 127 |

X

Y 128 | #errors 129 | #document 130 | | 131 | | 132 | | 133 | |

134 | | 135 | | id="a" 136 | | 137 | | id="a" 138 | | 139 | | id="a" 140 | | 141 | | 142 | | 143 | | id="a" 144 | | 145 | | id="a" 146 | | "X" 147 | |

148 | | 149 | | id="a" 150 | | 151 | | id="a" 152 | | 153 | | id="a" 154 | | 155 | | "Y" 156 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/tables01.dat: -------------------------------------------------------------------------------- 1 | #data 2 |

182 | | "B" 183 | 184 | #data 185 |
186 | #errors 187 | #document 188 | | 189 | | 190 | | 191 | | 192 | | 193 | | 194 | | 195 | -------------------------------------------------------------------------------- /xmls.lisp: -------------------------------------------------------------------------------- 1 | ;;;; HTML5 parser for Common Lisp 2 | ;;;; 3 | ;;;; Copyright (C) 2012 Thomas Bakketun 4 | ;;;; Copyright (C) 2012 Asgeir Bjørlykke 5 | ;;;; Copyright (C) 2012 Mathias Hellevang 6 | ;;;; Copyright (C) 2012 Stian Sletner 7 | ;;;; 8 | ;;;; This library is free software: you can redistribute it and/or modify 9 | ;;;; it under the terms of the GNU Lesser General Public License as published 10 | ;;;; by the Free Software Foundation, either version 3 of the License, or 11 | ;;;; (at your option) any later version. 12 | ;;;; 13 | ;;;; This library is distributed in the hope that it will be useful, 14 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | ;;;; GNU General Public License for more details. 17 | ;;;; 18 | ;;;; You should have received a copy of the GNU General Public License 19 | ;;;; along with this library. If not, see . 20 | 21 | (in-package #:html5-parser) 22 | 23 | (defmethod transform-html5-dom ((to-type (eql :xmls)) node 24 | &key namespace comments) 25 | "Convert a node into an XMLS-compatible tree of conses, starting 26 | at. If the node is a document-fragement a list of XMLS trees is returned." 27 | (labels ((node-to-xmls (node) 28 | (ecase (node-type node) 29 | (:document 30 | (let (root) 31 | (element-map-children (lambda (n) 32 | (when (string= (node-name n) "html") 33 | (setf root n))) 34 | node) 35 | (assert root) 36 | (node-to-xmls root))) 37 | (:document-fragment 38 | (let (xmls-nodes) 39 | (element-map-children (lambda (node) 40 | (push (node-to-xmls node) 41 | xmls-nodes)) 42 | node) 43 | (nreverse xmls-nodes))) 44 | (:element 45 | (let (attrs children) 46 | (element-map-attributes (lambda (name namespace value) 47 | (declare (ignore namespace)) 48 | (push (list name value) attrs)) 49 | node) 50 | (element-map-children (lambda (c) 51 | (push c children)) 52 | node) 53 | 54 | (apply #'list 55 | (if namespace 56 | (cons (node-name node) (node-namespace node)) 57 | (node-name node)) 58 | attrs 59 | (mapcar (lambda (c) 60 | (node-to-xmls c)) 61 | (nreverse children))))) 62 | (:text 63 | (node-value node)) 64 | (:comment 65 | (when comments 66 | (list :comment nil (node-value node))))))) 67 | (node-to-xmls node))) 68 | 69 | (defmethod transform-html5-dom ((to-type (eql :xmls-ns)) node &key) 70 | (transform-html5-dom :xmls node :namespace t)) 71 | -------------------------------------------------------------------------------- /tests/testdata/tokenizer/domjs.test: -------------------------------------------------------------------------------- 1 | { 2 | "tests": [ 3 | { 4 | "description":"CR in bogus comment state", 5 | "input":"", 42 | "output":[["EndTag","xmp"]] 43 | }, 44 | { 45 | "description":"bad endtag in RCDATA and RAWTEXT", 46 | "initialStates":["RCDATA state", "RAWTEXT state"], 47 | "lastStartTag":"xmp", 48 | "input":"", 49 | "output":[["Character",""]] 50 | }, 51 | { 52 | "description":"bad endtag in RCDATA and RAWTEXT", 53 | "initialStates":["RCDATA state", "RAWTEXT state"], 54 | "lastStartTag":"xmp", 55 | "input":"", 56 | "output":[["Character",""]] 57 | }, 58 | { 59 | "description":"bad endtag in RCDATA and RAWTEXT", 60 | "initialStates":["RCDATA state", "RAWTEXT state"], 61 | "lastStartTag":"xmp", 62 | "input":"", 75 | "output":[["StartTag", "p", {"id":"\u2242\u0338"}]] 76 | }, 77 | { 78 | "description":"--!NUL in comment ", 79 | "doubleEscaped":true, 80 | "input":"", 81 | "output":["ParseError", ["Comment", "--!\\uFFFD"]] 82 | }, 83 | { 84 | "description":"space EOF after doctype ", 85 | "input":" 28 | #encoding 29 | euc_jp 30 | 31 | #data 32 | 33 | #encoding 34 | euc_jp 35 | 36 | #data 37 | 38 | 39 | #encoding 40 | utf-8 41 | 42 | #data 43 | 44 | 50 | #encoding 51 | windows-1252 52 | 53 | #data 54 | 55 | #encoding 56 | utf-8 57 | 58 | #data 59 | 60 | #encoding 61 | windows-1252 62 | 63 | #data 64 | 76 | #encoding 77 | utf-8 78 | 79 | #data 80 | 86 | #encoding 87 | utf-8 88 | 89 | #data 90 | 91 | #encoding 92 | utf-8 93 | 94 | #data 95 | 96 | #encoding 97 | utf-8 98 | 99 | #data 100 | 101 | 102 | #encoding 103 | utf-8 104 | 105 | #data 106 | 107 | 108 | #encoding 109 | utf-8 110 | 111 | #data 112 | ñ 113 | 114 | #encoding 115 | utf-8 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cl-html5-parser: HTML5 parser for Common Lisp 2 | ============================================= 3 | 4 | ## Abstract 5 | 6 | cl-html5-parser is a HTML5 parser for Common Lisp with the following features: 7 | 8 | * It is a port of the Python library [html5lib](http://code.google.com/p/html5lib/). 9 | * It passes all relevant tests from html5lib. 10 | * It is not tied to a specific DOM implementation. 11 | 12 | 13 | ## Requirements 14 | 15 | * SBCL or ECL. 16 | * CL-PPCRE and FLEXI-STREAMS. 17 | 18 | Might work with CLISP, ABCL and Clozure CL, but many of the tests don't pass there. 19 | 20 | 21 | ## Usage 22 | 23 | 24 | ### Parsing 25 | 26 | Parsing functions are in the package HTML5-PARSER. 27 | 28 | ``` 29 | parse-html5 source &key encoding strictp dom 30 | => document, errors 31 | ``` 32 | 33 | Parse an HTML document from source. Source can be a string, a pathname 34 | or a stream. When parsing from a stream encoding detection is not 35 | supported, encoding must be supplied via the encoding keyword 36 | parameter. 37 | 38 | When strictp is true, parsing stops on first error. 39 | 40 | Returns two values. The primary value is the document node. The 41 | secondary value is a list of errors found during parsing. The format 42 | of this list is subject to change. 43 | 44 | The type of document depends on the dom parameter. By default it's an 45 | instance of cl-html5-parser's own DOM implementation. See the DOM 46 | paragraph below for more information. 47 | 48 | ``` 49 | parse-html5-fragment source &key container encoding strictp dom 50 | => document-fragment, errors 51 | ``` 52 | 53 | Parses a fragment of HTML. Container sets the context, defaults to 54 | "div". Returns a document-fragment node. For the other parameters see 55 | `PARSE-HTML5`. 56 | 57 | 58 | ### Example 59 | ``` 60 | (html5-parser:parse-html5-fragment "Parse some HTML" :dom :xmls) 61 | ==> ("Parse " ("i" NIL "some") " HTML") 62 | ``` 63 | 64 | ### The DOM 65 | 66 | Parsing HTML5 is not possible without a 67 | [DOM](http://en.wikipedia.org/wiki/Document_Object_Model). cl-html5-parser 68 | defines a minimal DOM implementation for this task. Functions for 69 | traversing documents are exported by the HTML5-PARSER package. 70 | 71 | Alternativly the parser can be instructed to to convert the document 72 | into other DOM implemenations using the dom parameter. The convertion 73 | is done by simply calling the generic function 74 | transform-html5-dom. Support for other DOM implementations can be 75 | added by defining new methods for this generic function. The dom 76 | parameter is either a symbol or a list where the car is a symbol and 77 | the rest is key arguments. Below is the currently supported target 78 | types. 79 | 80 | 81 | ### :XMLS or (:XMLS &key namespace comments) 82 | 83 | Converts a node into a simple 84 | [XMLS](http://common-lisp.net/project/xmls/)-like list structure. 85 | If node is a document fragment a list of XMLS nodes a returned. In 86 | all other cases a single XMLS node is returned. 87 | 88 | If namespace argument is true, tag names are conses of name and 89 | namespace URI. 90 | 91 | By default comments are stripped. If comments argument is true, 92 | comments are returned as (:COMMENT NIL "comment text"). This extension 93 | of XMLS format. 94 | 95 | 96 | ### :CXML 97 | 98 | Convert to [Closure XML Parser](http://common-lisp.net/project/cxml/) 99 | DOM implementation. In order to use this you must load/depend on the 100 | the system cl-html5-parser-cxml. 101 | 102 | 103 | ## License 104 | 105 | This library is available under the 106 | [GNU Lesser General Public License v3.0](http://www.gnu.org/licenses/lgpl.html). 107 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/tests21.dat: -------------------------------------------------------------------------------- 1 | #data 2 | foo 3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | | "foo" 10 | 11 | #data 12 | foo 13 | #errors 14 | #document 15 | | 16 | | 17 | | 18 | | 19 | | "foo" 20 | 21 | #data 22 |
23 | #errors 24 | #document 25 | | 26 | | 27 | | 28 | |
29 | | 30 | 31 | #data 32 | foo 33 | #errors 34 | #document 35 | | <html> 36 | | <head> 37 | | <body> 38 | | <svg svg> 39 | | "foo" 40 | 41 | #data 42 | <svg><![CDATA[foo 43 | #errors 44 | #document 45 | | <html> 46 | | <head> 47 | | <body> 48 | | <svg svg> 49 | | "foo" 50 | 51 | #data 52 | <svg><![CDATA[ 53 | #errors 54 | #document 55 | | <html> 56 | | <head> 57 | | <body> 58 | | <svg svg> 59 | 60 | #data 61 | <svg><![CDATA[ 62 | #errors 63 | #document 64 | | 65 | | 66 | | 67 | | 68 | 69 | #data 70 | ]] > 71 | #errors 72 | #document 73 | | 74 | | 75 | | 76 | | 77 | | "]] >" 78 | 79 | #data 80 | ]] > 81 | #errors 82 | #document 83 | | 84 | | 85 | | 86 | | 87 | | "]] >" 88 | 89 | #data 90 | ]] 91 | #errors 92 | #document 93 | | <html> 94 | | <head> 95 | | <body> 96 | | <svg svg> 97 | | "]]" 98 | 99 | #data 100 | <svg><![CDATA[] 101 | #errors 102 | #document 103 | | <html> 104 | | <head> 105 | | <body> 106 | | <svg svg> 107 | | "]" 108 | 109 | #data 110 | <svg><![CDATA[]>a 111 | #errors 112 | #document 113 | | <html> 114 | | <head> 115 | | <body> 116 | | <svg svg> 117 | | "]>a" 118 | 119 | #data 120 | <svg><foreignObject><div><![CDATA[foo 121 | #errors 122 | #document 123 | | 124 | | 125 | | 126 | | 127 | | 128 | |
129 | | 130 | 131 | #data 132 | <svg> 133 | #errors 134 | #document 135 | | 136 | | 137 | | 138 | | 139 | | "" 140 | 141 | #data 142 | </svg>a 143 | #errors 144 | #document 145 | | 146 | | 147 | | 148 | | 149 | | "a" 150 | 151 | #data 152 | <svg>a 153 | #errors 154 | #document 155 | | <html> 156 | | <head> 157 | | <body> 158 | | <svg svg> 159 | | "<svg>a" 160 | 161 | #data 162 | <svg><![CDATA[</svg>a 163 | #errors 164 | #document 165 | | <html> 166 | | <head> 167 | | <body> 168 | | <svg svg> 169 | | "</svg>a" 170 | 171 | #data 172 | <svg><![CDATA[<svg> 173 | #errors 174 | #document 175 | | 176 | | 177 | | 178 | | 179 | | "" 180 | | 181 | 182 | #data 183 | <svg> 184 | #errors 185 | #document 186 | | 187 | | 188 | | 189 | | 190 | | "" 191 | 192 | #data 193 | <svg> 194 | #errors 195 | #document 196 | | 197 | | 198 | | 199 | | 200 | | "" 201 | | 202 | 203 | #data 204 | <svg>path 205 | #errors 206 | #document 207 | | 208 | | 209 | | 210 | | 211 | | "path" 212 | 213 | #data 214 | <!--svg--> 215 | #errors 216 | #document 217 | | 218 | | 219 | | 220 | | 221 | | "" 222 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/tests25.dat: -------------------------------------------------------------------------------- 1 | #data 2 | A 3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | | 10 | | "A" 11 | 12 | #data 13 | A 14 | #errors 15 | #document 16 | | 17 | | 18 | | 19 | | 20 | | 21 | | "A" 22 | 23 | #data 24 | A 25 | #errors 26 | #document 27 | | 28 | | 29 | | 30 | | 31 | | 32 | | "A" 33 | 34 | #data 35 | A 36 | #errors 37 | #document 38 | | 39 | | 40 | | 41 | | 42 | | 43 | | "A" 44 | 45 | #data 46 | A 47 | #errors 48 | #document 49 | | 50 | | 51 | | 52 | | 53 | | 54 | | "A" 55 | 56 | #data 57 |
A 58 | #errors 59 | #document 60 | | 61 | | 62 | | 63 | | 64 | |
65 | | "A" 66 | 67 | #data 68 |
3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | | 10 | | 11 | |
12 | 13 | #data 14 |
15 | #errors 16 | #document 17 | | 18 | | 19 | | 20 | | 21 | | 22 | | 23 | |
24 | 25 | #data 26 | 27 | #errors 28 | #document 29 | | 30 | | 31 | | 32 | |
33 | | 34 | | 35 | | foo="bar" 36 | 37 | #data 38 |
foo 39 | #errors 40 | #document 41 | | 42 | | 43 | | 44 | | "foo" 45 | |
46 | | 47 | 48 | #data 49 |

foo 50 | #errors 51 | #document 52 | | 53 | | 54 | | 55 | | 56 | |

57 | | "foo" 58 | 59 | #data 60 |

61 | #errors 62 | #document 63 | | 64 | | 65 | | 66 | | 67 | | 68 | | 69 | |
70 | 71 | #data 72 |
73 | #errors 74 | #document 75 | | 76 | | 77 | | 78 | | 82 | 83 | #data 84 |
85 | #errors 86 | #document 87 | | 88 | | 89 | | 90 | | 92 | |
93 | 94 | #data 95 |
96 | #errors 97 | #document 98 | | 99 | | 100 | | 101 | | 103 | 104 | #data 105 |
B
106 | #errors 107 | #document 108 | | 109 | | 110 | | 111 | | 115 | | 116 | | 117 | |
118 | | "B" 119 | 120 | #data 121 |
foo 122 | #errors 123 | #document 124 | | 125 | | 126 | | 127 | | 128 | | 129 | | 130 | |
131 | | "foo" 132 | 133 | #data 134 |
A
B 135 | #errors 136 | #document 137 | | 138 | | 139 | | 140 | | 141 | | 142 | | 143 | |
144 | | "A" 145 | | "B" 146 | 147 | #data 148 |
149 | #errors 150 | #document 151 | | 152 | | 153 | | 154 | | 155 | | 156 | | 157 | |
158 | 159 | #data 160 |
foo 161 | #errors 162 | #document 163 | | 164 | | 165 | | 166 | | 167 | | 168 | | 169 | |
170 | | "foo" 171 | 172 | #data 173 | 174 | #errors 175 | #document 176 | | 177 | | 178 | | 179 | |
180 | | 181 | | 182 | | 184 | 185 | #data 186 |
183 | |
187 | #errors 188 | #document 189 | | 190 | | 191 | | 192 | | 193 | | 194 | | 195 | |
196 | | 198 | 199 | #data 200 |
201 | #errors 202 | #document 203 | | 204 | | 205 | | 206 | | 207 | | 208 | | 209 | |
210 | | 211 | | 212 | | 213 | -------------------------------------------------------------------------------- /tests/testdata/tree-construction/html5test-com.dat: -------------------------------------------------------------------------------- 1 | #data 2 | 3 | #errors 4 | #document 5 | | 6 | | 7 | | 8 | | 9 | 10 | #data 11 |
12 | #errors 13 | #document 14 | | 15 | | 16 | | 17 | |
18 | | foo 22 | #errors 23 | #document 24 | | 25 | | 26 | | 27 | |
28 | | foo="`bar`" 29 | 30 | #data 31 |
32 | #errors 33 | #document 34 | | 35 | | 36 | | 37 | |
38 | | \"foo="" 39 | 40 | #data 41 | 42 | #errors 43 | #document 44 | | 45 | | 46 | | 47 | | 48 | | href="\nbar" 49 | 50 | #data 51 | 52 | #errors 53 | #document 54 | | 55 | | 56 | | 57 | | 58 | 59 | #data 60 | ⟨⟩ 61 | #errors 62 | #document 63 | | 64 | | 65 | | 66 | | "⟨⟩" 67 | 68 | #data 69 | ' 70 | #errors 71 | #document 72 | | 73 | | 74 | | 75 | | "'" 76 | 77 | #data 78 | ⅈ 79 | #errors 80 | #document 81 | | 82 | | 83 | | 84 | | "ⅈ" 85 | 86 | #data 87 | 𝕂 88 | #errors 89 | #document 90 | | 91 | | 92 | | 93 | | "𝕂" 94 | 95 | #data 96 | ∉ 97 | #errors 98 | #document 99 | | 100 | | 101 | | 102 | | "∉" 103 | 104 | #data 105 | 106 | #errors 107 | #document 108 | | 109 | | 110 | | 111 | | 112 | 113 | #data 114 | 115 | #errors 116 | #document 117 | | 118 | | 119 | | 120 | | 121 | 122 | #data 123 | 124 | #errors 125 | #document 126 | | 127 | | 128 | | 129 | | 130 | 131 | #data 132 | --> 133 | #errors 134 | #document 135 | | 136 | | 137 | | 138 | | --> 144 | #errors 145 | #document 146 | | 147 | | 148 | | 149 | | ->x --> x 103 | #errors 104 | Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. 105 | #document 106 | | 107 | | 108 | | 109 | | 206 | #errors 207 | #document 208 | | 209 | | 210 | | 211 | | 212 | | 217 | #errors 218 | #document 219 | | 220 | | 221 | | 222 | | 223 | | 230 | #errors 231 | #document 232 | | 233 | | 234 | | 235 | | 236 | |