├── CMakeLists.txt ├── COPYING ├── README.md ├── cmd ├── CMakeLists.txt └── main.cpp ├── config.h.in └── html-qt ├── CMakeLists.txt ├── entities.json ├── html-qt.doxygen ├── html-qt5.pc.in ├── htmlabstractphase.cpp ├── htmlabstractphase.h ├── htmlbeforehtmlphase.cpp ├── htmlbeforehtmlphase.h ├── htmlinitialphase.cpp ├── htmlinitialphase.h ├── htmlparser.cpp ├── htmlparser.h ├── htmlparser_p.h ├── htmltokenizer.cpp ├── htmltokenizer.h ├── htmltokenizer_p.h ├── htmltree.cpp └── htmltree.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeLists for libhtml-qt 2 | project(html-qt) 3 | 4 | cmake_minimum_required(VERSION 2.8.6 FATAL_ERROR) 5 | find_package(PkgConfig REQUIRED) 6 | find_package(Qt5 5.3.0 COMPONENTS 7 | Core 8 | Network 9 | ) 10 | 11 | set(HTMLQT_VERSION_MAJOR "0") 12 | set(HTMLQT_VERSION_MINOR "1") 13 | set(HTMLQT_VERSION_PATCH "0") 14 | set(HTMLQT_VERSION_SUFFIX "${VERSION_SUFFIX}") 15 | set(HTMLQT_VERSION "${HTMLQT_VERSION_MAJOR}.${HTMLQT_VERSION_MINOR}.${HTMLQT_VERSION_PATCH}") 16 | 17 | set(HTMLQT_API_LEVEL "0") 18 | 19 | # CMakeLists for HTML-Qt library 20 | set(CMAKE_AUTOMOC ON) 21 | 22 | # Include our cmake modules 23 | set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules) 24 | 25 | # As moc files are generated in the binary dir, tell CMake 26 | # to always look for includes there: 27 | set(CMAKE_INCLUDE_CURRENT_DIR ON) 28 | 29 | # Forbid in-tree building 30 | if(${CMAKE_SOURCE_DIR} MATCHES ${CMAKE_BINARY_DIR}) 31 | message(STATUS "Please do an out-of-tree build:") 32 | message(STATUS "rm -f CMakeCache.txt && mkdir build && cd build; cmake .. && make") 33 | message(FATAL_ERROR "In-tree-build detected!") 34 | endif(${CMAKE_SOURCE_DIR} MATCHES ${CMAKE_BINARY_DIR}) 35 | 36 | # 37 | # Options 38 | # 39 | 40 | # NONE 41 | 42 | if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 43 | set(CMAKE_INSTALL_PREFIX 44 | "/usr" CACHE PATH "html-qt default install prefix" FORCE) 45 | endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 46 | 47 | # 48 | # Configure files 49 | # 50 | set (PREFIXDIR "${CMAKE_INSTALL_PREFIX}") 51 | set (CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${LIBNAME}/") 52 | set (DATADIR "${CMAKE_INSTALL_PREFIX}/share") 53 | set (PKGDATADIR "${DATA_INSTALL_DIR}") 54 | set (LIBDIR "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}") 55 | set (PKGLIBDIR "${LIBDIR}/html-qt") 56 | set (GETTEXT_PACKAGE "html-qt") 57 | set (LOCALE_DIR "${DATADIR}/locale") 58 | set (VERSION "${HTMLQT_VERSION}") 59 | set (BUILDDIR "${CMAKE_BINARY_DIR}") 60 | 61 | add_definitions("-DLOCALSTATEDIR=\"${LOCALSTATEDIR}\"") 62 | 63 | set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_LIBRARY_ARCHITECTURE}" CACHE PATH "Output directory for libraries") 64 | 65 | configure_file(config.h.in ${CMAKE_BINARY_DIR}/config.h) 66 | 67 | # 68 | # Custom C flags 69 | # 70 | set (MAINTAINER_CFLAGS "-Werror -Wall -Wcast-align -Wno-uninitialized -Wempty-body -Wformat-security -Wformat -Winit-self") 71 | option (DISABLE_MAINTAINER_CFLAGS "Disable maintainer CFlags" ON) 72 | if (DISABLE_MAINTAINER_CFLAGS) 73 | set (MAINTAINER_CFLAGS "") 74 | endif (DISABLE_MAINTAINER_CFLAGS) 75 | add_definitions(${MAINTAINER_CFLAGS}) 76 | 77 | add_definitions(-DQT_NO_KEYWORDS) 78 | 79 | include_directories( 80 | ${CMAKE_SOURCE_DIR} 81 | ${CMAKE_CURRENT_SOURCE_DIR}/lib 82 | ${CMAKE_CURRENT_BINARY_DIR} 83 | ) 84 | 85 | add_subdirectory(html-qt) 86 | add_subdirectory(cmd) 87 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html-qt 2 | HTML parser based on the WHATWG HTML5 specification 3 | 4 | The command line tool html-qt reads for stdin or the first argument and run the parser. 5 | -------------------------------------------------------------------------------- /cmd/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories( 2 | ${CMAKE_CURRENT_BINARY_DIR} 3 | ${CMAKE_CURRENT_SOURCE_DIR} 4 | ) 5 | 6 | set(html_qt_cmd_SRCS 7 | main.cpp 8 | ) 9 | 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11") 11 | 12 | add_executable(html-qt-skell ${html_qt_cmd_SRCS}) 13 | qt5_use_modules(html-qt-skell Core) 14 | target_link_libraries(html-qt-skell 15 | html-qt5 16 | ) 17 | 18 | set_target_properties(html-qt-skell PROPERTIES OUTPUT_NAME html-qt) 19 | install(TARGETS html-qt-skell DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/) 20 | -------------------------------------------------------------------------------- /cmd/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../html-qt/htmlparser.h" 14 | 15 | int main(int argc, char *argv[]) 16 | { 17 | QCoreApplication app(argc, argv); 18 | QCoreApplication::setOrganizationName("Cutelyst"); 19 | QCoreApplication::setOrganizationDomain("cutelyst.org"); 20 | QCoreApplication::setApplicationName("html-qt"); 21 | QCoreApplication::setApplicationVersion("0.0.1"); 22 | 23 | QTranslator qtTranslator; 24 | qtTranslator.load("qt_" + QLocale::system().name(), 25 | QLibraryInfo::location(QLibraryInfo::TranslationsPath)); 26 | QCoreApplication::installTranslator(&qtTranslator); 27 | 28 | QCommandLineParser parser; 29 | parser.setApplicationDescription("Parses HTML documents according to WHATWG definitions"); 30 | parser.addHelpOption(); 31 | parser.addVersionOption(); 32 | 33 | parser.addPositionalArgument("source", QCoreApplication::translate("main", "Source HTML file to parse.")); 34 | 35 | // Process the actual command line arguments given by the user 36 | parser.process(app); 37 | 38 | const QStringList args = parser.positionalArguments(); 39 | QTextStream *in = 0; 40 | if (args.isEmpty()) { 41 | in = new QTextStream(stdin); 42 | } else if (args.size() == 1) { 43 | QFile *file = new QFile(args.first()); 44 | if (!file->open(QFile::ReadOnly)) { 45 | qFatal("Failed to open html file"); 46 | } 47 | in = new QTextStream(file); 48 | } else { 49 | parser.showHelp(1); 50 | } 51 | 52 | HTMLParser htmlParser; 53 | QElapsedTimer t; 54 | t.start(); 55 | htmlParser.parse(in->readAll()); 56 | qDebug() << "Time elapsed:" << t.elapsed() << "ms"; 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef CONFIG_H 2 | #define CONFIG_H 3 | 4 | /* always defined to indicate that i18n is enabled */ 5 | #define ENABLE_NLS 1 6 | 7 | /* Gettext Package */ 8 | #define GETTEXT_PACKAGE "@GETTEXT_PACKAGE@" 9 | 10 | /* Paths */ 11 | #define LOCALEDIR "@LOCALE_DIR@" 12 | #define PKGDATADIR "@PKGDATADIR@" 13 | #define PKGLIBDIR "@PKGLIBDIR@" 14 | #define PREFIXDIR "@PREFIXDIR@" 15 | #define DATADIR "@DATADIR@" 16 | #define LIBDIR "@LIBDIR@" 17 | #define BUILDDIR "@BUILDDIR@" 18 | 19 | /* Name of package */ 20 | #define PACKAGE_NAME "htmlqt" 21 | 22 | /* Version number of package */ 23 | #define VERSION "@VERSION@" 24 | 25 | #endif /*CONFIG_H*/ 26 | -------------------------------------------------------------------------------- /html-qt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories( 2 | ${CMAKE_BINARY_DIR} 3 | ${CMAKE_CURRENT_BINARY_DIR} 4 | ${CMAKE_CURRENT_SOURCE_DIR} 5 | ) 6 | 7 | set(htmlqt_SRC 8 | htmlabstractphase.cpp 9 | htmlinitialphase.cpp 10 | htmlbeforehtmlphase.cpp 11 | htmltokenizer.cpp 12 | htmltokenizer_p.h 13 | htmlparser.cpp 14 | htmlparser_p.h 15 | htmltree.cpp 16 | ) 17 | 18 | set(htmlqt_HEADERS 19 | htmltokenizer.h 20 | htmlparser.h 21 | htmltree.h 22 | ) 23 | 24 | # set(htmlqt_HEADERS_PRIVATE 25 | # common.h 26 | # ) 27 | 28 | add_definitions( 29 | -std=c++11 30 | ) 31 | 32 | add_library(html-qt5 SHARED ${htmlqt_SRC} ${htmlqt_HEADERS} ${htmlqt_HEADERS_PRIVATE}) 33 | set_target_properties(html-qt5 PROPERTIES VERSION ${HTMLQT_VERSION} SOVERSION ${HTMLQT_API_LEVEL}) 34 | 35 | qt5_use_modules(html-qt5 Core Network) 36 | 37 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/html-qt5.pc.in 38 | ${CMAKE_CURRENT_BINARY_DIR}/html-qt5.pc 39 | @ONLY 40 | ) 41 | 42 | install(TARGETS html-qt5 EXPORT HTMLQt5Targets DESTINATION ${CMAKE_INSTALL_LIBDIR}) 43 | install(FILES ${CMAKE_CURRENT_BINARY_DIR}/html-qt5.pc 44 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig 45 | ) 46 | install(FILES ${htmlqt_HEADERS} 47 | DESTINATION include/html-qt5/HTMLQt 48 | ) 49 | -------------------------------------------------------------------------------- /html-qt/html-qt5.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=${prefix} 3 | libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ 4 | includedir=${prefix}/include 5 | 6 | Name: html-qt5 7 | Description: HTML Qt parser 8 | Version: @VERSION@ 9 | Requires: QtCore 10 | Libs: -L${libdir} -lhtml-qt5 11 | Cflags: -I${includedir}/html-qt5/ 12 | -------------------------------------------------------------------------------- /html-qt/htmlabstractphase.cpp: -------------------------------------------------------------------------------- 1 | #include "htmlabstractphase.h" 2 | #include "htmlparser.h" 3 | #include "htmlparser_p.h" 4 | 5 | #include "htmltree.h" 6 | 7 | #include 8 | 9 | Q_LOGGING_CATEGORY(HTML_IM, "htmlqt.im") 10 | 11 | HTMLAbstractPhase::HTMLAbstractPhase(HTMLParser *parser, HTMLTree *tree) 12 | { 13 | this->tree = tree; 14 | this->parser = parser; 15 | } 16 | 17 | HTMLAbstractPhase::~HTMLAbstractPhase() 18 | { 19 | 20 | } 21 | 22 | HTMLParserPrivate *HTMLAbstractPhase::parserPriv() 23 | { 24 | return parser->d_ptr; 25 | } 26 | 27 | void HTMLAbstractPhase::insertHtmlElement() 28 | { 29 | 30 | } 31 | 32 | void HTMLAbstractPhase::startTagHtml(HTMLToken *token) 33 | { 34 | if (!parserPriv()->firstStartTag && token->name == QLatin1String("html")) { 35 | parser->parserErrorToken(QStringLiteral("non-html-root"), 0); 36 | return; 37 | } 38 | 39 | HTMLTreeNode *last = tree->openElements().last(); 40 | 41 | auto it = token->data.constBegin(); 42 | while (it != token->data.constEnd()) { 43 | const QString attr = it->first; 44 | const QString value = it->second; 45 | if (!last->attributes.contains(attr)) { 46 | last->attributes.insert(attr, value); 47 | } 48 | ++it; 49 | } 50 | parserPriv()->firstStartTag = false; 51 | } 52 | 53 | bool HTMLAbstractPhase::processCharacter(QChar c) 54 | { 55 | tree->insertText(c); 56 | return true; 57 | } 58 | 59 | bool HTMLAbstractPhase::processSpaceCharacters(HTMLToken *token) 60 | { 61 | Q_UNUSED(token) 62 | return true; 63 | } 64 | 65 | bool HTMLAbstractPhase::processStartTag(HTMLToken *token) 66 | { 67 | Q_UNUSED(token) 68 | return true; 69 | } 70 | 71 | bool HTMLAbstractPhase::processEndTag(HTMLToken *token) 72 | { 73 | Q_UNUSED(token) 74 | return true; 75 | } 76 | 77 | bool HTMLAbstractPhase::processCommentTag(HTMLToken *token) 78 | { 79 | tree->insertComment(token, tree->openElements().last()); 80 | return true; 81 | } 82 | 83 | bool HTMLAbstractPhase::processDoctype(HTMLToken *token) 84 | { 85 | Q_UNUSED(token) 86 | return true; 87 | } 88 | 89 | bool HTMLAbstractPhase::processEOF() 90 | { 91 | return true; 92 | } 93 | 94 | -------------------------------------------------------------------------------- /html-qt/htmlabstractphase.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLABSTRACTPHASE_H 2 | #define HTMLABSTRACTPHASE_H 3 | 4 | #include 5 | 6 | class HTMLParserPrivate; 7 | class HTMLParser; 8 | class HTMLTree; 9 | class HTMLToken; 10 | class HTMLAbstractPhase 11 | { 12 | public: 13 | HTMLAbstractPhase(HTMLParser *parser, HTMLTree *tree); 14 | virtual ~HTMLAbstractPhase(); 15 | 16 | HTMLTree *tree; 17 | HTMLParser *parser; 18 | HTMLParserPrivate *parserPriv(); 19 | 20 | virtual void insertHtmlElement(); 21 | 22 | virtual void startTagHtml(HTMLToken *token); 23 | 24 | virtual bool processCharacter(QChar c); 25 | 26 | virtual bool processSpaceCharacters(HTMLToken *token); 27 | 28 | virtual bool processStartTag(HTMLToken *token); 29 | 30 | virtual bool processEndTag(HTMLToken *token); 31 | 32 | virtual bool processCommentTag(HTMLToken *token); 33 | 34 | virtual bool processDoctype(HTMLToken *token); 35 | 36 | virtual bool processEOF(); 37 | 38 | }; 39 | 40 | #endif // HTMLABSTRACTPHASE_H 41 | -------------------------------------------------------------------------------- /html-qt/htmlbeforehtmlphase.cpp: -------------------------------------------------------------------------------- 1 | #include "htmlbeforehtmlphase.h" 2 | 3 | #include "htmltree.h" 4 | #include "htmltokenizer_p.h" 5 | #include "htmlparser_p.h" 6 | 7 | HTMLBeforeHtmlPhase::HTMLBeforeHtmlPhase(HTMLParser *parser, HTMLTree *tree) : HTMLAbstractPhase(parser, tree) 8 | { 9 | 10 | } 11 | 12 | void HTMLBeforeHtmlPhase::insertHtmlElement() 13 | { 14 | tree->inserRoot(new HTMLToken(QStringLiteral("html"), HTMLToken::StartTagToken)); 15 | parserPriv()->insertionModeEnum = HTMLParser::BeforeHead; 16 | parserPriv()->phase = parserPriv()->imBeforeHead; 17 | } 18 | 19 | bool HTMLBeforeHtmlPhase::processEOF() 20 | { 21 | insertHtmlElement(); 22 | return true; 23 | } 24 | 25 | bool HTMLBeforeHtmlPhase::processCharacter(QChar c) 26 | { 27 | insertHtmlElement(); 28 | return true; 29 | } 30 | 31 | bool HTMLBeforeHtmlPhase::processCommentTag(HTMLToken *token) 32 | { 33 | tree->insertComment(token, tree->document()); 34 | } 35 | 36 | bool HTMLBeforeHtmlPhase::processStartTag(HTMLToken *token) 37 | { 38 | if (token->name == QLatin1String("html")) { 39 | parserPriv()->firstStartTag = true; 40 | } 41 | insertHtmlElement(); 42 | return true; 43 | } 44 | -------------------------------------------------------------------------------- /html-qt/htmlbeforehtmlphase.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLBEFOREHTMLPHASE_H 2 | #define HTMLBEFOREHTMLPHASE_H 3 | 4 | #include "htmlabstractphase.h" 5 | 6 | class HTMLBeforeHtmlPhase : public HTMLAbstractPhase 7 | { 8 | public: 9 | HTMLBeforeHtmlPhase(HTMLParser *parser, HTMLTree *tree); 10 | 11 | virtual void insertHtmlElement() override; 12 | 13 | virtual bool processEOF(); 14 | 15 | virtual bool processCharacter(QChar c) override; 16 | 17 | virtual bool processCommentTag(HTMLToken *token) override; 18 | 19 | virtual bool processStartTag(HTMLToken *token) override; 20 | }; 21 | 22 | #endif // HTMLBEFOREHTMLPHASE_H 23 | -------------------------------------------------------------------------------- /html-qt/htmlinitialphase.cpp: -------------------------------------------------------------------------------- 1 | #include "htmlinitialphase.h" 2 | 3 | #include "htmltree.h" 4 | #include "htmltokenizer_p.h" 5 | #include "htmlparser_p.h" 6 | 7 | #include 8 | 9 | Q_LOGGING_CATEGORY(HTML_IM_INITIAL, "htmlqt.im.initial") 10 | 11 | HTMLInitialPhase::HTMLInitialPhase(HTMLParser *parser, HTMLTree *tree) : HTMLAbstractPhase(parser, tree) 12 | { 13 | 14 | } 15 | 16 | bool HTMLInitialPhase::processSpaceCharacters(HTMLToken *token) 17 | { 18 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token->name; 19 | return true; 20 | } 21 | 22 | bool HTMLInitialPhase::processStartTag(HTMLToken *token) 23 | { 24 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token; 25 | return true; 26 | } 27 | 28 | bool HTMLInitialPhase::processEndTag(HTMLToken *token) 29 | { 30 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token; 31 | return true; 32 | } 33 | 34 | bool HTMLInitialPhase::processCommentTag(HTMLToken *token) 35 | { 36 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token; 37 | tree->insertComment(token, tree->document()); 38 | return true; 39 | } 40 | 41 | bool HTMLInitialPhase::processDoctype(HTMLToken *token) 42 | { 43 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token; 44 | const QString &name = token->name; 45 | QString publicId = token->doctypePublicId; 46 | const QString &systemId = token->doctypeSystemId; 47 | if (name != QLatin1String("html") || 48 | !publicId.isNull() || 49 | (!systemId.isNull() && systemId != QLatin1String("about:legacy-compat"))) { 50 | // parser->parserErrorToken("unknown-doctype"); 51 | } 52 | 53 | if (publicId.isNull()) { 54 | publicId = QLatin1String(""); 55 | } 56 | 57 | tree->insertDoctype(token); 58 | 59 | qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token; 60 | 61 | // TODO 62 | 63 | parserPriv()->insertionModeEnum = HTMLParser::BeforeHTML; 64 | parserPriv()->phase = parserPriv()->imBeforeHTML; 65 | return true; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /html-qt/htmlinitialphase.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLINITIALPHASE_H 2 | #define HTMLINITIALPHASE_H 3 | 4 | #include "htmlabstractphase.h" 5 | 6 | class HTMLInitialPhase : public HTMLAbstractPhase 7 | { 8 | public: 9 | HTMLInitialPhase(HTMLParser *parser, HTMLTree *tree); 10 | 11 | virtual bool processSpaceCharacters(HTMLToken *token) override; 12 | 13 | virtual bool processStartTag(HTMLToken *token) override; 14 | 15 | virtual bool processEndTag(HTMLToken *token) override; 16 | 17 | virtual bool processCommentTag(HTMLToken *token) override; 18 | 19 | virtual bool processDoctype(HTMLToken *token) override; 20 | 21 | }; 22 | 23 | #endif // HTMLINITIALPHASE_H 24 | -------------------------------------------------------------------------------- /html-qt/htmlparser.cpp: -------------------------------------------------------------------------------- 1 | #include "htmlparser_p.h" 2 | 3 | #include "htmlinitialphase.h" 4 | #include "htmlbeforehtmlphase.h" 5 | 6 | #include 7 | #include 8 | 9 | Q_LOGGING_CATEGORY(HTML_PARSER, "htmlqt.parser") 10 | 11 | HTMLParser::HTMLParser(QObject *parent) : QObject(parent) 12 | , d_ptr(new HTMLParserPrivate) 13 | { 14 | Q_D(HTMLParser); 15 | 16 | d->tokenizer = new HTMLTokenizer(this); 17 | 18 | HTMLTree *tree = new HTMLTree; 19 | d->imInitial = new HTMLInitialPhase(this, tree); 20 | d->imBeforeHTML = new HTMLBeforeHtmlPhase(this, tree); 21 | d->imBeforeHead = new HTMLAbstractPhase(this, tree); 22 | d->imInHead = new HTMLAbstractPhase(this, tree); 23 | d->imInHeadNoScript = new HTMLAbstractPhase(this, tree); 24 | d->imAfterHead = new HTMLAbstractPhase(this, tree); 25 | d->imInBody = new HTMLAbstractPhase(this, tree); 26 | d->imText = new HTMLAbstractPhase(this, tree); 27 | d->imInTable = new HTMLAbstractPhase(this, tree); 28 | d->imInTableText = new HTMLAbstractPhase(this, tree); 29 | d->imInCaption = new HTMLAbstractPhase(this, tree); 30 | d->imInColumGroup = new HTMLAbstractPhase(this, tree); 31 | d->imInTableBody = new HTMLAbstractPhase(this, tree); 32 | d->imInRow = new HTMLAbstractPhase(this, tree); 33 | d->imInCell = new HTMLAbstractPhase(this, tree); 34 | d->imInSelect = new HTMLAbstractPhase(this, tree); 35 | d->imInSelectInTable = new HTMLAbstractPhase(this, tree); 36 | d->imInTemplate = new HTMLAbstractPhase(this, tree); 37 | d->imAfterBody = new HTMLAbstractPhase(this, tree); 38 | d->imInFrameset = new HTMLAbstractPhase(this, tree); 39 | d->imAfterFrameset = new HTMLAbstractPhase(this, tree); 40 | d->imAfterAfterBody = new HTMLAbstractPhase(this, tree); 41 | d->imAfterAfterFrameset = new HTMLAbstractPhase(this, tree); 42 | d->phase = d->imInitial; 43 | d->tree = tree; 44 | } 45 | 46 | HTMLParser::~HTMLParser() 47 | { 48 | delete d_ptr; 49 | } 50 | 51 | void HTMLParser::parse(const QString &html) 52 | { 53 | Q_D(HTMLParser); 54 | 55 | d->tokenizer->setHtmlText(html); 56 | d->tokenizer->start(); 57 | d->tree->dump(); 58 | } 59 | 60 | void HTMLParser::reset() 61 | { 62 | Q_D(HTMLParser); 63 | d->tree->reset(); 64 | d->firstStartTag = false; 65 | } 66 | 67 | void HTMLParser::characterToken(const QChar &c) 68 | { 69 | Q_D(HTMLParser); 70 | d->phase->processCharacter(c); 71 | } 72 | 73 | void HTMLParser::parserErrorToken(const QString &string, int pos) 74 | { 75 | qCCritical(HTML_PARSER) << "parser-error" << string << pos; 76 | } 77 | 78 | void HTMLParser::parseToken(HTMLToken *token) 79 | { 80 | qCCritical(HTML_PARSER) << "parseToken" << token << token->type; 81 | Q_D(HTMLParser); 82 | switch (token->type) { 83 | case HTMLToken::CharactersToken: 84 | d->phase->processCharacter(token->dataStr.at(0)); 85 | break; 86 | case HTMLToken::SpaceCharactersToken: 87 | d->phase->processStartTag(token); 88 | break; 89 | case HTMLToken::StartTagToken: 90 | d->phase->processStartTag(token); 91 | break; 92 | case HTMLToken::EndTagToken: 93 | d->phase->processEndTag(token); 94 | break; 95 | case HTMLToken::CommentToken: 96 | d->phase->processCommentTag(token); 97 | break; 98 | case HTMLToken::DocTypeToken: 99 | d->phase->processDoctype(token); 100 | break; 101 | case HTMLToken::ParserErrorToken: 102 | qDebug() << "error " << token; 103 | break; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /html-qt/htmlparser.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLPARSER_H 2 | #define HTMLPARSER_H 3 | 4 | #include 5 | 6 | class HTMLToken; 7 | class HTMLParserPrivate; 8 | class HTMLParser : public QObject 9 | { 10 | Q_OBJECT 11 | Q_DECLARE_PRIVATE(HTMLParser) 12 | public: 13 | enum InsertionMode { 14 | Initial, 15 | BeforeHTML, 16 | BeforeHead, 17 | InHead, 18 | InHeadNoScript, 19 | AfterHead, 20 | InBody, 21 | Text, 22 | InTable, 23 | InTableText, 24 | InCaption, 25 | InColumGroup, 26 | InTableBody, 27 | InRow, 28 | InCell, 29 | InSelect, 30 | InSelectInTable, 31 | InTemplate, 32 | AfterBody, 33 | InFrameset, 34 | AfterFrameset, 35 | AfterAfterBody, 36 | AfterAfterFrameset, 37 | }; 38 | Q_ENUM(InsertionMode) 39 | 40 | explicit HTMLParser(QObject *parent = 0); 41 | ~HTMLParser(); 42 | 43 | void parse(const QString &html); 44 | 45 | void reset(); 46 | 47 | protected: 48 | void characterToken(const QChar &c); 49 | void parserErrorToken(const QString &string, int pos); 50 | void parseToken(HTMLToken *token); 51 | 52 | friend class HTMLTokenizer; 53 | friend class HTMLAbstractPhase; 54 | 55 | HTMLParserPrivate *d_ptr; 56 | }; 57 | 58 | #endif // HTMLPARSER_H 59 | -------------------------------------------------------------------------------- /html-qt/htmlparser_p.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLPARSER_P_H 2 | #define HTMLPARSER_P_H 3 | 4 | #include "htmlparser.h" 5 | #include "htmltokenizer_p.h" 6 | #include "htmltree.h" 7 | 8 | #include "htmlabstractphase.h" 9 | 10 | class HTMLParserPrivate : public QObject 11 | { 12 | Q_OBJECT 13 | public: 14 | QString html; 15 | HTMLTokenizer *tokenizer; 16 | HTMLTree *tree; 17 | HTMLAbstractPhase *phase; 18 | HTMLParser::InsertionMode insertionModeEnum = HTMLParser::Initial; 19 | 20 | HTMLAbstractPhase *imInitial; 21 | HTMLAbstractPhase *imBeforeHTML; 22 | HTMLAbstractPhase *imBeforeHead; 23 | HTMLAbstractPhase *imInHead; 24 | HTMLAbstractPhase *imInHeadNoScript; 25 | HTMLAbstractPhase *imAfterHead; 26 | HTMLAbstractPhase *imInBody; 27 | HTMLAbstractPhase *imText; 28 | HTMLAbstractPhase *imInTable; 29 | HTMLAbstractPhase *imInTableText; 30 | HTMLAbstractPhase *imInCaption; 31 | HTMLAbstractPhase *imInColumGroup; 32 | HTMLAbstractPhase *imInTableBody; 33 | HTMLAbstractPhase *imInRow; 34 | HTMLAbstractPhase *imInCell; 35 | HTMLAbstractPhase *imInSelect; 36 | HTMLAbstractPhase *imInSelectInTable; 37 | HTMLAbstractPhase *imInTemplate; 38 | HTMLAbstractPhase *imAfterBody; 39 | HTMLAbstractPhase *imInFrameset; 40 | HTMLAbstractPhase *imAfterFrameset; 41 | HTMLAbstractPhase *imAfterAfterBody; 42 | HTMLAbstractPhase *imAfterAfterFrameset; 43 | 44 | bool firstStartTag = false; 45 | }; 46 | 47 | #endif // HTMLPARSER_P_H 48 | 49 | -------------------------------------------------------------------------------- /html-qt/htmltokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include "htmltokenizer_p.h" 2 | 3 | #include "htmlparser.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | Q_LOGGING_CATEGORY(HTML_TOKENIZER, "htmlqt.tokenizer") 13 | 14 | #define CALL_MEMBER_FN(object,ptrToMember) ((object).*(ptrToMember)) 15 | 16 | #define IS_ASCII_UPPERCASE(c) ('A' <= c && c <= 'Z') 17 | #define IS_ASCII_LOWERCASE(c) ('a' <= c && c <= 'z') 18 | #define IS_ASCII_DIGITS(c) ('0' <= c && c <= '9') 19 | #define IS_ASCII_HEX_DIGITS(c) (IS_ASCII_DIGITS(c) || \ 20 | ('A' <= c && c <= 'F') || \ 21 | ('a' <= c && c <= 'f')) 22 | #define IS_SPACE_CHARACTER(c) (data == QChar::Tabulation || /* CHARACTER TABULATION (tab) */ \ 23 | data == QChar::LineFeed || /* LINE FEED (LF) */ \ 24 | data == 0x000C || /* FORM FEED (FF) */ \ 25 | data == QChar::Space) // SPACE 26 | 27 | HTMLTokenizer::HTMLTokenizer(HTMLParser *parser) : QObject(parser) 28 | , d_ptr(new HTMLTokenizerPrivate) 29 | { 30 | d_ptr->q_ptr = this; 31 | d_ptr->parser = parser; 32 | 33 | // TODO https://html.spec.whatwg.org/multipage/entities.json 34 | // get from the url and/or keep a local copy 35 | QFile entitiesFile("/home/daniel/code/html-qt/entities.json"); 36 | if (!entitiesFile.open(QFile::ReadOnly)) { 37 | return; 38 | } 39 | QJsonDocument entities = QJsonDocument::fromBinaryData(entitiesFile.readAll()); 40 | qCDebug(HTML_TOKENIZER) << entities.object(); 41 | } 42 | 43 | HTMLTokenizer::~HTMLTokenizer() 44 | { 45 | delete d_ptr; 46 | } 47 | 48 | void HTMLTokenizer::setHtmlText(const QString &html) 49 | { 50 | Q_D(HTMLTokenizer); 51 | d->html = html; 52 | d->htmlPos = -1; 53 | d->htmlSize = html.size(); 54 | } 55 | 56 | HTMLTokenizer::State HTMLTokenizer::state() const 57 | { 58 | Q_D(const HTMLTokenizer); 59 | return d->state; 60 | } 61 | 62 | void HTMLTokenizer::start() 63 | { 64 | Q_D(HTMLTokenizer); 65 | 66 | int lastPos = d->streamPos(); 67 | int repeatedPos = 0; 68 | while (CALL_MEMBER_FN(*d, d->stateFn)() && !d->streamAtEnd()) { 69 | // dunno what to do here :) 70 | // qCDebug(HTML_TOKENIZER) << d->state << d->streamPos() << d->streamAtEnd(); 71 | if (lastPos == d->streamPos()) { 72 | if (++repeatedPos > 10) { 73 | qFatal("Infinite loop detected on state: %s, at position: %d", 74 | metaObject()->enumerator(0).key(d->state), 75 | lastPos); 76 | } 77 | } else { 78 | lastPos = d->streamPos(); 79 | repeatedPos = 0; 80 | } 81 | } 82 | qCDebug(HTML_TOKENIZER) << "finished"; 83 | } 84 | 85 | void HTMLTokenizer::character(QChar c) 86 | { 87 | Q_D(HTMLTokenizer); 88 | // auto token = new HTMLToken(HTMLToken::CharactersToken); 89 | // token->dataStr = c; 90 | // d->tokenQueue.append(token); 91 | d->parser->characterToken(c); 92 | } 93 | 94 | void HTMLTokenizer::parserError(const QString &error) 95 | { 96 | Q_D(HTMLTokenizer); 97 | auto token = new HTMLToken(HTMLToken::ParserErrorToken); 98 | token->dataStr = error; 99 | d->tokenQueue.append(token); 100 | d->parser->parserErrorToken(error, d->streamPos()); 101 | } 102 | 103 | void HTMLTokenizer::token(HTMLToken *token) 104 | { 105 | Q_D(HTMLTokenizer); 106 | d->parser->parseToken(token); 107 | } 108 | 109 | // https://html.spec.whatwg.org/multipage/syntax.html#data-state 110 | bool HTMLTokenizerPrivate::dataState() 111 | { 112 | Q_Q(HTMLTokenizer); 113 | 114 | QChar data; 115 | 116 | if (!consumeStream(data)) { 117 | // Tokenization ends. 118 | return false; 119 | } else if (data == '&') { 120 | state = HTMLTokenizer::CharacterReferenceInDataState; 121 | stateFn = &HTMLTokenizerPrivate::characterReferenceInDataState; 122 | } else if (data == '<') { 123 | state = HTMLTokenizer::TagOpenState; 124 | stateFn = &HTMLTokenizerPrivate::tagOpenState; 125 | } else if (data.isNull()) { 126 | state = HTMLTokenizer::TagOpenState; 127 | Q_EMIT q->parserError(QLatin1String("invalid-codepoint: ") + data); 128 | Q_EMIT q->character(data); 129 | } else { 130 | Q_EMIT q->character(data); 131 | } 132 | 133 | return true; 134 | } 135 | 136 | // https://html.spec.whatwg.org/multipage/syntax.html#character-reference-in-data-state 137 | bool HTMLTokenizerPrivate::characterReferenceInDataState() 138 | { 139 | Q_Q(HTMLTokenizer); 140 | 141 | const QString &ret = consumeEntity(); 142 | if (ret.isNull()) { 143 | q->character('&'); 144 | } else { 145 | QString::ConstIterator it = ret.constBegin(); 146 | while (it != ret.constEnd()) { 147 | q->character(*it); 148 | ++it; 149 | } 150 | } 151 | state = HTMLTokenizer::DataState; 152 | stateFn = &HTMLTokenizerPrivate::dataState; 153 | return true; 154 | } 155 | 156 | // https://html.spec.whatwg.org/multipage/syntax.html#tag-open-state 157 | bool HTMLTokenizerPrivate::tagOpenState() 158 | { 159 | Q_Q(HTMLTokenizer); 160 | 161 | QChar data; 162 | 163 | if (!consumeStream(data)) { 164 | Q_EMIT q->parserError(QStringLiteral("expected-tag-name")); 165 | state = HTMLTokenizer::DataState; 166 | stateFn = &HTMLTokenizerPrivate::dataState; 167 | Q_EMIT q->character('<'); 168 | streamUnconsume(); 169 | } else if (data == '!') { 170 | state = HTMLTokenizer::MarkupDeclarationOpenState; 171 | stateFn = &HTMLTokenizerPrivate::markupDeclarationOpenState; 172 | } else if (data == '/') { 173 | state = HTMLTokenizer::EndTagOpenState; 174 | stateFn = &HTMLTokenizerPrivate::endTagOpenState; 175 | } else if (IS_ASCII_UPPERCASE(data)) { 176 | state = HTMLTokenizer::TagNameState; 177 | stateFn = &HTMLTokenizerPrivate::tagNameState; 178 | currentToken = new HTMLToken(HTMLToken::StartTagToken); 179 | currentToken->name = data.toLower(); 180 | } else if (IS_ASCII_LOWERCASE(data)) { 181 | state = HTMLTokenizer::TagNameState; 182 | stateFn = &HTMLTokenizerPrivate::tagNameState; 183 | currentToken = new HTMLToken(HTMLToken::StartTagToken); 184 | currentToken->name = data; 185 | } else if (data == '?') { 186 | q->parserError(QStringLiteral("expected-tag-name-but-got-question-mark")); 187 | state = HTMLTokenizer::BogusCommentState; 188 | stateFn = &HTMLTokenizerPrivate::bogusCommentState; 189 | } else { 190 | q->parserError(QStringLiteral("expected-tag-name")); 191 | state = HTMLTokenizer::DataState; 192 | stateFn = &HTMLTokenizerPrivate::dataState; 193 | q->character('<'); 194 | streamUnconsume(); 195 | } 196 | 197 | return true; 198 | } 199 | 200 | // https://html.spec.whatwg.org/multipage/syntax.html#end-tag-open-state 201 | bool HTMLTokenizerPrivate::endTagOpenState() 202 | { 203 | Q_Q(HTMLTokenizer); 204 | 205 | QChar data; 206 | 207 | if (!consumeStream(data)) { 208 | Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-eof")); 209 | state = HTMLTokenizer::DataState; 210 | stateFn = &HTMLTokenizerPrivate::dataState; 211 | Q_EMIT q->character('<'); // 0x003C 212 | Q_EMIT q->character('/'); // 0x002F 213 | streamUnconsume(); 214 | } else if (IS_ASCII_UPPERCASE(data)) { 215 | currentToken = new HTMLToken(HTMLToken::EndTagToken); 216 | currentToken->name = data.toLower(); 217 | currentToken->selfClosing = false; 218 | state = HTMLTokenizer::TagNameState; 219 | stateFn = &HTMLTokenizerPrivate::tagNameState; 220 | } else if (IS_ASCII_LOWERCASE(data)) { 221 | currentToken = new HTMLToken(HTMLToken::EndTagToken); 222 | currentToken->name = data; 223 | currentToken->selfClosing = false; 224 | state = HTMLTokenizer::TagNameState; 225 | stateFn = &HTMLTokenizerPrivate::tagNameState; 226 | } else if (data == '>') { 227 | Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-right-bracket")); 228 | state = HTMLTokenizer::DataState; 229 | stateFn = &HTMLTokenizerPrivate::dataState; 230 | } else { 231 | Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-char")); 232 | state = HTMLTokenizer::BogusCommentState; 233 | stateFn = &HTMLTokenizerPrivate::bogusCommentState; 234 | } 235 | 236 | return true; 237 | } 238 | 239 | bool HTMLTokenizerPrivate::tagNameState() 240 | { 241 | Q_Q(HTMLTokenizer); 242 | 243 | QChar data; 244 | 245 | if (!consumeStream(data)) { 246 | Q_EMIT q->parserError(QStringLiteral("eof-in-tag-name")); 247 | state = HTMLTokenizer::DataState; 248 | stateFn = &HTMLTokenizerPrivate::dataState; 249 | streamUnconsume(); 250 | } else if (IS_SPACE_CHARACTER(data)) { 251 | state = HTMLTokenizer::BeforeAttributeNameState; 252 | stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState; 253 | } else if (data == '/') { 254 | state = HTMLTokenizer::SelfClosingStartTagState; 255 | stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState; 256 | } else if (data == '>') { 257 | state = HTMLTokenizer::DataState; 258 | stateFn = &HTMLTokenizerPrivate::dataState; 259 | emitCurrentToken(); 260 | } else if (IS_ASCII_UPPERCASE(data)) { 261 | // Appending the lower case version 262 | currentToken->name.append(data.toLower()); 263 | } else if (data.isNull()) { 264 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 265 | currentToken->name.append(QChar::ReplacementCharacter); 266 | } else { 267 | currentToken->name.append(data); 268 | } 269 | 270 | return true; 271 | } 272 | 273 | bool HTMLTokenizerPrivate::beforeAttributeNameState() 274 | { 275 | Q_Q(HTMLTokenizer); 276 | 277 | QChar data; 278 | do { 279 | if (!consumeStream(data)) { 280 | Q_EMIT q->parserError(QStringLiteral("expected-attribute-name-but-got-eof")); 281 | state = HTMLTokenizer::DataState; 282 | stateFn = &HTMLTokenizerPrivate::dataState; 283 | streamUnconsume(); 284 | return true; 285 | } 286 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 287 | 288 | if (data == '/') { 289 | state = HTMLTokenizer::SelfClosingStartTagState; 290 | stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState; 291 | } else if (data == '>') { 292 | state = HTMLTokenizer::DataState; 293 | stateFn = &HTMLTokenizerPrivate::dataState; 294 | emitCurrentToken(); 295 | } else if (IS_ASCII_UPPERCASE(data)) { 296 | // Appending the lower case version 297 | currentToken->data.append({ data.toLower(), QString()}); 298 | state = HTMLTokenizer::AttributeNameState; 299 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 300 | } else if (data.isNull()) { 301 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 302 | currentToken->data.append({ QString(QChar::ReplacementCharacter), QString()}); 303 | state = HTMLTokenizer::AttributeNameState; 304 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 305 | } else if (data == '"' || 306 | data == '\'' || 307 | data == '<' || 308 | data == '=') { 309 | Q_EMIT q->parserError(QStringLiteral("invalid-character-in-attribute-name")); 310 | currentToken->data.append({ data, QString() }); 311 | state = HTMLTokenizer::AttributeNameState; 312 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 313 | } else { 314 | currentToken->data.append({ data, QString() }); 315 | state = HTMLTokenizer::AttributeNameState; 316 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 317 | } 318 | 319 | return true; 320 | } 321 | 322 | bool HTMLTokenizerPrivate::attributeNameState() 323 | { 324 | Q_Q(HTMLTokenizer); 325 | 326 | QChar data; 327 | 328 | if (!consumeStream(data)) { 329 | Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-name")); 330 | state = HTMLTokenizer::DataState; 331 | stateFn = &HTMLTokenizerPrivate::dataState; 332 | streamUnconsume(); 333 | } else if (IS_SPACE_CHARACTER(data)) { 334 | state = HTMLTokenizer::AfterAttributeNameState; 335 | stateFn = &HTMLTokenizerPrivate::afterAttributeNameState; 336 | } else if (data == '/') { 337 | state = HTMLTokenizer::SelfClosingStartTagState; 338 | stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState; 339 | } else if (data == '=') { 340 | state = HTMLTokenizer::BeforeAttributeValueState; 341 | stateFn = &HTMLTokenizerPrivate::beforeAttributeValueState; 342 | } else if (data == '>') { 343 | state = HTMLTokenizer::DataState; 344 | stateFn = &HTMLTokenizerPrivate::dataState; 345 | emitCurrentToken(); 346 | } else if (IS_ASCII_UPPERCASE(data)) { 347 | currentToken->appendDataCurrentAttributeName(data.toLower()); 348 | } else if (data.isNull()) { 349 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 350 | currentToken->appendDataCurrentAttributeName(QChar::ReplacementCharacter); 351 | } else if (data == '"' || data == '\'' || data == '<') { 352 | Q_EMIT q->parserError(QStringLiteral("invalid-character-in-attribute-name")); 353 | currentToken->appendDataCurrentAttributeName(data); 354 | } else { 355 | currentToken->appendDataCurrentAttributeName(data); 356 | } 357 | 358 | return true; 359 | } 360 | 361 | bool HTMLTokenizerPrivate::afterAttributeNameState() 362 | { 363 | Q_Q(HTMLTokenizer); 364 | 365 | QChar data; 366 | do { 367 | if (!consumeStream(data)) { 368 | Q_EMIT q->parserError(QStringLiteral("expected-end-of-tag-but-got-eof")); 369 | state = HTMLTokenizer::DataState; 370 | stateFn = &HTMLTokenizerPrivate::dataState; 371 | streamUnconsume(); 372 | return true; 373 | } 374 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 375 | 376 | if (data == '/') { 377 | state = HTMLTokenizer::SelfClosingStartTagState; 378 | stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState; 379 | } else if (data == '=') { 380 | state = HTMLTokenizer::BeforeAttributeValueState; 381 | stateFn = &HTMLTokenizerPrivate::beforeAttributeValueState; 382 | } else if (data == '>') { 383 | state = HTMLTokenizer::DataState; 384 | stateFn = &HTMLTokenizerPrivate::dataState; 385 | emitCurrentToken(); 386 | } else if (IS_ASCII_UPPERCASE(data)) { 387 | currentToken->data.append({ data.toLower(), QString() }); 388 | state = HTMLTokenizer::AttributeNameState; 389 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 390 | } else if (data.isNull()) { 391 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 392 | currentToken->data.append({ QString(QChar::ReplacementCharacter), QString() }); 393 | state = HTMLTokenizer::AttributeNameState; 394 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 395 | } else if (data == '"' || data == '\'' || data == '<') { 396 | Q_EMIT q->parserError(QStringLiteral("invalid-character-after-attribute-name")); 397 | currentToken->data.append({ data, QString() }); 398 | state = HTMLTokenizer::AttributeNameState; 399 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 400 | } else { 401 | currentToken->data.append({ data, QString() }); 402 | state = HTMLTokenizer::AttributeNameState; 403 | stateFn = &HTMLTokenizerPrivate::attributeNameState; 404 | } 405 | 406 | return true; 407 | } 408 | 409 | bool HTMLTokenizerPrivate::beforeAttributeValueState() 410 | { 411 | Q_Q(HTMLTokenizer); 412 | 413 | QChar data; 414 | do { 415 | if (!consumeStream(data)) { 416 | Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-eof")); 417 | state = HTMLTokenizer::DataState; 418 | stateFn = &HTMLTokenizerPrivate::dataState; 419 | streamUnconsume(); 420 | return true; 421 | } 422 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 423 | 424 | if (data == '"') { 425 | state = HTMLTokenizer::AttributeValueDoubleQuotedState; 426 | stateFn = &HTMLTokenizerPrivate::attributeValueDoubleQuotedState; 427 | } else if (data == '&') { 428 | state = HTMLTokenizer::AttributeValueUnquotedState; 429 | stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState; 430 | } else if (data == '\'') { 431 | state = HTMLTokenizer::AttributeValueSingleQuotedState; 432 | stateFn = &HTMLTokenizerPrivate::attributeValueSingleQuotedState; 433 | } else if (data.isNull()) { 434 | Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-right-bracket")); 435 | emitCurrentToken(); 436 | } else if (data == '>') { 437 | Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-right-bracket")); 438 | state = HTMLTokenizer::DataState; 439 | stateFn = &HTMLTokenizerPrivate::dataState; 440 | emitCurrentToken(); 441 | } else if (data == '<' || data == '=' || data == '`') { 442 | Q_EMIT q->parserError(QStringLiteral("equals-in-unquoted-attribute-value")); 443 | currentToken->appendDataCurrentAttributeValue(data); 444 | state = HTMLTokenizer::AttributeValueUnquotedState; 445 | stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState; 446 | } else { 447 | currentToken->appendDataCurrentAttributeValue(data); 448 | state = HTMLTokenizer::AttributeValueUnquotedState; 449 | stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState; 450 | } 451 | 452 | return true; 453 | } 454 | 455 | bool HTMLTokenizerPrivate::attributeValueDoubleQuotedState() 456 | { 457 | Q_Q(HTMLTokenizer); 458 | 459 | QChar data; 460 | 461 | if (!consumeStream(data)) { 462 | Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-double-quote")); 463 | state = HTMLTokenizer::DataState; 464 | stateFn = &HTMLTokenizerPrivate::dataState; 465 | streamUnconsume(); 466 | } else if (data == '"') { 467 | state = HTMLTokenizer::AfterAttributeValueQuotedState; 468 | stateFn = &HTMLTokenizerPrivate::afterAttributeValueQuotedState; 469 | } else if (data == '&') { 470 | QChar allowedChar('"'); 471 | characterReferenceInAttributeValueState(&allowedChar); 472 | } else if (data.isNull()) { 473 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 474 | currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter); 475 | } else { 476 | currentToken->appendDataCurrentAttributeValue(data); 477 | } 478 | 479 | return true; 480 | } 481 | 482 | bool HTMLTokenizerPrivate::attributeValueSingleQuotedState() 483 | { 484 | Q_Q(HTMLTokenizer); 485 | 486 | QChar data; 487 | 488 | if (!consumeStream(data)) { 489 | Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-single-quote")); 490 | state = HTMLTokenizer::DataState; 491 | stateFn = &HTMLTokenizerPrivate::dataState; 492 | streamUnconsume(); 493 | } else if (data == '\'') { 494 | state = HTMLTokenizer::AfterAttributeValueQuotedState; 495 | stateFn = &HTMLTokenizerPrivate::afterAttributeValueQuotedState; 496 | } else if (data == '&') { 497 | QChar allowedChar('\''); 498 | characterReferenceInAttributeValueState(&allowedChar); 499 | } else if (data.isNull()) { 500 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 501 | currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter); 502 | } else { 503 | currentToken->appendDataCurrentAttributeValue(data); 504 | } 505 | 506 | return true; 507 | } 508 | 509 | bool HTMLTokenizerPrivate::attributeValueUnquotedState() 510 | { 511 | Q_Q(HTMLTokenizer); 512 | 513 | QChar data; 514 | 515 | if (!consumeStream(data)) { 516 | Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-no-quotes")); 517 | state = HTMLTokenizer::DataState; 518 | stateFn = &HTMLTokenizerPrivate::dataState; 519 | streamUnconsume(); 520 | } else if (IS_SPACE_CHARACTER(data)) { 521 | state = HTMLTokenizer::BeforeAttributeNameState; 522 | stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState; 523 | } else if (data == '&') { 524 | QChar allowedChar('>'); 525 | characterReferenceInAttributeValueState(&allowedChar); 526 | } else if (data == '>') { 527 | state = HTMLTokenizer::DataState; 528 | stateFn = &HTMLTokenizerPrivate::dataState; 529 | emitCurrentToken(); 530 | } else if (data.isNull()) { 531 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 532 | currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter); 533 | } else if (data == '"' || data == '\'' || data == '<' || data == '`') { 534 | Q_EMIT q->parserError(QStringLiteral("unexpected-character-in-unquoted-attribute-value")); 535 | currentToken->appendDataCurrentAttributeValue(data); 536 | } else { 537 | currentToken->appendDataCurrentAttributeValue(data); 538 | } 539 | 540 | return true; 541 | } 542 | 543 | void HTMLTokenizerPrivate::characterReferenceInAttributeValueState(QChar *additionalAllowedCharacter) 544 | { 545 | QString ret = consumeEntity(additionalAllowedCharacter); 546 | if (ret.isNull()) { 547 | currentToken->appendDataCurrentAttributeValue('&'); 548 | } else { 549 | currentToken->appendDataCurrentAttributeValue(ret); 550 | } 551 | } 552 | 553 | bool HTMLTokenizerPrivate::afterAttributeValueQuotedState() 554 | { 555 | Q_Q(HTMLTokenizer); 556 | 557 | QChar data; 558 | 559 | if (!consumeStream(data)) { 560 | Q_EMIT q->parserError(QStringLiteral("unexpected-eof-after-attribute-value")); 561 | state = HTMLTokenizer::DataState; 562 | stateFn = &HTMLTokenizerPrivate::dataState; 563 | streamUnconsume(); 564 | } else if (IS_SPACE_CHARACTER(data)) { 565 | state = HTMLTokenizer::BeforeAttributeNameState; 566 | stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState; 567 | } else if (data == '/') { 568 | state = HTMLTokenizer::SelfClosingStartTagState; 569 | stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState; 570 | } else if (data == '>') { 571 | state = HTMLTokenizer::DataState; 572 | stateFn = &HTMLTokenizerPrivate::dataState; 573 | emitCurrentToken(); 574 | } else { 575 | Q_EMIT q->parserError(QStringLiteral("unexpected-character-after-attribute-value")); 576 | state = HTMLTokenizer::BeforeAttributeNameState; 577 | stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState; 578 | streamUnconsume(); 579 | } 580 | 581 | return true; 582 | } 583 | 584 | bool HTMLTokenizerPrivate::selfClosingStartTagState() 585 | { 586 | Q_Q(HTMLTokenizer); 587 | 588 | QChar data; 589 | 590 | if (!consumeStream(data)) { 591 | Q_EMIT q->parserError(QStringLiteral("unexpected-eof-after-solidus-in-tag")); 592 | state = HTMLTokenizer::DataState; 593 | stateFn = &HTMLTokenizerPrivate::dataState; 594 | streamUnconsume(); 595 | } else if (data == '>') { 596 | currentToken->selfClosing = true; 597 | state = HTMLTokenizer::DataState; 598 | stateFn = &HTMLTokenizerPrivate::dataState; 599 | emitCurrentToken(); 600 | } else { 601 | Q_EMIT q->parserError(QStringLiteral("unexpected-character-after-solidus-in-tag")); 602 | state = HTMLTokenizer::BeforeAttributeNameState; 603 | stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState; 604 | streamUnconsume(); 605 | } 606 | 607 | return true; 608 | } 609 | 610 | bool HTMLTokenizerPrivate::bogusCommentState() 611 | { 612 | // TODO 613 | return true; 614 | } 615 | 616 | // https://html.spec.whatwg.org/multipage/syntax.html#markup-declaration-open-state 617 | bool HTMLTokenizerPrivate::markupDeclarationOpenState() 618 | { 619 | Q_Q(HTMLTokenizer); 620 | 621 | int initalPos = streamPos(); 622 | QChar data; 623 | // TODO check this 624 | consumeStream(data); 625 | QString charStack = data; 626 | 627 | if (data == '-') { 628 | // TODO check this 629 | consumeStream(data); 630 | charStack.append(data); 631 | if (data == '-') { 632 | currentToken = new HTMLToken(HTMLToken::CommentToken); 633 | currentToken->name = ""; 634 | state = HTMLTokenizer::CommentStartState; 635 | stateFn = &HTMLTokenizerPrivate::commentStartState; 636 | return true; 637 | } 638 | } else if (data == 'd' || data == 'D') { 639 | // consume more 6 chars 640 | for (int i = 0; i < 6; ++i) { 641 | // TODO check this 642 | consumeStream(data); 643 | charStack.append(data); 644 | } 645 | 646 | if (charStack.compare(QLatin1String("DOCTYPE"), Qt::CaseInsensitive) == 0) { 647 | // currentToken = new HTMLToken(HTMLToken::CommentToken); 648 | qCDebug(HTML_TOKENIZER) << "markupDeclarationOpenState" << charStack; 649 | state = HTMLTokenizer::DocTypeState; 650 | stateFn = &HTMLTokenizerPrivate::doctypeState; 651 | return true; 652 | } 653 | } else if (data == '[') { 654 | qCWarning(HTML_TOKENIZER) << "markupDeclarationOpenState CDATA TODO"; 655 | } 656 | 657 | Q_EMIT q->parserError(QStringLiteral("expected-dashes-or-doctype")); 658 | state = HTMLTokenizer::BogusCommentState; 659 | stateFn = &HTMLTokenizerPrivate::bogusCommentState; 660 | streamSeek(initalPos); 661 | 662 | return true; 663 | } 664 | 665 | bool HTMLTokenizerPrivate::commentStartState() 666 | { 667 | Q_Q(HTMLTokenizer); 668 | 669 | QChar data; 670 | 671 | if (!consumeStream(data)) { 672 | Q_EMIT q->parserError(QStringLiteral("eof-in-comment")); 673 | state = HTMLTokenizer::DataState; 674 | stateFn = &HTMLTokenizerPrivate::dataState; 675 | emitCurrentToken(); 676 | streamUnconsume(); 677 | } else if (data == '-') { 678 | state = HTMLTokenizer::CommentStartDashState; 679 | stateFn = &HTMLTokenizerPrivate::commentStartDashState; 680 | } else if (data.isNull()) { 681 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 682 | currentToken->name.append(QChar::ReplacementCharacter); 683 | state = HTMLTokenizer::CommentState; 684 | stateFn = &HTMLTokenizerPrivate::commentState; 685 | } else if (data == '>') { 686 | Q_EMIT q->parserError(QStringLiteral("incorrect-comment")); 687 | state = HTMLTokenizer::DataState; 688 | stateFn = &HTMLTokenizerPrivate::dataState; 689 | emitCurrentToken(); 690 | } else { 691 | currentToken->name.append(data); 692 | state = HTMLTokenizer::CommentState; 693 | stateFn = &HTMLTokenizerPrivate::commentState; 694 | } 695 | 696 | return true; 697 | } 698 | 699 | bool HTMLTokenizerPrivate::commentStartDashState() 700 | { 701 | Q_Q(HTMLTokenizer); 702 | 703 | QChar data; 704 | 705 | if (!consumeStream(data)) { 706 | Q_EMIT q->parserError(QStringLiteral("eof-in-comment")); 707 | state = HTMLTokenizer::DataState; 708 | stateFn = &HTMLTokenizerPrivate::dataState; 709 | emitCurrentToken(); 710 | streamUnconsume(); 711 | } else if (data == '-') { 712 | state = HTMLTokenizer::CommentEndState; 713 | stateFn = &HTMLTokenizerPrivate::commentEndState; 714 | } else if (data.isNull()) { 715 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 716 | // TODO see if we can reduce to a singe call 717 | currentToken->name.append('-'); 718 | currentToken->name.append(QChar::ReplacementCharacter); 719 | state = HTMLTokenizer::CommentState; 720 | stateFn = &HTMLTokenizerPrivate::commentState; 721 | } else if (data == '>') { 722 | Q_EMIT q->parserError(QStringLiteral("incorrect-comment")); 723 | state = HTMLTokenizer::DataState; 724 | stateFn = &HTMLTokenizerPrivate::dataState; 725 | emitCurrentToken(); 726 | } else { 727 | // TODO see if we can reduce to a singe call 728 | currentToken->name.append('-'); 729 | currentToken->name.append(data); 730 | state = HTMLTokenizer::CommentState; 731 | stateFn = &HTMLTokenizerPrivate::commentState; 732 | } 733 | 734 | return true; 735 | } 736 | 737 | bool HTMLTokenizerPrivate::commentState() 738 | { 739 | Q_Q(HTMLTokenizer); 740 | 741 | QChar data; 742 | 743 | if (!consumeStream(data)) { 744 | Q_EMIT q->parserError(QStringLiteral("eof-in-comment")); 745 | state = HTMLTokenizer::DataState; 746 | stateFn = &HTMLTokenizerPrivate::dataState; 747 | emitCurrentToken(); 748 | streamUnconsume(); 749 | } else if (data == '-') { 750 | state = HTMLTokenizer::CommentEndDashState; 751 | stateFn = &HTMLTokenizerPrivate::commentEndDashState; 752 | } else if (data.isNull()) { 753 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 754 | currentToken->name.append(QChar::ReplacementCharacter); 755 | } else { 756 | currentToken->name.append(data); 757 | } 758 | 759 | return true; 760 | } 761 | 762 | bool HTMLTokenizerPrivate::commentEndDashState() 763 | { 764 | Q_Q(HTMLTokenizer); 765 | 766 | QChar data; 767 | 768 | if (!consumeStream(data)) { 769 | Q_EMIT q->parserError(QStringLiteral("eof-in-comment-end-dash")); 770 | state = HTMLTokenizer::DataState; 771 | stateFn = &HTMLTokenizerPrivate::dataState; 772 | emitCurrentToken(); 773 | streamUnconsume(); 774 | } else if (data == '-') { 775 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 776 | // TODO see if we can reduce to a singe call 777 | currentToken->name.append('-'); 778 | currentToken->name.append(QChar::ReplacementCharacter); 779 | state = HTMLTokenizer::CommentEndState; 780 | stateFn = &HTMLTokenizerPrivate::commentEndState; 781 | } else if (data.isNull()) { 782 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 783 | currentToken->name.append(QChar::ReplacementCharacter); 784 | state = HTMLTokenizer::CommentState; 785 | stateFn = &HTMLTokenizerPrivate::commentState; 786 | } else { 787 | currentToken->name.append('-'); 788 | currentToken->name.append(data); 789 | state = HTMLTokenizer::CommentState; 790 | stateFn = &HTMLTokenizerPrivate::commentState; 791 | } 792 | 793 | return true; 794 | } 795 | 796 | bool HTMLTokenizerPrivate::commentEndState() 797 | { 798 | Q_Q(HTMLTokenizer); 799 | 800 | QChar data; 801 | 802 | if (!consumeStream(data)) { 803 | Q_EMIT q->parserError(QStringLiteral("eof-in-comment-double-dash")); 804 | state = HTMLTokenizer::DataState; 805 | stateFn = &HTMLTokenizerPrivate::dataState; 806 | emitCurrentToken(); 807 | streamUnconsume(); 808 | } else if (data == '>') { 809 | state = HTMLTokenizer::DataState; 810 | stateFn = &HTMLTokenizerPrivate::dataState; 811 | emitCurrentToken(); 812 | } else if (data.isNull()) { 813 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 814 | // TODO see if we can reduce to a singe call 815 | currentToken->name.append('-'); 816 | currentToken->name.append(QChar::ReplacementCharacter); 817 | state = HTMLTokenizer::CommentState; 818 | stateFn = &HTMLTokenizerPrivate::commentState; 819 | } else if (data == '!') { 820 | Q_EMIT q->parserError(QStringLiteral("unexpected-bang-after-double-dash-in-comment")); 821 | state = HTMLTokenizer::CommentEndBangState; 822 | stateFn = &HTMLTokenizerPrivate::commentEndBangState; 823 | } else if (data == '-') { 824 | Q_EMIT q->parserError(QStringLiteral("unexpected-dash-after-double-dash-in-comment")); 825 | currentToken->name.append('-'); 826 | } else { 827 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-comment")); 828 | currentToken->name.append(QLatin1String("--") % data); 829 | state = HTMLTokenizer::CommentState; 830 | stateFn = &HTMLTokenizerPrivate::commentState; 831 | } 832 | 833 | return true; 834 | } 835 | 836 | bool HTMLTokenizerPrivate::commentEndBangState() 837 | { 838 | // TODO 839 | return true; 840 | } 841 | 842 | // https://html.spec.whatwg.org/multipage/syntax.html#doctype-state 843 | bool HTMLTokenizerPrivate::doctypeState() 844 | { 845 | Q_Q(HTMLTokenizer); 846 | 847 | QChar data; 848 | 849 | if (!consumeStream(data)) { 850 | Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-eof")); 851 | state = HTMLTokenizer::DataState; 852 | stateFn = &HTMLTokenizerPrivate::dataState; 853 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 854 | currentToken->forceQuirks = true; 855 | emitCurrentToken(); 856 | streamUnconsume(); 857 | } else if (IS_SPACE_CHARACTER(data)) { 858 | state = HTMLTokenizer::BeforeDocTypeNameState; 859 | stateFn = &HTMLTokenizerPrivate::beforeDocTypeNameState; 860 | } else { 861 | Q_EMIT q->parserError(QStringLiteral("need-space-after-doctype")); 862 | state = HTMLTokenizer::BeforeDocTypeNameState; 863 | stateFn = &HTMLTokenizerPrivate::beforeDocTypeNameState; 864 | streamUnconsume(); 865 | } 866 | 867 | return true; 868 | } 869 | 870 | bool HTMLTokenizerPrivate::beforeDocTypeNameState() 871 | { 872 | Q_Q(HTMLTokenizer); 873 | 874 | QChar data; 875 | do { 876 | if (!consumeStream(data)) { 877 | Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-eof")); 878 | state = HTMLTokenizer::DataState; 879 | stateFn = &HTMLTokenizerPrivate::dataState; 880 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 881 | currentToken->forceQuirks = true; 882 | emitCurrentToken(); 883 | streamUnconsume(); 884 | return true; 885 | } 886 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 887 | 888 | if (IS_ASCII_UPPERCASE(data)) { 889 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 890 | currentToken->name = data.toLower(); 891 | state = HTMLTokenizer::DocTypeNameState; 892 | stateFn = &HTMLTokenizerPrivate::docTypeNameState; 893 | } else if (data.isNull()) { 894 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 895 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 896 | currentToken->name = QChar(QChar::ReplacementCharacter); 897 | state = HTMLTokenizer::DocTypeNameState; 898 | stateFn = &HTMLTokenizerPrivate::docTypeNameState; 899 | } else if (data == '>') { 900 | Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-right-bracket" )); 901 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 902 | currentToken->forceQuirks = true; 903 | emitCurrentToken(); 904 | state = HTMLTokenizer::DataState; 905 | stateFn = &HTMLTokenizerPrivate::dataState; 906 | } else { 907 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 908 | currentToken->name = data; 909 | state = HTMLTokenizer::DocTypeNameState; 910 | stateFn = &HTMLTokenizerPrivate::docTypeNameState; 911 | } 912 | 913 | return true; 914 | } 915 | 916 | bool HTMLTokenizerPrivate::docTypeNameState() 917 | { 918 | Q_Q(HTMLTokenizer); 919 | 920 | QChar data; 921 | 922 | if (!consumeStream(data)) { 923 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype-name")); 924 | state = HTMLTokenizer::DataState; 925 | stateFn = &HTMLTokenizerPrivate::dataState; 926 | currentToken = new HTMLToken(HTMLToken::DocTypeToken); 927 | currentToken->forceQuirks = true; 928 | emitCurrentToken(); 929 | streamUnconsume(); 930 | } else if (IS_SPACE_CHARACTER(data)) { 931 | state = HTMLTokenizer::AfterDocTypeNameState; 932 | stateFn = &HTMLTokenizerPrivate::afterDocTypeNameState; 933 | } else if (data == '>') { 934 | state = HTMLTokenizer::DataState; 935 | stateFn = &HTMLTokenizerPrivate::dataState; 936 | emitCurrentToken(); 937 | } else if (IS_ASCII_UPPERCASE(data)) { 938 | currentToken->name.append(data.toLower()); 939 | } else if (data.isNull()) { 940 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 941 | currentToken->name.append(QChar::ReplacementCharacter); 942 | } else { 943 | currentToken->name.append(data); 944 | } 945 | 946 | return true; 947 | } 948 | 949 | // https://html.spec.whatwg.org/multipage/syntax.html#after-doctype-name-state 950 | bool HTMLTokenizerPrivate::afterDocTypeNameState() 951 | { 952 | Q_Q(HTMLTokenizer); 953 | 954 | QChar data; 955 | do { 956 | if (!consumeStream(data)) { 957 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 958 | state = HTMLTokenizer::DataState; 959 | stateFn = &HTMLTokenizerPrivate::dataState; 960 | currentToken->forceQuirks = true; 961 | emitCurrentToken(); 962 | streamUnconsume(); 963 | return true; 964 | } 965 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 966 | 967 | if (data == '>') { 968 | state = HTMLTokenizer::DataState; 969 | stateFn = &HTMLTokenizerPrivate::dataState; 970 | emitCurrentToken(); 971 | } else { 972 | int initalPos = streamPos(); 973 | if (data == 'p' || data == 'P' || 974 | data == 's' || data == 'S') { 975 | QString charStack = data; 976 | // consume more 5 chars 977 | for (int i = 0; i < 5; ++i) { 978 | // TODO check this 979 | consumeStream(data); 980 | charStack.append(data); 981 | } 982 | 983 | if (charStack.compare(QLatin1String("PUBLIC"), Qt::CaseInsensitive) == 0) { 984 | state = HTMLTokenizer::AfterDocTypePublicKeywordState; 985 | stateFn = &HTMLTokenizerPrivate::afterDocTypePublicKeywordState; 986 | return true; 987 | } else if (charStack.compare(QLatin1String("SYSTEM"), Qt::CaseInsensitive) == 0) { 988 | state = HTMLTokenizer::AfterDocTypeSystemKeywordState; 989 | stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemKeywordState; 990 | return true; 991 | } 992 | } 993 | 994 | Q_EMIT q->parserError(QStringLiteral("expected-space-or-right-bracket-in-doctype")); 995 | state = HTMLTokenizer::BogusDocTypeState; 996 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 997 | currentToken->forceQuirks = true; 998 | streamSeek(initalPos); 999 | } 1000 | 1001 | return true; 1002 | } 1003 | 1004 | bool HTMLTokenizerPrivate::afterDocTypePublicKeywordState() 1005 | { 1006 | Q_Q(HTMLTokenizer); 1007 | 1008 | QChar data; 1009 | 1010 | if (!consumeStream(data)) { 1011 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1012 | state = HTMLTokenizer::DataState; 1013 | stateFn = &HTMLTokenizerPrivate::dataState; 1014 | currentToken->forceQuirks = true; 1015 | emitCurrentToken(); 1016 | streamUnconsume(); 1017 | } else if (IS_SPACE_CHARACTER(data)) { 1018 | state = HTMLTokenizer::BeforeDocTypePublicIdentifierState; 1019 | stateFn = &HTMLTokenizerPrivate::beforeDocTypePublicIdentifierState; 1020 | } else if (data == '"') { 1021 | Q_EMIT q->parserError(QStringLiteral("unexpected-double-quote-in-doctype")); 1022 | currentToken->doctypePublicId = ""; 1023 | state = HTMLTokenizer::DocTypePublicIdentifierDoubleQuotedState; 1024 | stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState; 1025 | } else if (data == '\'') { 1026 | Q_EMIT q->parserError(QStringLiteral("unexpected-single-quote-in-doctype")); 1027 | currentToken->doctypePublicId = ""; 1028 | state = HTMLTokenizer::DocTypePublicIdentifierSingleQuotedState; 1029 | stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState; 1030 | } else if (data == '>') { 1031 | Q_EMIT q->parserError(QStringLiteral("unexpected-single-quote-in-doctype")); 1032 | currentToken->forceQuirks = true; 1033 | state = HTMLTokenizer::DataState; 1034 | stateFn = &HTMLTokenizerPrivate::dataState; 1035 | emitCurrentToken(); 1036 | } else { 1037 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1038 | currentToken->forceQuirks = true; 1039 | emitCurrentToken(); 1040 | state = HTMLTokenizer::BogusDocTypeState; 1041 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1042 | } 1043 | 1044 | return true; 1045 | } 1046 | 1047 | bool HTMLTokenizerPrivate::beforeDocTypePublicIdentifierState() 1048 | { 1049 | Q_Q(HTMLTokenizer); 1050 | 1051 | QChar data; 1052 | do { 1053 | if (!consumeStream(data)) { 1054 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1055 | state = HTMLTokenizer::DataState; 1056 | stateFn = &HTMLTokenizerPrivate::dataState; 1057 | currentToken->forceQuirks = true; 1058 | emitCurrentToken(); 1059 | streamUnconsume(); 1060 | return true; 1061 | } 1062 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 1063 | 1064 | if (data == '"') { 1065 | currentToken->doctypePublicId = ""; 1066 | state = HTMLTokenizer::DocTypePublicIdentifierDoubleQuotedState; 1067 | stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState; 1068 | } else if (data == '\'') { 1069 | currentToken->doctypePublicId = ""; 1070 | state = HTMLTokenizer::DocTypePublicIdentifierSingleQuotedState; 1071 | stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState; 1072 | } else if (data == '>') { 1073 | Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype")); 1074 | currentToken->forceQuirks = true; 1075 | state = HTMLTokenizer::DataState; 1076 | stateFn = &HTMLTokenizerPrivate::dataState; 1077 | emitCurrentToken(); 1078 | } else { 1079 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1080 | currentToken->forceQuirks = true; 1081 | emitCurrentToken(); 1082 | state = HTMLTokenizer::BogusDocTypeState; 1083 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1084 | } 1085 | 1086 | return true; 1087 | } 1088 | 1089 | bool HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState() 1090 | { 1091 | Q_Q(HTMLTokenizer); 1092 | 1093 | QChar data; 1094 | 1095 | if (!consumeStream(data)) { 1096 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1097 | state = HTMLTokenizer::DataState; 1098 | stateFn = &HTMLTokenizerPrivate::dataState; 1099 | currentToken->forceQuirks = true; 1100 | emitCurrentToken(); 1101 | streamUnconsume(); 1102 | } else if (data == '"') { 1103 | state = HTMLTokenizer::AfterDocTypePublicIdentifierState; 1104 | stateFn = &HTMLTokenizerPrivate::afterDocTypePublicIdentifierState; 1105 | } else if (data.isNull()) { 1106 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 1107 | currentToken->name.append(QChar::ReplacementCharacter); 1108 | } else if (data == '>') { 1109 | Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype")); 1110 | currentToken->forceQuirks = true; 1111 | state = HTMLTokenizer::DataState; 1112 | stateFn = &HTMLTokenizerPrivate::dataState; 1113 | emitCurrentToken(); 1114 | } else { 1115 | currentToken->doctypePublicId.append(data); 1116 | } 1117 | 1118 | return true; 1119 | } 1120 | 1121 | bool HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState() 1122 | { 1123 | Q_Q(HTMLTokenizer); 1124 | 1125 | QChar data; 1126 | 1127 | if (!consumeStream(data)) { 1128 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1129 | state = HTMLTokenizer::DataState; 1130 | stateFn = &HTMLTokenizerPrivate::dataState; 1131 | currentToken->forceQuirks = true; 1132 | emitCurrentToken(); 1133 | streamUnconsume(); 1134 | } else if (data == '\'') { 1135 | state = HTMLTokenizer::AfterDocTypePublicIdentifierState; 1136 | stateFn = &HTMLTokenizerPrivate::afterDocTypePublicIdentifierState; 1137 | } else if (data.isNull()) { 1138 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 1139 | currentToken->name.append(QChar::ReplacementCharacter); 1140 | } else if (data == '>') { 1141 | Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype")); 1142 | currentToken->forceQuirks = true; 1143 | state = HTMLTokenizer::DataState; 1144 | stateFn = &HTMLTokenizerPrivate::dataState; 1145 | emitCurrentToken(); 1146 | } else { 1147 | currentToken->doctypePublicId.append(data); 1148 | } 1149 | 1150 | return true; 1151 | } 1152 | 1153 | bool HTMLTokenizerPrivate::afterDocTypePublicIdentifierState() 1154 | { 1155 | Q_Q(HTMLTokenizer); 1156 | 1157 | QChar data; 1158 | 1159 | if (!consumeStream(data)) { 1160 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1161 | state = HTMLTokenizer::DataState; 1162 | stateFn = &HTMLTokenizerPrivate::dataState; 1163 | currentToken->forceQuirks = true; 1164 | emitCurrentToken(); 1165 | streamUnconsume(); 1166 | } else if (IS_SPACE_CHARACTER(data)) { 1167 | state = HTMLTokenizer::BetweenDocTypePublicAndSystemIdentifierState; 1168 | stateFn = &HTMLTokenizerPrivate::betweenDocTypePublicAndSystemIdentifierState; 1169 | } else if (data == '>') { 1170 | state = HTMLTokenizer::DataState; 1171 | stateFn = &HTMLTokenizerPrivate::dataState; 1172 | emitCurrentToken(); 1173 | } else if (data == '"') { 1174 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1175 | currentToken->doctypeSystemId = ""; 1176 | state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState; 1177 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState; 1178 | } else if (data == '\'') { 1179 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1180 | currentToken->doctypeSystemId = ""; 1181 | state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState; 1182 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState; 1183 | } else { 1184 | q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1185 | currentToken->forceQuirks = true; 1186 | state = HTMLTokenizer::BogusDocTypeState; 1187 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1188 | } 1189 | 1190 | return true; 1191 | } 1192 | 1193 | bool HTMLTokenizerPrivate::betweenDocTypePublicAndSystemIdentifierState() 1194 | { 1195 | Q_Q(HTMLTokenizer); 1196 | 1197 | QChar data; 1198 | do { 1199 | if (!consumeStream(data)) { 1200 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1201 | state = HTMLTokenizer::DataState; 1202 | stateFn = &HTMLTokenizerPrivate::dataState; 1203 | currentToken->forceQuirks = true; 1204 | emitCurrentToken(); 1205 | streamUnconsume(); 1206 | return true; 1207 | } 1208 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 1209 | 1210 | if (data == '>') { 1211 | state = HTMLTokenizer::DataState; 1212 | stateFn = &HTMLTokenizerPrivate::dataState; 1213 | emitCurrentToken(); 1214 | } else if (data == '"') { 1215 | currentToken->doctypeSystemId = ""; 1216 | state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState; 1217 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState; 1218 | } else if (data == '\'') { 1219 | currentToken->doctypeSystemId = ""; 1220 | state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState; 1221 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState; 1222 | } else { 1223 | q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1224 | currentToken->forceQuirks = true; 1225 | state = HTMLTokenizer::BogusDocTypeState; 1226 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1227 | } 1228 | 1229 | return true; 1230 | } 1231 | 1232 | bool HTMLTokenizerPrivate::afterDocTypeSystemKeywordState() 1233 | { 1234 | Q_Q(HTMLTokenizer); 1235 | 1236 | QChar data; 1237 | 1238 | if (!consumeStream(data)) { 1239 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1240 | state = HTMLTokenizer::DataState; 1241 | stateFn = &HTMLTokenizerPrivate::dataState; 1242 | currentToken->forceQuirks = true; 1243 | emitCurrentToken(); 1244 | streamUnconsume(); 1245 | } else if (IS_SPACE_CHARACTER(data)) { 1246 | state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState; 1247 | stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState; 1248 | } else if (data == '>') { 1249 | state = HTMLTokenizer::DataState; 1250 | stateFn = &HTMLTokenizerPrivate::dataState; 1251 | emitCurrentToken(); 1252 | } else if (data == '"') { 1253 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1254 | currentToken->doctypeSystemId = ""; 1255 | state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState; 1256 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState; 1257 | } else if (data == '\'') { 1258 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1259 | currentToken->doctypeSystemId = ""; 1260 | state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState; 1261 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState; 1262 | } else { 1263 | q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1264 | currentToken->forceQuirks = true; 1265 | state = HTMLTokenizer::BogusDocTypeState; 1266 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1267 | } 1268 | 1269 | return true; 1270 | } 1271 | 1272 | bool HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState() 1273 | { 1274 | Q_Q(HTMLTokenizer); 1275 | 1276 | QChar data; 1277 | do { 1278 | if (!consumeStream(data)) { 1279 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1280 | state = HTMLTokenizer::DataState; 1281 | stateFn = &HTMLTokenizerPrivate::dataState; 1282 | currentToken->forceQuirks = true; 1283 | emitCurrentToken(); 1284 | streamUnconsume(); 1285 | return true; 1286 | } 1287 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 1288 | 1289 | if (data == '"') { 1290 | currentToken->doctypeSystemId = ""; 1291 | state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState; 1292 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState; 1293 | } else if (data == '\'') { 1294 | currentToken->doctypeSystemId = ""; 1295 | state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState; 1296 | stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState; 1297 | } else if (data == '>') { 1298 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1299 | currentToken->forceQuirks = true; 1300 | state = HTMLTokenizer::DataState; 1301 | stateFn = &HTMLTokenizerPrivate::dataState; 1302 | emitCurrentToken(); 1303 | } else { 1304 | Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1305 | currentToken->forceQuirks = true; 1306 | state = HTMLTokenizer::BogusDocTypeState; 1307 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1308 | } 1309 | 1310 | return true; 1311 | } 1312 | 1313 | bool HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState() 1314 | { 1315 | Q_Q(HTMLTokenizer); 1316 | 1317 | QChar data; 1318 | 1319 | if (!consumeStream(data)) { 1320 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1321 | state = HTMLTokenizer::DataState; 1322 | stateFn = &HTMLTokenizerPrivate::dataState; 1323 | currentToken->forceQuirks = true; 1324 | emitCurrentToken(); 1325 | streamUnconsume(); 1326 | } else if (data == '"') { 1327 | state = HTMLTokenizer::AfterDocTypeSystemIdentifierState; 1328 | stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState; 1329 | } else if (data.isNull()) { 1330 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 1331 | currentToken->doctypeSystemId.append(QChar::ReplacementCharacter); 1332 | state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState; 1333 | stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState; 1334 | } else if (data == '>') { 1335 | Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype")); 1336 | currentToken->forceQuirks = true; 1337 | state = HTMLTokenizer::DataState; 1338 | stateFn = &HTMLTokenizerPrivate::dataState; 1339 | emitCurrentToken(); 1340 | } else { 1341 | currentToken->doctypeSystemId.append(data); 1342 | } 1343 | 1344 | return true; 1345 | } 1346 | 1347 | bool HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState() 1348 | { 1349 | Q_Q(HTMLTokenizer); 1350 | 1351 | QChar data; 1352 | 1353 | if (!consumeStream(data)) { 1354 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1355 | state = HTMLTokenizer::DataState; 1356 | stateFn = &HTMLTokenizerPrivate::dataState; 1357 | currentToken->forceQuirks = true; 1358 | emitCurrentToken(); 1359 | streamUnconsume(); 1360 | } else if (data == '\'') { 1361 | state = HTMLTokenizer::AfterDocTypeSystemIdentifierState; 1362 | stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState; 1363 | } else if (data.isNull()) { 1364 | Q_EMIT q->parserError(QStringLiteral("invalid-codepoint")); 1365 | currentToken->doctypeSystemId.append(QChar::ReplacementCharacter); 1366 | state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState; 1367 | stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState; 1368 | } else if (data == '>') { 1369 | Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype")); 1370 | currentToken->forceQuirks = true; 1371 | state = HTMLTokenizer::DataState; 1372 | stateFn = &HTMLTokenizerPrivate::dataState; 1373 | emitCurrentToken(); 1374 | } else { 1375 | currentToken->doctypeSystemId.append(data); 1376 | } 1377 | 1378 | return true; 1379 | } 1380 | 1381 | bool HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState() 1382 | { 1383 | Q_Q(HTMLTokenizer); 1384 | 1385 | QChar data; 1386 | do { 1387 | if (!consumeStream(data)) { 1388 | Q_EMIT q->parserError(QStringLiteral("eof-in-doctype")); 1389 | state = HTMLTokenizer::DataState; 1390 | stateFn = &HTMLTokenizerPrivate::dataState; 1391 | currentToken->forceQuirks = true; 1392 | emitCurrentToken(); 1393 | streamUnconsume(); 1394 | return true; 1395 | } 1396 | } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters 1397 | 1398 | if (data == '>') { 1399 | state = HTMLTokenizer::DataState; 1400 | stateFn = &HTMLTokenizerPrivate::dataState; 1401 | emitCurrentToken(); 1402 | } else { 1403 | q->parserError(QStringLiteral("unexpected-char-in-doctype")); 1404 | currentToken->forceQuirks = true; 1405 | state = HTMLTokenizer::BogusDocTypeState; 1406 | stateFn = &HTMLTokenizerPrivate::bogusDocTypeState; 1407 | } 1408 | 1409 | return true; 1410 | } 1411 | 1412 | bool HTMLTokenizerPrivate::bogusDocTypeState() 1413 | { 1414 | Q_Q(HTMLTokenizer); 1415 | 1416 | QChar data; 1417 | if (!consumeStream(data)) { 1418 | state = HTMLTokenizer::DataState; 1419 | stateFn = &HTMLTokenizerPrivate::dataState; 1420 | emitCurrentToken(); 1421 | streamUnconsume(); 1422 | } else if (data == '>') { 1423 | state = HTMLTokenizer::DataState; 1424 | stateFn = &HTMLTokenizerPrivate::dataState; 1425 | emitCurrentToken(); 1426 | } 1427 | 1428 | return true; 1429 | } 1430 | 1431 | bool HTMLTokenizerPrivate::cDataSectionState() 1432 | { 1433 | // TODO 1434 | return true; 1435 | } 1436 | 1437 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference 1438 | QString HTMLTokenizerPrivate::consumeEntity(QChar *allowedChar) 1439 | { 1440 | Q_Q(HTMLTokenizer); 1441 | 1442 | int initalPos = streamPos(); 1443 | QString output = QStringLiteral("&"); 1444 | 1445 | QChar data; 1446 | if (!consumeStream(data) || 1447 | IS_SPACE_CHARACTER(data) || data == '<' || data == '&' || 1448 | (allowedChar && data == *allowedChar)) { 1449 | // Not a character reference. No characters are consumed, 1450 | // and nothing is returned. (This is not an error, either.) 1451 | streamUnconsume(); 1452 | return QString(); 1453 | } else if (data == '#') { 1454 | output.append(data); 1455 | 1456 | // TODO check this 1457 | consumeStream(data); 1458 | QChar number; 1459 | if (data == 'x' || data == 'X') { 1460 | number = consumeNumberEntity(true); 1461 | } else { 1462 | number = consumeNumberEntity(false); 1463 | } 1464 | 1465 | if (number.isNull()) { 1466 | q->parserError(QStringLiteral("expected-numeric-entity")); 1467 | // unconsume all characters 1468 | streamSeek(initalPos); 1469 | return QString(); 1470 | } 1471 | 1472 | return number; 1473 | } else { 1474 | 1475 | } 1476 | return QString(); 1477 | } 1478 | 1479 | QChar HTMLTokenizerPrivate::consumeNumberEntity(bool isHex) 1480 | { 1481 | Q_Q(HTMLTokenizer); 1482 | 1483 | QChar ret; 1484 | QString charStack; 1485 | QChar c; 1486 | // TODO check this 1487 | consumeStream(c); 1488 | int lastPos = streamPos(); 1489 | if (isHex) { 1490 | while (IS_ASCII_HEX_DIGITS(c) && 1491 | !streamAtEnd()) { 1492 | charStack.append(c); // store the position to rewind for ; 1493 | lastPos = streamPos(); 1494 | // TODO check this 1495 | consumeStream(c); 1496 | } 1497 | } else { 1498 | while (IS_ASCII_DIGITS(c) && // Zero (0) to Nine (9) 1499 | !streamAtEnd()) { 1500 | charStack.append(c); 1501 | lastPos = streamPos(); // store the position to rewind for ; 1502 | // TODO check this 1503 | consumeStream(c); 1504 | } 1505 | } 1506 | 1507 | // No char was found return null to unconsume 1508 | if (charStack.isNull()) { 1509 | return QChar::Null; 1510 | } 1511 | 1512 | // Discard the ; if present. Otherwise, put it back on the queue and 1513 | // invoke parseError on parser. 1514 | if (c != ';') { 1515 | q->parserError(QStringLiteral("numeric-entity-without-semicolon")); 1516 | streamSeek(lastPos); 1517 | } 1518 | 1519 | // Convert the number using the proper base 1520 | bool ok; 1521 | int charAsInt = charStack.toInt(&ok, isHex ? 16 : 10); 1522 | if (!ok) { 1523 | // TODO error 1524 | } 1525 | 1526 | // Certain characters get replaced with others 1527 | QMap::ConstIterator it = replacementCharacters.constFind(charAsInt); 1528 | if (it != replacementCharacters.constEnd()) { 1529 | ret = it.value(); 1530 | q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack)); 1531 | } else if ((charAsInt >= 0xD800 && charAsInt <= 0xDFFF) || charAsInt > 0x10FFFF) { 1532 | ret = QChar::ReplacementCharacter; 1533 | q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack)); 1534 | } else { 1535 | if ((0x0001 <= charAsInt && charAsInt <= 0x0008) || 1536 | (0x000E <= charAsInt && charAsInt <= 0x001F) || 1537 | (0x007F <= charAsInt && charAsInt <= 0x009F) || 1538 | (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) || 1539 | (charAsInt == 0x000B || charAsInt == 0xFFFE || charAsInt == 0xFFFF || charAsInt == 0x1FFFE || 1540 | charAsInt == 0x1FFFF || charAsInt == 0x2FFFE || charAsInt == 0x2FFFF || charAsInt == 0x3FFFE || 1541 | charAsInt == 0x3FFFF || charAsInt == 0x4FFFE || charAsInt == 0x4FFFF || charAsInt == 0x5FFFE || 1542 | charAsInt == 0x5FFFF || charAsInt == 0x6FFFE || charAsInt == 0x6FFFF || charAsInt == 0x7FFFE || 1543 | charAsInt == 0x7FFFF || charAsInt == 0x8FFFE || charAsInt == 0x8FFFF || charAsInt == 0x9FFFE || 1544 | charAsInt == 0x9FFFF || charAsInt == 0xAFFFE || charAsInt == 0xAFFFF || charAsInt == 0xBFFFE || 1545 | charAsInt == 0xBFFFF || charAsInt == 0xCFFFE || charAsInt == 0xCFFFF || charAsInt == 0xDFFFE || 1546 | charAsInt == 0xDFFFF || charAsInt == 0xEFFFE || charAsInt == 0xEFFFF || charAsInt == 0xFFFFE || 1547 | charAsInt == 0xFFFFF || charAsInt == 0x10FFFE || charAsInt == 0x10FFFF)) { 1548 | q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack)); 1549 | ret = charAsInt; 1550 | } 1551 | } 1552 | 1553 | return ret; 1554 | } 1555 | 1556 | void HTMLTokenizerPrivate::emitCurrentToken() 1557 | { 1558 | Q_Q(HTMLTokenizer); 1559 | 1560 | // qDebug() << "emitCurrentToken" << currentToken; 1561 | HTMLToken *token = currentToken; 1562 | if (token->type == HTMLToken::EndTagToken) { 1563 | if (!token->data.isEmpty()) { 1564 | Q_EMIT q->parserError(QStringLiteral("attributes-in-end-tag")); 1565 | } 1566 | 1567 | if (token->selfClosing) { 1568 | Q_EMIT q->parserError(QStringLiteral("self-closing-flag-on-end-tag")); 1569 | } 1570 | } 1571 | Q_EMIT q->token(token); 1572 | 1573 | currentToken = 0; 1574 | } 1575 | 1576 | QMap HTMLToken::dataItems() 1577 | { 1578 | QMap ret; 1579 | for (const std::pair &pair : data) { 1580 | ret.insertMulti(pair.first, pair.second); 1581 | } 1582 | return ret; 1583 | } 1584 | -------------------------------------------------------------------------------- /html-qt/htmltokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLTOKENIZER_H 2 | #define HTMLTOKENIZER_H 3 | 4 | #include 5 | 6 | class HTMLParser; 7 | class HTMLToken; 8 | class HTMLTokenizerPrivate; 9 | class HTMLTokenizer : public QObject 10 | { 11 | Q_OBJECT 12 | Q_DECLARE_PRIVATE(HTMLTokenizer) 13 | public: 14 | enum State { 15 | DataState, 16 | CharacterReferenceInDataState, 17 | RCDataState, 18 | CharacterReferenceInRCDataState, 19 | RawTextState, 20 | ScriptDataState, 21 | PlainTextState, 22 | TagOpenState, 23 | EndTagOpenState, 24 | TagNameState, 25 | RCDataLessThanSignState, 26 | RCDataEndTagOpenState, 27 | RCDataEndTagNameState, 28 | RawTextLessThanSignState, 29 | RawTextEndTagOpenState, 30 | RawTextEndTagNameState, 31 | ScriptDataLessThanSignState, 32 | ScriptDataEndTagOpenState, 33 | ScriptDataEndTagNameState, 34 | ScriptDataEscapeStartState, 35 | ScriptDataEscapeStartDashState, 36 | ScriptDataEscapedState, 37 | ScriptDataEscapedDashState, 38 | ScriptDataEscapedDashDashState, 39 | ScriptDataEscapedLessThanSignState, 40 | ScriptDataEscapedEndTagOpenState, 41 | ScriptDataEscapedEndTagNameState, 42 | ScriptDataDoubleEscapeStartState, 43 | ScriptDataDoubleEscapedState, 44 | ScriptDataDoubleEscapedDashState, 45 | ScriptDataDoubleEscapedDashDashState, 46 | ScriptDataDoubleEscapedLessThanSignState, 47 | ScriptDataDoubleEscapeEndState, 48 | BeforeAttributeNameState, 49 | AttributeNameState, 50 | AfterAttributeNameState, 51 | BeforeAttributeValueState, 52 | AttributeValueDoubleQuotedState, 53 | AttributeValueSingleQuotedState, 54 | AttributeValueUnquotedState, 55 | CharacterReferenceInAttributeValueState, 56 | AfterAttributeValueQuotedState, 57 | SelfClosingStartTagState, 58 | BogusCommentState, 59 | MarkupDeclarationOpenState, 60 | CommentStartState, 61 | CommentStartDashState, 62 | CommentState, 63 | CommentEndDashState, 64 | CommentEndState, 65 | CommentEndBangState, 66 | DocTypeState, 67 | BeforeDocTypeNameState, 68 | DocTypeNameState, 69 | AfterDocTypeNameState, 70 | AfterDocTypePublicKeywordState, 71 | BeforeDocTypePublicIdentifierState, 72 | DocTypePublicIdentifierDoubleQuotedState, 73 | DocTypePublicIdentifierSingleQuotedState, 74 | AfterDocTypePublicIdentifierState, 75 | BetweenDocTypePublicAndSystemIdentifierState, 76 | AfterDocTypeSystemKeywordState, 77 | BeforeDocTypeSystemIdentifierState, 78 | DocTypeSystemIdentifierDoubleQuotedState, 79 | DocTypeSystemIdentifierSingleQuotedState, 80 | AfterDocTypeSystemIdentifierState, 81 | BogusDocTypeState, 82 | CDataSectionState, 83 | }; 84 | Q_ENUM(State) 85 | HTMLTokenizer(HTMLParser *parser); 86 | ~HTMLTokenizer(); 87 | 88 | void setHtmlText(const QString &html); 89 | 90 | State state() const; 91 | 92 | void start(); 93 | 94 | protected: 95 | void character(QChar c); 96 | void parserError(const QString &error); 97 | void token(HTMLToken *token); 98 | 99 | HTMLTokenizerPrivate *d_ptr; 100 | }; 101 | 102 | #endif // HTMLTOKENIZER_H 103 | -------------------------------------------------------------------------------- /html-qt/htmltokenizer_p.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLTOKENIZER_P_H 2 | #define HTMLTOKENIZER_P_H 3 | 4 | #include "htmltokenizer.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | typedef bool (HTMLTokenizerPrivate::*HTMLTokenizerPrivateMemFn)(); 11 | 12 | class HTMLToken 13 | { 14 | Q_GADGET 15 | public: 16 | enum Type { 17 | CharactersToken, 18 | SpaceCharactersToken, 19 | StartTagToken, 20 | EndTagToken, 21 | CommentToken, 22 | DocTypeToken, 23 | ParserErrorToken 24 | }; 25 | Q_ENUMS(Type) 26 | 27 | HTMLToken(Type tokenType) : type(tokenType) {} 28 | 29 | HTMLToken(const QString &_name, Type tokenType = EndTagToken, 30 | const QVector > &attributes = QVector >(), 31 | bool _selfClosing = false) 32 | : name(_name) 33 | , type(tokenType) 34 | , data(attributes) 35 | , selfClosing(_selfClosing) 36 | {} 37 | 38 | void appendDataCurrentAttributeName(const QChar &c) 39 | { 40 | if (data.isEmpty()) { 41 | data.append({ c, QString()}); 42 | } else { 43 | data.last().first.append(c); 44 | } 45 | } 46 | 47 | void appendDataCurrentAttributeValue(const QChar &c) 48 | { 49 | if (data.isEmpty()) { 50 | data.append({QString(), c}); 51 | } else { 52 | data.last().second.append(c); 53 | } 54 | } 55 | 56 | void appendDataCurrentAttributeValue(const QString &s) 57 | { 58 | if (data.isEmpty()) { 59 | data.push_back({QString(), s}); 60 | } else { 61 | data.last().second.append(s); 62 | } 63 | } 64 | 65 | QMap dataItems(); 66 | 67 | QString name; // or data for comment or character types 68 | Type type; 69 | QString dataStr; 70 | QVector > data; 71 | bool selfClosing = false; 72 | bool selfClosingAcknowledged = false; 73 | bool forceQuirks = false; 74 | QString doctypePublicId; 75 | QString doctypeSystemId; 76 | }; 77 | 78 | class HTMLTokenizerPrivate 79 | { 80 | Q_DECLARE_PUBLIC(HTMLTokenizer) 81 | public: 82 | // State methods 83 | bool dataState(); 84 | bool characterReferenceInDataState(); 85 | bool tagOpenState(); 86 | bool endTagOpenState(); 87 | bool tagNameState(); 88 | // ... RC Raw Script 89 | bool beforeAttributeNameState(); 90 | bool attributeNameState(); 91 | bool afterAttributeNameState(); 92 | bool beforeAttributeValueState(); 93 | bool attributeValueDoubleQuotedState(); 94 | bool attributeValueSingleQuotedState(); 95 | bool attributeValueUnquotedState(); 96 | // This method is special as for simplicity it is directly called by the callers 97 | void characterReferenceInAttributeValueState(QChar *additionalAllowedCharacter); 98 | bool afterAttributeValueQuotedState(); 99 | bool selfClosingStartTagState(); 100 | bool bogusCommentState(); 101 | bool markupDeclarationOpenState(); 102 | bool commentStartState(); 103 | bool commentStartDashState(); 104 | bool commentState(); 105 | bool commentEndDashState(); 106 | bool commentEndState(); 107 | bool commentEndBangState(); 108 | bool doctypeState(); 109 | bool beforeDocTypeNameState(); 110 | bool docTypeNameState(); 111 | bool afterDocTypeNameState(); 112 | bool afterDocTypePublicKeywordState(); 113 | bool beforeDocTypePublicIdentifierState(); 114 | bool docTypePublicIdentifierDoubleQuotedState(); 115 | bool docTypePublicIdentifierSingleQuotedState(); 116 | bool afterDocTypePublicIdentifierState(); 117 | bool betweenDocTypePublicAndSystemIdentifierState(); 118 | bool afterDocTypeSystemKeywordState(); 119 | bool beforeDocTypeSystemIdentifierState(); 120 | bool docTypeSystemIdentifierDoubleQuotedState(); 121 | bool docTypeSystemIdentifierSingleQuotedState(); 122 | bool afterDocTypeSystemIdentifierState(); 123 | bool bogusDocTypeState(); 124 | bool cDataSectionState(); 125 | 126 | // auxiliary methods 127 | inline bool consumeStream(QChar &c) 128 | { 129 | if (++htmlPos >= htmlSize || htmlPos < 0) { 130 | return false; 131 | } else { 132 | c = html.at(htmlPos); 133 | return true; 134 | } 135 | } 136 | 137 | inline int streamPos() { 138 | return htmlPos; 139 | } 140 | 141 | inline void streamSeek(int pos) { 142 | htmlPos = pos; 143 | } 144 | 145 | inline void streamUnconsume(int nChars = 1) { 146 | htmlPos -= nChars; 147 | } 148 | 149 | inline bool streamCanRead(int nChars = 1) { 150 | return htmlPos + nChars < htmlSize; 151 | } 152 | 153 | inline bool streamAtEnd() { 154 | return htmlPos > htmlSize; 155 | } 156 | 157 | QString consumeEntity(QChar *allowedChar = 0); 158 | QChar consumeNumberEntity(bool isHex); 159 | void emitCurrentToken(); 160 | 161 | // current token 162 | HTMLToken *currentToken; 163 | QVector tokenQueue; 164 | 165 | HTMLTokenizer *q_ptr; 166 | HTMLParser *parser; 167 | QString html; 168 | int htmlPos = -1; 169 | int htmlSize = 0; 170 | HTMLTokenizer::State state = HTMLTokenizer::DataState; 171 | HTMLTokenizerPrivateMemFn stateFn = &HTMLTokenizerPrivate::dataState; 172 | QMap replacementCharacters = { 173 | {0x00, 0xFFFD}, // REPLACEMENT CHARACTER 174 | {0x80, 0x20AC}, // EURO SIGN (€) 175 | {0x82, 0x201A}, // SINGLE LOW-9 QUOTATION MARK (‚) 176 | {0x83, 0x0192}, // LATIN SMALL LETTER F WITH HOOK (ƒ) 177 | {0x84, 0x201E}, // DOUBLE LOW-9 QUOTATION MARK („) 178 | {0x85, 0x2026}, // HORIZONTAL ELLIPSIS (…) 179 | {0x86, 0x2020}, // DAGGER (†) 180 | {0x87, 0x2021}, // DOUBLE DAGGER (‡) 181 | {0x88, 0x02C6}, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) 182 | {0x89, 0x2030}, // PER MILLE SIGN (‰) 183 | {0x8A, 0x0160}, // LATIN CAPITAL LETTER S WITH CARON (Š) 184 | {0x8B, 0x2039}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) 185 | {0x8C, 0x0152}, // LATIN CAPITAL LIGATURE OE (Œ) 186 | {0x8E, 0x017D}, // LATIN CAPITAL LETTER Z WITH CARON (Ž) 187 | {0x91, 0x2018}, // LEFT SINGLE QUOTATION MARK (‘) 188 | {0x92, 0x2019}, // RIGHT SINGLE QUOTATION MARK (’) 189 | {0x93, 0x201C}, // LEFT DOUBLE QUOTATION MARK (“) 190 | {0x94, 0x201D}, // RIGHT DOUBLE QUOTATION MARK (”) 191 | {0x95, 0x2022}, // BULLET (•) 192 | {0x96, 0x2013}, // EN DASH (–) 193 | {0x97, 0x2014}, // EM DASH (—) 194 | {0x98, 0x02DC}, // SMALL TILDE (˜) 195 | {0x99, 0x2122}, // TRADE MARK SIGN (™) 196 | {0x9A, 0x0161}, // LATIN SMALL LETTER S WITH CARON (š) 197 | {0x9B, 0x203A}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) 198 | {0x9C, 0x0153}, // LATIN SMALL LIGATURE OE (œ) 199 | {0x9E, 0x017E}, // LATIN SMALL LETTER Z WITH CARON (ž) 200 | {0x9F, 0x0178}, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) 201 | }; 202 | 203 | 204 | }; 205 | 206 | #endif // HTMLTOKENIZER_P_H 207 | 208 | -------------------------------------------------------------------------------- /html-qt/htmltree.cpp: -------------------------------------------------------------------------------- 1 | #include "htmltree.h" 2 | 3 | #include "htmltokenizer_p.h" 4 | 5 | #include 6 | 7 | Q_LOGGING_CATEGORY(HTML_TREE, "htmlqt.tree") 8 | 9 | HTMLTree::HTMLTree(const QString &namespaceHTMLElements) 10 | { 11 | if (namespaceHTMLElements.isEmpty()) { 12 | m_defaultNamespace = QStringLiteral("http://www.w3.org/1999/xhtml"); 13 | } else { 14 | m_defaultNamespace = namespaceHTMLElements; 15 | } 16 | 17 | reset(); 18 | } 19 | 20 | HTMLTree::~HTMLTree() 21 | { 22 | 23 | } 24 | 25 | void HTMLTree::reset() 26 | { 27 | m_openElements.clear(); 28 | 29 | delete m_document; 30 | m_document = new HTMLTreeNode; 31 | } 32 | 33 | HTMLTreeNode *HTMLTree::document() 34 | { 35 | return m_document; 36 | } 37 | 38 | void HTMLTree::insertText(QChar c, HTMLTreeNode *parent) 39 | { 40 | qCDebug(HTML_TREE) << Q_FUNC_INFO << c << m_openElements.size(); 41 | if (!parent) { 42 | parent = m_openElements.last(); 43 | } 44 | 45 | // if (! m_insertFromTable) 46 | parent->insertText(c); 47 | } 48 | 49 | void HTMLTree::inserRoot(HTMLToken *token) 50 | { 51 | HTMLTreeNode *node = createElement(token); 52 | m_openElements.push_back(node); 53 | m_document->appendChild(node); 54 | } 55 | 56 | void HTMLTree::insertDoctype(HTMLToken *token) 57 | { 58 | qCDebug(HTML_TREE) << Q_FUNC_INFO << token; 59 | m_document->token = token; 60 | } 61 | 62 | void HTMLTree::insertComment(HTMLToken *token, HTMLTreeNode *parent) 63 | { 64 | qCDebug(HTML_TREE) << Q_FUNC_INFO; 65 | } 66 | 67 | HTMLTreeNode *HTMLTree::createElement(HTMLToken *token) 68 | { 69 | auto ret = new HTMLTreeNode(token->name); 70 | for (const std::pair &pair : token->data) { 71 | ret->attributes.insertMulti(pair.first, pair.second); 72 | } 73 | return ret; 74 | } 75 | 76 | void HTMLTree::dump() 77 | { 78 | dumpTree(m_document); 79 | } 80 | 81 | QVector HTMLTree::openElements() const 82 | { 83 | return m_openElements; 84 | } 85 | 86 | HTMLTreeNode *HTMLTree::createNode(int &pos, int lastPos, bool plainText, HTMLTreeNode *parent) 87 | { 88 | qCDebug(HTML_TREE) << Q_FUNC_INFO; 89 | return 0; 90 | } 91 | 92 | void HTMLTree::dumpTree(HTMLTreeNode *root, int level) 93 | { 94 | qDebug() << QByteArray("-").repeated(level).data() << ">" << root->token->name; 95 | for (HTMLTreeNode *node : root->children) { 96 | dumpTree(node, level + 1); 97 | } 98 | } 99 | 100 | HTMLTreeNode::HTMLTreeNode(const QString &name) 101 | { 102 | this->name = name; 103 | } 104 | 105 | HTMLTreeNode::~HTMLTreeNode() 106 | { 107 | 108 | } 109 | 110 | void HTMLTreeNode::appendChild(HTMLTreeNode *node) 111 | { 112 | children.push_back(node); 113 | } 114 | 115 | void HTMLTreeNode::insertText(const QString &data) 116 | { 117 | qDebug() << data; 118 | text.append(data); 119 | } 120 | 121 | void HTMLTreeNode::removeChild(HTMLTreeNode *node) 122 | { 123 | children.removeOne(node); 124 | } 125 | 126 | void HTMLTreeNode::reparentChildren(HTMLTreeNode *node) 127 | { 128 | for (HTMLTreeNode *child : children) { 129 | node->appendChild(child); 130 | } 131 | children.clear(); 132 | } 133 | 134 | bool HTMLTreeNode::hasContent() const 135 | { 136 | return !text.isEmpty() || !children.isEmpty(); 137 | } 138 | 139 | QString HTMLTreeNode::asText() const 140 | { 141 | QString attributesStr; 142 | auto it = attributes.constBegin(); 143 | while (it != attributes.constEnd()) { 144 | if (it.value().isEmpty()) { 145 | attributesStr += QLatin1Char(' ') + it.value(); 146 | } else { 147 | attributesStr += QLatin1Char(' ') + it.key() + QLatin1String("=\"") + it.value() + QLatin1Char('"'); 148 | } 149 | } 150 | 151 | return QLatin1Char('<') + name + attributesStr + QLatin1Char('>'); 152 | } 153 | -------------------------------------------------------------------------------- /html-qt/htmltree.h: -------------------------------------------------------------------------------- 1 | #ifndef HTMLTREE_H 2 | #define HTMLTREE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class HTMLToken; 10 | class HTMLTreeNode 11 | { 12 | public: 13 | HTMLTreeNode(const QString &name = QString()); 14 | virtual ~HTMLTreeNode(); 15 | 16 | QString name; 17 | HTMLTreeNode *parent = nullptr; 18 | QVector children; 19 | QMap attributes; 20 | HTMLToken *token; 21 | QStringRef type; 22 | QString text; 23 | bool end = false; 24 | bool plainText = true; 25 | 26 | /*! 27 | * Insert node as a child of the current node 28 | */ 29 | virtual void appendChild(HTMLTreeNode *node); 30 | 31 | /*! 32 | * Insert data as text in the current node, 33 | * TODO positioned before the 34 | * start of node insertBefore or to the end of the node's text. 35 | */ 36 | virtual void insertText(const QString &data); 37 | 38 | /*! 39 | * Remove node from the children of the current node 40 | */ 41 | virtual void removeChild(HTMLTreeNode *node); 42 | 43 | /*! 44 | * Move all the children of the current node to newParent. 45 | * This is needed so that trees that don't store text as nodes move the 46 | * text in the correct way 47 | */ 48 | virtual void reparentChildren(HTMLTreeNode *node); 49 | 50 | /*! 51 | * Return true if the node has children or text, false otherwise 52 | */ 53 | virtual bool hasContent() const; 54 | 55 | QString asText() const; 56 | }; 57 | 58 | class HTMLTree 59 | { 60 | public: 61 | HTMLTree(const QString &namespaceHTMLElements = QString()); 62 | virtual ~HTMLTree(); 63 | 64 | void reset(); 65 | 66 | HTMLTreeNode *document(); 67 | 68 | void insertText(QChar c, HTMLTreeNode *parent = nullptr); 69 | 70 | void inserRoot(HTMLToken *token); 71 | 72 | void insertDoctype(HTMLToken *token); 73 | 74 | void insertComment(HTMLToken *token, HTMLTreeNode *parent = nullptr); 75 | 76 | HTMLTreeNode *createElement(HTMLToken *token); 77 | 78 | void dump(); 79 | 80 | QVector openElements() const; 81 | 82 | private: 83 | HTMLTreeNode *createNode(int &pos, int lastPos, bool plainText, HTMLTreeNode *parent); 84 | void dumpTree(HTMLTreeNode *root, int level = 0); 85 | 86 | QString m_defaultNamespace; 87 | bool m_useAllowed; 88 | bool m_insertFromTable = false; 89 | QStringList m_allowed; 90 | QString m_content; 91 | int m_pos = 0; 92 | QList m_nodes; 93 | HTMLTreeNode *m_document = nullptr; 94 | QVector m_openElements; 95 | }; 96 | 97 | #endif // HTMLTREE_H 98 | --------------------------------------------------------------------------------