├── CMakeLists.txt
├── COPYING
├── README.md
├── cmd
    ├── CMakeLists.txt
    └── main.cpp
├── config.h.in
└── html-qt
    ├── CMakeLists.txt
    ├── entities.json
    ├── html-qt.doxygen
    ├── html-qt5.pc.in
    ├── htmlabstractphase.cpp
    ├── htmlabstractphase.h
    ├── htmlbeforehtmlphase.cpp
    ├── htmlbeforehtmlphase.h
    ├── htmlinitialphase.cpp
    ├── htmlinitialphase.h
    ├── htmlparser.cpp
    ├── htmlparser.h
    ├── htmlparser_p.h
    ├── htmltokenizer.cpp
    ├── htmltokenizer.h
    ├── htmltokenizer_p.h
    ├── htmltree.cpp
    └── htmltree.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CMakeLists for libhtml-qt
 2 | project(html-qt)
 3 | 
 4 | cmake_minimum_required(VERSION 2.8.6 FATAL_ERROR)
 5 | find_package(PkgConfig REQUIRED)
 6 | find_package(Qt5 5.3.0 COMPONENTS
 7 |     Core
 8 |     Network
 9 | )
10 | 
11 | set(HTMLQT_VERSION_MAJOR  "0")
12 | set(HTMLQT_VERSION_MINOR  "1")
13 | set(HTMLQT_VERSION_PATCH  "0")
14 | set(HTMLQT_VERSION_SUFFIX "${VERSION_SUFFIX}")
15 | set(HTMLQT_VERSION "${HTMLQT_VERSION_MAJOR}.${HTMLQT_VERSION_MINOR}.${HTMLQT_VERSION_PATCH}")
16 | 
17 | set(HTMLQT_API_LEVEL "0")
18 | 
19 | # CMakeLists for HTML-Qt library
20 | set(CMAKE_AUTOMOC ON)
21 | 
22 | # Include our cmake modules
23 | set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
24 | 
25 | # As moc files are generated in the binary dir, tell CMake
26 | # to always look for includes there:
27 | set(CMAKE_INCLUDE_CURRENT_DIR ON)
28 | 
29 | # Forbid in-tree building
30 | if(${CMAKE_SOURCE_DIR} MATCHES ${CMAKE_BINARY_DIR})
31 |       message(STATUS "Please do an out-of-tree build:")
32 |       message(STATUS "rm -f CMakeCache.txt && mkdir build && cd build; cmake .. && make")
33 |       message(FATAL_ERROR "In-tree-build detected!")
34 | endif(${CMAKE_SOURCE_DIR} MATCHES ${CMAKE_BINARY_DIR})
35 | 
36 | #
37 | # Options
38 | #
39 | 
40 | # NONE
41 | 
42 | if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
43 |   set(CMAKE_INSTALL_PREFIX
44 |     "/usr" CACHE PATH "html-qt default install prefix" FORCE)
45 | endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
46 | 
47 | #
48 | # Configure files
49 | #
50 | set (PREFIXDIR "${CMAKE_INSTALL_PREFIX}")
51 | set (CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${LIBNAME}/")
52 | set (DATADIR "${CMAKE_INSTALL_PREFIX}/share")
53 | set (PKGDATADIR "${DATA_INSTALL_DIR}")
54 | set (LIBDIR "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}")
55 | set (PKGLIBDIR "${LIBDIR}/html-qt")
56 | set (GETTEXT_PACKAGE "html-qt")
57 | set (LOCALE_DIR "${DATADIR}/locale")
58 | set (VERSION "${HTMLQT_VERSION}")
59 | set (BUILDDIR "${CMAKE_BINARY_DIR}")
60 | 
61 | add_definitions("-DLOCALSTATEDIR=\"${LOCALSTATEDIR}\"")
62 | 
63 | set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_LIBRARY_ARCHITECTURE}" CACHE PATH "Output directory for libraries")
64 | 
65 | configure_file(config.h.in ${CMAKE_BINARY_DIR}/config.h)
66 | 
67 | #
68 | # Custom C flags
69 | #
70 | set (MAINTAINER_CFLAGS "-Werror -Wall -Wcast-align -Wno-uninitialized -Wempty-body -Wformat-security -Wformat -Winit-self")
71 | option (DISABLE_MAINTAINER_CFLAGS "Disable maintainer CFlags" ON)
72 | if (DISABLE_MAINTAINER_CFLAGS)
73 | 	set (MAINTAINER_CFLAGS "")
74 | endif (DISABLE_MAINTAINER_CFLAGS)
75 | add_definitions(${MAINTAINER_CFLAGS})
76 | 
77 | add_definitions(-DQT_NO_KEYWORDS)
78 | 
79 | include_directories(
80 |     ${CMAKE_SOURCE_DIR}
81 |     ${CMAKE_CURRENT_SOURCE_DIR}/lib
82 |     ${CMAKE_CURRENT_BINARY_DIR}
83 | )
84 | 
85 | add_subdirectory(html-qt)
86 | add_subdirectory(cmd)
87 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 |                   GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 |                             Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 |                   GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 | 
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this.  Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 | 
435 |                             NO WARRANTY
436 | 
437 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 | 
447 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 | 
458 |                      END OF TERMS AND CONDITIONS
459 | 
460 |            How to Apply These Terms to Your New Libraries
461 | 
462 |   If you develop a new library, and you want it to be of the greatest
463 | possible use to the public, we recommend making it free software that
464 | everyone can redistribute and change.  You can do so by permitting
465 | redistribution under these terms (or, alternatively, under the terms of the
466 | ordinary General Public License).
467 | 
468 |   To apply these terms, attach the following notices to the library.  It is
469 | safest to attach them to the start of each source file to most effectively
470 | convey the exclusion of warranty; and each file should have at least the
471 | "copyright" line and a pointer to where the full notice is found.
472 | 
473 |     <one line to give the library's name and a brief idea of what it does.>
474 |     Copyright (C) <year>  <name of author>
475 | 
476 |     This library is free software; you can redistribute it and/or
477 |     modify it under the terms of the GNU Lesser General Public
478 |     License as published by the Free Software Foundation; either
479 |     version 2.1 of the License, or (at your option) any later version.
480 | 
481 |     This library is distributed in the hope that it will be useful,
482 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
483 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
484 |     Lesser General Public License for more details.
485 | 
486 |     You should have received a copy of the GNU Lesser General Public
487 |     License along with this library; if not, write to the Free Software
488 |     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
489 | 
490 | Also add information on how to contact you by electronic and paper mail.
491 | 
492 | You should also get your employer (if you work as a programmer) or your
493 | school, if any, to sign a "copyright disclaimer" for the library, if
494 | necessary.  Here is a sample; alter the names:
495 | 
496 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the
497 |   library `Frob' (a library for tweaking knobs) written by James Random Hacker.
498 | 
499 |   <signature of Ty Coon>, 1 April 1990
500 |   Ty Coon, President of Vice
501 | 
502 | That's all there is to it!
503 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # html-qt
2 | HTML parser based on the WHATWG HTML5 specification
3 | 
4 | The command line tool html-qt reads for stdin or the first argument and run the parser.
5 | 


--------------------------------------------------------------------------------
/cmd/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(
 2 |     ${CMAKE_CURRENT_BINARY_DIR}
 3 |     ${CMAKE_CURRENT_SOURCE_DIR}
 4 | )
 5 | 
 6 | set(html_qt_cmd_SRCS
 7 |     main.cpp
 8 | )
 9 | 
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11")
11 | 
12 | add_executable(html-qt-skell ${html_qt_cmd_SRCS})
13 | qt5_use_modules(html-qt-skell Core)
14 | target_link_libraries(html-qt-skell
15 |     html-qt5
16 | )
17 | 
18 | set_target_properties(html-qt-skell PROPERTIES OUTPUT_NAME html-qt)
19 | install(TARGETS html-qt-skell DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/)
20 | 


--------------------------------------------------------------------------------
/cmd/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <QCoreApplication>
 2 | #include <QCommandLineParser>
 3 | 
 4 | #include <QLocale>
 5 | #include <QTranslator>
 6 | #include <QLibraryInfo>
 7 | #include <QDebug>
 8 | 
 9 | #include <QTextStream>
10 | #include <QFile>
11 | #include <QElapsedTimer>
12 | 
13 | #include "../html-qt/htmlparser.h"
14 | 
15 | int main(int argc, char *argv[])
16 | {
17 |     QCoreApplication app(argc, argv);
18 |     QCoreApplication::setOrganizationName("Cutelyst");
19 |     QCoreApplication::setOrganizationDomain("cutelyst.org");
20 |     QCoreApplication::setApplicationName("html-qt");
21 |     QCoreApplication::setApplicationVersion("0.0.1");
22 | 
23 |     QTranslator qtTranslator;
24 |     qtTranslator.load("qt_" + QLocale::system().name(),
25 |                       QLibraryInfo::location(QLibraryInfo::TranslationsPath));
26 |     QCoreApplication::installTranslator(&qtTranslator);
27 | 
28 |     QCommandLineParser parser;
29 |     parser.setApplicationDescription("Parses HTML documents according to WHATWG definitions");
30 |     parser.addHelpOption();
31 |     parser.addVersionOption();
32 | 
33 |     parser.addPositionalArgument("source", QCoreApplication::translate("main", "Source HTML file to parse."));
34 | 
35 |     // Process the actual command line arguments given by the user
36 |     parser.process(app);
37 | 
38 |     const QStringList args = parser.positionalArguments();
39 |     QTextStream *in = 0;
40 |     if (args.isEmpty()) {
41 |         in = new QTextStream(stdin);
42 |     } else if (args.size() == 1) {
43 |         QFile *file = new QFile(args.first());
44 |         if (!file->open(QFile::ReadOnly)) {
45 |             qFatal("Failed to open html file");
46 |         }
47 |         in = new QTextStream(file);
48 |     } else {
49 |         parser.showHelp(1);
50 |     }
51 | 
52 |     HTMLParser htmlParser;
53 |     QElapsedTimer t;
54 |     t.start();
55 |     htmlParser.parse(in->readAll());
56 |     qDebug() << "Time elapsed:" << t.elapsed() << "ms";
57 | 
58 |     return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/config.h.in:
--------------------------------------------------------------------------------
 1 | #ifndef CONFIG_H
 2 | #define CONFIG_H
 3 | 
 4 | /* always defined to indicate that i18n is enabled */
 5 | #define ENABLE_NLS 1
 6 | 
 7 | /* Gettext Package */
 8 | #define GETTEXT_PACKAGE "@GETTEXT_PACKAGE@"
 9 | 
10 | /* Paths */
11 | #define LOCALEDIR "@LOCALE_DIR@"
12 | #define PKGDATADIR "@PKGDATADIR@"
13 | #define PKGLIBDIR "@PKGLIBDIR@"
14 | #define PREFIXDIR "@PREFIXDIR@"
15 | #define DATADIR "@DATADIR@"
16 | #define LIBDIR "@LIBDIR@"
17 | #define BUILDDIR "@BUILDDIR@"
18 | 
19 | /* Name of package */
20 | #define PACKAGE_NAME "htmlqt"
21 | 
22 | /* Version number of package */
23 | #define VERSION "@VERSION@"
24 | 
25 | #endif /*CONFIG_H*/
26 | 


--------------------------------------------------------------------------------
/html-qt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(
 2 |     ${CMAKE_BINARY_DIR}
 3 |     ${CMAKE_CURRENT_BINARY_DIR}
 4 |     ${CMAKE_CURRENT_SOURCE_DIR}
 5 | )
 6 | 
 7 | set(htmlqt_SRC
 8 |     htmlabstractphase.cpp
 9 |     htmlinitialphase.cpp
10 |     htmlbeforehtmlphase.cpp
11 |     htmltokenizer.cpp
12 |     htmltokenizer_p.h
13 |     htmlparser.cpp
14 |     htmlparser_p.h
15 |     htmltree.cpp
16 | )
17 | 
18 | set(htmlqt_HEADERS
19 |     htmltokenizer.h
20 |     htmlparser.h
21 |     htmltree.h
22 | )
23 | 
24 | # set(htmlqt_HEADERS_PRIVATE
25 | #     common.h
26 | # )
27 | 
28 | add_definitions(
29 |     -std=c++11
30 | )
31 | 
32 | add_library(html-qt5 SHARED ${htmlqt_SRC} ${htmlqt_HEADERS} ${htmlqt_HEADERS_PRIVATE})
33 | set_target_properties(html-qt5 PROPERTIES VERSION ${HTMLQT_VERSION} SOVERSION ${HTMLQT_API_LEVEL})
34 | 
35 | qt5_use_modules(html-qt5 Core Network)
36 | 
37 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/html-qt5.pc.in
38 |   ${CMAKE_CURRENT_BINARY_DIR}/html-qt5.pc
39 |   @ONLY
40 | )
41 | 
42 | install(TARGETS html-qt5 EXPORT HTMLQt5Targets DESTINATION ${CMAKE_INSTALL_LIBDIR})
43 | install(FILES ${CMAKE_CURRENT_BINARY_DIR}/html-qt5.pc
44 |         DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
45 | )
46 | install(FILES ${htmlqt_HEADERS}
47 |         DESTINATION include/html-qt5/HTMLQt
48 | )
49 | 


--------------------------------------------------------------------------------
/html-qt/html-qt5.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@CMAKE_INSTALL_PREFIX@
 2 | exec_prefix=${prefix}
 3 | libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: html-qt5
 7 | Description: HTML Qt parser
 8 | Version: @VERSION@
 9 | Requires: QtCore
10 | Libs: -L${libdir} -lhtml-qt5
11 | Cflags: -I${includedir}/html-qt5/
12 | 


--------------------------------------------------------------------------------
/html-qt/htmlabstractphase.cpp:
--------------------------------------------------------------------------------
 1 | #include "htmlabstractphase.h"
 2 | #include "htmlparser.h"
 3 | #include "htmlparser_p.h"
 4 | 
 5 | #include "htmltree.h"
 6 | 
 7 | #include <QLoggingCategory>
 8 | 
 9 | Q_LOGGING_CATEGORY(HTML_IM, "htmlqt.im")
10 | 
11 | HTMLAbstractPhase::HTMLAbstractPhase(HTMLParser *parser, HTMLTree *tree)
12 | {
13 |     this->tree = tree;
14 |     this->parser = parser;
15 | }
16 | 
17 | HTMLAbstractPhase::~HTMLAbstractPhase()
18 | {
19 | 
20 | }
21 | 
22 | HTMLParserPrivate *HTMLAbstractPhase::parserPriv()
23 | {
24 |     return parser->d_ptr;
25 | }
26 | 
27 | void HTMLAbstractPhase::insertHtmlElement()
28 | {
29 | 
30 | }
31 | 
32 | void HTMLAbstractPhase::startTagHtml(HTMLToken *token)
33 | {
34 |     if (!parserPriv()->firstStartTag && token->name == QLatin1String("html")) {
35 |         parser->parserErrorToken(QStringLiteral("non-html-root"), 0);
36 |         return;
37 |     }
38 | 
39 |     HTMLTreeNode *last = tree->openElements().last();
40 | 
41 |     auto it = token->data.constBegin();
42 |     while (it != token->data.constEnd()) {
43 |         const QString attr = it->first;
44 |         const QString value = it->second;
45 |         if (!last->attributes.contains(attr)) {
46 |             last->attributes.insert(attr, value);
47 |         }
48 |         ++it;
49 |     }
50 |     parserPriv()->firstStartTag = false;
51 | }
52 | 
53 | bool HTMLAbstractPhase::processCharacter(QChar c)
54 | {
55 |     tree->insertText(c);
56 |     return true;
57 | }
58 | 
59 | bool HTMLAbstractPhase::processSpaceCharacters(HTMLToken *token)
60 | {
61 |     Q_UNUSED(token)
62 |     return true;
63 | }
64 | 
65 | bool HTMLAbstractPhase::processStartTag(HTMLToken *token)
66 | {
67 |     Q_UNUSED(token)
68 |     return true;
69 | }
70 | 
71 | bool HTMLAbstractPhase::processEndTag(HTMLToken *token)
72 | {
73 |     Q_UNUSED(token)
74 |     return true;
75 | }
76 | 
77 | bool HTMLAbstractPhase::processCommentTag(HTMLToken *token)
78 | {
79 |     tree->insertComment(token, tree->openElements().last());
80 |     return true;
81 | }
82 | 
83 | bool HTMLAbstractPhase::processDoctype(HTMLToken *token)
84 | {
85 |     Q_UNUSED(token)
86 |     return true;
87 | }
88 | 
89 | bool HTMLAbstractPhase::processEOF()
90 | {
91 |     return true;
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/html-qt/htmlabstractphase.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLABSTRACTPHASE_H
 2 | #define HTMLABSTRACTPHASE_H
 3 | 
 4 | #include <QObject>
 5 | 
 6 | class HTMLParserPrivate;
 7 | class HTMLParser;
 8 | class HTMLTree;
 9 | class HTMLToken;
10 | class HTMLAbstractPhase
11 | {
12 | public:
13 |     HTMLAbstractPhase(HTMLParser *parser, HTMLTree *tree);
14 |     virtual ~HTMLAbstractPhase();
15 | 
16 |     HTMLTree *tree;
17 |     HTMLParser *parser;
18 |     HTMLParserPrivate *parserPriv();
19 | 
20 |     virtual void insertHtmlElement();
21 | 
22 |     virtual void startTagHtml(HTMLToken *token);
23 | 
24 |     virtual bool processCharacter(QChar c);
25 | 
26 |     virtual bool processSpaceCharacters(HTMLToken *token);
27 | 
28 |     virtual bool processStartTag(HTMLToken *token);
29 | 
30 |     virtual bool processEndTag(HTMLToken *token);
31 | 
32 |     virtual bool processCommentTag(HTMLToken *token);
33 | 
34 |     virtual bool processDoctype(HTMLToken *token);
35 | 
36 |     virtual bool processEOF();
37 | 
38 | };
39 | 
40 | #endif // HTMLABSTRACTPHASE_H
41 | 


--------------------------------------------------------------------------------
/html-qt/htmlbeforehtmlphase.cpp:
--------------------------------------------------------------------------------
 1 | #include "htmlbeforehtmlphase.h"
 2 | 
 3 | #include "htmltree.h"
 4 | #include "htmltokenizer_p.h"
 5 | #include "htmlparser_p.h"
 6 | 
 7 | HTMLBeforeHtmlPhase::HTMLBeforeHtmlPhase(HTMLParser *parser, HTMLTree *tree) : HTMLAbstractPhase(parser, tree)
 8 | {
 9 | 
10 | }
11 | 
12 | void HTMLBeforeHtmlPhase::insertHtmlElement()
13 | {
14 |     tree->inserRoot(new HTMLToken(QStringLiteral("html"), HTMLToken::StartTagToken));
15 |     parserPriv()->insertionModeEnum = HTMLParser::BeforeHead;
16 |     parserPriv()->phase = parserPriv()->imBeforeHead;
17 | }
18 | 
19 | bool HTMLBeforeHtmlPhase::processEOF()
20 | {
21 |     insertHtmlElement();
22 |     return true;
23 | }
24 | 
25 | bool HTMLBeforeHtmlPhase::processCharacter(QChar c)
26 | {
27 |     insertHtmlElement();
28 |     return true;
29 | }
30 | 
31 | bool HTMLBeforeHtmlPhase::processCommentTag(HTMLToken *token)
32 | {
33 |     tree->insertComment(token, tree->document());
34 | }
35 | 
36 | bool HTMLBeforeHtmlPhase::processStartTag(HTMLToken *token)
37 | {
38 |     if (token->name == QLatin1String("html")) {
39 |         parserPriv()->firstStartTag = true;
40 |     }
41 |     insertHtmlElement();
42 |     return true;
43 | }
44 | 


--------------------------------------------------------------------------------
/html-qt/htmlbeforehtmlphase.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLBEFOREHTMLPHASE_H
 2 | #define HTMLBEFOREHTMLPHASE_H
 3 | 
 4 | #include "htmlabstractphase.h"
 5 | 
 6 | class HTMLBeforeHtmlPhase : public HTMLAbstractPhase
 7 | {
 8 | public:
 9 |     HTMLBeforeHtmlPhase(HTMLParser *parser, HTMLTree *tree);
10 | 
11 |     virtual void insertHtmlElement() override;
12 | 
13 |     virtual bool processEOF();
14 | 
15 |     virtual bool processCharacter(QChar c) override;
16 | 
17 |     virtual bool processCommentTag(HTMLToken *token) override;
18 | 
19 |     virtual bool processStartTag(HTMLToken *token) override;
20 | };
21 | 
22 | #endif // HTMLBEFOREHTMLPHASE_H
23 | 


--------------------------------------------------------------------------------
/html-qt/htmlinitialphase.cpp:
--------------------------------------------------------------------------------
 1 | #include "htmlinitialphase.h"
 2 | 
 3 | #include "htmltree.h"
 4 | #include "htmltokenizer_p.h"
 5 | #include "htmlparser_p.h"
 6 | 
 7 | #include <QLoggingCategory>
 8 | 
 9 | Q_LOGGING_CATEGORY(HTML_IM_INITIAL, "htmlqt.im.initial")
10 | 
11 | HTMLInitialPhase::HTMLInitialPhase(HTMLParser *parser, HTMLTree *tree) : HTMLAbstractPhase(parser, tree)
12 | {
13 | 
14 | }
15 | 
16 | bool HTMLInitialPhase::processSpaceCharacters(HTMLToken *token)
17 | {
18 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token->name;
19 |     return true;
20 | }
21 | 
22 | bool HTMLInitialPhase::processStartTag(HTMLToken *token)
23 | {
24 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token;
25 |     return true;
26 | }
27 | 
28 | bool HTMLInitialPhase::processEndTag(HTMLToken *token)
29 | {
30 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token;
31 |     return true;
32 | }
33 | 
34 | bool HTMLInitialPhase::processCommentTag(HTMLToken *token)
35 | {
36 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token;
37 |     tree->insertComment(token, tree->document());
38 |     return true;
39 | }
40 | 
41 | bool HTMLInitialPhase::processDoctype(HTMLToken *token)
42 | {
43 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token;
44 |     const QString &name = token->name;
45 |     QString publicId = token->doctypePublicId;
46 |     const QString &systemId = token->doctypeSystemId;
47 |     if (name != QLatin1String("html") ||
48 |             !publicId.isNull() ||
49 |             (!systemId.isNull() && systemId != QLatin1String("about:legacy-compat"))) {
50 | //        parser->parserErrorToken("unknown-doctype");
51 |     }
52 | 
53 |     if (publicId.isNull()) {
54 |         publicId = QLatin1String("");
55 |     }
56 | 
57 |     tree->insertDoctype(token);
58 | 
59 |     qCCritical(HTML_IM_INITIAL) << Q_FUNC_INFO << token;
60 | 
61 |     // TODO
62 | 
63 |     parserPriv()->insertionModeEnum = HTMLParser::BeforeHTML;
64 |     parserPriv()->phase = parserPriv()->imBeforeHTML;
65 |     return true;
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/html-qt/htmlinitialphase.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLINITIALPHASE_H
 2 | #define HTMLINITIALPHASE_H
 3 | 
 4 | #include "htmlabstractphase.h"
 5 | 
 6 | class HTMLInitialPhase : public HTMLAbstractPhase
 7 | {
 8 | public:
 9 |     HTMLInitialPhase(HTMLParser *parser, HTMLTree *tree);
10 | 
11 |     virtual bool processSpaceCharacters(HTMLToken *token) override;
12 | 
13 |     virtual bool processStartTag(HTMLToken *token) override;
14 | 
15 |     virtual bool processEndTag(HTMLToken *token) override;
16 | 
17 |     virtual bool processCommentTag(HTMLToken *token) override;
18 | 
19 |     virtual bool processDoctype(HTMLToken *token) override;
20 | 
21 | };
22 | 
23 | #endif // HTMLINITIALPHASE_H
24 | 


--------------------------------------------------------------------------------
/html-qt/htmlparser.cpp:
--------------------------------------------------------------------------------
  1 | #include "htmlparser_p.h"
  2 | 
  3 | #include "htmlinitialphase.h"
  4 | #include "htmlbeforehtmlphase.h"
  5 | 
  6 | #include <QMetaEnum>
  7 | #include <QLoggingCategory>
  8 | 
  9 | Q_LOGGING_CATEGORY(HTML_PARSER, "htmlqt.parser")
 10 | 
 11 | HTMLParser::HTMLParser(QObject *parent) : QObject(parent)
 12 |   , d_ptr(new HTMLParserPrivate)
 13 | {
 14 |     Q_D(HTMLParser);
 15 | 
 16 |     d->tokenizer = new HTMLTokenizer(this);
 17 | 
 18 |     HTMLTree *tree = new HTMLTree;
 19 |     d->imInitial = new HTMLInitialPhase(this, tree);
 20 |     d->imBeforeHTML = new HTMLBeforeHtmlPhase(this, tree);
 21 |     d->imBeforeHead = new HTMLAbstractPhase(this, tree);
 22 |     d->imInHead = new HTMLAbstractPhase(this, tree);
 23 |     d->imInHeadNoScript = new HTMLAbstractPhase(this, tree);
 24 |     d->imAfterHead = new HTMLAbstractPhase(this, tree);
 25 |     d->imInBody = new HTMLAbstractPhase(this, tree);
 26 |     d->imText = new HTMLAbstractPhase(this, tree);
 27 |     d->imInTable = new HTMLAbstractPhase(this, tree);
 28 |     d->imInTableText = new HTMLAbstractPhase(this, tree);
 29 |     d->imInCaption = new HTMLAbstractPhase(this, tree);
 30 |     d->imInColumGroup = new HTMLAbstractPhase(this, tree);
 31 |     d->imInTableBody = new HTMLAbstractPhase(this, tree);
 32 |     d->imInRow = new HTMLAbstractPhase(this, tree);
 33 |     d->imInCell = new HTMLAbstractPhase(this, tree);
 34 |     d->imInSelect = new HTMLAbstractPhase(this, tree);
 35 |     d->imInSelectInTable = new HTMLAbstractPhase(this, tree);
 36 |     d->imInTemplate = new HTMLAbstractPhase(this, tree);
 37 |     d->imAfterBody = new HTMLAbstractPhase(this, tree);
 38 |     d->imInFrameset = new HTMLAbstractPhase(this, tree);
 39 |     d->imAfterFrameset = new HTMLAbstractPhase(this, tree);
 40 |     d->imAfterAfterBody = new HTMLAbstractPhase(this, tree);
 41 |     d->imAfterAfterFrameset = new HTMLAbstractPhase(this, tree);
 42 |     d->phase = d->imInitial;
 43 |     d->tree = tree;
 44 | }
 45 | 
 46 | HTMLParser::~HTMLParser()
 47 | {
 48 |     delete d_ptr;
 49 | }
 50 | 
 51 | void HTMLParser::parse(const QString &html)
 52 | {
 53 |     Q_D(HTMLParser);
 54 | 
 55 |     d->tokenizer->setHtmlText(html);
 56 |     d->tokenizer->start();
 57 |     d->tree->dump();
 58 | }
 59 | 
 60 | void HTMLParser::reset()
 61 | {
 62 |     Q_D(HTMLParser);
 63 |     d->tree->reset();
 64 |     d->firstStartTag = false;
 65 | }
 66 | 
 67 | void HTMLParser::characterToken(const QChar &c)
 68 | {
 69 |     Q_D(HTMLParser);
 70 |     d->phase->processCharacter(c);
 71 | }
 72 | 
 73 | void HTMLParser::parserErrorToken(const QString &string, int pos)
 74 | {
 75 |     qCCritical(HTML_PARSER) << "parser-error" << string << pos;
 76 | }
 77 | 
 78 | void HTMLParser::parseToken(HTMLToken *token)
 79 | {
 80 |     qCCritical(HTML_PARSER) << "parseToken" << token << token->type;
 81 |     Q_D(HTMLParser);
 82 |     switch (token->type) {
 83 |     case HTMLToken::CharactersToken:
 84 |         d->phase->processCharacter(token->dataStr.at(0));
 85 |         break;
 86 |     case HTMLToken::SpaceCharactersToken:
 87 |         d->phase->processStartTag(token);
 88 |         break;
 89 |     case HTMLToken::StartTagToken:
 90 |         d->phase->processStartTag(token);
 91 |         break;
 92 |     case HTMLToken::EndTagToken:
 93 |         d->phase->processEndTag(token);
 94 |         break;
 95 |     case HTMLToken::CommentToken:
 96 |         d->phase->processCommentTag(token);
 97 |         break;
 98 |     case HTMLToken::DocTypeToken:
 99 |         d->phase->processDoctype(token);
100 |         break;
101 |     case HTMLToken::ParserErrorToken:
102 |         qDebug() << "error " << token;
103 |         break;
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/html-qt/htmlparser.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLPARSER_H
 2 | #define HTMLPARSER_H
 3 | 
 4 | #include <QObject>
 5 | 
 6 | class HTMLToken;
 7 | class HTMLParserPrivate;
 8 | class HTMLParser : public QObject
 9 | {
10 |     Q_OBJECT
11 |     Q_DECLARE_PRIVATE(HTMLParser)
12 | public:
13 |     enum InsertionMode {
14 |         Initial,
15 |         BeforeHTML,
16 |         BeforeHead,
17 |         InHead,
18 |         InHeadNoScript,
19 |         AfterHead,
20 |         InBody,
21 |         Text,
22 |         InTable,
23 |         InTableText,
24 |         InCaption,
25 |         InColumGroup,
26 |         InTableBody,
27 |         InRow,
28 |         InCell,
29 |         InSelect,
30 |         InSelectInTable,
31 |         InTemplate,
32 |         AfterBody,
33 |         InFrameset,
34 |         AfterFrameset,
35 |         AfterAfterBody,
36 |         AfterAfterFrameset,
37 |     };
38 |     Q_ENUM(InsertionMode)
39 | 
40 |     explicit HTMLParser(QObject *parent = 0);
41 |     ~HTMLParser();
42 | 
43 |     void parse(const QString &html);
44 | 
45 |     void reset();
46 | 
47 | protected:
48 |     void characterToken(const QChar &c);
49 |     void parserErrorToken(const QString &string, int pos);
50 |     void parseToken(HTMLToken *token);
51 | 
52 |     friend class HTMLTokenizer;
53 |     friend class HTMLAbstractPhase;
54 | 
55 |     HTMLParserPrivate *d_ptr;
56 | };
57 | 
58 | #endif // HTMLPARSER_H
59 | 


--------------------------------------------------------------------------------
/html-qt/htmlparser_p.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLPARSER_P_H
 2 | #define HTMLPARSER_P_H
 3 | 
 4 | #include "htmlparser.h"
 5 | #include "htmltokenizer_p.h"
 6 | #include "htmltree.h"
 7 | 
 8 | #include "htmlabstractphase.h"
 9 | 
10 | class HTMLParserPrivate : public QObject
11 | {
12 |     Q_OBJECT
13 | public:
14 |     QString html;
15 |     HTMLTokenizer *tokenizer;
16 |     HTMLTree *tree;
17 |     HTMLAbstractPhase *phase;
18 |     HTMLParser::InsertionMode insertionModeEnum = HTMLParser::Initial;
19 | 
20 |     HTMLAbstractPhase *imInitial;
21 |     HTMLAbstractPhase *imBeforeHTML;
22 |     HTMLAbstractPhase *imBeforeHead;
23 |     HTMLAbstractPhase *imInHead;
24 |     HTMLAbstractPhase *imInHeadNoScript;
25 |     HTMLAbstractPhase *imAfterHead;
26 |     HTMLAbstractPhase *imInBody;
27 |     HTMLAbstractPhase *imText;
28 |     HTMLAbstractPhase *imInTable;
29 |     HTMLAbstractPhase *imInTableText;
30 |     HTMLAbstractPhase *imInCaption;
31 |     HTMLAbstractPhase *imInColumGroup;
32 |     HTMLAbstractPhase *imInTableBody;
33 |     HTMLAbstractPhase *imInRow;
34 |     HTMLAbstractPhase *imInCell;
35 |     HTMLAbstractPhase *imInSelect;
36 |     HTMLAbstractPhase *imInSelectInTable;
37 |     HTMLAbstractPhase *imInTemplate;
38 |     HTMLAbstractPhase *imAfterBody;
39 |     HTMLAbstractPhase *imInFrameset;
40 |     HTMLAbstractPhase *imAfterFrameset;
41 |     HTMLAbstractPhase *imAfterAfterBody;
42 |     HTMLAbstractPhase *imAfterAfterFrameset;
43 | 
44 |     bool firstStartTag = false;
45 | };
46 | 
47 | #endif // HTMLPARSER_P_H
48 | 
49 | 


--------------------------------------------------------------------------------
/html-qt/htmltokenizer.cpp:
--------------------------------------------------------------------------------
   1 | #include "htmltokenizer_p.h"
   2 | 
   3 | #include "htmlparser.h"
   4 | 
   5 | #include <QJsonDocument>
   6 | #include <QJsonObject>
   7 | #include <QMetaEnum>
   8 | #include <QFile>
   9 | #include <QStringBuilder>
  10 | #include <QLoggingCategory>
  11 | 
  12 | Q_LOGGING_CATEGORY(HTML_TOKENIZER, "htmlqt.tokenizer")
  13 | 
  14 | #define CALL_MEMBER_FN(object,ptrToMember)  ((object).*(ptrToMember))
  15 | 
  16 | #define IS_ASCII_UPPERCASE(c) ('A' <= c && c <= 'Z')
  17 | #define IS_ASCII_LOWERCASE(c) ('a' <= c && c <= 'z')
  18 | #define IS_ASCII_DIGITS(c) ('0' <= c && c <= '9')
  19 | #define IS_ASCII_HEX_DIGITS(c) (IS_ASCII_DIGITS(c) || \
  20 |     ('A' <= c && c <= 'F') || \
  21 |     ('a' <= c && c <= 'f'))
  22 | #define IS_SPACE_CHARACTER(c) (data == QChar::Tabulation || /* CHARACTER TABULATION (tab) */ \
  23 |     data == QChar::LineFeed || /* LINE FEED (LF) */ \
  24 |     data == 0x000C || /* FORM FEED (FF) */ \
  25 |     data == QChar::Space) // SPACE
  26 | 
  27 | HTMLTokenizer::HTMLTokenizer(HTMLParser *parser) : QObject(parser)
  28 |   , d_ptr(new HTMLTokenizerPrivate)
  29 | {
  30 |     d_ptr->q_ptr = this;
  31 |     d_ptr->parser = parser;
  32 | 
  33 |     // TODO https://html.spec.whatwg.org/multipage/entities.json
  34 |     // get from the url and/or keep a local copy
  35 |     QFile entitiesFile("/home/daniel/code/html-qt/entities.json");
  36 |     if (!entitiesFile.open(QFile::ReadOnly)) {
  37 |         return;
  38 |     }
  39 |     QJsonDocument entities = QJsonDocument::fromBinaryData(entitiesFile.readAll());
  40 |     qCDebug(HTML_TOKENIZER) << entities.object();
  41 | }
  42 | 
  43 | HTMLTokenizer::~HTMLTokenizer()
  44 | {
  45 |     delete d_ptr;
  46 | }
  47 | 
  48 | void HTMLTokenizer::setHtmlText(const QString &html)
  49 | {
  50 |     Q_D(HTMLTokenizer);
  51 |     d->html = html;
  52 |     d->htmlPos = -1;
  53 |     d->htmlSize = html.size();
  54 | }
  55 | 
  56 | HTMLTokenizer::State HTMLTokenizer::state() const
  57 | {
  58 |     Q_D(const HTMLTokenizer);
  59 |     return d->state;
  60 | }
  61 | 
  62 | void HTMLTokenizer::start()
  63 | {
  64 |     Q_D(HTMLTokenizer);
  65 | 
  66 |     int lastPos = d->streamPos();
  67 |     int repeatedPos = 0;
  68 |     while (CALL_MEMBER_FN(*d, d->stateFn)() && !d->streamAtEnd()) {
  69 |         // dunno what to do here :)
  70 | //        qCDebug(HTML_TOKENIZER) << d->state << d->streamPos() << d->streamAtEnd();
  71 |         if (lastPos == d->streamPos()) {
  72 |             if (++repeatedPos > 10) {
  73 |                 qFatal("Infinite loop detected on state: %s, at position: %d",
  74 |                        metaObject()->enumerator(0).key(d->state),
  75 |                        lastPos);
  76 |             }
  77 |         } else {
  78 |             lastPos = d->streamPos();
  79 |             repeatedPos = 0;
  80 |         }
  81 |     }
  82 |     qCDebug(HTML_TOKENIZER) << "finished";
  83 | }
  84 | 
  85 | void HTMLTokenizer::character(QChar c)
  86 | {
  87 |     Q_D(HTMLTokenizer);
  88 | //    auto token = new HTMLToken(HTMLToken::CharactersToken);
  89 | //    token->dataStr = c;
  90 | //    d->tokenQueue.append(token);
  91 |     d->parser->characterToken(c);
  92 | }
  93 | 
  94 | void HTMLTokenizer::parserError(const QString &error)
  95 | {
  96 |     Q_D(HTMLTokenizer);
  97 |     auto token = new HTMLToken(HTMLToken::ParserErrorToken);
  98 |     token->dataStr = error;
  99 |     d->tokenQueue.append(token);
 100 |     d->parser->parserErrorToken(error, d->streamPos());
 101 | }
 102 | 
 103 | void HTMLTokenizer::token(HTMLToken *token)
 104 | {
 105 |     Q_D(HTMLTokenizer);
 106 |     d->parser->parseToken(token);
 107 | }
 108 | 
 109 | // https://html.spec.whatwg.org/multipage/syntax.html#data-state
 110 | bool HTMLTokenizerPrivate::dataState()
 111 | {
 112 |     Q_Q(HTMLTokenizer);
 113 | 
 114 |     QChar data;
 115 | 
 116 |     if (!consumeStream(data)) {
 117 |         // Tokenization ends.
 118 |         return false;
 119 |     } else if (data == '&') {
 120 |         state = HTMLTokenizer::CharacterReferenceInDataState;
 121 |         stateFn = &HTMLTokenizerPrivate::characterReferenceInDataState;
 122 |     } else if (data == '<') {
 123 |         state = HTMLTokenizer::TagOpenState;
 124 |         stateFn = &HTMLTokenizerPrivate::tagOpenState;
 125 |     } else if (data.isNull()) {
 126 |         state = HTMLTokenizer::TagOpenState;
 127 |         Q_EMIT q->parserError(QLatin1String("invalid-codepoint: ") + data);
 128 |         Q_EMIT q->character(data);
 129 |     } else {
 130 |         Q_EMIT q->character(data);
 131 |     }
 132 | 
 133 |     return true;
 134 | }
 135 | 
 136 | // https://html.spec.whatwg.org/multipage/syntax.html#character-reference-in-data-state
 137 | bool HTMLTokenizerPrivate::characterReferenceInDataState()
 138 | {
 139 |     Q_Q(HTMLTokenizer);
 140 | 
 141 |     const QString &ret = consumeEntity();
 142 |     if (ret.isNull()) {
 143 |         q->character('&');
 144 |     } else {
 145 |         QString::ConstIterator it = ret.constBegin();
 146 |         while (it != ret.constEnd()) {
 147 |             q->character(*it);
 148 |             ++it;
 149 |         }
 150 |     }
 151 |     state = HTMLTokenizer::DataState;
 152 |     stateFn = &HTMLTokenizerPrivate::dataState;
 153 |     return true;
 154 | }
 155 | 
 156 | // https://html.spec.whatwg.org/multipage/syntax.html#tag-open-state
 157 | bool HTMLTokenizerPrivate::tagOpenState()
 158 | {
 159 |     Q_Q(HTMLTokenizer);
 160 | 
 161 |     QChar data;
 162 | 
 163 |     if (!consumeStream(data)) {
 164 |         Q_EMIT q->parserError(QStringLiteral("expected-tag-name"));
 165 |         state = HTMLTokenizer::DataState;
 166 |         stateFn = &HTMLTokenizerPrivate::dataState;
 167 |         Q_EMIT q->character('<');
 168 |         streamUnconsume();
 169 |     } else if (data == '!') {
 170 |         state = HTMLTokenizer::MarkupDeclarationOpenState;
 171 |         stateFn = &HTMLTokenizerPrivate::markupDeclarationOpenState;
 172 |     } else if (data == '/') {
 173 |         state = HTMLTokenizer::EndTagOpenState;
 174 |         stateFn = &HTMLTokenizerPrivate::endTagOpenState;
 175 |     } else if (IS_ASCII_UPPERCASE(data)) {
 176 |         state = HTMLTokenizer::TagNameState;
 177 |         stateFn = &HTMLTokenizerPrivate::tagNameState;
 178 |         currentToken = new HTMLToken(HTMLToken::StartTagToken);
 179 |         currentToken->name = data.toLower();
 180 |     } else if (IS_ASCII_LOWERCASE(data)) {
 181 |         state = HTMLTokenizer::TagNameState;
 182 |         stateFn = &HTMLTokenizerPrivate::tagNameState;
 183 |         currentToken = new HTMLToken(HTMLToken::StartTagToken);
 184 |         currentToken->name = data;
 185 |     } else if (data == '?') {
 186 |         q->parserError(QStringLiteral("expected-tag-name-but-got-question-mark"));
 187 |         state = HTMLTokenizer::BogusCommentState;
 188 |         stateFn = &HTMLTokenizerPrivate::bogusCommentState;
 189 |     } else {
 190 |         q->parserError(QStringLiteral("expected-tag-name"));
 191 |         state = HTMLTokenizer::DataState;
 192 |         stateFn = &HTMLTokenizerPrivate::dataState;
 193 |         q->character('<');
 194 |         streamUnconsume();
 195 |     }
 196 | 
 197 |     return true;
 198 | }
 199 | 
 200 | // https://html.spec.whatwg.org/multipage/syntax.html#end-tag-open-state
 201 | bool HTMLTokenizerPrivate::endTagOpenState()
 202 | {
 203 |     Q_Q(HTMLTokenizer);
 204 | 
 205 |     QChar data;
 206 | 
 207 |     if (!consumeStream(data)) {
 208 |         Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-eof"));
 209 |         state = HTMLTokenizer::DataState;
 210 |         stateFn = &HTMLTokenizerPrivate::dataState;
 211 |         Q_EMIT q->character('<'); // 0x003C
 212 |         Q_EMIT q->character('/'); // 0x002F
 213 |         streamUnconsume();
 214 |     } else if (IS_ASCII_UPPERCASE(data)) {
 215 |         currentToken = new HTMLToken(HTMLToken::EndTagToken);
 216 |         currentToken->name = data.toLower();
 217 |         currentToken->selfClosing = false;
 218 |         state = HTMLTokenizer::TagNameState;
 219 |         stateFn = &HTMLTokenizerPrivate::tagNameState;
 220 |     } else if (IS_ASCII_LOWERCASE(data)) {
 221 |         currentToken = new HTMLToken(HTMLToken::EndTagToken);
 222 |         currentToken->name = data;
 223 |         currentToken->selfClosing = false;
 224 |         state = HTMLTokenizer::TagNameState;
 225 |         stateFn = &HTMLTokenizerPrivate::tagNameState;
 226 |     } else if (data == '>') {
 227 |         Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-right-bracket"));
 228 |         state = HTMLTokenizer::DataState;
 229 |         stateFn = &HTMLTokenizerPrivate::dataState;
 230 |     } else {
 231 |         Q_EMIT q->parserError(QStringLiteral("expected-closing-tag-but-got-char"));
 232 |         state = HTMLTokenizer::BogusCommentState;
 233 |         stateFn = &HTMLTokenizerPrivate::bogusCommentState;
 234 |     }
 235 | 
 236 |     return true;
 237 | }
 238 | 
 239 | bool HTMLTokenizerPrivate::tagNameState()
 240 | {
 241 |     Q_Q(HTMLTokenizer);
 242 | 
 243 |     QChar data;
 244 | 
 245 |     if (!consumeStream(data)) {
 246 |         Q_EMIT q->parserError(QStringLiteral("eof-in-tag-name"));
 247 |         state = HTMLTokenizer::DataState;
 248 |         stateFn = &HTMLTokenizerPrivate::dataState;
 249 |         streamUnconsume();
 250 |     } else if (IS_SPACE_CHARACTER(data)) {
 251 |         state = HTMLTokenizer::BeforeAttributeNameState;
 252 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState;
 253 |     } else if (data == '/') {
 254 |         state = HTMLTokenizer::SelfClosingStartTagState;
 255 |         stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState;
 256 |     } else if (data == '>') {
 257 |         state = HTMLTokenizer::DataState;
 258 |         stateFn = &HTMLTokenizerPrivate::dataState;
 259 |         emitCurrentToken();
 260 |     } else if (IS_ASCII_UPPERCASE(data)) {
 261 |         // Appending the lower case version
 262 |         currentToken->name.append(data.toLower());
 263 |     } else if (data.isNull()) {
 264 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 265 |         currentToken->name.append(QChar::ReplacementCharacter);
 266 |     } else {
 267 |         currentToken->name.append(data);
 268 |     }
 269 | 
 270 |     return true;
 271 | }
 272 | 
 273 | bool HTMLTokenizerPrivate::beforeAttributeNameState()
 274 | {
 275 |     Q_Q(HTMLTokenizer);
 276 | 
 277 |     QChar data;
 278 |     do {
 279 |         if (!consumeStream(data)) {
 280 |             Q_EMIT q->parserError(QStringLiteral("expected-attribute-name-but-got-eof"));
 281 |             state = HTMLTokenizer::DataState;
 282 |             stateFn = &HTMLTokenizerPrivate::dataState;
 283 |             streamUnconsume();
 284 |             return true;
 285 |         }
 286 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
 287 | 
 288 |     if (data == '/') {
 289 |         state = HTMLTokenizer::SelfClosingStartTagState;
 290 |         stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState;
 291 |     } else if (data == '>') {
 292 |         state = HTMLTokenizer::DataState;
 293 |         stateFn = &HTMLTokenizerPrivate::dataState;
 294 |         emitCurrentToken();
 295 |     } else if (IS_ASCII_UPPERCASE(data)) {
 296 |         // Appending the lower case version
 297 |         currentToken->data.append({ data.toLower(), QString()});
 298 |         state = HTMLTokenizer::AttributeNameState;
 299 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 300 |     } else if (data.isNull()) {
 301 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 302 |         currentToken->data.append({ QString(QChar::ReplacementCharacter), QString()});
 303 |         state = HTMLTokenizer::AttributeNameState;
 304 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 305 |     } else if (data == '"' ||
 306 |                data == '\'' ||
 307 |                data == '<' ||
 308 |                data == '=') {
 309 |         Q_EMIT q->parserError(QStringLiteral("invalid-character-in-attribute-name"));
 310 |         currentToken->data.append({ data, QString() });
 311 |         state = HTMLTokenizer::AttributeNameState;
 312 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 313 |     } else {
 314 |         currentToken->data.append({ data, QString() });
 315 |         state = HTMLTokenizer::AttributeNameState;
 316 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 317 |     }
 318 | 
 319 |     return true;
 320 | }
 321 | 
 322 | bool HTMLTokenizerPrivate::attributeNameState()
 323 | {
 324 |     Q_Q(HTMLTokenizer);
 325 | 
 326 |     QChar data;
 327 | 
 328 |     if (!consumeStream(data)) {
 329 |         Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-name"));
 330 |         state = HTMLTokenizer::DataState;
 331 |         stateFn = &HTMLTokenizerPrivate::dataState;
 332 |         streamUnconsume();
 333 |     } else if (IS_SPACE_CHARACTER(data)) {
 334 |         state = HTMLTokenizer::AfterAttributeNameState;
 335 |         stateFn = &HTMLTokenizerPrivate::afterAttributeNameState;
 336 |     } else if (data == '/') {
 337 |         state = HTMLTokenizer::SelfClosingStartTagState;
 338 |         stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState;
 339 |     } else if (data == '=') {
 340 |         state = HTMLTokenizer::BeforeAttributeValueState;
 341 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeValueState;
 342 |     } else if (data == '>') {
 343 |         state = HTMLTokenizer::DataState;
 344 |         stateFn = &HTMLTokenizerPrivate::dataState;
 345 |         emitCurrentToken();
 346 |     } else if (IS_ASCII_UPPERCASE(data)) {
 347 |         currentToken->appendDataCurrentAttributeName(data.toLower());
 348 |     } else if (data.isNull()) {
 349 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 350 |         currentToken->appendDataCurrentAttributeName(QChar::ReplacementCharacter);
 351 |     } else if (data == '"' || data == '\'' || data == '<') {
 352 |         Q_EMIT q->parserError(QStringLiteral("invalid-character-in-attribute-name"));
 353 |         currentToken->appendDataCurrentAttributeName(data);
 354 |     } else {
 355 |         currentToken->appendDataCurrentAttributeName(data);
 356 |     }
 357 | 
 358 |     return true;
 359 | }
 360 | 
 361 | bool HTMLTokenizerPrivate::afterAttributeNameState()
 362 | {
 363 |     Q_Q(HTMLTokenizer);
 364 | 
 365 |     QChar data;
 366 |     do {
 367 |         if (!consumeStream(data)) {
 368 |             Q_EMIT q->parserError(QStringLiteral("expected-end-of-tag-but-got-eof"));
 369 |             state = HTMLTokenizer::DataState;
 370 |             stateFn = &HTMLTokenizerPrivate::dataState;
 371 |             streamUnconsume();
 372 |             return true;
 373 |         }
 374 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
 375 | 
 376 |     if (data == '/') {
 377 |         state = HTMLTokenizer::SelfClosingStartTagState;
 378 |         stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState;
 379 |     } else if (data == '=') {
 380 |         state = HTMLTokenizer::BeforeAttributeValueState;
 381 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeValueState;
 382 |     } else if (data == '>') {
 383 |         state = HTMLTokenizer::DataState;
 384 |         stateFn = &HTMLTokenizerPrivate::dataState;
 385 |         emitCurrentToken();
 386 |     } else if (IS_ASCII_UPPERCASE(data)) {
 387 |         currentToken->data.append({ data.toLower(), QString() });
 388 |         state = HTMLTokenizer::AttributeNameState;
 389 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 390 |     } else if (data.isNull()) {
 391 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 392 |         currentToken->data.append({ QString(QChar::ReplacementCharacter), QString() });
 393 |         state = HTMLTokenizer::AttributeNameState;
 394 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 395 |     } else if (data == '"' || data == '\'' || data == '<') {
 396 |         Q_EMIT q->parserError(QStringLiteral("invalid-character-after-attribute-name"));
 397 |         currentToken->data.append({ data, QString() });
 398 |         state = HTMLTokenizer::AttributeNameState;
 399 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 400 |     } else {
 401 |         currentToken->data.append({ data, QString() });
 402 |         state = HTMLTokenizer::AttributeNameState;
 403 |         stateFn = &HTMLTokenizerPrivate::attributeNameState;
 404 |     }
 405 | 
 406 |     return true;
 407 | }
 408 | 
 409 | bool HTMLTokenizerPrivate::beforeAttributeValueState()
 410 | {
 411 |     Q_Q(HTMLTokenizer);
 412 | 
 413 |     QChar data;
 414 |     do {
 415 |         if (!consumeStream(data)) {
 416 |             Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-eof"));
 417 |             state = HTMLTokenizer::DataState;
 418 |             stateFn = &HTMLTokenizerPrivate::dataState;
 419 |             streamUnconsume();
 420 |             return true;
 421 |         }
 422 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
 423 | 
 424 |     if (data == '"') {
 425 |         state = HTMLTokenizer::AttributeValueDoubleQuotedState;
 426 |         stateFn = &HTMLTokenizerPrivate::attributeValueDoubleQuotedState;
 427 |     } else if (data == '&') {
 428 |         state = HTMLTokenizer::AttributeValueUnquotedState;
 429 |         stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState;
 430 |     } else if (data == '\'') {
 431 |         state = HTMLTokenizer::AttributeValueSingleQuotedState;
 432 |         stateFn = &HTMLTokenizerPrivate::attributeValueSingleQuotedState;
 433 |     } else if (data.isNull()) {
 434 |         Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-right-bracket"));
 435 |         emitCurrentToken();
 436 |     } else if (data == '>') {
 437 |         Q_EMIT q->parserError(QStringLiteral("expected-attribute-value-but-got-right-bracket"));
 438 |         state = HTMLTokenizer::DataState;
 439 |         stateFn = &HTMLTokenizerPrivate::dataState;
 440 |         emitCurrentToken();
 441 |     } else if (data == '<' || data == '=' || data == '`') {
 442 |         Q_EMIT q->parserError(QStringLiteral("equals-in-unquoted-attribute-value"));
 443 |         currentToken->appendDataCurrentAttributeValue(data);
 444 |         state = HTMLTokenizer::AttributeValueUnquotedState;
 445 |         stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState;
 446 |     } else {
 447 |         currentToken->appendDataCurrentAttributeValue(data);
 448 |         state = HTMLTokenizer::AttributeValueUnquotedState;
 449 |         stateFn = &HTMLTokenizerPrivate::attributeValueUnquotedState;
 450 |     }
 451 | 
 452 |     return true;
 453 | }
 454 | 
 455 | bool HTMLTokenizerPrivate::attributeValueDoubleQuotedState()
 456 | {
 457 |     Q_Q(HTMLTokenizer);
 458 | 
 459 |     QChar data;
 460 | 
 461 |     if (!consumeStream(data)) {
 462 |         Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-double-quote"));
 463 |         state = HTMLTokenizer::DataState;
 464 |         stateFn = &HTMLTokenizerPrivate::dataState;
 465 |         streamUnconsume();
 466 |     } else if (data == '"') {
 467 |         state = HTMLTokenizer::AfterAttributeValueQuotedState;
 468 |         stateFn = &HTMLTokenizerPrivate::afterAttributeValueQuotedState;
 469 |     } else if (data == '&') {
 470 |         QChar allowedChar('"');
 471 |         characterReferenceInAttributeValueState(&allowedChar);
 472 |     } else if (data.isNull()) {
 473 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 474 |         currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter);
 475 |     } else {
 476 |         currentToken->appendDataCurrentAttributeValue(data);
 477 |     }
 478 | 
 479 |     return true;
 480 | }
 481 | 
 482 | bool HTMLTokenizerPrivate::attributeValueSingleQuotedState()
 483 | {
 484 |     Q_Q(HTMLTokenizer);
 485 | 
 486 |     QChar data;
 487 | 
 488 |     if (!consumeStream(data)) {
 489 |         Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-single-quote"));
 490 |         state = HTMLTokenizer::DataState;
 491 |         stateFn = &HTMLTokenizerPrivate::dataState;
 492 |         streamUnconsume();
 493 |     } else if (data == '\'') {
 494 |         state = HTMLTokenizer::AfterAttributeValueQuotedState;
 495 |         stateFn = &HTMLTokenizerPrivate::afterAttributeValueQuotedState;
 496 |     } else if (data == '&') {
 497 |         QChar allowedChar('\'');
 498 |         characterReferenceInAttributeValueState(&allowedChar);
 499 |     } else if (data.isNull()) {
 500 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 501 |         currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter);
 502 |     } else {
 503 |         currentToken->appendDataCurrentAttributeValue(data);
 504 |     }
 505 | 
 506 |     return true;
 507 | }
 508 | 
 509 | bool HTMLTokenizerPrivate::attributeValueUnquotedState()
 510 | {
 511 |     Q_Q(HTMLTokenizer);
 512 | 
 513 |     QChar data;
 514 | 
 515 |     if (!consumeStream(data)) {
 516 |         Q_EMIT q->parserError(QStringLiteral("eof-in-attribute-value-no-quotes"));
 517 |         state = HTMLTokenizer::DataState;
 518 |         stateFn = &HTMLTokenizerPrivate::dataState;
 519 |         streamUnconsume();
 520 |     } else if (IS_SPACE_CHARACTER(data)) {
 521 |         state = HTMLTokenizer::BeforeAttributeNameState;
 522 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState;
 523 |     } else if (data == '&') {
 524 |         QChar allowedChar('>');
 525 |         characterReferenceInAttributeValueState(&allowedChar);
 526 |     } else if (data == '>') {
 527 |         state = HTMLTokenizer::DataState;
 528 |         stateFn = &HTMLTokenizerPrivate::dataState;
 529 |         emitCurrentToken();
 530 |     } else if (data.isNull()) {
 531 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 532 |         currentToken->appendDataCurrentAttributeValue(QChar::ReplacementCharacter);
 533 |     } else if (data == '"' || data == '\'' || data == '<' || data == '`') {
 534 |         Q_EMIT q->parserError(QStringLiteral("unexpected-character-in-unquoted-attribute-value"));
 535 |         currentToken->appendDataCurrentAttributeValue(data);
 536 |     } else {
 537 |         currentToken->appendDataCurrentAttributeValue(data);
 538 |     }
 539 | 
 540 |     return true;
 541 | }
 542 | 
 543 | void HTMLTokenizerPrivate::characterReferenceInAttributeValueState(QChar *additionalAllowedCharacter)
 544 | {
 545 |     QString ret = consumeEntity(additionalAllowedCharacter);
 546 |     if (ret.isNull()) {
 547 |         currentToken->appendDataCurrentAttributeValue('&');
 548 |     } else {
 549 |         currentToken->appendDataCurrentAttributeValue(ret);
 550 |     }
 551 | }
 552 | 
 553 | bool HTMLTokenizerPrivate::afterAttributeValueQuotedState()
 554 | {
 555 |     Q_Q(HTMLTokenizer);
 556 | 
 557 |     QChar data;
 558 | 
 559 |     if (!consumeStream(data)) {
 560 |         Q_EMIT q->parserError(QStringLiteral("unexpected-eof-after-attribute-value"));
 561 |         state = HTMLTokenizer::DataState;
 562 |         stateFn = &HTMLTokenizerPrivate::dataState;
 563 |         streamUnconsume();
 564 |     } else if (IS_SPACE_CHARACTER(data)) {
 565 |         state = HTMLTokenizer::BeforeAttributeNameState;
 566 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState;
 567 |     } else if (data == '/') {
 568 |         state = HTMLTokenizer::SelfClosingStartTagState;
 569 |         stateFn = &HTMLTokenizerPrivate::selfClosingStartTagState;
 570 |     } else if (data == '>') {
 571 |         state = HTMLTokenizer::DataState;
 572 |         stateFn = &HTMLTokenizerPrivate::dataState;
 573 |         emitCurrentToken();
 574 |     } else {
 575 |         Q_EMIT q->parserError(QStringLiteral("unexpected-character-after-attribute-value"));
 576 |         state = HTMLTokenizer::BeforeAttributeNameState;
 577 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState;
 578 |         streamUnconsume();
 579 |     }
 580 | 
 581 |     return true;
 582 | }
 583 | 
 584 | bool HTMLTokenizerPrivate::selfClosingStartTagState()
 585 | {
 586 |     Q_Q(HTMLTokenizer);
 587 | 
 588 |     QChar data;
 589 | 
 590 |     if (!consumeStream(data)) {
 591 |         Q_EMIT q->parserError(QStringLiteral("unexpected-eof-after-solidus-in-tag"));
 592 |         state = HTMLTokenizer::DataState;
 593 |         stateFn = &HTMLTokenizerPrivate::dataState;
 594 |         streamUnconsume();
 595 |     } else if (data == '>') {
 596 |         currentToken->selfClosing = true;
 597 |         state = HTMLTokenizer::DataState;
 598 |         stateFn = &HTMLTokenizerPrivate::dataState;
 599 |         emitCurrentToken();
 600 |     } else {
 601 |         Q_EMIT q->parserError(QStringLiteral("unexpected-character-after-solidus-in-tag"));
 602 |         state = HTMLTokenizer::BeforeAttributeNameState;
 603 |         stateFn = &HTMLTokenizerPrivate::beforeAttributeNameState;
 604 |         streamUnconsume();
 605 |     }
 606 | 
 607 |     return true;
 608 | }
 609 | 
 610 | bool HTMLTokenizerPrivate::bogusCommentState()
 611 | {
 612 |     // TODO
 613 |     return true;
 614 | }
 615 | 
 616 | // https://html.spec.whatwg.org/multipage/syntax.html#markup-declaration-open-state
 617 | bool HTMLTokenizerPrivate::markupDeclarationOpenState()
 618 | {
 619 |     Q_Q(HTMLTokenizer);
 620 | 
 621 |     int initalPos = streamPos();
 622 |     QChar data;
 623 |     // TODO check this
 624 |     consumeStream(data);
 625 |     QString charStack = data;
 626 | 
 627 |     if (data == '-') {
 628 |         // TODO check this
 629 |         consumeStream(data);
 630 |         charStack.append(data);
 631 |         if (data == '-') {
 632 |             currentToken = new HTMLToken(HTMLToken::CommentToken);
 633 |             currentToken->name = "";
 634 |             state = HTMLTokenizer::CommentStartState;
 635 |             stateFn = &HTMLTokenizerPrivate::commentStartState;
 636 |             return true;
 637 |         }
 638 |     } else if (data == 'd' || data == 'D') {
 639 |         // consume more 6 chars
 640 |         for (int i = 0; i < 6; ++i) {
 641 |             // TODO check this
 642 |             consumeStream(data);
 643 |             charStack.append(data);
 644 |         }
 645 | 
 646 |         if (charStack.compare(QLatin1String("DOCTYPE"), Qt::CaseInsensitive) == 0) {
 647 | //            currentToken = new HTMLToken(HTMLToken::CommentToken);
 648 |             qCDebug(HTML_TOKENIZER) << "markupDeclarationOpenState" << charStack;
 649 |             state = HTMLTokenizer::DocTypeState;
 650 |             stateFn = &HTMLTokenizerPrivate::doctypeState;
 651 |             return true;
 652 |         }
 653 |     } else if (data == '[') {
 654 |         qCWarning(HTML_TOKENIZER) << "markupDeclarationOpenState CDATA TODO";
 655 |     }
 656 | 
 657 |     Q_EMIT q->parserError(QStringLiteral("expected-dashes-or-doctype"));
 658 |     state = HTMLTokenizer::BogusCommentState;
 659 |     stateFn = &HTMLTokenizerPrivate::bogusCommentState;
 660 |     streamSeek(initalPos);
 661 | 
 662 |     return true;
 663 | }
 664 | 
 665 | bool HTMLTokenizerPrivate::commentStartState()
 666 | {
 667 |     Q_Q(HTMLTokenizer);
 668 | 
 669 |     QChar data;
 670 | 
 671 |     if (!consumeStream(data)) {
 672 |         Q_EMIT q->parserError(QStringLiteral("eof-in-comment"));
 673 |         state = HTMLTokenizer::DataState;
 674 |         stateFn = &HTMLTokenizerPrivate::dataState;
 675 |         emitCurrentToken();
 676 |         streamUnconsume();
 677 |     } else if (data == '-') {
 678 |         state = HTMLTokenizer::CommentStartDashState;
 679 |         stateFn = &HTMLTokenizerPrivate::commentStartDashState;
 680 |     } else if (data.isNull()) {
 681 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 682 |         currentToken->name.append(QChar::ReplacementCharacter);
 683 |         state = HTMLTokenizer::CommentState;
 684 |         stateFn = &HTMLTokenizerPrivate::commentState;
 685 |     } else if (data == '>') {
 686 |         Q_EMIT q->parserError(QStringLiteral("incorrect-comment"));
 687 |         state = HTMLTokenizer::DataState;
 688 |         stateFn = &HTMLTokenizerPrivate::dataState;
 689 |         emitCurrentToken();
 690 |     } else {
 691 |         currentToken->name.append(data);
 692 |         state = HTMLTokenizer::CommentState;
 693 |         stateFn = &HTMLTokenizerPrivate::commentState;
 694 |     }
 695 | 
 696 |     return true;
 697 | }
 698 | 
 699 | bool HTMLTokenizerPrivate::commentStartDashState()
 700 | {
 701 |     Q_Q(HTMLTokenizer);
 702 | 
 703 |     QChar data;
 704 | 
 705 |     if (!consumeStream(data)) {
 706 |         Q_EMIT q->parserError(QStringLiteral("eof-in-comment"));
 707 |         state = HTMLTokenizer::DataState;
 708 |         stateFn = &HTMLTokenizerPrivate::dataState;
 709 |         emitCurrentToken();
 710 |         streamUnconsume();
 711 |     } else if (data == '-') {
 712 |         state = HTMLTokenizer::CommentEndState;
 713 |         stateFn = &HTMLTokenizerPrivate::commentEndState;
 714 |     } else if (data.isNull()) {
 715 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 716 |         // TODO see if we can reduce to a singe call
 717 |         currentToken->name.append('-');
 718 |         currentToken->name.append(QChar::ReplacementCharacter);
 719 |         state = HTMLTokenizer::CommentState;
 720 |         stateFn = &HTMLTokenizerPrivate::commentState;
 721 |     } else if (data == '>') {
 722 |         Q_EMIT q->parserError(QStringLiteral("incorrect-comment"));
 723 |         state = HTMLTokenizer::DataState;
 724 |         stateFn = &HTMLTokenizerPrivate::dataState;
 725 |         emitCurrentToken();
 726 |     } else {
 727 |         // TODO see if we can reduce to a singe call
 728 |         currentToken->name.append('-');
 729 |         currentToken->name.append(data);
 730 |         state = HTMLTokenizer::CommentState;
 731 |         stateFn = &HTMLTokenizerPrivate::commentState;
 732 |     }
 733 | 
 734 |     return true;
 735 | }
 736 | 
 737 | bool HTMLTokenizerPrivate::commentState()
 738 | {
 739 |     Q_Q(HTMLTokenizer);
 740 | 
 741 |     QChar data;
 742 | 
 743 |     if (!consumeStream(data)) {
 744 |         Q_EMIT q->parserError(QStringLiteral("eof-in-comment"));
 745 |         state = HTMLTokenizer::DataState;
 746 |         stateFn = &HTMLTokenizerPrivate::dataState;
 747 |         emitCurrentToken();
 748 |         streamUnconsume();
 749 |     } else if (data == '-') {
 750 |         state = HTMLTokenizer::CommentEndDashState;
 751 |         stateFn = &HTMLTokenizerPrivate::commentEndDashState;
 752 |     } else if (data.isNull()) {
 753 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 754 |         currentToken->name.append(QChar::ReplacementCharacter);
 755 |     } else {
 756 |         currentToken->name.append(data);
 757 |     }
 758 | 
 759 |     return true;
 760 | }
 761 | 
 762 | bool HTMLTokenizerPrivate::commentEndDashState()
 763 | {
 764 |     Q_Q(HTMLTokenizer);
 765 | 
 766 |     QChar data;
 767 | 
 768 |     if (!consumeStream(data)) {
 769 |         Q_EMIT q->parserError(QStringLiteral("eof-in-comment-end-dash"));
 770 |         state = HTMLTokenizer::DataState;
 771 |         stateFn = &HTMLTokenizerPrivate::dataState;
 772 |         emitCurrentToken();
 773 |         streamUnconsume();
 774 |     } else if (data == '-') {
 775 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 776 |         // TODO see if we can reduce to a singe call
 777 |         currentToken->name.append('-');
 778 |         currentToken->name.append(QChar::ReplacementCharacter);
 779 |         state = HTMLTokenizer::CommentEndState;
 780 |         stateFn = &HTMLTokenizerPrivate::commentEndState;
 781 |     } else if (data.isNull()) {
 782 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 783 |         currentToken->name.append(QChar::ReplacementCharacter);
 784 |         state = HTMLTokenizer::CommentState;
 785 |         stateFn = &HTMLTokenizerPrivate::commentState;
 786 |     } else {
 787 |         currentToken->name.append('-');
 788 |         currentToken->name.append(data);
 789 |         state = HTMLTokenizer::CommentState;
 790 |         stateFn = &HTMLTokenizerPrivate::commentState;
 791 |     }
 792 | 
 793 |     return true;
 794 | }
 795 | 
 796 | bool HTMLTokenizerPrivate::commentEndState()
 797 | {
 798 |     Q_Q(HTMLTokenizer);
 799 | 
 800 |     QChar data;
 801 | 
 802 |     if (!consumeStream(data)) {
 803 |         Q_EMIT q->parserError(QStringLiteral("eof-in-comment-double-dash"));
 804 |         state = HTMLTokenizer::DataState;
 805 |         stateFn = &HTMLTokenizerPrivate::dataState;
 806 |         emitCurrentToken();
 807 |         streamUnconsume();
 808 |     } else if (data == '>') {
 809 |         state = HTMLTokenizer::DataState;
 810 |         stateFn = &HTMLTokenizerPrivate::dataState;
 811 |         emitCurrentToken();
 812 |     } else if (data.isNull()) {
 813 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 814 |         // TODO see if we can reduce to a singe call
 815 |         currentToken->name.append('-');
 816 |         currentToken->name.append(QChar::ReplacementCharacter);
 817 |         state = HTMLTokenizer::CommentState;
 818 |         stateFn = &HTMLTokenizerPrivate::commentState;
 819 |     } else if (data == '!') {
 820 |         Q_EMIT q->parserError(QStringLiteral("unexpected-bang-after-double-dash-in-comment"));
 821 |         state = HTMLTokenizer::CommentEndBangState;
 822 |         stateFn = &HTMLTokenizerPrivate::commentEndBangState;
 823 |     } else if (data == '-') {
 824 |         Q_EMIT q->parserError(QStringLiteral("unexpected-dash-after-double-dash-in-comment"));
 825 |         currentToken->name.append('-');
 826 |     } else {
 827 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-comment"));
 828 |         currentToken->name.append(QLatin1String("--") % data);
 829 |         state = HTMLTokenizer::CommentState;
 830 |         stateFn = &HTMLTokenizerPrivate::commentState;
 831 |     }
 832 | 
 833 |     return true;
 834 | }
 835 | 
 836 | bool HTMLTokenizerPrivate::commentEndBangState()
 837 | {
 838 |     // TODO
 839 |     return true;
 840 | }
 841 | 
 842 | // https://html.spec.whatwg.org/multipage/syntax.html#doctype-state
 843 | bool HTMLTokenizerPrivate::doctypeState()
 844 | {
 845 |     Q_Q(HTMLTokenizer);
 846 | 
 847 |     QChar data;
 848 | 
 849 |     if (!consumeStream(data)) {
 850 |         Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-eof"));
 851 |         state = HTMLTokenizer::DataState;
 852 |         stateFn = &HTMLTokenizerPrivate::dataState;
 853 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 854 |         currentToken->forceQuirks = true;
 855 |         emitCurrentToken();
 856 |         streamUnconsume();
 857 |     } else if (IS_SPACE_CHARACTER(data)) {
 858 |         state = HTMLTokenizer::BeforeDocTypeNameState;
 859 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypeNameState;
 860 |     } else {
 861 |         Q_EMIT q->parserError(QStringLiteral("need-space-after-doctype"));
 862 |         state = HTMLTokenizer::BeforeDocTypeNameState;
 863 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypeNameState;
 864 |         streamUnconsume();
 865 |     }
 866 | 
 867 |     return true;
 868 | }
 869 | 
 870 | bool HTMLTokenizerPrivate::beforeDocTypeNameState()
 871 | {
 872 |     Q_Q(HTMLTokenizer);
 873 | 
 874 |     QChar data;
 875 |     do {
 876 |         if (!consumeStream(data)) {
 877 |             Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-eof"));
 878 |             state = HTMLTokenizer::DataState;
 879 |             stateFn = &HTMLTokenizerPrivate::dataState;
 880 |             currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 881 |             currentToken->forceQuirks = true;
 882 |             emitCurrentToken();
 883 |             streamUnconsume();
 884 |             return true;
 885 |         }
 886 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
 887 | 
 888 |     if (IS_ASCII_UPPERCASE(data)) {
 889 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 890 |         currentToken->name = data.toLower();
 891 |         state = HTMLTokenizer::DocTypeNameState;
 892 |         stateFn = &HTMLTokenizerPrivate::docTypeNameState;
 893 |     } else if (data.isNull()) {
 894 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 895 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 896 |         currentToken->name = QChar(QChar::ReplacementCharacter);
 897 |         state = HTMLTokenizer::DocTypeNameState;
 898 |         stateFn = &HTMLTokenizerPrivate::docTypeNameState;
 899 |     } else if (data == '>') {
 900 |         Q_EMIT q->parserError(QStringLiteral("expected-doctype-name-but-got-right-bracket"  ));
 901 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 902 |         currentToken->forceQuirks = true;
 903 |         emitCurrentToken();
 904 |         state = HTMLTokenizer::DataState;
 905 |         stateFn = &HTMLTokenizerPrivate::dataState;
 906 |     } else {
 907 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 908 |         currentToken->name = data;
 909 |         state = HTMLTokenizer::DocTypeNameState;
 910 |         stateFn = &HTMLTokenizerPrivate::docTypeNameState;
 911 |     }
 912 | 
 913 |     return true;
 914 | }
 915 | 
 916 | bool HTMLTokenizerPrivate::docTypeNameState()
 917 | {
 918 |     Q_Q(HTMLTokenizer);
 919 | 
 920 |     QChar data;
 921 | 
 922 |     if (!consumeStream(data)) {
 923 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype-name"));
 924 |         state = HTMLTokenizer::DataState;
 925 |         stateFn = &HTMLTokenizerPrivate::dataState;
 926 |         currentToken = new HTMLToken(HTMLToken::DocTypeToken);
 927 |         currentToken->forceQuirks = true;
 928 |         emitCurrentToken();
 929 |         streamUnconsume();
 930 |     } else if (IS_SPACE_CHARACTER(data)) {
 931 |         state = HTMLTokenizer::AfterDocTypeNameState;
 932 |         stateFn = &HTMLTokenizerPrivate::afterDocTypeNameState;
 933 |     } else if (data == '>') {
 934 |         state = HTMLTokenizer::DataState;
 935 |         stateFn = &HTMLTokenizerPrivate::dataState;
 936 |         emitCurrentToken();
 937 |     } else if (IS_ASCII_UPPERCASE(data)) {
 938 |         currentToken->name.append(data.toLower());
 939 |     } else if (data.isNull()) {
 940 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
 941 |         currentToken->name.append(QChar::ReplacementCharacter);
 942 |     } else {
 943 |         currentToken->name.append(data);
 944 |     }
 945 | 
 946 |     return true;
 947 | }
 948 | 
 949 | // https://html.spec.whatwg.org/multipage/syntax.html#after-doctype-name-state
 950 | bool HTMLTokenizerPrivate::afterDocTypeNameState()
 951 | {
 952 |     Q_Q(HTMLTokenizer);
 953 | 
 954 |     QChar data;
 955 |     do {
 956 |         if (!consumeStream(data)) {
 957 |             Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
 958 |             state = HTMLTokenizer::DataState;
 959 |             stateFn = &HTMLTokenizerPrivate::dataState;
 960 |             currentToken->forceQuirks = true;
 961 |             emitCurrentToken();
 962 |             streamUnconsume();
 963 |             return true;
 964 |         }
 965 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
 966 | 
 967 |     if (data == '>') {
 968 |         state = HTMLTokenizer::DataState;
 969 |         stateFn = &HTMLTokenizerPrivate::dataState;
 970 |         emitCurrentToken();
 971 |     } else {
 972 |         int initalPos = streamPos();
 973 |         if (data == 'p' || data == 'P' ||
 974 |                 data == 's' || data == 'S') {
 975 |             QString charStack = data;
 976 |             // consume more 5 chars
 977 |             for (int i = 0; i < 5; ++i) {
 978 |                 // TODO check this
 979 |                 consumeStream(data);
 980 |                 charStack.append(data);
 981 |             }
 982 | 
 983 |             if (charStack.compare(QLatin1String("PUBLIC"), Qt::CaseInsensitive) == 0) {
 984 |                 state = HTMLTokenizer::AfterDocTypePublicKeywordState;
 985 |                 stateFn = &HTMLTokenizerPrivate::afterDocTypePublicKeywordState;
 986 |                 return true;
 987 |             } else if (charStack.compare(QLatin1String("SYSTEM"), Qt::CaseInsensitive) == 0) {
 988 |                 state = HTMLTokenizer::AfterDocTypeSystemKeywordState;
 989 |                 stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemKeywordState;
 990 |                 return true;
 991 |             }
 992 |         }
 993 | 
 994 |         Q_EMIT q->parserError(QStringLiteral("expected-space-or-right-bracket-in-doctype"));
 995 |         state = HTMLTokenizer::BogusDocTypeState;
 996 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
 997 |         currentToken->forceQuirks = true;
 998 |         streamSeek(initalPos);
 999 |     }
1000 | 
1001 |     return true;
1002 | }
1003 | 
1004 | bool HTMLTokenizerPrivate::afterDocTypePublicKeywordState()
1005 | {
1006 |     Q_Q(HTMLTokenizer);
1007 | 
1008 |     QChar data;
1009 | 
1010 |     if (!consumeStream(data)) {
1011 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1012 |         state = HTMLTokenizer::DataState;
1013 |         stateFn = &HTMLTokenizerPrivate::dataState;
1014 |         currentToken->forceQuirks = true;
1015 |         emitCurrentToken();
1016 |         streamUnconsume();
1017 |     } else if (IS_SPACE_CHARACTER(data)) {
1018 |         state = HTMLTokenizer::BeforeDocTypePublicIdentifierState;
1019 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypePublicIdentifierState;
1020 |     } else if (data == '"') {
1021 |         Q_EMIT q->parserError(QStringLiteral("unexpected-double-quote-in-doctype"));
1022 |         currentToken->doctypePublicId = "";
1023 |         state = HTMLTokenizer::DocTypePublicIdentifierDoubleQuotedState;
1024 |         stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState;
1025 |     } else if (data == '\'') {
1026 |         Q_EMIT q->parserError(QStringLiteral("unexpected-single-quote-in-doctype"));
1027 |         currentToken->doctypePublicId = "";
1028 |         state = HTMLTokenizer::DocTypePublicIdentifierSingleQuotedState;
1029 |         stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState;
1030 |     } else if (data == '>') {
1031 |         Q_EMIT q->parserError(QStringLiteral("unexpected-single-quote-in-doctype"));
1032 |         currentToken->forceQuirks = true;
1033 |         state = HTMLTokenizer::DataState;
1034 |         stateFn = &HTMLTokenizerPrivate::dataState;
1035 |         emitCurrentToken();
1036 |     } else {
1037 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1038 |         currentToken->forceQuirks = true;
1039 |         emitCurrentToken();
1040 |         state = HTMLTokenizer::BogusDocTypeState;
1041 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1042 |     }
1043 | 
1044 |     return true;
1045 | }
1046 | 
1047 | bool HTMLTokenizerPrivate::beforeDocTypePublicIdentifierState()
1048 | {
1049 |     Q_Q(HTMLTokenizer);
1050 | 
1051 |     QChar data;
1052 |     do {
1053 |         if (!consumeStream(data)) {
1054 |             Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1055 |             state = HTMLTokenizer::DataState;
1056 |             stateFn = &HTMLTokenizerPrivate::dataState;
1057 |             currentToken->forceQuirks = true;
1058 |             emitCurrentToken();
1059 |             streamUnconsume();
1060 |             return true;
1061 |         }
1062 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
1063 | 
1064 |     if (data == '"') {
1065 |         currentToken->doctypePublicId = "";
1066 |         state = HTMLTokenizer::DocTypePublicIdentifierDoubleQuotedState;
1067 |         stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState;
1068 |     } else if (data == '\'') {
1069 |         currentToken->doctypePublicId = "";
1070 |         state = HTMLTokenizer::DocTypePublicIdentifierSingleQuotedState;
1071 |         stateFn = &HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState;
1072 |     } else if (data == '>') {
1073 |         Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype"));
1074 |         currentToken->forceQuirks = true;
1075 |         state = HTMLTokenizer::DataState;
1076 |         stateFn = &HTMLTokenizerPrivate::dataState;
1077 |         emitCurrentToken();
1078 |     } else {
1079 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1080 |         currentToken->forceQuirks = true;
1081 |         emitCurrentToken();
1082 |         state = HTMLTokenizer::BogusDocTypeState;
1083 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1084 |     }
1085 | 
1086 |     return true;
1087 | }
1088 | 
1089 | bool HTMLTokenizerPrivate::docTypePublicIdentifierDoubleQuotedState()
1090 | {
1091 |     Q_Q(HTMLTokenizer);
1092 | 
1093 |     QChar data;
1094 | 
1095 |     if (!consumeStream(data)) {
1096 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1097 |         state = HTMLTokenizer::DataState;
1098 |         stateFn = &HTMLTokenizerPrivate::dataState;
1099 |         currentToken->forceQuirks = true;
1100 |         emitCurrentToken();
1101 |         streamUnconsume();
1102 |     } else if (data == '"') {
1103 |         state = HTMLTokenizer::AfterDocTypePublicIdentifierState;
1104 |         stateFn = &HTMLTokenizerPrivate::afterDocTypePublicIdentifierState;
1105 |     } else if (data.isNull()) {
1106 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
1107 |         currentToken->name.append(QChar::ReplacementCharacter);
1108 |     } else if (data == '>') {
1109 |         Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype"));
1110 |         currentToken->forceQuirks = true;
1111 |         state = HTMLTokenizer::DataState;
1112 |         stateFn = &HTMLTokenizerPrivate::dataState;
1113 |         emitCurrentToken();
1114 |     } else {
1115 |         currentToken->doctypePublicId.append(data);
1116 |     }
1117 | 
1118 |     return true;
1119 | }
1120 | 
1121 | bool HTMLTokenizerPrivate::docTypePublicIdentifierSingleQuotedState()
1122 | {
1123 |     Q_Q(HTMLTokenizer);
1124 | 
1125 |     QChar data;
1126 | 
1127 |     if (!consumeStream(data)) {
1128 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1129 |         state = HTMLTokenizer::DataState;
1130 |         stateFn = &HTMLTokenizerPrivate::dataState;
1131 |         currentToken->forceQuirks = true;
1132 |         emitCurrentToken();
1133 |         streamUnconsume();
1134 |     } else if (data == '\'') {
1135 |         state = HTMLTokenizer::AfterDocTypePublicIdentifierState;
1136 |         stateFn = &HTMLTokenizerPrivate::afterDocTypePublicIdentifierState;
1137 |     } else if (data.isNull()) {
1138 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
1139 |         currentToken->name.append(QChar::ReplacementCharacter);
1140 |     } else if (data == '>') {
1141 |         Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype"));
1142 |         currentToken->forceQuirks = true;
1143 |         state = HTMLTokenizer::DataState;
1144 |         stateFn = &HTMLTokenizerPrivate::dataState;
1145 |         emitCurrentToken();
1146 |     } else {
1147 |         currentToken->doctypePublicId.append(data);
1148 |     }
1149 | 
1150 |     return true;
1151 | }
1152 | 
1153 | bool HTMLTokenizerPrivate::afterDocTypePublicIdentifierState()
1154 | {
1155 |     Q_Q(HTMLTokenizer);
1156 | 
1157 |     QChar data;
1158 | 
1159 |     if (!consumeStream(data)) {
1160 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1161 |         state = HTMLTokenizer::DataState;
1162 |         stateFn = &HTMLTokenizerPrivate::dataState;
1163 |         currentToken->forceQuirks = true;
1164 |         emitCurrentToken();
1165 |         streamUnconsume();
1166 |     } else if (IS_SPACE_CHARACTER(data)) {
1167 |         state = HTMLTokenizer::BetweenDocTypePublicAndSystemIdentifierState;
1168 |         stateFn = &HTMLTokenizerPrivate::betweenDocTypePublicAndSystemIdentifierState;
1169 |     } else if (data == '>') {
1170 |         state = HTMLTokenizer::DataState;
1171 |         stateFn = &HTMLTokenizerPrivate::dataState;
1172 |         emitCurrentToken();
1173 |     } else if (data == '"') {
1174 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1175 |         currentToken->doctypeSystemId = "";
1176 |         state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState;
1177 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState;
1178 |     } else if (data == '\'') {
1179 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1180 |         currentToken->doctypeSystemId = "";
1181 |         state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState;
1182 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState;
1183 |     } else {
1184 |         q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1185 |         currentToken->forceQuirks = true;
1186 |         state = HTMLTokenizer::BogusDocTypeState;
1187 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1188 |     }
1189 | 
1190 |     return true;
1191 | }
1192 | 
1193 | bool HTMLTokenizerPrivate::betweenDocTypePublicAndSystemIdentifierState()
1194 | {
1195 |     Q_Q(HTMLTokenizer);
1196 | 
1197 |     QChar data;
1198 |     do {
1199 |         if (!consumeStream(data)) {
1200 |             Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1201 |             state = HTMLTokenizer::DataState;
1202 |             stateFn = &HTMLTokenizerPrivate::dataState;
1203 |             currentToken->forceQuirks = true;
1204 |             emitCurrentToken();
1205 |             streamUnconsume();
1206 |             return true;
1207 |         }
1208 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
1209 | 
1210 |     if (data == '>') {
1211 |         state = HTMLTokenizer::DataState;
1212 |         stateFn = &HTMLTokenizerPrivate::dataState;
1213 |         emitCurrentToken();
1214 |     } else if (data == '"') {
1215 |         currentToken->doctypeSystemId = "";
1216 |         state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState;
1217 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState;
1218 |     } else if (data == '\'') {
1219 |         currentToken->doctypeSystemId = "";
1220 |         state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState;
1221 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState;
1222 |     } else {
1223 |         q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1224 |         currentToken->forceQuirks = true;
1225 |         state = HTMLTokenizer::BogusDocTypeState;
1226 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1227 |     }
1228 | 
1229 |     return true;
1230 | }
1231 | 
1232 | bool HTMLTokenizerPrivate::afterDocTypeSystemKeywordState()
1233 | {
1234 |     Q_Q(HTMLTokenizer);
1235 | 
1236 |     QChar data;
1237 | 
1238 |     if (!consumeStream(data)) {
1239 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1240 |         state = HTMLTokenizer::DataState;
1241 |         stateFn = &HTMLTokenizerPrivate::dataState;
1242 |         currentToken->forceQuirks = true;
1243 |         emitCurrentToken();
1244 |         streamUnconsume();
1245 |     } else if (IS_SPACE_CHARACTER(data)) {
1246 |         state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState;
1247 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState;
1248 |     } else if (data == '>') {
1249 |         state = HTMLTokenizer::DataState;
1250 |         stateFn = &HTMLTokenizerPrivate::dataState;
1251 |         emitCurrentToken();
1252 |     } else if (data == '"') {
1253 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1254 |         currentToken->doctypeSystemId = "";
1255 |         state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState;
1256 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState;
1257 |     } else if (data == '\'') {
1258 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1259 |         currentToken->doctypeSystemId = "";
1260 |         state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState;
1261 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState;
1262 |     } else {
1263 |         q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1264 |         currentToken->forceQuirks = true;
1265 |         state = HTMLTokenizer::BogusDocTypeState;
1266 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1267 |     }
1268 | 
1269 |     return true;
1270 | }
1271 | 
1272 | bool HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState()
1273 | {
1274 |     Q_Q(HTMLTokenizer);
1275 | 
1276 |     QChar data;
1277 |     do {
1278 |         if (!consumeStream(data)) {
1279 |             Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1280 |             state = HTMLTokenizer::DataState;
1281 |             stateFn = &HTMLTokenizerPrivate::dataState;
1282 |             currentToken->forceQuirks = true;
1283 |             emitCurrentToken();
1284 |             streamUnconsume();
1285 |             return true;
1286 |         }
1287 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
1288 | 
1289 |     if (data == '"') {
1290 |         currentToken->doctypeSystemId = "";
1291 |         state = HTMLTokenizer::DocTypeSystemIdentifierDoubleQuotedState;
1292 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState;
1293 |     } else if (data == '\'') {
1294 |         currentToken->doctypeSystemId = "";
1295 |         state = HTMLTokenizer::DocTypeSystemIdentifierSingleQuotedState;
1296 |         stateFn = &HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState;
1297 |     } else if (data == '>') {
1298 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1299 |         currentToken->forceQuirks = true;
1300 |         state = HTMLTokenizer::DataState;
1301 |         stateFn = &HTMLTokenizerPrivate::dataState;
1302 |         emitCurrentToken();
1303 |     } else {
1304 |         Q_EMIT q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1305 |         currentToken->forceQuirks = true;
1306 |         state = HTMLTokenizer::BogusDocTypeState;
1307 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1308 |     }
1309 | 
1310 |     return true;
1311 | }
1312 | 
1313 | bool HTMLTokenizerPrivate::docTypeSystemIdentifierDoubleQuotedState()
1314 | {
1315 |     Q_Q(HTMLTokenizer);
1316 | 
1317 |     QChar data;
1318 | 
1319 |     if (!consumeStream(data)) {
1320 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1321 |         state = HTMLTokenizer::DataState;
1322 |         stateFn = &HTMLTokenizerPrivate::dataState;
1323 |         currentToken->forceQuirks = true;
1324 |         emitCurrentToken();
1325 |         streamUnconsume();
1326 |     } else if (data == '"') {
1327 |         state = HTMLTokenizer::AfterDocTypeSystemIdentifierState;
1328 |         stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState;
1329 |     } else if (data.isNull()) {
1330 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
1331 |         currentToken->doctypeSystemId.append(QChar::ReplacementCharacter);
1332 |         state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState;
1333 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState;
1334 |     } else if (data == '>') {
1335 |         Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype"));
1336 |         currentToken->forceQuirks = true;
1337 |         state = HTMLTokenizer::DataState;
1338 |         stateFn = &HTMLTokenizerPrivate::dataState;
1339 |         emitCurrentToken();
1340 |     } else {
1341 |         currentToken->doctypeSystemId.append(data);
1342 |     }
1343 | 
1344 |     return true;
1345 | }
1346 | 
1347 | bool HTMLTokenizerPrivate::docTypeSystemIdentifierSingleQuotedState()
1348 | {
1349 |     Q_Q(HTMLTokenizer);
1350 | 
1351 |     QChar data;
1352 | 
1353 |     if (!consumeStream(data)) {
1354 |         Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1355 |         state = HTMLTokenizer::DataState;
1356 |         stateFn = &HTMLTokenizerPrivate::dataState;
1357 |         currentToken->forceQuirks = true;
1358 |         emitCurrentToken();
1359 |         streamUnconsume();
1360 |     } else if (data == '\'') {
1361 |         state = HTMLTokenizer::AfterDocTypeSystemIdentifierState;
1362 |         stateFn = &HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState;
1363 |     } else if (data.isNull()) {
1364 |         Q_EMIT q->parserError(QStringLiteral("invalid-codepoint"));
1365 |         currentToken->doctypeSystemId.append(QChar::ReplacementCharacter);
1366 |         state = HTMLTokenizer::BeforeDocTypeSystemIdentifierState;
1367 |         stateFn = &HTMLTokenizerPrivate::beforeDocTypeSystemIdentifierState;
1368 |     } else if (data == '>') {
1369 |         Q_EMIT q->parserError(QStringLiteral("unexpected-end-of-doctype"));
1370 |         currentToken->forceQuirks = true;
1371 |         state = HTMLTokenizer::DataState;
1372 |         stateFn = &HTMLTokenizerPrivate::dataState;
1373 |         emitCurrentToken();
1374 |     }  else {
1375 |         currentToken->doctypeSystemId.append(data);
1376 |     }
1377 | 
1378 |     return true;
1379 | }
1380 | 
1381 | bool HTMLTokenizerPrivate::afterDocTypeSystemIdentifierState()
1382 | {
1383 |     Q_Q(HTMLTokenizer);
1384 | 
1385 |     QChar data;
1386 |     do {
1387 |         if (!consumeStream(data)) {
1388 |             Q_EMIT q->parserError(QStringLiteral("eof-in-doctype"));
1389 |             state = HTMLTokenizer::DataState;
1390 |             stateFn = &HTMLTokenizerPrivate::dataState;
1391 |             currentToken->forceQuirks = true;
1392 |             emitCurrentToken();
1393 |             streamUnconsume();
1394 |             return true;
1395 |         }
1396 |     } while (IS_SPACE_CHARACTER(data)); // Ignore all space characters
1397 | 
1398 |     if (data == '>') {
1399 |         state = HTMLTokenizer::DataState;
1400 |         stateFn = &HTMLTokenizerPrivate::dataState;
1401 |         emitCurrentToken();
1402 |     } else {
1403 |         q->parserError(QStringLiteral("unexpected-char-in-doctype"));
1404 |         currentToken->forceQuirks = true;
1405 |         state = HTMLTokenizer::BogusDocTypeState;
1406 |         stateFn = &HTMLTokenizerPrivate::bogusDocTypeState;
1407 |     }
1408 | 
1409 |     return true;
1410 | }
1411 | 
1412 | bool HTMLTokenizerPrivate::bogusDocTypeState()
1413 | {
1414 |     Q_Q(HTMLTokenizer);
1415 | 
1416 |     QChar data;
1417 |     if (!consumeStream(data)) {
1418 |         state = HTMLTokenizer::DataState;
1419 |         stateFn = &HTMLTokenizerPrivate::dataState;
1420 |         emitCurrentToken();
1421 |         streamUnconsume();
1422 |     } else if (data == '>') {
1423 |         state = HTMLTokenizer::DataState;
1424 |         stateFn = &HTMLTokenizerPrivate::dataState;
1425 |         emitCurrentToken();
1426 |     }
1427 | 
1428 |     return true;
1429 | }
1430 | 
1431 | bool HTMLTokenizerPrivate::cDataSectionState()
1432 | {
1433 |     // TODO
1434 |     return true;
1435 | }
1436 | 
1437 | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
1438 | QString HTMLTokenizerPrivate::consumeEntity(QChar *allowedChar)
1439 | {
1440 |     Q_Q(HTMLTokenizer);
1441 | 
1442 |     int initalPos = streamPos();
1443 |     QString output = QStringLiteral("&");
1444 | 
1445 |     QChar data;
1446 |     if (!consumeStream(data) ||
1447 |             IS_SPACE_CHARACTER(data) || data == '<' || data == '&' ||
1448 |             (allowedChar && data == *allowedChar)) {
1449 |         // Not a character reference. No characters are consumed,
1450 |         // and nothing is returned. (This is not an error, either.)
1451 |         streamUnconsume();
1452 |         return QString();
1453 |     } else if (data == '#') {
1454 |         output.append(data);
1455 | 
1456 |         // TODO check this
1457 |         consumeStream(data);
1458 |         QChar number;
1459 |         if (data == 'x' || data == 'X') {
1460 |             number = consumeNumberEntity(true);
1461 |         } else {
1462 |             number = consumeNumberEntity(false);
1463 |         }
1464 | 
1465 |         if (number.isNull()) {
1466 |             q->parserError(QStringLiteral("expected-numeric-entity"));
1467 |             // unconsume all characters
1468 |             streamSeek(initalPos);
1469 |             return QString();
1470 |         }
1471 | 
1472 |         return number;
1473 |     } else {
1474 | 
1475 |     }
1476 |     return QString();
1477 | }
1478 | 
1479 | QChar HTMLTokenizerPrivate::consumeNumberEntity(bool isHex)
1480 | {
1481 |     Q_Q(HTMLTokenizer);
1482 | 
1483 |     QChar ret;
1484 |     QString charStack;
1485 |     QChar c;
1486 |     // TODO check this
1487 |     consumeStream(c);
1488 |     int lastPos = streamPos();
1489 |     if (isHex) {
1490 |         while (IS_ASCII_HEX_DIGITS(c) &&
1491 |                !streamAtEnd()) {
1492 |             charStack.append(c); // store the position to rewind for ;
1493 |             lastPos = streamPos();
1494 |             // TODO check this
1495 |             consumeStream(c);
1496 |         }
1497 |     } else {
1498 |         while (IS_ASCII_DIGITS(c) && // Zero (0) to Nine (9)
1499 |                !streamAtEnd()) {
1500 |             charStack.append(c);
1501 |             lastPos = streamPos(); // store the position to rewind for ;
1502 |             // TODO check this
1503 |             consumeStream(c);
1504 |         }
1505 |     }
1506 | 
1507 |     // No char was found return null to unconsume
1508 |     if (charStack.isNull()) {
1509 |         return QChar::Null;
1510 |     }
1511 | 
1512 |     // Discard the ; if present. Otherwise, put it back on the queue and
1513 |     // invoke parseError on parser.
1514 |     if (c != ';') {
1515 |         q->parserError(QStringLiteral("numeric-entity-without-semicolon"));
1516 |         streamSeek(lastPos);
1517 |     }
1518 | 
1519 |     // Convert the number using the proper base
1520 |     bool ok;
1521 |     int charAsInt = charStack.toInt(&ok, isHex ? 16 : 10);
1522 |     if (!ok) {
1523 |         // TODO error
1524 |     }
1525 | 
1526 |     // Certain characters get replaced with others
1527 |     QMap<int,int>::ConstIterator it = replacementCharacters.constFind(charAsInt);
1528 |     if (it != replacementCharacters.constEnd()) {
1529 |         ret = it.value();
1530 |         q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack));
1531 |     } else if ((charAsInt >= 0xD800 && charAsInt <= 0xDFFF) || charAsInt > 0x10FFFF) {
1532 |         ret = QChar::ReplacementCharacter;
1533 |         q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack));
1534 |     } else {
1535 |         if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
1536 |                 (0x000E <= charAsInt && charAsInt <= 0x001F) ||
1537 |                 (0x007F <= charAsInt && charAsInt <= 0x009F) ||
1538 |                 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
1539 |                 (charAsInt == 0x000B || charAsInt == 0xFFFE || charAsInt == 0xFFFF || charAsInt == 0x1FFFE ||
1540 |                  charAsInt == 0x1FFFF || charAsInt == 0x2FFFE || charAsInt == 0x2FFFF || charAsInt == 0x3FFFE ||
1541 |                  charAsInt == 0x3FFFF || charAsInt == 0x4FFFE || charAsInt == 0x4FFFF || charAsInt == 0x5FFFE ||
1542 |                  charAsInt == 0x5FFFF || charAsInt == 0x6FFFE || charAsInt == 0x6FFFF || charAsInt == 0x7FFFE ||
1543 |                  charAsInt == 0x7FFFF || charAsInt == 0x8FFFE || charAsInt == 0x8FFFF || charAsInt == 0x9FFFE ||
1544 |                  charAsInt == 0x9FFFF || charAsInt == 0xAFFFE || charAsInt == 0xAFFFF || charAsInt == 0xBFFFE ||
1545 |                  charAsInt == 0xBFFFF || charAsInt == 0xCFFFE || charAsInt == 0xCFFFF || charAsInt == 0xDFFFE ||
1546 |                  charAsInt == 0xDFFFF || charAsInt == 0xEFFFE || charAsInt == 0xEFFFF || charAsInt == 0xFFFFE ||
1547 |                  charAsInt == 0xFFFFF || charAsInt == 0x10FFFE || charAsInt == 0x10FFFF)) {
1548 |             q->parserError(QString("illegal-codepoint-for-numeric-entity: %1").arg(charStack));
1549 |             ret = charAsInt;
1550 |         }
1551 |     }
1552 | 
1553 |     return ret;
1554 | }
1555 | 
1556 | void HTMLTokenizerPrivate::emitCurrentToken()
1557 | {
1558 |     Q_Q(HTMLTokenizer);
1559 | 
1560 | //    qDebug() << "emitCurrentToken" << currentToken;
1561 |     HTMLToken *token = currentToken;
1562 |     if (token->type == HTMLToken::EndTagToken) {
1563 |         if (!token->data.isEmpty()) {
1564 |             Q_EMIT q->parserError(QStringLiteral("attributes-in-end-tag"));
1565 |         }
1566 | 
1567 |         if (token->selfClosing) {
1568 |             Q_EMIT q->parserError(QStringLiteral("self-closing-flag-on-end-tag"));
1569 |         }
1570 |     }
1571 |     Q_EMIT q->token(token);
1572 | 
1573 |     currentToken = 0;
1574 | }
1575 | 
1576 | QMap<QString, QString> HTMLToken::dataItems()
1577 | {
1578 |     QMap<QString, QString> ret;
1579 |     for (const std::pair<QString,QString> &pair : data) {
1580 |         ret.insertMulti(pair.first, pair.second);
1581 |     }
1582 |     return ret;
1583 | }
1584 | 


--------------------------------------------------------------------------------
/html-qt/htmltokenizer.h:
--------------------------------------------------------------------------------
  1 | #ifndef HTMLTOKENIZER_H
  2 | #define HTMLTOKENIZER_H
  3 | 
  4 | #include <QObject>
  5 | 
  6 | class HTMLParser;
  7 | class HTMLToken;
  8 | class HTMLTokenizerPrivate;
  9 | class HTMLTokenizer : public QObject
 10 | {
 11 |     Q_OBJECT
 12 |     Q_DECLARE_PRIVATE(HTMLTokenizer)
 13 | public:
 14 |     enum State {
 15 |         DataState,
 16 |         CharacterReferenceInDataState,
 17 |         RCDataState,
 18 |         CharacterReferenceInRCDataState,
 19 |         RawTextState,
 20 |         ScriptDataState,
 21 |         PlainTextState,
 22 |         TagOpenState,
 23 |         EndTagOpenState,
 24 |         TagNameState,
 25 |         RCDataLessThanSignState,
 26 |         RCDataEndTagOpenState,
 27 |         RCDataEndTagNameState,
 28 |         RawTextLessThanSignState,
 29 |         RawTextEndTagOpenState,
 30 |         RawTextEndTagNameState,
 31 |         ScriptDataLessThanSignState,
 32 |         ScriptDataEndTagOpenState,
 33 |         ScriptDataEndTagNameState,
 34 |         ScriptDataEscapeStartState,
 35 |         ScriptDataEscapeStartDashState,
 36 |         ScriptDataEscapedState,
 37 |         ScriptDataEscapedDashState,
 38 |         ScriptDataEscapedDashDashState,
 39 |         ScriptDataEscapedLessThanSignState,
 40 |         ScriptDataEscapedEndTagOpenState,
 41 |         ScriptDataEscapedEndTagNameState,
 42 |         ScriptDataDoubleEscapeStartState,
 43 |         ScriptDataDoubleEscapedState,
 44 |         ScriptDataDoubleEscapedDashState,
 45 |         ScriptDataDoubleEscapedDashDashState,
 46 |         ScriptDataDoubleEscapedLessThanSignState,
 47 |         ScriptDataDoubleEscapeEndState,
 48 |         BeforeAttributeNameState,
 49 |         AttributeNameState,
 50 |         AfterAttributeNameState,
 51 |         BeforeAttributeValueState,
 52 |         AttributeValueDoubleQuotedState,
 53 |         AttributeValueSingleQuotedState,
 54 |         AttributeValueUnquotedState,
 55 |         CharacterReferenceInAttributeValueState,
 56 |         AfterAttributeValueQuotedState,
 57 |         SelfClosingStartTagState,
 58 |         BogusCommentState,
 59 |         MarkupDeclarationOpenState,
 60 |         CommentStartState,
 61 |         CommentStartDashState,
 62 |         CommentState,
 63 |         CommentEndDashState,
 64 |         CommentEndState,
 65 |         CommentEndBangState,
 66 |         DocTypeState,
 67 |         BeforeDocTypeNameState,
 68 |         DocTypeNameState,
 69 |         AfterDocTypeNameState,
 70 |         AfterDocTypePublicKeywordState,
 71 |         BeforeDocTypePublicIdentifierState,
 72 |         DocTypePublicIdentifierDoubleQuotedState,
 73 |         DocTypePublicIdentifierSingleQuotedState,
 74 |         AfterDocTypePublicIdentifierState,
 75 |         BetweenDocTypePublicAndSystemIdentifierState,
 76 |         AfterDocTypeSystemKeywordState,
 77 |         BeforeDocTypeSystemIdentifierState,
 78 |         DocTypeSystemIdentifierDoubleQuotedState,
 79 |         DocTypeSystemIdentifierSingleQuotedState,
 80 |         AfterDocTypeSystemIdentifierState,
 81 |         BogusDocTypeState,
 82 |         CDataSectionState,
 83 |     };
 84 |     Q_ENUM(State)
 85 |     HTMLTokenizer(HTMLParser *parser);
 86 |     ~HTMLTokenizer();
 87 | 
 88 |     void setHtmlText(const QString &html);
 89 | 
 90 |     State state() const;
 91 | 
 92 |     void start();
 93 | 
 94 | protected:
 95 |     void character(QChar c);
 96 |     void parserError(const QString &error);
 97 |     void token(HTMLToken *token);
 98 | 
 99 |     HTMLTokenizerPrivate *d_ptr;
100 | };
101 | 
102 | #endif // HTMLTOKENIZER_H
103 | 


--------------------------------------------------------------------------------
/html-qt/htmltokenizer_p.h:
--------------------------------------------------------------------------------
  1 | #ifndef HTMLTOKENIZER_P_H
  2 | #define HTMLTOKENIZER_P_H
  3 | 
  4 | #include "htmltokenizer.h"
  5 | 
  6 | #include <QPair>
  7 | #include <QMap>
  8 | #include <QDebug>
  9 | 
 10 | typedef  bool (HTMLTokenizerPrivate::*HTMLTokenizerPrivateMemFn)();
 11 | 
 12 | class HTMLToken
 13 | {
 14 |     Q_GADGET
 15 | public:
 16 |     enum Type {
 17 |         CharactersToken,
 18 |         SpaceCharactersToken,
 19 |         StartTagToken,
 20 |         EndTagToken,
 21 |         CommentToken,
 22 |         DocTypeToken,
 23 |         ParserErrorToken
 24 |     };
 25 |     Q_ENUMS(Type)
 26 | 
 27 |     HTMLToken(Type tokenType) : type(tokenType) {}
 28 | 
 29 |     HTMLToken(const QString &_name, Type tokenType = EndTagToken,
 30 |               const QVector<std::pair<QString,QString> > &attributes = QVector<std::pair<QString,QString> >(),
 31 |               bool _selfClosing = false)
 32 |         : name(_name)
 33 |         , type(tokenType)
 34 |         , data(attributes)
 35 |         , selfClosing(_selfClosing)
 36 |     {}
 37 | 
 38 |     void appendDataCurrentAttributeName(const QChar &c)
 39 |     {
 40 |         if (data.isEmpty()) {
 41 |             data.append({ c, QString()});
 42 |         } else {
 43 |             data.last().first.append(c);
 44 |         }
 45 |     }
 46 | 
 47 |     void appendDataCurrentAttributeValue(const QChar &c)
 48 |     {
 49 |         if (data.isEmpty()) {
 50 |             data.append({QString(), c});
 51 |         } else {
 52 |             data.last().second.append(c);
 53 |         }
 54 |     }
 55 | 
 56 |     void appendDataCurrentAttributeValue(const QString &s)
 57 |     {
 58 |         if (data.isEmpty()) {
 59 |             data.push_back({QString(), s});
 60 |         } else {
 61 |             data.last().second.append(s);
 62 |         }
 63 |     }
 64 | 
 65 |     QMap<QString, QString> dataItems();
 66 | 
 67 |     QString name; // or data for comment or character types
 68 |     Type type;
 69 |     QString dataStr;
 70 |     QVector<std::pair<QString,QString> > data;
 71 |     bool selfClosing = false;
 72 |     bool selfClosingAcknowledged = false;
 73 |     bool forceQuirks = false;
 74 |     QString doctypePublicId;
 75 |     QString doctypeSystemId;
 76 | };
 77 | 
 78 | class HTMLTokenizerPrivate
 79 | {
 80 |     Q_DECLARE_PUBLIC(HTMLTokenizer)
 81 | public:
 82 |     // State methods
 83 |     bool dataState();
 84 |     bool characterReferenceInDataState();
 85 |     bool tagOpenState();
 86 |     bool endTagOpenState();
 87 |     bool tagNameState();
 88 |     // ... RC Raw Script
 89 |     bool beforeAttributeNameState();
 90 |     bool attributeNameState();
 91 |     bool afterAttributeNameState();
 92 |     bool beforeAttributeValueState();
 93 |     bool attributeValueDoubleQuotedState();
 94 |     bool attributeValueSingleQuotedState();
 95 |     bool attributeValueUnquotedState();
 96 |     // This method is special as for simplicity it is directly called by the callers
 97 |     void characterReferenceInAttributeValueState(QChar *additionalAllowedCharacter);
 98 |     bool afterAttributeValueQuotedState();
 99 |     bool selfClosingStartTagState();
100 |     bool bogusCommentState();
101 |     bool markupDeclarationOpenState();
102 |     bool commentStartState();
103 |     bool commentStartDashState();
104 |     bool commentState();
105 |     bool commentEndDashState();
106 |     bool commentEndState();
107 |     bool commentEndBangState();
108 |     bool doctypeState();
109 |     bool beforeDocTypeNameState();
110 |     bool docTypeNameState();
111 |     bool afterDocTypeNameState();
112 |     bool afterDocTypePublicKeywordState();
113 |     bool beforeDocTypePublicIdentifierState();
114 |     bool docTypePublicIdentifierDoubleQuotedState();
115 |     bool docTypePublicIdentifierSingleQuotedState();
116 |     bool afterDocTypePublicIdentifierState();
117 |     bool betweenDocTypePublicAndSystemIdentifierState();
118 |     bool afterDocTypeSystemKeywordState();
119 |     bool beforeDocTypeSystemIdentifierState();
120 |     bool docTypeSystemIdentifierDoubleQuotedState();
121 |     bool docTypeSystemIdentifierSingleQuotedState();
122 |     bool afterDocTypeSystemIdentifierState();
123 |     bool bogusDocTypeState();
124 |     bool cDataSectionState();
125 | 
126 |     // auxiliary methods
127 |     inline bool consumeStream(QChar &c)
128 |     {
129 |         if (++htmlPos >= htmlSize || htmlPos < 0) {
130 |             return false;
131 |         } else {
132 |             c = html.at(htmlPos);
133 |             return true;
134 |         }
135 |     }
136 | 
137 |     inline int streamPos() {
138 |         return htmlPos;
139 |     }
140 | 
141 |     inline void streamSeek(int pos) {
142 |         htmlPos = pos;
143 |     }
144 | 
145 |     inline void streamUnconsume(int nChars = 1) {
146 |         htmlPos -= nChars;
147 |     }
148 | 
149 |     inline bool streamCanRead(int nChars = 1) {
150 |         return htmlPos + nChars < htmlSize;
151 |     }
152 | 
153 |     inline bool streamAtEnd() {
154 |         return htmlPos > htmlSize;
155 |     }
156 | 
157 |     QString consumeEntity(QChar *allowedChar = 0);
158 |     QChar consumeNumberEntity(bool isHex);
159 |     void emitCurrentToken();
160 | 
161 |     // current token
162 |     HTMLToken *currentToken;
163 |     QVector<HTMLToken *> tokenQueue;
164 | 
165 |     HTMLTokenizer *q_ptr;
166 |     HTMLParser *parser;
167 |     QString html;
168 |     int htmlPos = -1;
169 |     int htmlSize = 0;
170 |     HTMLTokenizer::State state = HTMLTokenizer::DataState;
171 |     HTMLTokenizerPrivateMemFn stateFn = &HTMLTokenizerPrivate::dataState;
172 |     QMap<int,int> replacementCharacters = {
173 |         {0x00, 0xFFFD}, // REPLACEMENT CHARACTER
174 |         {0x80, 0x20AC}, // EURO SIGN (€)
175 |         {0x82, 0x201A}, // SINGLE LOW-9 QUOTATION MARK (‚)
176 |         {0x83, 0x0192}, // LATIN SMALL LETTER F WITH HOOK (ƒ)
177 |         {0x84, 0x201E}, // DOUBLE LOW-9 QUOTATION MARK („)
178 |         {0x85, 0x2026}, // HORIZONTAL ELLIPSIS (…)
179 |         {0x86, 0x2020}, // DAGGER (†)
180 |         {0x87, 0x2021}, // DOUBLE DAGGER (‡)
181 |         {0x88, 0x02C6}, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
182 |         {0x89, 0x2030}, // PER MILLE SIGN (‰)
183 |         {0x8A, 0x0160}, // LATIN CAPITAL LETTER S WITH CARON (Š)
184 |         {0x8B, 0x2039}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
185 |         {0x8C, 0x0152}, // LATIN CAPITAL LIGATURE OE (Œ)
186 |         {0x8E, 0x017D}, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
187 |         {0x91, 0x2018}, // LEFT SINGLE QUOTATION MARK (‘)
188 |         {0x92, 0x2019}, // RIGHT SINGLE QUOTATION MARK (’)
189 |         {0x93, 0x201C}, // LEFT DOUBLE QUOTATION MARK (“)
190 |         {0x94, 0x201D}, // RIGHT DOUBLE QUOTATION MARK (”)
191 |         {0x95, 0x2022}, // BULLET (•)
192 |         {0x96, 0x2013}, // EN DASH (–)
193 |         {0x97, 0x2014}, // EM DASH (—)
194 |         {0x98, 0x02DC}, // SMALL TILDE (˜)
195 |         {0x99, 0x2122}, // TRADE MARK SIGN (™)
196 |         {0x9A, 0x0161}, // LATIN SMALL LETTER S WITH CARON (š)
197 |         {0x9B, 0x203A}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
198 |         {0x9C, 0x0153}, // LATIN SMALL LIGATURE OE (œ)
199 |         {0x9E, 0x017E}, // LATIN SMALL LETTER Z WITH CARON (ž)
200 |         {0x9F, 0x0178}, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
201 |     };
202 | 
203 | 
204 | };
205 | 
206 | #endif // HTMLTOKENIZER_P_H
207 | 
208 | 


--------------------------------------------------------------------------------
/html-qt/htmltree.cpp:
--------------------------------------------------------------------------------
  1 | #include "htmltree.h"
  2 | 
  3 | #include "htmltokenizer_p.h"
  4 | 
  5 | #include <QLoggingCategory>
  6 | 
  7 | Q_LOGGING_CATEGORY(HTML_TREE, "htmlqt.tree")
  8 | 
  9 | HTMLTree::HTMLTree(const QString &namespaceHTMLElements)
 10 | {
 11 |     if (namespaceHTMLElements.isEmpty()) {
 12 |         m_defaultNamespace = QStringLiteral("http://www.w3.org/1999/xhtml");
 13 |     } else {
 14 |         m_defaultNamespace = namespaceHTMLElements;
 15 |     }
 16 | 
 17 |     reset();
 18 | }
 19 | 
 20 | HTMLTree::~HTMLTree()
 21 | {
 22 | 
 23 | }
 24 | 
 25 | void HTMLTree::reset()
 26 | {
 27 |     m_openElements.clear();
 28 | 
 29 |     delete m_document;
 30 |     m_document = new HTMLTreeNode;
 31 | }
 32 | 
 33 | HTMLTreeNode *HTMLTree::document()
 34 | {
 35 |     return m_document;
 36 | }
 37 | 
 38 | void HTMLTree::insertText(QChar c, HTMLTreeNode *parent)
 39 | {
 40 |     qCDebug(HTML_TREE) << Q_FUNC_INFO << c << m_openElements.size();
 41 |     if (!parent) {
 42 |         parent = m_openElements.last();
 43 |     }
 44 | 
 45 | //    if (! m_insertFromTable)
 46 |     parent->insertText(c);
 47 | }
 48 | 
 49 | void HTMLTree::inserRoot(HTMLToken *token)
 50 | {
 51 |     HTMLTreeNode *node = createElement(token);
 52 |     m_openElements.push_back(node);
 53 |     m_document->appendChild(node);
 54 | }
 55 | 
 56 | void HTMLTree::insertDoctype(HTMLToken *token)
 57 | {
 58 |     qCDebug(HTML_TREE) << Q_FUNC_INFO << token;
 59 |     m_document->token = token;
 60 | }
 61 | 
 62 | void HTMLTree::insertComment(HTMLToken *token, HTMLTreeNode *parent)
 63 | {
 64 |     qCDebug(HTML_TREE) << Q_FUNC_INFO;
 65 | }
 66 | 
 67 | HTMLTreeNode *HTMLTree::createElement(HTMLToken *token)
 68 | {
 69 |     auto ret = new HTMLTreeNode(token->name);
 70 |     for (const std::pair<QString,QString> &pair : token->data) {
 71 |         ret->attributes.insertMulti(pair.first, pair.second);
 72 |     }
 73 |     return ret;
 74 | }
 75 | 
 76 | void HTMLTree::dump()
 77 | {
 78 |     dumpTree(m_document);
 79 | }
 80 | 
 81 | QVector<HTMLTreeNode *> HTMLTree::openElements() const
 82 | {
 83 |     return m_openElements;
 84 | }
 85 | 
 86 | HTMLTreeNode *HTMLTree::createNode(int &pos, int lastPos, bool plainText, HTMLTreeNode *parent)
 87 | {
 88 |     qCDebug(HTML_TREE) << Q_FUNC_INFO;
 89 |     return 0;
 90 | }
 91 | 
 92 | void HTMLTree::dumpTree(HTMLTreeNode *root, int level)
 93 | {
 94 |     qDebug() << QByteArray("-").repeated(level).data() << ">" << root->token->name;
 95 |     for (HTMLTreeNode *node : root->children) {
 96 |         dumpTree(node, level + 1);
 97 |     }
 98 | }
 99 | 
100 | HTMLTreeNode::HTMLTreeNode(const QString &name)
101 | {
102 |     this->name = name;
103 | }
104 | 
105 | HTMLTreeNode::~HTMLTreeNode()
106 | {
107 | 
108 | }
109 | 
110 | void HTMLTreeNode::appendChild(HTMLTreeNode *node)
111 | {
112 |     children.push_back(node);
113 | }
114 | 
115 | void HTMLTreeNode::insertText(const QString &data)
116 | {
117 |     qDebug() << data;
118 |     text.append(data);
119 | }
120 | 
121 | void HTMLTreeNode::removeChild(HTMLTreeNode *node)
122 | {
123 |     children.removeOne(node);
124 | }
125 | 
126 | void HTMLTreeNode::reparentChildren(HTMLTreeNode *node)
127 | {
128 |     for (HTMLTreeNode *child : children) {
129 |         node->appendChild(child);
130 |     }
131 |     children.clear();
132 | }
133 | 
134 | bool HTMLTreeNode::hasContent() const
135 | {
136 |     return !text.isEmpty() || !children.isEmpty();
137 | }
138 | 
139 | QString HTMLTreeNode::asText() const
140 | {
141 |     QString attributesStr;
142 |     auto it = attributes.constBegin();
143 |     while (it != attributes.constEnd()) {
144 |         if (it.value().isEmpty()) {
145 |             attributesStr += QLatin1Char(' ') + it.value();
146 |         } else {
147 |             attributesStr += QLatin1Char(' ') + it.key() + QLatin1String("=\"") + it.value() + QLatin1Char('"');
148 |         }
149 |     }
150 | 
151 |     return QLatin1Char('<') + name + attributesStr + QLatin1Char('>');
152 | }
153 | 


--------------------------------------------------------------------------------
/html-qt/htmltree.h:
--------------------------------------------------------------------------------
 1 | #ifndef HTMLTREE_H
 2 | #define HTMLTREE_H
 3 | 
 4 | #include <QObject>
 5 | #include <QStringList>
 6 | #include <QMap>
 7 | #include <QVector>
 8 | 
 9 | class HTMLToken;
10 | class HTMLTreeNode
11 | {
12 | public:
13 |     HTMLTreeNode(const QString &name = QString());
14 |     virtual ~HTMLTreeNode();
15 | 
16 |     QString name;
17 |     HTMLTreeNode *parent = nullptr;
18 |     QVector<HTMLTreeNode *> children;
19 |     QMap<QString, QString> attributes;
20 |     HTMLToken *token;
21 |     QStringRef type;
22 |     QString text;
23 |     bool end = false;
24 |     bool plainText = true;
25 | 
26 |     /*!
27 |      * Insert node as a child of the current node
28 |      */
29 |     virtual void appendChild(HTMLTreeNode *node);
30 | 
31 |     /*!
32 |      * Insert data as text in the current node,
33 |      * TODO positioned before the
34 |      * start of node insertBefore or to the end of the node's text.
35 |      */
36 |     virtual void insertText(const QString &data);
37 | 
38 |     /*!
39 |      * Remove node from the children of the current node
40 |      */
41 |     virtual void removeChild(HTMLTreeNode *node);
42 | 
43 |     /*!
44 |      * Move all the children of the current node to newParent.
45 |      * This is needed so that trees that don't store text as nodes move the
46 |      * text in the correct way
47 |      */
48 |     virtual void reparentChildren(HTMLTreeNode *node);
49 | 
50 |     /*!
51 |      * Return true if the node has children or text, false otherwise
52 |      */
53 |     virtual bool hasContent() const;
54 | 
55 |     QString asText() const;
56 | };
57 | 
58 | class HTMLTree
59 | {
60 | public:
61 |     HTMLTree(const QString &namespaceHTMLElements = QString());
62 |     virtual ~HTMLTree();
63 | 
64 |     void reset();
65 | 
66 |     HTMLTreeNode *document();
67 | 
68 |     void insertText(QChar c, HTMLTreeNode *parent = nullptr);
69 | 
70 |     void inserRoot(HTMLToken *token);
71 | 
72 |     void insertDoctype(HTMLToken *token);
73 | 
74 |     void insertComment(HTMLToken *token, HTMLTreeNode *parent = nullptr);
75 | 
76 |     HTMLTreeNode *createElement(HTMLToken *token);
77 | 
78 |     void dump();
79 | 
80 |     QVector<HTMLTreeNode*> openElements() const;
81 | 
82 | private:
83 |     HTMLTreeNode *createNode(int &pos, int lastPos, bool plainText, HTMLTreeNode *parent);
84 |     void dumpTree(HTMLTreeNode *root, int level = 0);
85 | 
86 |     QString m_defaultNamespace;
87 |     bool m_useAllowed;
88 |     bool m_insertFromTable = false;
89 |     QStringList m_allowed;
90 |     QString m_content;
91 |     int m_pos = 0;
92 |     QList<HTMLTreeNode*> m_nodes;
93 |     HTMLTreeNode *m_document = nullptr;
94 |     QVector<HTMLTreeNode*> m_openElements;
95 | };
96 | 
97 | #endif // HTMLTREE_H
98 | 


--------------------------------------------------------------------------------