├── .gitignore ├── LICENSE ├── README.md ├── html_parser.hpp ├── test.cpp └── test └── migrate_test ├── test-group1.cpp ├── test-group2.cpp ├── test-group3.cpp ├── test-group4.cpp ├── test-group5.cpp └── test-group6.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | *.smod 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | *.lib 25 | 26 | # Executables 27 | *.exe 28 | *.out 29 | *.app 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # htmlparser 2 | 3 | a simple c++ html parser 4 | 5 | ## Intro 6 | 7 | Chinese Introduction 8 | 9 | [https://xilixili.net/2017/03/16/htmlparser/](https://xilixili.net/2017/03/16/htmlparser/) 10 | 11 | - support html and xhtml document 12 | - support getElementById(ClassName/TagName) 13 | - support simple XPath select interface 14 | 15 | ## Usage 16 | 17 | Basic usage please see demo [test.cpp](test.cpp). 18 | 19 | The compiler must support at least tr1 both Win and GNU/Linux for smart_ptr & unordered_set. 20 | 21 | Any c++11 compiler was supported. others may works. -------------------------------------------------------------------------------- /html_parser.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 SPLI (rangerlee@foxmail.com) 3 | * Latest version available at: http://github.com/rangerlee/htmlparser.git 4 | * 5 | * A Simple html parser. 6 | * More information can get from README.md 7 | * 8 | */ 9 | 10 | #ifndef HTMLPARSER_HPP_ 11 | #define HTMLPARSER_HPP_ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #if __cplusplus <= 199711L 22 | #if linux 23 | #include 24 | #else 25 | #include 26 | #endif 27 | using std::tr1::enable_shared_from_this; 28 | using std::tr1::shared_ptr; 29 | using std::tr1::weak_ptr; 30 | #else 31 | 32 | #include 33 | 34 | using std::enable_shared_from_this; 35 | using std::shared_ptr; 36 | using std::weak_ptr; 37 | #endif 38 | 39 | /** 40 | * class HtmlElement 41 | * HTML Element struct 42 | */ 43 | class HtmlElement : public enable_shared_from_this { 44 | public: 45 | friend class HtmlParser; 46 | 47 | friend class HtmlDocument; 48 | 49 | public: 50 | /** 51 | * for children traversals. 52 | */ 53 | typedef std::vector >::const_iterator ChildIterator; 54 | 55 | const ChildIterator ChildBegin() { 56 | return children.begin(); 57 | } 58 | 59 | const ChildIterator ChildEnd() { 60 | return children.end(); 61 | } 62 | 63 | /** 64 | * for attribute traversals. 65 | */ 66 | typedef std::map::const_iterator AttributeIterator; 67 | 68 | const AttributeIterator AttributeBegin() { 69 | return attribute.begin(); 70 | } 71 | 72 | const AttributeIterator AttributeEnd() { 73 | return attribute.end(); 74 | } 75 | 76 | public: 77 | HtmlElement() {} 78 | 79 | HtmlElement(shared_ptr p) 80 | : parent(p) {} 81 | 82 | std::string GetAttribute(const std::string &k) { 83 | if (attribute.find(k) != attribute.end()) { 84 | return attribute[k]; 85 | } 86 | 87 | return ""; 88 | } 89 | 90 | shared_ptr GetElementById(const std::string &id) { 91 | for (HtmlElement::ChildIterator it = children.begin(); it != children.end(); ++it) { 92 | if ((*it)->GetAttribute("id") == id) return *it; 93 | 94 | shared_ptr r = (*it)->GetElementById(id); 95 | if (r) return r; 96 | } 97 | 98 | return shared_ptr(); 99 | } 100 | 101 | std::vector > GetElementByClassName(const std::string &name) { 102 | std::vector > result; 103 | GetElementByClassName(name, result); 104 | return result; 105 | } 106 | 107 | std::vector > GetElementByTagName(const std::string &name) { 108 | std::vector > result; 109 | GetElementByTagName(name, result); 110 | return result; 111 | } 112 | 113 | void SelectElement(const std::string& rule, std::vector >& result){ 114 | if(rule.empty() || rule.at(0) != '/' || name == "plain") return; 115 | std::string::size_type pos = 0; 116 | if(rule.size() >= 2 && rule.at(1) == '/') { 117 | std::vector > temp; 118 | GetAllElement(temp); 119 | pos = 1; 120 | std::string next = rule.substr(pos); 121 | if(next.empty()) { 122 | for(size_t i = 0; i < temp.size(); i++){ 123 | InsertIfNotExists(result, temp[i]); 124 | } 125 | } else { 126 | for(size_t i = 0; i < temp.size(); i++){ 127 | temp[i]->SelectElement(next, result);; 128 | } 129 | } 130 | } else { 131 | std::string::size_type p = rule.find('/', 1); 132 | std::string line; 133 | if(p == std::string::npos) { 134 | line = rule; 135 | pos = rule.size(); 136 | } else { 137 | line = rule.substr(0, p); 138 | pos = p; 139 | } 140 | 141 | enum { x_ele, x_wait_attr, x_attr, x_val }; 142 | std::string ele, attr, oper, val, cond; 143 | int state = x_ele; 144 | for(p = 1; p < pos; ) { 145 | char c = line.at(p++); 146 | switch (state) { 147 | case x_ele: { 148 | if(c == '@') { 149 | state = x_attr; 150 | } else if(c == '!') { 151 | state = x_wait_attr; 152 | cond.append(1,c); 153 | } else if(c == '[') { 154 | state = x_wait_attr; 155 | } else { 156 | ele.append(1,c); 157 | } 158 | } 159 | break; 160 | 161 | case x_wait_attr: { 162 | if(c == '@') state = x_attr; 163 | else if(c == '!') { 164 | cond.append(1,c); 165 | } 166 | } 167 | break; 168 | 169 | case x_attr: { 170 | if(c == '!') { 171 | oper.append(1,c); 172 | } else if(c == '=') { 173 | oper.append(1,c); 174 | state = x_val; 175 | } else if(c == ']') { 176 | state = x_ele; 177 | } else { 178 | attr.append(1,c); 179 | } 180 | } 181 | break; 182 | 183 | case x_val: { 184 | if(c == ']') { 185 | state = x_ele; 186 | } else { 187 | val.append(1,c); 188 | } 189 | } 190 | break; 191 | } 192 | } 193 | 194 | if(!val.empty() && val.at(0) == '\''){ 195 | val.erase(val.begin()); 196 | } 197 | 198 | if(!val.empty() && val.at(val.size() - 1) == '\''){ 199 | val.pop_back(); 200 | } 201 | 202 | bool matched = true; 203 | if(!ele.empty()){ 204 | if(name != ele) { 205 | matched = false; 206 | } 207 | } 208 | 209 | if(cond == "!"){ 210 | if(!attr.empty() && matched){ 211 | if(!oper.empty()){ 212 | std::string v = attribute[attr]; 213 | if(oper == "="){ 214 | if(v == val) matched = false; 215 | if (attr == "class") { 216 | std::set attr_class = SplitClassName(GetAttribute("class")); 217 | if (attr_class.find(val) != attr_class.end()) matched = false; 218 | } 219 | } else if (oper == "!=") { 220 | if (v == val) matched = false; 221 | if (attr == "class") { 222 | std::set attr_class = SplitClassName(GetAttribute("class")); 223 | if (attr_class.find(val) == attr_class.end()) matched = false; 224 | } 225 | } 226 | } else { 227 | if(attribute.find(attr) != attribute.end()) matched = false; 228 | } 229 | } 230 | } else { 231 | if (!attr.empty() && matched) { 232 | if (attribute.find(attr) == attribute.end()) { 233 | matched = false; 234 | } else { 235 | std::string v = attribute[attr]; 236 | if (oper == "=") { 237 | if (v != val) matched = false; 238 | if (attr == "class") { 239 | std::set attr_class = SplitClassName(GetAttribute("class")); 240 | if (attr_class.find(val) == attr_class.end()) matched = false; 241 | } 242 | } else if (oper == "!=") { 243 | if (v == val) matched = false; 244 | if (attr == "class") { 245 | std::set attr_class = SplitClassName(GetAttribute("class")); 246 | if (attr_class.find(val) != attr_class.end()) matched = false; 247 | } 248 | } 249 | } 250 | } 251 | } 252 | 253 | std::string next = rule.substr(pos); 254 | if(matched) { 255 | if(next.empty()) 256 | InsertIfNotExists(result, shared_from_this()); 257 | else { 258 | for(ChildIterator it = ChildBegin(); it != ChildEnd(); it++){ 259 | (*it)->SelectElement(next, result); 260 | } 261 | } 262 | } 263 | }; 264 | } 265 | 266 | shared_ptr GetParent() { 267 | return parent.lock(); 268 | } 269 | 270 | const std::string &GetValue() { 271 | if(value.empty() && children.size() == 1 && children[0]->GetName() == "plain"){ 272 | return children[0]->GetValue(); 273 | } 274 | 275 | return value; 276 | } 277 | 278 | const std::string &GetName() { 279 | return name; 280 | } 281 | 282 | std::string text(){ 283 | std::string str; 284 | PlainStylize(str); 285 | return str; 286 | } 287 | 288 | void PlainStylize(std::string& str){ 289 | if(name == "head" || name == "meta" || name == "style" || name == "script" || name == "link"){ 290 | return ; 291 | } 292 | 293 | if(name == "plain"){ 294 | str.append(value); 295 | return; 296 | } 297 | 298 | for (size_t i = 0; i < children.size();) { 299 | children[i]->PlainStylize(str); 300 | 301 | if (++i < children.size()) { 302 | std::string ele = children[i]->GetName(); 303 | if (ele == "td") { 304 | str.append("\t"); 305 | } 306 | else if (ele == "tr" || ele == "br" || ele == "div" || ele == "p" || ele == "hr" || ele == "area" || 307 | ele == "h1" || ele == "h2" || ele == "h3" || ele == "h4" || ele == "h5" || ele == "h6" || ele == "h7") { 308 | str.append("\n"); 309 | } 310 | } 311 | } 312 | } 313 | 314 | std::string html(){ 315 | std::string str; 316 | HtmlStylize(str); 317 | return str; 318 | } 319 | 320 | void HtmlStylize(std::string& str) { 321 | if (name.empty()) { 322 | for (size_t i = 0; i < children.size(); i++) { 323 | children[i]->HtmlStylize(str); 324 | } 325 | 326 | return; 327 | } else if(name == "plain"){ 328 | str.append(value); 329 | return; 330 | } 331 | 332 | str.append("<" + name); 333 | std::map::const_iterator it = attribute.begin(); 334 | for (; it != attribute.end(); it++) { 335 | str.append(" " + it->first + "=\"" + it->second + "\""); 336 | } 337 | str.append(">"); 338 | 339 | if (children.empty()) { 340 | str.append(value); 341 | } else { 342 | for (size_t i = 0; i < children.size(); i++) { 343 | children[i]->HtmlStylize(str); 344 | } 345 | } 346 | 347 | str.append(""); 348 | } 349 | 350 | private: 351 | void GetElementByClassName(const std::string &name, std::vector > &result) { 352 | for (HtmlElement::ChildIterator it = children.begin(); it != children.end(); ++it) { 353 | std::set attr_class = SplitClassName((*it)->GetAttribute("class")); 354 | std::set class_name = SplitClassName(name); 355 | 356 | std::set::const_iterator iter = class_name.begin(); 357 | for(; iter != class_name.end(); ++iter){ 358 | if(attr_class.find(*iter) == attr_class.end()){ 359 | break; 360 | } 361 | } 362 | 363 | if(iter == class_name.end()){ 364 | InsertIfNotExists(result, *it); 365 | } 366 | 367 | (*it)->GetElementByClassName(name, result); 368 | } 369 | } 370 | 371 | void GetElementByTagName(const std::string &name, std::vector > &result) { 372 | for (HtmlElement::ChildIterator it = children.begin(); it != children.end(); ++it) { 373 | if ((*it)->name == name) 374 | InsertIfNotExists(result, *it); 375 | 376 | (*it)->GetElementByTagName(name, result); 377 | } 378 | } 379 | 380 | void GetAllElement(std::vector >& result){ 381 | for (size_t i = 0; i < children.size(); ++i) { 382 | InsertIfNotExists(result, children[i]); 383 | children[i]->GetAllElement(result); 384 | } 385 | } 386 | 387 | void Parse(const std::string &attr) { 388 | size_t index = 0; 389 | std::string k; 390 | std::string v; 391 | char split = ' '; 392 | bool quota = false; 393 | 394 | enum ParseAttrState { 395 | PARSE_ATTR_KEY, 396 | PARSE_ATTR_VALUE_BEGIN, 397 | PARSE_ATTR_VALUE_END, 398 | }; 399 | 400 | ParseAttrState state = PARSE_ATTR_KEY; 401 | 402 | while (attr.size() > index) { 403 | char input = attr.at(index); 404 | switch (state) { 405 | case PARSE_ATTR_KEY: { 406 | if (input == '\t' || input == '\r' || input == '\n') { 407 | } else if (input == '\'' || input == '"') { 408 | std::cerr << "WARN : attribute unexpected " << input << std::endl; 409 | } else if (input == ' ') { 410 | if (!k.empty()) { 411 | attribute[k] = v; 412 | k.clear(); 413 | } 414 | } else if (input == '=') { 415 | state = PARSE_ATTR_VALUE_BEGIN; 416 | } else { 417 | k.append(attr.c_str() + index, 1); 418 | } 419 | } 420 | break; 421 | 422 | case PARSE_ATTR_VALUE_BEGIN:{ 423 | if (input == '\t' || input == '\r' || input == '\n' || input == ' ') { 424 | if (!k.empty()) { 425 | attribute[k] = v; 426 | k.clear(); 427 | } 428 | state = PARSE_ATTR_KEY; 429 | } else if (input == '\'' || input == '"') { 430 | split = input; 431 | quota = true; 432 | state = PARSE_ATTR_VALUE_END; 433 | } else { 434 | v.append(attr.c_str() + index, 1); 435 | quota = false; 436 | state = PARSE_ATTR_VALUE_END; 437 | } 438 | } 439 | break; 440 | 441 | case PARSE_ATTR_VALUE_END: { 442 | if((quota && input == split) || (!quota && (input == '\t' || input == '\r' || input == '\n' || input == ' '))) { 443 | attribute[k] = v; 444 | k.clear(); 445 | v.clear(); 446 | state = PARSE_ATTR_KEY; 447 | } else { 448 | v.append(attr.c_str() + index, 1); 449 | } 450 | } 451 | break; 452 | } 453 | 454 | index++; 455 | } 456 | 457 | if(!k.empty()){ 458 | attribute[k] = v; 459 | } 460 | 461 | //trim 462 | if (!value.empty()) { 463 | value.erase(0, value.find_first_not_of(" ")); 464 | value.erase(value.find_last_not_of(" ") + 1); 465 | } 466 | } 467 | 468 | static std::set SplitClassName(const std::string& name){ 469 | #if defined(WIN32) 470 | #define strtok_ strtok_s 471 | #else 472 | #define strtok_ strtok_r 473 | #endif 474 | std::set class_names; 475 | char *temp = NULL; 476 | char *p = strtok_((char *)name.c_str(), " ", &temp); 477 | while (p) { 478 | class_names.insert(p); 479 | p = strtok_(NULL, " ", &temp); 480 | } 481 | 482 | return class_names; 483 | } 484 | 485 | static void InsertIfNotExists(std::vector>& vec, const std::shared_ptr& ele){ 486 | for(size_t i = 0; i < vec.size(); i++){ 487 | if(vec[i] == ele) return; 488 | } 489 | 490 | vec.push_back(ele); 491 | } 492 | 493 | private: 494 | std::string name; 495 | std::string value; 496 | std::map attribute; 497 | weak_ptr parent; 498 | std::vector > children; 499 | }; 500 | 501 | /** 502 | * class HtmlDocument 503 | * Html Doc struct 504 | */ 505 | class HtmlDocument { 506 | public: 507 | HtmlDocument(shared_ptr &root) 508 | : root_(root) {} 509 | 510 | shared_ptr GetElementById(const std::string &id) { 511 | return root_->GetElementById(id); 512 | } 513 | 514 | std::vector > GetElementByClassName(const std::string &name) { 515 | return root_->GetElementByClassName(name); 516 | } 517 | 518 | std::vector > GetElementByTagName(const std::string &name) { 519 | return root_->GetElementByTagName(name); 520 | } 521 | 522 | std::vector > SelectElement(const std::string& rule){ 523 | std::vector > result; 524 | HtmlElement::ChildIterator it = root_->ChildBegin(); 525 | for(; it != root_->ChildEnd(); it++){ 526 | (*it)->SelectElement(rule, result); 527 | } 528 | 529 | return result; 530 | } 531 | 532 | std::string html() { 533 | return root_->html(); 534 | } 535 | 536 | std::string text() { 537 | return root_->text(); 538 | } 539 | 540 | private: 541 | shared_ptr root_; 542 | }; 543 | 544 | /** 545 | * class HtmlParser 546 | * html parser and only one interface 547 | */ 548 | class HtmlParser { 549 | public: 550 | HtmlParser() { 551 | static const std::string token[] = { "br", "hr", "img", "input", "link", "meta", 552 | "area", "base", "col", "command", "embed", "keygen", "param", "source", "track", "wbr"}; 553 | self_closing_tags_.insert(token, token + sizeof(token) / sizeof(token[0])); 554 | } 555 | 556 | /** 557 | * parse html by C-Style data 558 | * @param data 559 | * @param len 560 | * @return html document object 561 | */ 562 | shared_ptr Parse(const char *data, size_t len) { 563 | stream_ = data; 564 | length_ = len; 565 | size_t index = 0; 566 | root_.reset(new HtmlElement()); 567 | while (length_ > index) { 568 | char input = stream_[index]; 569 | if (input == '\r' || input == '\n' || input == '\t' || input == ' ') { 570 | index++; 571 | } else if (input == '<') { 572 | index = ParseElement(index, root_); 573 | } else { 574 | break; 575 | } 576 | } 577 | 578 | return shared_ptr(new HtmlDocument(root_)); 579 | } 580 | 581 | /** 582 | * parse html by string data 583 | * @param data 584 | * @return html document object 585 | */ 586 | shared_ptr Parse(const std::string &data) { 587 | return Parse(data.data(), data.size()); 588 | } 589 | 590 | private: 591 | size_t ParseElement(size_t index, shared_ptr &element) { 592 | while (length_ > index) { 593 | char input = stream_[index + 1]; 594 | if (input == '!') { 595 | if (strncmp(stream_ + index, ""); 597 | } else { 598 | return SkipUntil(index + 2, '>'); 599 | } 600 | } else if (input == '/') { 601 | return SkipUntil(index, '>'); 602 | } else if (input == '?') { 603 | return SkipUntil(index, "?>"); 604 | } 605 | 606 | shared_ptr self(new HtmlElement(element)); 607 | 608 | enum ParseElementState { 609 | PARSE_ELEMENT_TAG, 610 | PARSE_ELEMENT_ATTR, 611 | PARSE_ELEMENT_VALUE, 612 | PARSE_ELEMENT_TAG_END 613 | }; 614 | 615 | ParseElementState state = PARSE_ELEMENT_TAG; 616 | index++; 617 | char split = 0; 618 | std::string attr; 619 | 620 | while (length_ > index) { 621 | switch (state) { 622 | case PARSE_ELEMENT_TAG: { 623 | char input = stream_[index]; 624 | if (input == ' ' || input == '\r' || input == '\n' || input == '\t') { 625 | if (!self->name.empty()) { 626 | state = PARSE_ELEMENT_ATTR; 627 | } 628 | index++; 629 | } else if (input == '/') { 630 | self->Parse(attr); 631 | element->children.push_back(self); 632 | return SkipUntil(index, '>'); 633 | } else if (input == '>') { 634 | if(self_closing_tags_.find(self->name) != self_closing_tags_.end()) { 635 | element->children.push_back(self); 636 | return ++index; 637 | } 638 | state = PARSE_ELEMENT_VALUE; 639 | index++; 640 | } else { 641 | self->name.append(stream_ + index, 1); 642 | index++; 643 | } 644 | } 645 | break; 646 | 647 | case PARSE_ELEMENT_ATTR: { 648 | char input = stream_[index]; 649 | if (input == '>') { 650 | if (stream_[index - 1] == '/') { 651 | attr.erase(attr.size() - 1); 652 | self->Parse(attr); 653 | element->children.push_back(self); 654 | return ++index; 655 | } else if(self_closing_tags_.find(self->name) != self_closing_tags_.end()) { 656 | self->Parse(attr); 657 | element->children.push_back(self); 658 | return ++index; 659 | } 660 | state = PARSE_ELEMENT_VALUE; 661 | index++; 662 | } else { 663 | attr.append(stream_ + index, 1); 664 | index++; 665 | } 666 | } 667 | break; 668 | 669 | case PARSE_ELEMENT_VALUE: { 670 | if (self->name == "script" || self->name == "noscript" || self->name == "style") { 671 | std::string close = "name + ">"; 672 | 673 | size_t pre = index; 674 | index = SkipUntil(index, close.c_str()); 675 | if (index > (pre + close.size())) 676 | self->value.append(stream_ + pre, index - pre - close.size()); 677 | 678 | self->Parse(attr); 679 | element->children.push_back(self); 680 | return index; 681 | } 682 | 683 | char input = stream_[index]; 684 | if (input == '<') { 685 | if (!self->value.empty()) { 686 | shared_ptr child(new HtmlElement(self)); 687 | child->name = "plain"; 688 | child->value.swap(self->value); 689 | self->children.push_back(child); 690 | } 691 | 692 | if (stream_[index + 1] == '/') { 693 | state = PARSE_ELEMENT_TAG_END; 694 | } else { 695 | index = ParseElement(index, self); 696 | } 697 | } else if (input != '\r' && input != '\n' && input != '\t') { 698 | self->value.append(stream_ + index, 1); 699 | index++; 700 | } else { 701 | index++; 702 | } 703 | } 704 | break; 705 | 706 | case PARSE_ELEMENT_TAG_END: { 707 | index += 2; 708 | std::string selfname = self->name + ">"; 709 | if (strncmp(stream_ + index, selfname.c_str(), selfname.size())) { 710 | size_t pre = index; 711 | index = SkipUntil(index, ">"); 712 | std::string value; 713 | if (index > (pre + 1)) 714 | value.append(stream_ + pre, index - pre - 1); 715 | else 716 | value.append(stream_ + pre, index - pre); 717 | 718 | shared_ptr parent = self->GetParent(); 719 | while (parent) { 720 | if (parent->name == value) { 721 | std::cerr << "WARN : element not closed <" << self->name << "> " << std::endl; 722 | self->Parse(attr); 723 | element->children.push_back(self); 724 | return pre - 2; 725 | } 726 | 727 | parent = parent->GetParent(); 728 | } 729 | 730 | std::cerr << "WARN : unexpected closed element for <" << self->name 731 | << ">" << std::endl; 732 | state = PARSE_ELEMENT_VALUE; 733 | } else { 734 | self->Parse(attr); 735 | element->children.push_back(self); 736 | return SkipUntil(index, '>'); 737 | } 738 | } 739 | break; 740 | } 741 | } 742 | } 743 | 744 | return index; 745 | } 746 | 747 | size_t SkipUntil(size_t index, const char *data) { 748 | while (length_ > index) { 749 | if (strncmp(stream_ + index, data, strlen(data)) == 0) { 750 | return index + strlen(data); 751 | } else { 752 | index++; 753 | } 754 | } 755 | 756 | return index; 757 | } 758 | 759 | size_t SkipUntil(size_t index, const char data) { 760 | while (length_ > index) { 761 | if (stream_[index] == data) { 762 | return ++index; 763 | } else { 764 | index++; 765 | } 766 | } 767 | 768 | return index; 769 | } 770 | 771 | private: 772 | const char *stream_; 773 | size_t length_; 774 | std::set self_closing_tags_; 775 | shared_ptr root_; 776 | }; 777 | 778 | #endif 779 | -------------------------------------------------------------------------------- /test.cpp: -------------------------------------------------------------------------------- 1 | #include "html_parser.hpp" 2 | 3 | int main() { 4 | std::string data("
ab
"); 5 | HtmlParser parser; 6 | shared_ptr doc = parser.Parse(data.c_str(), data.size()); 7 | 8 | std::vector> x = doc->GetElementByClassName("x"); 9 | shared_ptr b = doc->GetElementById("b"); 10 | 11 | if(!x.empty()){ 12 | std::cout << x[0]->GetName() << std::endl; 13 | std::cout << x[0]->GetAttribute("id") << std::endl; 14 | } 15 | 16 | if(b){ 17 | std::cout << b->GetName() << std::endl; 18 | std::cout << b->GetValue() << std::endl; 19 | } 20 | 21 | data = "

"; 22 | doc = parser.Parse(data.c_str(), data.size()); 23 | b = doc->GetElementById("b"); 24 | if(b){ 25 | shared_ptr c = b->GetElementById("c"); 26 | if(c){ 27 | std::cout << "wrong" << std::endl; 28 | } else { 29 | std::cout << "ok" << std::endl; 30 | } 31 | } 32 | 33 | data = "
"; 34 | doc = parser.Parse(data.c_str(), data.size()); 35 | std::vector> ab = doc->GetElementByClassName("aa bb"); 36 | std::vector> abc = doc->GetElementByClassName("aa cc bb"); 37 | std::vector> xz = doc->GetElementByClassName("xx zz"); 38 | if(ab.empty() || abc.empty() || xz.empty()){ 39 | std::cout << "wrong" << std::endl; 40 | } else { 41 | std::cout << "ok" << std::endl; 42 | } 43 | 44 | std::vector> cc = doc->SelectElement("//img[@class]"); 45 | if(cc.empty()){ 46 | std::cout << "wrong" << std::endl; 47 | } else { 48 | std::cout << "ok" << std::endl; 49 | } 50 | 51 | return 0; 52 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | //test1 9 | TEST(test, dropsUnterminatedTag) { 10 | string h1(" doc = parser.Parse(h1); 13 | vector> x = doc->GetElementByTagName("p"); 14 | 15 | ASSERT_EQ(0, x.size()); 16 | ASSERT_EQ(true, x.empty()); 17 | 18 | string h2("
text()); 21 | } 22 | 23 | //test2 24 | TEST(test, dropsUnterminatedAttribute) { 25 | string h1("

doc = parser.Parse(h1); 28 | ASSERT_EQ("", doc->text()); 29 | } 30 | 31 | //test3 32 | TEST(test, parsesQuiteRoughAttributes) { 33 | 34 | string html("

OneSomething

Else"); 35 | HtmlParser parser; 36 | shared_ptr doc = parser.Parse(html); 37 | 38 | ASSERT_EQ("

OneSomething

", doc->html()); 39 | } 40 | 41 | //test4 42 | TEST(test, createsStructureFromBodySnippet) { 43 | string html("foo bar baz"); 44 | HtmlParser parser; 45 | shared_ptr doc = parser.Parse(html); 46 | ASSERT_EQ("", doc->text()); 47 | } 48 | 49 | //test5 50 | TEST(test, handlesTextArea) { 51 | string html(""); 52 | HtmlParser parser; 53 | shared_ptr doc = parser.Parse(html); 54 | vector> els = doc->SelectElement("//textarea"); 55 | 56 | ASSERT_EQ(1, els.size()); 57 | 58 | ASSERT_EQ("Hello", els[0]->text()); 59 | ASSERT_EQ("Hello", els[0]->GetValue()); 60 | } 61 | 62 | //test6 63 | TEST(test, handlesTextAfterData) { 64 | string h = "pre aft"; 65 | HtmlParser parser; 66 | shared_ptr doc = parser.Parse(h); 67 | 68 | ASSERT_EQ("pre aft", doc->html()); 69 | } 70 | 71 | //test7 72 | TEST(test, handlesTextTd) { 73 | string h = "Hello

There

now"; 74 | HtmlParser parser; 75 | shared_ptr doc = parser.Parse(h); 76 | 77 | ASSERT_EQ("", doc->html()); 78 | } 79 | 80 | //test8 81 | TEST(test, handlesNestedImplicitTable) { 82 | HtmlParser parser; 83 | shared_ptr doc = parser.Parse("
1
2
3 4
5
"); 84 | 85 | ASSERT_EQ("
1 2
3 4
5
", doc->html()); 86 | } 87 | 88 | //test9 89 | TEST(test, handlesImplicitCaptionClose) { 90 | HtmlParser parser; 91 | shared_ptr doc = parser.Parse("
A caption
OneTwo
"); 92 | ASSERT_EQ("
A caption
OneTwo
", doc->html()); 93 | } 94 | 95 | //test10 96 | TEST(test, handlesKnownEmptyBlocks) { 97 | string h = "

One
", doc->html()); 102 | } 103 | 104 | 105 | 106 | GTEST_API_ int main(int argc, char ** argv) { 107 | testing::InitGoogleTest(&argc, argv); 108 | return RUN_ALL_TESTS(); 109 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | //test11 9 | TEST(test, handlesKnownEmptyNoFrames) { 10 | std::string h = "<meta name=foo></head><body>One</body></html>"; 11 | HtmlParser parser; 12 | shared_ptr<HtmlDocument> doc = parser.Parse(h); 13 | 14 | ASSERT_EQ("<html><head><noframes>One", doc->html()); 15 | } 16 | 17 | //test12 18 | TEST(test, handlesKnownEmptyStyle) { 19 | std::string h = "One", doc->html()); 24 | } 25 | 26 | //test13 27 | TEST(test, handlesKnownEmptyTitle) { 28 | std::string h = "<meta name=foo></head><body>One</body></html>"; 29 | HtmlParser parser; 30 | shared_ptr<HtmlDocument> doc = parser.Parse(h); 31 | 32 | ASSERT_EQ("<html><head><title>One", doc->html()); 33 | } 34 | 35 | //test14 36 | TEST(test, handlesProtocolRelativeUrl) { 37 | std::string html(""); 38 | HtmlParser parser; 39 | shared_ptr doc = parser.Parse(html); 40 | std::vector> els = doc->SelectElement("//textarea"); 41 | 42 | ASSERT_EQ(1, els.size()); 43 | 44 | ASSERT_EQ("Hello", els[0]->text()); 45 | ASSERT_EQ("Hello", els[0]->GetValue()); 46 | } 47 | 48 | //test15 49 | // issue: expect:Hello < There <&> actual:Hello 50 | TEST(test, handlesInvalidStartTags) { 51 | std::string html("
Hello < There <&>
"); 52 | HtmlParser parser; 53 | shared_ptr doc = parser.Parse(html); 54 | std::vector> els = doc->SelectElement("//div"); 55 | 56 | ASSERT_EQ(1, els.size()); 57 | 58 | // ASSERT_EQ("Hello < There <&>", els[0]->text()); 59 | ASSERT_EQ("Hello ", els[0]->text()); 60 | } 61 | 62 | //test16 63 | TEST(test, handlesFrames) { 64 | std::string html(""); 65 | HtmlParser parser; 66 | shared_ptr doc = parser.Parse(html); 67 | 68 | ASSERT_EQ("", doc->html()); 69 | } 70 | 71 | //test17 72 | TEST(test, ignoresContentAfterFrameset) { 73 | std::string html("One
"); 74 | HtmlParser parser; 75 | shared_ptr doc = parser.Parse(html); 76 | 77 | ASSERT_EQ("One
", doc->html()); 78 | } 79 | 80 | //test18 81 | TEST(test, testSpaceAfterTag) { 82 | std::string html("

Hello

"); 83 | HtmlParser parser; 84 | shared_ptr doc = parser.Parse(html); 85 | 86 | ASSERT_EQ("", doc->html()); 87 | 88 | } 89 | 90 | //test19 91 | TEST(test, normalisesDocument) { 92 | std::string h("OneTwoThreeFourFive Six Seven "); 93 | HtmlParser parser; 94 | shared_ptr doc = parser.Parse(h); 95 | 96 | // ASSERT_EQ("OneTwoThreeFourFive Six Seven", doc->html()); 97 | } 98 | 99 | //test20 100 | TEST(test, normalisesEmptyDocument) { 101 | std::string h(""); 102 | HtmlParser parser; 103 | shared_ptr doc = parser.Parse(h); 104 | 105 | ASSERT_EQ("", doc->html()); 106 | } 107 | 108 | 109 | GTEST_API_ int main(int argc, char ** argv) { 110 | testing::InitGoogleTest(&argc, argv); 111 | return RUN_ALL_TESTS(); 112 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | //test21 9 | TEST(test, findsCharsetInMalformedMeta) { 10 | string h(""); 11 | HtmlParser parser; 12 | shared_ptr doc = parser.Parse(h); 13 | vector> els = doc->SelectElement("//meta"); 14 | ASSERT_EQ(1, els.size()); 15 | 16 | ASSERT_EQ("gb2312", els[0]->GetAttribute("charset")); 17 | } 18 | 19 | //test22 20 | TEST(test, testHgroup) { 21 | string h("

Hello

There

Another

headline

More

stuff

"); 22 | HtmlParser parser; 23 | shared_ptr doc = parser.Parse(h); 24 | 25 | // ASSERT_EQ("

Hello

There

Another

headline

More

stuff

", doc->html()); 26 | } 27 | 28 | 29 | //test23 30 | TEST(test, testNoImagesInNoScriptInHead) { 31 | string h("

Hello

"); 32 | HtmlParser parser; 33 | shared_ptr doc = parser.Parse(h); 34 | 35 | ASSERT_EQ("

Hello

", doc->html()); 36 | } 37 | 38 | //test24 39 | TEST(test, commentBeforeHtml) { 40 | string h("

One

"); 41 | HtmlParser parser; 42 | shared_ptr doc = parser.Parse(h); 43 | 44 | ASSERT_EQ("

One

", doc->html()); 45 | } 46 | 47 | //test25 48 | TEST(test, emptyTdTag) { 49 | string h("
One
"); 50 | HtmlParser parser; 51 | shared_ptr doc = parser.Parse(h); 52 | 53 | vector> els = doc->SelectElement("//tr"); 54 | ASSERT_EQ(1, els.size()); 55 | 56 | ASSERT_EQ("One", els[0]->html()); 57 | // ASSERT_EQ("One\n", els[0]->html()); 58 | } 59 | 60 | //test26 61 | TEST(test, handlesUnclosedScriptAtEof) { 62 | HtmlParser parser; 63 | vector> els = parser.Parse("")->SelectElement("//script"); 64 | ASSERT_EQ(1, els.size()); 65 | 66 | ASSERT_EQ("Data", els[0]->GetValue()); 67 | 68 | } 69 | 70 | //test27 71 | TEST(test, handlesUnclosedRawtextAtEof) { 72 | HtmlParser parser; 73 | vector> els = parser.Parse("")->SelectElement("//style"); 74 | ASSERT_EQ(1, els.size()); 75 | 76 | ASSERT_EQ("Data", els[0]->GetValue()); 77 | 78 | } 79 | 80 | //test28 81 | TEST(test, handlesEscapedScript) { 82 | HtmlParser parser; 83 | shared_ptr doc = parser.Parse(""); 84 | vector> els = doc->SelectElement("//script"); 85 | ASSERT_EQ(1, els.size()); 86 | // ", els[0]->GetValue()); 88 | } 89 | 90 | //test29 91 | TEST(test, handlesInputInTable) { 92 | string h = "\n\ 93 | \n\ 94 | \n\ 95 | \n\ 96 |
\n\ 97 | "; 98 | HtmlParser parser; 99 | shared_ptr doc = parser.Parse(h); 100 | ASSERT_EQ(1, doc->SelectElement("//table").size()); 101 | ASSERT_EQ(2, doc->SelectElement("//input").size()); 102 | } 103 | 104 | 105 | //test30 106 | TEST(test, testUsingSingleQuotesInQueries) { 107 | string body = "
hello
"; 108 | HtmlParser parser; 109 | shared_ptr doc = parser.Parse(body); 110 | vector> main =doc->SelectElement("//div[@class='main']"); 111 | 112 | ASSERT_EQ(1, main.size()); 113 | 114 | ASSERT_EQ("hello", main[0]->text()); 115 | } 116 | 117 | GTEST_API_ int main(int argc, char ** argv) { 118 | testing::InitGoogleTest(&argc, argv); 119 | return RUN_ALL_TESTS(); 120 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group4.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | //test31 9 | TEST(test, testSupportsNonAsciiTags) { 10 | string body = "YesCorrect"; 11 | HtmlParser parser; 12 | shared_ptr doc = parser.Parse(body); 13 | vector> els =doc->SelectElement("//a進捗推移グラフ"); 14 | 15 | ASSERT_EQ(1, els.size()); 16 | 17 | ASSERT_EQ("Yes", els[0]->text()); 18 | els =doc->SelectElement("//bрусский-тэг"); 19 | 20 | ASSERT_EQ(1, els.size()); 21 | ASSERT_EQ("Correct", els[0]->text()); 22 | } 23 | 24 | //test32 25 | TEST(test, testSupportsPartiallyNonAsciiTags) { 26 | string body = "
Check"; 27 | HtmlParser parser; 28 | shared_ptr doc = parser.Parse(body); 29 | vector> els =doc->SelectElement("//div"); 30 | 31 | ASSERT_EQ(1, els.size()); 32 | 33 | ASSERT_EQ("Check", els[0]->text()); 34 | } 35 | 36 | //test33 37 | TEST(test, preSkipsFirstNewline) { 38 | HtmlParser parser; 39 | shared_ptr doc = parser.Parse("
\n\nOne\nTwo\n
"); 40 | vector> pre =doc->SelectElement("//pre"); 41 | 42 | ASSERT_EQ(1, pre.size()); 43 | 44 | ASSERT_EQ("OneTwo", pre[0]->text()); 45 | } 46 | 47 | //test34 48 | TEST(test, testNoSpuriousSpace) { 49 | HtmlParser parser; 50 | shared_ptr doc = parser.Parse("

JustOneTwo

"); 51 | 52 | ASSERT_EQ("JustOneTwo", doc->text()); 53 | } 54 | 55 | //test35 56 | TEST(test, testH20) { 57 | string html = "H2O"; 58 | HtmlParser parser; 59 | shared_ptr doc = parser.Parse("

" + html+"

"); 60 | ASSERT_EQ("H2O", doc->text()); 61 | } 62 | 63 | //test36 64 | TEST(test, testUNewlines) { 65 | string html = "test on fire"; 66 | HtmlParser parser; 67 | shared_ptr doc = parser.Parse("

" + html+"

"); 68 | ASSERT_EQ("test on fire", doc->text()); 69 | } 70 | 71 | //test37 72 | TEST(test, testFarsi) { 73 | string text = "نیمه\u200Cشب"; 74 | HtmlParser parser; 75 | shared_ptr doc = parser.Parse("

" + text+"

"); 76 | ASSERT_EQ(text, doc->text()); 77 | } 78 | 79 | //test38 80 | TEST(test, mergeHtmlAttributesFromBody) { 81 | HtmlParser parser; 82 | shared_ptr doc = parser.Parse("

One"); 83 | ASSERT_EQ("", doc->html()); 84 | } 85 | 86 | //test39 87 | TEST(test, text) { 88 | string h = "

Hello

there

world

"; 89 | HtmlParser parser; 90 | shared_ptr doc = parser.Parse(h); 91 | ASSERT_EQ("Hello\nthere\nworld", doc->text()); 92 | } 93 | 94 | //test40 95 | TEST(test, html) { 96 | HtmlParser parser; 97 | shared_ptr doc = parser.Parse("

Hello

There

"); 98 | 99 | vector> divs =doc->SelectElement("//div"); 100 | 101 | ASSERT_EQ(false, divs.empty()); 102 | 103 | ASSERT_EQ("

Hello

", divs[0]->html()); 104 | } 105 | 106 | GTEST_API_ int main(int argc, char ** argv) { 107 | testing::InitGoogleTest(&argc, argv); 108 | return RUN_ALL_TESTS(); 109 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group5.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | 9 | //test41 10 | TEST(test, classWithHyphen) { 11 | HtmlParser parser; 12 | shared_ptr doc = parser.Parse("

Check

"); 13 | vector> els = doc->GetElementByClassName("tab-nav"); 14 | ASSERT_EQ(1, els.size()); 15 | ASSERT_EQ("Check", els[0]->text()); 16 | } 17 | 18 | //test42 19 | TEST(test, testGetText) { 20 | string reference = "

Hello

Another element

"; 21 | HtmlParser parser; 22 | shared_ptr doc = parser.Parse(reference); 23 | 24 | ASSERT_EQ("Hello\nAnother element\n", doc->text()); 25 | ASSERT_EQ("Another element", doc->GetElementByTagName("p")[1]->text()); 26 | } 27 | 28 | //test43 29 | TEST(test, testNormalisesText) { 30 | string h = "

Hello

There.

\n

Here is \n some text."; 31 | HtmlParser parser; 32 | shared_ptr doc = parser.Parse(h); 33 | string text = doc->text(); 34 | ASSERT_EQ("Hello\nThere. \nHere is some text.", text); 35 | } 36 | 37 | //test44 38 | TEST(test, testKeepsPreText) { 39 | string h = "

Hello \n \n there.

  What's \n\n  that?
"; 40 | HtmlParser parser; 41 | shared_ptr doc = parser.Parse(h); 42 | ASSERT_EQ("Hello there.", doc->text()); 43 | } 44 | 45 | //test45 46 | TEST(test, testKeepsPreTextInCode) { 47 | string h = "
code\n\ncode
"; 48 | HtmlParser parser; 49 | shared_ptr doc = parser.Parse(h); 50 | ASSERT_EQ("codecode", doc->text()); 51 | } 52 | 53 | //test46 54 | TEST(test, testKeepsPreTextAtDepth) { 55 | string h = "
code\n\ncode
"; 56 | HtmlParser parser; 57 | shared_ptr doc = parser.Parse(h); 58 | ASSERT_EQ("codecode", doc->text()); 59 | } 60 | 61 | //test47 62 | TEST(test, textHasSpacesAfterBlock) { 63 | HtmlParser parser; 64 | shared_ptr doc = parser.Parse("
One
Two
Three

Four

"); 65 | string text = doc->text(); 66 | 67 | ASSERT_EQ("One\nTwoThree\nFour", text); 68 | ASSERT_EQ("OneTwo", parser.Parse("OneTwo")->text()); 69 | } 70 | 71 | //test48 72 | TEST(test, testInnerHtml) { 73 | HtmlParser parser; 74 | shared_ptr doc = parser.Parse("
\n

Hello

"); 75 | 76 | ASSERT_EQ("

Hello

", doc->GetElementByTagName("div")[0]->html()); 77 | } 78 | 79 | //test49 80 | TEST(test, testHtmlId) { 81 | HtmlParser parser; 82 | shared_ptr doc = parser.Parse("

Hello

"); 83 | shared_ptr div = doc->GetElementById("1"); 84 | 85 | ASSERT_EQ("Hello", div->text()); 86 | } 87 | 88 | //test50 89 | TEST(test, textHasSpaceAfterBlockTags) { 90 | HtmlParser parser; 91 | shared_ptr doc = parser.Parse("
One
Two"); 92 | ASSERT_EQ("One", doc->text()); 93 | } 94 | 95 | GTEST_API_ int main(int argc, char ** argv) { 96 | testing::InitGoogleTest(&argc, argv); 97 | return RUN_ALL_TESTS(); 98 | } -------------------------------------------------------------------------------- /test/migrate_test/test-group6.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "html_parser.hpp" 5 | 6 | using namespace std; 7 | 8 | //test51 9 | TEST(test, textHasSpaceBetweenDivAndCenterTags) { 10 | HtmlParser parser; 11 | shared_ptr doc = parser.Parse("
One
Two
Three
Four
"); 12 | ASSERT_EQ("One\nTwoThreeFour", doc->text()); 13 | } 14 | 15 | //test52 16 | TEST(test, prettyprintBrInBlock) { 17 | string html = "

"; 18 | HtmlParser parser; 19 | shared_ptr doc = parser.Parse(html); 20 | ASSERT_EQ("


", doc->html()); 21 | } 22 | 23 | //test53 24 | TEST(test, prettyprintBrWhenNotFirstChild) { 25 | string h = "
\n\ 26 |


\n Foo

\n\ 27 |
\n\ 28 |
"; 29 | HtmlParser parser; 30 | shared_ptr doc = parser.Parse(h); 31 | ASSERT_EQ("



Foo



", doc->html()); 32 | } 33 | 34 | //test54 35 | TEST(test, noDanglingSpaceAfterCustomElement) { 36 | string html = "

\n"; 37 | HtmlParser parser; 38 | shared_ptr doc = parser.Parse(html); 39 | ASSERT_EQ("

", doc->html()); 40 | 41 | html = "\n \n"; 42 | doc = parser.Parse(html); 43 | 44 | ASSERT_EQ(" ", doc->html()); 45 | } 46 | 47 | //test55 48 | TEST(test, spanInBlockTrims) { 49 | string html = "

Lorem ipsum

\nThanks"; 50 | HtmlParser parser; 51 | shared_ptr doc = parser.Parse(html); 52 | string outHtml = doc->html(); 53 | ASSERT_EQ("

Lorem ipsum

Thanks", outHtml); 54 | } 55 | 56 | //test56 57 | TEST(test, rubyInline) { 58 | string html = "T(!)"; 59 | HtmlParser parser; 60 | shared_ptr doc = parser.Parse(html); 61 | ASSERT_EQ(html, doc->html()); 62 | } 63 | 64 | GTEST_API_ int main(int argc, char ** argv) { 65 | testing::InitGoogleTest(&argc, argv); 66 | return RUN_ALL_TESTS(); 67 | } --------------------------------------------------------------------------------