├── README.md └── src ├── Makefile ├── pugixml ├── pugiconfig.hpp ├── pugixml.cpp └── pugixml.hpp ├── spider_common.h ├── spider_config.cpp ├── spider_config.h ├── spider_cookie.cpp ├── spider_cookie.h ├── spider_database.cpp ├── spider_database.h ├── spider_executor.cpp ├── spider_executor.h ├── spider_http_client.cpp ├── spider_http_client.h ├── spider_main.cpp ├── spider_md5.cpp ├── spider_md5.h ├── spider_porting.cpp ├── spider_porting.h ├── spider_seed.cpp ├── spider_seed.h ├── spider_storage.cpp ├── spider_storage.h ├── spider_thread_pool.cpp ├── spider_thread_pool.h ├── spider_url.cpp ├── spider_url.h ├── spider_url_rinse.cpp ├── spider_url_rinse.h ├── spider_utils.cpp ├── spider_utils.h ├── spider_website.cpp └── spider_website.h /README.md: -------------------------------------------------------------------------------- 1 | # Spider 2 | 3 | 高性能爬虫引擎, 4 | 5 | 已用于GIF库中,从微博/主流网站抓取图片和点评. 6 | 7 | 在单核1G内存的服务器上抓取GIF(含下载图片)速度为每小时下载1万条(受带宽影响)。 8 | 9 | 在单核1G内存的服务器上抓取图片(不下载图片)速度为每小时80万条。 10 | 11 | 包含线程池、网页去重、历史记录、网页分析、epoll/select异步请求管理、Cookie管理、通用Http请求、异步DNS解析等模块。 12 | 13 | 14 | #Build 15 | 16 | 依赖库: 17 | 18 | boost_1_57_0 提供智能指针 19 | 20 | crypto_5_60 提供加密 21 | 22 | libevent-2.0.22-stable 提供异步DNS解析 23 | 24 | mpir-2.7.0 提供大数的处理 25 | 26 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | DIR_SRC=./ 2 | DIR_3RD=../3rd/ 3 | DIR_MYSQL_LIB=/usr/lib64/mysql/ 4 | TARGET=spider 5 | 6 | 7 | CPPS=$(wildcard $(DIR_SRC)*.cpp) 8 | CPPS+=$(wildcard $(DIR_SRC)/pugixml/*.cpp) 9 | OBJS=$(patsubst %.cpp,%.o,$(CPPS)) 10 | 11 | CFLAGS=-g -O2 -DLINUX -D_DEBUG 12 | CFLAGS+=-I $(DIR_3RD)jsoncpp/ 13 | CFLAGS+=-I $(DIR_3RD)mpir/include/ 14 | CFLAGS+=-I $(DIR_3RD)libevent/include/ 15 | CFLAGS+=-I $(DIR_3RD)crypto/include/ 16 | CFLAGS+=-I $(DIR_3RD)boost/ 17 | 18 | CC=g++ 19 | LIB=-L $(DIR_3RD)jsoncpp/ -ljsoncpp -pthread 20 | LIB+=-L $(DIR_3RD)mpir/lib/ -lmpir 21 | LIB+=-L $(DIR_3RD)libevent/lib/ -levent 22 | LIB+=-L $(DIR_3RD)crypto/lib/ -lcryptopp 23 | LIB+=-L $(DIR_3RD)boost/lib/ -lboost_filesystem -lboost_system 24 | LIB+=-L $(DIR_MYSQL_LIB) -lmysqlclient 25 | #LIB+=-L $(DIR_3RD)/hiredis -lhiredis 26 | 27 | all:$(TARGET) 28 | $(TARGET):$(OBJS) 29 | $(CC) $(CFLAGS) $(OBJS) $(LIB) -o $(TARGET) 30 | 31 | $(OBJS):%.o:%.cpp 32 | $(CC) $(CFLAGS) -c $< -o $@ 33 | 34 | clean: 35 | rm $(OBJS) $(TARGET) 36 | 37 | install: 38 | cp $(TARGET) ../bin/ 39 | 40 | 41 | -------------------------------------------------------------------------------- /src/pugixml/pugiconfig.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pugixml parser - version 1.2 3 | * -------------------------------------------------------- 4 | * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at http://pugixml.org/ 6 | * 7 | * This library is distributed under the MIT License. See notice at the end 8 | * of this file. 9 | * 10 | * This work is based on the pugxml parser, which is: 11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) 12 | */ 13 | 14 | #ifndef HEADER_PUGICONFIG_HPP 15 | #define HEADER_PUGICONFIG_HPP 16 | 17 | // Uncomment this to enable wchar_t mode 18 | //#define PUGIXML_WCHAR_MODE 19 | 20 | // Uncomment this to disable XPath 21 | // #define PUGIXML_NO_XPATH 22 | 23 | // Uncomment this to disable STL 24 | // #define PUGIXML_NO_STL 25 | 26 | // Uncomment this to disable exceptions 27 | // #define PUGIXML_NO_EXCEPTIONS 28 | 29 | // Set this to control attributes for public classes/functions, i.e.: 30 | // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL 31 | // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL 32 | // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall 33 | // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead 34 | 35 | // Uncomment this to switch to header-only version 36 | #define PUGIXML_HEADER_ONLY 37 | #include "pugixml.cpp" 38 | 39 | // Tune these constants to adjust memory-related behavior 40 | // #define PUGIXML_MEMORY_PAGE_SIZE 32768 41 | // #define PUGIXML_MEMORY_OUTPUT_STACK 10240 42 | // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 43 | 44 | #endif 45 | 46 | /** 47 | * Copyright (c) 2006-2012 Arseny Kapoulkine 48 | * 49 | * Permission is hereby granted, free of charge, to any person 50 | * obtaining a copy of this software and associated documentation 51 | * files (the "Software"), to deal in the Software without 52 | * restriction, including without limitation the rights to use, 53 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 54 | * copies of the Software, and to permit persons to whom the 55 | * Software is furnished to do so, subject to the following 56 | * conditions: 57 | * 58 | * The above copyright notice and this permission notice shall be 59 | * included in all copies or substantial portions of the Software. 60 | * 61 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 62 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 63 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 64 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 65 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 66 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 67 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 68 | * OTHER DEALINGS IN THE SOFTWARE. 69 | */ 70 | -------------------------------------------------------------------------------- /src/pugixml/pugixml.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pugixml parser - version 1.2 3 | * -------------------------------------------------------- 4 | * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at http://pugixml.org/ 6 | * 7 | * This library is distributed under the MIT License. See notice at the end 8 | * of this file. 9 | * 10 | * This work is based on the pugxml parser, which is: 11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) 12 | */ 13 | 14 | #ifndef PUGIXML_VERSION 15 | // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons 16 | # define PUGIXML_VERSION 120 17 | #endif 18 | 19 | // Include user configuration file (this can define various configuration macros) 20 | #include "pugiconfig.hpp" 21 | 22 | #ifndef HEADER_PUGIXML_HPP 23 | #define HEADER_PUGIXML_HPP 24 | 25 | // Include stddef.h for size_t and ptrdiff_t 26 | #include 27 | 28 | // Include exception header for XPath 29 | #if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS) 30 | # include 31 | #endif 32 | 33 | // Include STL headers 34 | #ifndef PUGIXML_NO_STL 35 | # include 36 | # include 37 | # include 38 | #endif 39 | 40 | // Macro for deprecated features 41 | #ifndef PUGIXML_DEPRECATED 42 | # if defined(__GNUC__) 43 | # define PUGIXML_DEPRECATED __attribute__((deprecated)) 44 | # elif defined(_MSC_VER) && _MSC_VER >= 1300 45 | # define PUGIXML_DEPRECATED __declspec(deprecated) 46 | # else 47 | # define PUGIXML_DEPRECATED 48 | # endif 49 | #endif 50 | 51 | // If no API is defined, assume default 52 | #ifndef PUGIXML_API 53 | # define PUGIXML_API 54 | #endif 55 | 56 | // If no API for classes is defined, assume default 57 | #ifndef PUGIXML_CLASS 58 | # define PUGIXML_CLASS PUGIXML_API 59 | #endif 60 | 61 | // If no API for functions is defined, assume default 62 | #ifndef PUGIXML_FUNCTION 63 | # define PUGIXML_FUNCTION PUGIXML_API 64 | #endif 65 | 66 | // Character interface macros 67 | #ifdef PUGIXML_WCHAR_MODE 68 | # define PUGIXML_TEXT(t) L ## t 69 | # define PUGIXML_CHAR wchar_t 70 | #else 71 | # define PUGIXML_TEXT(t) t 72 | # define PUGIXML_CHAR char 73 | #endif 74 | 75 | namespace pugi 76 | { 77 | // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE 78 | typedef PUGIXML_CHAR char_t; 79 | 80 | #ifndef PUGIXML_NO_STL 81 | // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE 82 | typedef std::basic_string, std::allocator > string_t; 83 | #endif 84 | } 85 | 86 | // The PugiXML namespace 87 | namespace pugi 88 | { 89 | // Tree node types 90 | enum xml_node_type 91 | { 92 | node_null, // Empty (null) node handle 93 | node_document, // A document tree's absolute root 94 | node_element, // Element tag, i.e. '' 95 | node_pcdata, // Plain character data, i.e. 'text' 96 | node_cdata, // Character data, i.e. '' 97 | node_comment, // Comment tag, i.e. '' 98 | node_pi, // Processing instruction, i.e. '' 99 | node_declaration, // Document declaration, i.e. '' 100 | node_doctype // Document type declaration, i.e. '' 101 | }; 102 | 103 | // Parsing options 104 | 105 | // Minimal parsing mode (equivalent to turning all other flags off). 106 | // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. 107 | const unsigned int parse_minimal = 0x0000; 108 | 109 | // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default. 110 | const unsigned int parse_pi = 0x0001; 111 | 112 | // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default. 113 | const unsigned int parse_comments = 0x0002; 114 | 115 | // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default. 116 | const unsigned int parse_cdata = 0x0004; 117 | 118 | // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree. 119 | // This flag is off by default; turning it on usually results in slower parsing and more memory consumption. 120 | const unsigned int parse_ws_pcdata = 0x0008; 121 | 122 | // This flag determines if character and entity references are expanded during parsing. This flag is on by default. 123 | const unsigned int parse_escapes = 0x0010; 124 | 125 | // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default. 126 | const unsigned int parse_eol = 0x0020; 127 | 128 | // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default. 129 | const unsigned int parse_wconv_attribute = 0x0040; 130 | 131 | // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default. 132 | const unsigned int parse_wnorm_attribute = 0x0080; 133 | 134 | // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default. 135 | const unsigned int parse_declaration = 0x0100; 136 | 137 | // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. 138 | const unsigned int parse_doctype = 0x0200; 139 | 140 | // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only 141 | // of whitespace is added to the DOM tree. 142 | // This flag is off by default; turning it on may result in slower parsing and more memory consumption. 143 | const unsigned int parse_ws_pcdata_single = 0x0400; 144 | 145 | // The default parsing mode. 146 | // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, 147 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. 148 | const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; 149 | 150 | // The full parsing mode. 151 | // Nodes of all types are added to the DOM tree, character/reference entities are expanded, 152 | // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. 153 | const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; 154 | 155 | // These flags determine the encoding of input data for XML document 156 | enum xml_encoding 157 | { 158 | encoding_auto, // Auto-detect input encoding using BOM or < / class xml_object_range 217 | { 218 | public: 219 | typedef It const_iterator; 220 | 221 | xml_object_range(It b, It e): _begin(b), _end(e) 222 | { 223 | } 224 | 225 | It begin() const { return _begin; } 226 | It end() const { return _end; } 227 | 228 | private: 229 | It _begin, _end; 230 | }; 231 | 232 | // Writer interface for node printing (see xml_node::print) 233 | class PUGIXML_CLASS xml_writer 234 | { 235 | public: 236 | virtual ~xml_writer() {} 237 | 238 | // Write memory chunk into stream/file/whatever 239 | virtual void write(const void* data, size_t size) = 0; 240 | }; 241 | 242 | // xml_writer implementation for FILE* 243 | class PUGIXML_CLASS xml_writer_file: public xml_writer 244 | { 245 | public: 246 | // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio 247 | xml_writer_file(void* file); 248 | 249 | virtual void write(const void* data, size_t size); 250 | 251 | private: 252 | void* file; 253 | }; 254 | 255 | #ifndef PUGIXML_NO_STL 256 | // xml_writer implementation for streams 257 | class PUGIXML_CLASS xml_writer_stream: public xml_writer 258 | { 259 | public: 260 | // Construct writer from an output stream object 261 | xml_writer_stream(std::basic_ostream >& stream); 262 | xml_writer_stream(std::basic_ostream >& stream); 263 | 264 | virtual void write(const void* data, size_t size); 265 | 266 | private: 267 | std::basic_ostream >* narrow_stream; 268 | std::basic_ostream >* wide_stream; 269 | }; 270 | #endif 271 | 272 | // A light-weight handle for manipulating attributes in DOM tree 273 | class PUGIXML_CLASS xml_attribute 274 | { 275 | friend class xml_attribute_iterator; 276 | friend class xml_node; 277 | 278 | private: 279 | xml_attribute_struct* _attr; 280 | 281 | typedef void (*unspecified_bool_type)(xml_attribute***); 282 | 283 | public: 284 | // Default constructor. Constructs an empty attribute. 285 | xml_attribute(); 286 | 287 | // Constructs attribute from internal pointer 288 | explicit xml_attribute(xml_attribute_struct* attr); 289 | 290 | // Safe bool conversion operator 291 | operator unspecified_bool_type() const; 292 | 293 | // Borland C++ workaround 294 | bool operator!() const; 295 | 296 | // Comparison operators (compares wrapped attribute pointers) 297 | bool operator==(const xml_attribute& r) const; 298 | bool operator!=(const xml_attribute& r) const; 299 | bool operator<(const xml_attribute& r) const; 300 | bool operator>(const xml_attribute& r) const; 301 | bool operator<=(const xml_attribute& r) const; 302 | bool operator>=(const xml_attribute& r) const; 303 | 304 | // Check if attribute is empty 305 | bool empty() const; 306 | 307 | // Get attribute name/value, or "" if attribute is empty 308 | const char_t* name() const; 309 | const char_t* value() const; 310 | 311 | // Get attribute value, or the default value if attribute is empty 312 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; 313 | 314 | // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty 315 | int as_int(int def = 0) const; 316 | unsigned int as_uint(unsigned int def = 0) const; 317 | double as_double(double def = 0) const; 318 | float as_float(float def = 0) const; 319 | 320 | // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty 321 | bool as_bool(bool def = false) const; 322 | 323 | // Set attribute name/value (returns false if attribute is empty or there is not enough memory) 324 | bool set_name(const char_t* rhs); 325 | bool set_value(const char_t* rhs); 326 | 327 | // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") 328 | bool set_value(int rhs); 329 | bool set_value(unsigned int rhs); 330 | bool set_value(double rhs); 331 | bool set_value(bool rhs); 332 | 333 | // Set attribute value (equivalent to set_value without error checking) 334 | xml_attribute& operator=(const char_t* rhs); 335 | xml_attribute& operator=(int rhs); 336 | xml_attribute& operator=(unsigned int rhs); 337 | xml_attribute& operator=(double rhs); 338 | xml_attribute& operator=(bool rhs); 339 | 340 | // Get next/previous attribute in the attribute list of the parent node 341 | xml_attribute next_attribute() const; 342 | xml_attribute previous_attribute() const; 343 | 344 | // Get hash value (unique for handles to the same object) 345 | size_t hash_value() const; 346 | 347 | // Get internal pointer 348 | xml_attribute_struct* internal_object() const; 349 | }; 350 | 351 | #ifdef __BORLANDC__ 352 | // Borland C++ workaround 353 | bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs); 354 | bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs); 355 | #endif 356 | 357 | // A light-weight handle for manipulating nodes in DOM tree 358 | class PUGIXML_CLASS xml_node 359 | { 360 | friend class xml_attribute_iterator; 361 | friend class xml_node_iterator; 362 | friend class xml_named_node_iterator; 363 | 364 | protected: 365 | xml_node_struct* _root; 366 | 367 | typedef void (*unspecified_bool_type)(xml_node***); 368 | 369 | public: 370 | // Default constructor. Constructs an empty node. 371 | xml_node(); 372 | 373 | // Constructs node from internal pointer 374 | explicit xml_node(xml_node_struct* p); 375 | 376 | // Safe bool conversion operator 377 | operator unspecified_bool_type() const; 378 | 379 | // Borland C++ workaround 380 | bool operator!() const; 381 | 382 | // Comparison operators (compares wrapped node pointers) 383 | bool operator==(const xml_node& r) const; 384 | bool operator!=(const xml_node& r) const; 385 | bool operator<(const xml_node& r) const; 386 | bool operator>(const xml_node& r) const; 387 | bool operator<=(const xml_node& r) const; 388 | bool operator>=(const xml_node& r) const; 389 | 390 | // Check if node is empty. 391 | bool empty() const; 392 | 393 | // Get node type 394 | xml_node_type type() const; 395 | 396 | // Get node name/value, or "" if node is empty or it has no name/value 397 | const char_t* name() const; 398 | const char_t* value() const; 399 | 400 | // Get attribute list 401 | xml_attribute first_attribute() const; 402 | xml_attribute last_attribute() const; 403 | 404 | // Get children list 405 | xml_node first_child() const; 406 | xml_node last_child() const; 407 | 408 | // Get next/previous sibling in the children list of the parent node 409 | xml_node next_sibling() const; 410 | xml_node previous_sibling() const; 411 | 412 | // Get parent node 413 | xml_node parent() const; 414 | 415 | // Get root of DOM tree this node belongs to 416 | xml_node root() const; 417 | 418 | // Get text object for the current node 419 | xml_text text() const; 420 | 421 | // Get child, attribute or next/previous sibling with the specified name 422 | xml_node child(const char_t* name) const; 423 | xml_attribute attribute(const char_t* name) const; 424 | xml_node next_sibling(const char_t* name) const; 425 | xml_node previous_sibling(const char_t* name) const; 426 | 427 | // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA 428 | const char_t* child_value() const; 429 | 430 | // Get child value of child with specified name. Equivalent to child(name).child_value(). 431 | const char_t* child_value(const char_t* name) const; 432 | 433 | // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value) 434 | bool set_name(const char_t* rhs); 435 | bool set_value(const char_t* rhs); 436 | 437 | // Add attribute with specified name. Returns added attribute, or empty attribute on errors. 438 | xml_attribute append_attribute(const char_t* name); 439 | xml_attribute prepend_attribute(const char_t* name); 440 | xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr); 441 | xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr); 442 | 443 | // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors. 444 | xml_attribute append_copy(const xml_attribute& proto); 445 | xml_attribute prepend_copy(const xml_attribute& proto); 446 | xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr); 447 | xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr); 448 | 449 | // Add child node with specified type. Returns added node, or empty node on errors. 450 | xml_node append_child(xml_node_type type = node_element); 451 | xml_node prepend_child(xml_node_type type = node_element); 452 | xml_node insert_child_after(xml_node_type type, const xml_node& node); 453 | xml_node insert_child_before(xml_node_type type, const xml_node& node); 454 | 455 | // Add child element with specified name. Returns added node, or empty node on errors. 456 | xml_node append_child(const char_t* name); 457 | xml_node prepend_child(const char_t* name); 458 | xml_node insert_child_after(const char_t* name, const xml_node& node); 459 | xml_node insert_child_before(const char_t* name, const xml_node& node); 460 | 461 | // Add a copy of the specified node as a child. Returns added node, or empty node on errors. 462 | xml_node append_copy(const xml_node& proto); 463 | xml_node prepend_copy(const xml_node& proto); 464 | xml_node insert_copy_after(const xml_node& proto, const xml_node& node); 465 | xml_node insert_copy_before(const xml_node& proto, const xml_node& node); 466 | 467 | // Remove specified attribute 468 | bool remove_attribute(const xml_attribute& a); 469 | bool remove_attribute(const char_t* name); 470 | 471 | // Remove specified child 472 | bool remove_child(const xml_node& n); 473 | bool remove_child(const char_t* name); 474 | 475 | // Find attribute using predicate. Returns first attribute for which predicate returned true. 476 | template xml_attribute find_attribute(Predicate pred) const 477 | { 478 | if (!_root) return xml_attribute(); 479 | 480 | for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute()) 481 | if (pred(attrib)) 482 | return attrib; 483 | 484 | return xml_attribute(); 485 | } 486 | 487 | // Find child node using predicate. Returns first child for which predicate returned true. 488 | template xml_node find_child(Predicate pred) const 489 | { 490 | if (!_root) return xml_node(); 491 | 492 | for (xml_node node = first_child(); node; node = node.next_sibling()) 493 | if (pred(node)) 494 | return node; 495 | 496 | return xml_node(); 497 | } 498 | 499 | // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true. 500 | template xml_node find_node(Predicate pred) const 501 | { 502 | if (!_root) return xml_node(); 503 | 504 | xml_node cur = first_child(); 505 | 506 | while (cur._root && cur._root != _root) 507 | { 508 | if (pred(cur)) return cur; 509 | 510 | if (cur.first_child()) cur = cur.first_child(); 511 | else if (cur.next_sibling()) cur = cur.next_sibling(); 512 | else 513 | { 514 | while (!cur.next_sibling() && cur._root != _root) cur = cur.parent(); 515 | 516 | if (cur._root != _root) cur = cur.next_sibling(); 517 | } 518 | } 519 | 520 | return xml_node(); 521 | } 522 | 523 | // Find child node by attribute name/value 524 | xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const; 525 | xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const; 526 | 527 | #ifndef PUGIXML_NO_STL 528 | // Get the absolute node path from root as a text string. 529 | string_t path(char_t delimiter = '/') const; 530 | #endif 531 | 532 | // Search for a node by path consisting of node names and . or .. elements. 533 | xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const; 534 | 535 | // Recursively traverse subtree with xml_tree_walker 536 | bool traverse(xml_tree_walker& walker); 537 | 538 | #ifndef PUGIXML_NO_XPATH 539 | // Select single node by evaluating XPath query. Returns first node from the resulting node set. 540 | xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; 541 | xpath_node select_single_node(const xpath_query& query) const; 542 | 543 | // Select node set by evaluating XPath query 544 | xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const; 545 | xpath_node_set select_nodes(const xpath_query& query) const; 546 | #endif 547 | 548 | // Print subtree using a writer object 549 | void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; 550 | 551 | #ifndef PUGIXML_NO_STL 552 | // Print subtree to stream 553 | void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; 554 | void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; 555 | #endif 556 | 557 | // Child nodes iterators 558 | typedef xml_node_iterator iterator; 559 | 560 | iterator begin() const; 561 | iterator end() const; 562 | 563 | // Attribute iterators 564 | typedef xml_attribute_iterator attribute_iterator; 565 | 566 | attribute_iterator attributes_begin() const; 567 | attribute_iterator attributes_end() const; 568 | 569 | // Range-based for support 570 | xml_object_range children() const; 571 | xml_object_range children(const char_t* name) const; 572 | xml_object_range attributes() const; 573 | 574 | // Get node offset in parsed file/string (in char_t units) for debugging purposes 575 | ptrdiff_t offset_debug() const; 576 | 577 | // Get hash value (unique for handles to the same object) 578 | size_t hash_value() const; 579 | 580 | // Get internal pointer 581 | xml_node_struct* internal_object() const; 582 | }; 583 | 584 | #ifdef __BORLANDC__ 585 | // Borland C++ workaround 586 | bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs); 587 | bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs); 588 | #endif 589 | 590 | // A helper for working with text inside PCDATA nodes 591 | class PUGIXML_CLASS xml_text 592 | { 593 | friend class xml_node; 594 | 595 | xml_node_struct* _root; 596 | 597 | typedef void (*unspecified_bool_type)(xml_text***); 598 | 599 | explicit xml_text(xml_node_struct* root); 600 | 601 | xml_node_struct* _data_new(); 602 | xml_node_struct* _data() const; 603 | 604 | public: 605 | // Default constructor. Constructs an empty object. 606 | xml_text(); 607 | 608 | // Safe bool conversion operator 609 | operator unspecified_bool_type() const; 610 | 611 | // Borland C++ workaround 612 | bool operator!() const; 613 | 614 | // Check if text object is empty 615 | bool empty() const; 616 | 617 | // Get text, or "" if object is empty 618 | const char_t* get() const; 619 | 620 | // Get text, or the default value if object is empty 621 | const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const; 622 | 623 | // Get text as a number, or the default value if conversion did not succeed or object is empty 624 | int as_int(int def = 0) const; 625 | unsigned int as_uint(unsigned int def = 0) const; 626 | double as_double(double def = 0) const; 627 | float as_float(float def = 0) const; 628 | 629 | // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty 630 | bool as_bool(bool def = false) const; 631 | 632 | // Set text (returns false if object is empty or there is not enough memory) 633 | bool set(const char_t* rhs); 634 | 635 | // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") 636 | bool set(int rhs); 637 | bool set(unsigned int rhs); 638 | bool set(double rhs); 639 | bool set(bool rhs); 640 | 641 | // Set text (equivalent to set without error checking) 642 | xml_text& operator=(const char_t* rhs); 643 | xml_text& operator=(int rhs); 644 | xml_text& operator=(unsigned int rhs); 645 | xml_text& operator=(double rhs); 646 | xml_text& operator=(bool rhs); 647 | 648 | // Get the data node (node_pcdata or node_cdata) for this object 649 | xml_node data() const; 650 | }; 651 | 652 | #ifdef __BORLANDC__ 653 | // Borland C++ workaround 654 | bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs); 655 | bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs); 656 | #endif 657 | 658 | // Child node iterator (a bidirectional iterator over a collection of xml_node) 659 | class PUGIXML_CLASS xml_node_iterator 660 | { 661 | friend class xml_node; 662 | 663 | private: 664 | mutable xml_node _wrap; 665 | xml_node _parent; 666 | 667 | xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent); 668 | 669 | public: 670 | // Iterator traits 671 | typedef ptrdiff_t difference_type; 672 | typedef xml_node value_type; 673 | typedef xml_node* pointer; 674 | typedef xml_node& reference; 675 | 676 | #ifndef PUGIXML_NO_STL 677 | typedef std::bidirectional_iterator_tag iterator_category; 678 | #endif 679 | 680 | // Default constructor 681 | xml_node_iterator(); 682 | 683 | // Construct an iterator which points to the specified node 684 | xml_node_iterator(const xml_node& node); 685 | 686 | // Iterator operators 687 | bool operator==(const xml_node_iterator& rhs) const; 688 | bool operator!=(const xml_node_iterator& rhs) const; 689 | 690 | xml_node& operator*() const; 691 | xml_node* operator->() const; 692 | 693 | const xml_node_iterator& operator++(); 694 | xml_node_iterator operator++(int); 695 | 696 | const xml_node_iterator& operator--(); 697 | xml_node_iterator operator--(int); 698 | }; 699 | 700 | // Attribute iterator (a bidirectional iterator over a collection of xml_attribute) 701 | class PUGIXML_CLASS xml_attribute_iterator 702 | { 703 | friend class xml_node; 704 | 705 | private: 706 | mutable xml_attribute _wrap; 707 | xml_node _parent; 708 | 709 | xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent); 710 | 711 | public: 712 | // Iterator traits 713 | typedef ptrdiff_t difference_type; 714 | typedef xml_attribute value_type; 715 | typedef xml_attribute* pointer; 716 | typedef xml_attribute& reference; 717 | 718 | #ifndef PUGIXML_NO_STL 719 | typedef std::bidirectional_iterator_tag iterator_category; 720 | #endif 721 | 722 | // Default constructor 723 | xml_attribute_iterator(); 724 | 725 | // Construct an iterator which points to the specified attribute 726 | xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent); 727 | 728 | // Iterator operators 729 | bool operator==(const xml_attribute_iterator& rhs) const; 730 | bool operator!=(const xml_attribute_iterator& rhs) const; 731 | 732 | xml_attribute& operator*() const; 733 | xml_attribute* operator->() const; 734 | 735 | const xml_attribute_iterator& operator++(); 736 | xml_attribute_iterator operator++(int); 737 | 738 | const xml_attribute_iterator& operator--(); 739 | xml_attribute_iterator operator--(int); 740 | }; 741 | 742 | // Named node range helper 743 | class xml_named_node_iterator 744 | { 745 | public: 746 | // Iterator traits 747 | typedef ptrdiff_t difference_type; 748 | typedef xml_node value_type; 749 | typedef xml_node* pointer; 750 | typedef xml_node& reference; 751 | 752 | #ifndef PUGIXML_NO_STL 753 | typedef std::forward_iterator_tag iterator_category; 754 | #endif 755 | 756 | // Default constructor 757 | xml_named_node_iterator(); 758 | 759 | // Construct an iterator which points to the specified node 760 | xml_named_node_iterator(const xml_node& node, const char_t* name); 761 | 762 | // Iterator operators 763 | bool operator==(const xml_named_node_iterator& rhs) const; 764 | bool operator!=(const xml_named_node_iterator& rhs) const; 765 | 766 | xml_node& operator*() const; 767 | xml_node* operator->() const; 768 | 769 | const xml_named_node_iterator& operator++(); 770 | xml_named_node_iterator operator++(int); 771 | 772 | private: 773 | mutable xml_node _node; 774 | const char_t* _name; 775 | }; 776 | 777 | // Abstract tree walker class (see xml_node::traverse) 778 | class PUGIXML_CLASS xml_tree_walker 779 | { 780 | friend class xml_node; 781 | 782 | private: 783 | int _depth; 784 | 785 | protected: 786 | // Get current traversal depth 787 | int depth() const; 788 | 789 | public: 790 | xml_tree_walker(); 791 | virtual ~xml_tree_walker(); 792 | 793 | // Callback that is called when traversal begins 794 | virtual bool begin(xml_node& node); 795 | 796 | // Callback that is called for each node traversed 797 | virtual bool for_each(xml_node& node) = 0; 798 | 799 | // Callback that is called when traversal ends 800 | virtual bool end(xml_node& node); 801 | }; 802 | 803 | // Parsing status, returned as part of xml_parse_result object 804 | enum xml_parse_status 805 | { 806 | status_ok = 0, // No error 807 | 808 | status_file_not_found, // File was not found during load_file() 809 | status_io_error, // Error reading from file/stream 810 | status_out_of_memory, // Could not allocate memory 811 | status_internal_error, // Internal error occurred 812 | 813 | status_unrecognized_tag, // Parser could not determine tag type 814 | 815 | status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction 816 | status_bad_comment, // Parsing error occurred while parsing comment 817 | status_bad_cdata, // Parsing error occurred while parsing CDATA section 818 | status_bad_doctype, // Parsing error occurred while parsing document type declaration 819 | status_bad_pcdata, // Parsing error occurred while parsing PCDATA section 820 | status_bad_start_element, // Parsing error occurred while parsing start element tag 821 | status_bad_attribute, // Parsing error occurred while parsing element attribute 822 | status_bad_end_element, // Parsing error occurred while parsing end element tag 823 | status_end_element_mismatch // There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) 824 | }; 825 | 826 | // Parsing result 827 | struct PUGIXML_CLASS xml_parse_result 828 | { 829 | // Parsing status (see xml_parse_status) 830 | xml_parse_status status; 831 | 832 | // Last parsed offset (in char_t units from start of input data) 833 | ptrdiff_t offset; 834 | 835 | // Source document encoding 836 | xml_encoding encoding; 837 | 838 | // Default constructor, initializes object to failed state 839 | xml_parse_result(); 840 | 841 | // Cast to bool operator 842 | operator bool() const; 843 | 844 | // Get error description 845 | const char* description() const; 846 | }; 847 | 848 | // Document class (DOM tree root) 849 | class PUGIXML_CLASS xml_document: public xml_node 850 | { 851 | private: 852 | char_t* _buffer; 853 | 854 | char _memory[192]; 855 | 856 | // Non-copyable semantics 857 | xml_document(const xml_document&); 858 | const xml_document& operator=(const xml_document&); 859 | 860 | void create(); 861 | void destroy(); 862 | 863 | xml_parse_result load_buffer_impl(void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own); 864 | 865 | public: 866 | // Default constructor, makes empty document 867 | xml_document(); 868 | 869 | // Destructor, invalidates all node/attribute handles to this document 870 | ~xml_document(); 871 | 872 | // Removes all nodes, leaving the empty document 873 | void reset(); 874 | 875 | // Removes all nodes, then copies the entire contents of the specified document 876 | void reset(const xml_document& proto); 877 | 878 | #ifndef PUGIXML_NO_STL 879 | // Load document from stream. 880 | xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 881 | xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); 882 | #endif 883 | 884 | // Load document from zero-terminated string. No encoding conversions are applied. 885 | xml_parse_result load(const char_t* contents, unsigned int options = parse_default); 886 | 887 | // Load document from file 888 | xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 889 | xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 890 | 891 | // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns. 892 | xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 893 | 894 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). 895 | // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed. 896 | xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 897 | 898 | // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data). 899 | // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore). 900 | xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); 901 | 902 | // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details). 903 | void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 904 | 905 | #ifndef PUGIXML_NO_STL 906 | // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). 907 | void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 908 | void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; 909 | #endif 910 | 911 | // Save XML to file 912 | bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 913 | bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; 914 | 915 | // Get document element 916 | xml_node document_element() const; 917 | }; 918 | 919 | #ifndef PUGIXML_NO_XPATH 920 | // XPath query return type 921 | enum xpath_value_type 922 | { 923 | xpath_type_none, // Unknown type (query failed to compile) 924 | xpath_type_node_set, // Node set (xpath_node_set) 925 | xpath_type_number, // Number 926 | xpath_type_string, // String 927 | xpath_type_boolean // Boolean 928 | }; 929 | 930 | // XPath parsing result 931 | struct PUGIXML_CLASS xpath_parse_result 932 | { 933 | // Error message (0 if no error) 934 | const char* error; 935 | 936 | // Last parsed offset (in char_t units from string start) 937 | ptrdiff_t offset; 938 | 939 | // Default constructor, initializes object to failed state 940 | xpath_parse_result(); 941 | 942 | // Cast to bool operator 943 | operator bool() const; 944 | 945 | // Get error description 946 | const char* description() const; 947 | }; 948 | 949 | // A single XPath variable 950 | class PUGIXML_CLASS xpath_variable 951 | { 952 | friend class xpath_variable_set; 953 | 954 | protected: 955 | xpath_value_type _type; 956 | xpath_variable* _next; 957 | 958 | xpath_variable(); 959 | 960 | // Non-copyable semantics 961 | xpath_variable(const xpath_variable&); 962 | xpath_variable& operator=(const xpath_variable&); 963 | 964 | public: 965 | // Get variable name 966 | const char_t* name() const; 967 | 968 | // Get variable type 969 | xpath_value_type type() const; 970 | 971 | // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error 972 | bool get_boolean() const; 973 | double get_number() const; 974 | const char_t* get_string() const; 975 | const xpath_node_set& get_node_set() const; 976 | 977 | // Set variable value; no type conversion is performed, false is returned on type mismatch error 978 | bool set(bool value); 979 | bool set(double value); 980 | bool set(const char_t* value); 981 | bool set(const xpath_node_set& value); 982 | }; 983 | 984 | // A set of XPath variables 985 | class PUGIXML_CLASS xpath_variable_set 986 | { 987 | private: 988 | xpath_variable* _data[64]; 989 | 990 | // Non-copyable semantics 991 | xpath_variable_set(const xpath_variable_set&); 992 | xpath_variable_set& operator=(const xpath_variable_set&); 993 | 994 | xpath_variable* find(const char_t* name) const; 995 | 996 | public: 997 | // Default constructor/destructor 998 | xpath_variable_set(); 999 | ~xpath_variable_set(); 1000 | 1001 | // Add a new variable or get the existing one, if the types match 1002 | xpath_variable* add(const char_t* name, xpath_value_type type); 1003 | 1004 | // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch 1005 | bool set(const char_t* name, bool value); 1006 | bool set(const char_t* name, double value); 1007 | bool set(const char_t* name, const char_t* value); 1008 | bool set(const char_t* name, const xpath_node_set& value); 1009 | 1010 | // Get existing variable by name 1011 | xpath_variable* get(const char_t* name); 1012 | const xpath_variable* get(const char_t* name) const; 1013 | }; 1014 | 1015 | // A compiled XPath query object 1016 | class PUGIXML_CLASS xpath_query 1017 | { 1018 | private: 1019 | void* _impl; 1020 | xpath_parse_result _result; 1021 | 1022 | typedef void (*unspecified_bool_type)(xpath_query***); 1023 | 1024 | // Non-copyable semantics 1025 | xpath_query(const xpath_query&); 1026 | xpath_query& operator=(const xpath_query&); 1027 | 1028 | public: 1029 | // Construct a compiled object from XPath expression. 1030 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors. 1031 | explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0); 1032 | 1033 | // Destructor 1034 | ~xpath_query(); 1035 | 1036 | // Get query expression return type 1037 | xpath_value_type return_type() const; 1038 | 1039 | // Evaluate expression as boolean value in the specified context; performs type conversion if necessary. 1040 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1041 | bool evaluate_boolean(const xpath_node& n) const; 1042 | 1043 | // Evaluate expression as double value in the specified context; performs type conversion if necessary. 1044 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1045 | double evaluate_number(const xpath_node& n) const; 1046 | 1047 | #ifndef PUGIXML_NO_STL 1048 | // Evaluate expression as string value in the specified context; performs type conversion if necessary. 1049 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1050 | string_t evaluate_string(const xpath_node& n) const; 1051 | #endif 1052 | 1053 | // Evaluate expression as string value in the specified context; performs type conversion if necessary. 1054 | // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero). 1055 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors. 1056 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty set instead. 1057 | size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const; 1058 | 1059 | // Evaluate expression as node set in the specified context. 1060 | // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors. 1061 | // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead. 1062 | xpath_node_set evaluate_node_set(const xpath_node& n) const; 1063 | 1064 | // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode) 1065 | const xpath_parse_result& result() const; 1066 | 1067 | // Safe bool conversion operator 1068 | operator unspecified_bool_type() const; 1069 | 1070 | // Borland C++ workaround 1071 | bool operator!() const; 1072 | }; 1073 | 1074 | #ifndef PUGIXML_NO_EXCEPTIONS 1075 | // XPath exception class 1076 | class PUGIXML_CLASS xpath_exception: public std::exception 1077 | { 1078 | private: 1079 | xpath_parse_result _result; 1080 | 1081 | public: 1082 | // Construct exception from parse result 1083 | explicit xpath_exception(const xpath_parse_result& result); 1084 | 1085 | // Get error message 1086 | virtual const char* what() const throw(); 1087 | 1088 | // Get parse result 1089 | const xpath_parse_result& result() const; 1090 | }; 1091 | #endif 1092 | 1093 | // XPath node class (either xml_node or xml_attribute) 1094 | class PUGIXML_CLASS xpath_node 1095 | { 1096 | private: 1097 | xml_node _node; 1098 | xml_attribute _attribute; 1099 | 1100 | typedef void (*unspecified_bool_type)(xpath_node***); 1101 | 1102 | public: 1103 | // Default constructor; constructs empty XPath node 1104 | xpath_node(); 1105 | 1106 | // Construct XPath node from XML node/attribute 1107 | xpath_node(const xml_node& node); 1108 | xpath_node(const xml_attribute& attribute, const xml_node& parent); 1109 | 1110 | // Get node/attribute, if any 1111 | xml_node node() const; 1112 | xml_attribute attribute() const; 1113 | 1114 | // Get parent of contained node/attribute 1115 | xml_node parent() const; 1116 | 1117 | // Safe bool conversion operator 1118 | operator unspecified_bool_type() const; 1119 | 1120 | // Borland C++ workaround 1121 | bool operator!() const; 1122 | 1123 | // Comparison operators 1124 | bool operator==(const xpath_node& n) const; 1125 | bool operator!=(const xpath_node& n) const; 1126 | }; 1127 | 1128 | #ifdef __BORLANDC__ 1129 | // Borland C++ workaround 1130 | bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs); 1131 | bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs); 1132 | #endif 1133 | 1134 | // A fixed-size collection of XPath nodes 1135 | class PUGIXML_CLASS xpath_node_set 1136 | { 1137 | public: 1138 | // Collection type 1139 | enum type_t 1140 | { 1141 | type_unsorted, // Not ordered 1142 | type_sorted, // Sorted by document order (ascending) 1143 | type_sorted_reverse // Sorted by document order (descending) 1144 | }; 1145 | 1146 | // Constant iterator type 1147 | typedef const xpath_node* const_iterator; 1148 | 1149 | // Default constructor. Constructs empty set. 1150 | xpath_node_set(); 1151 | 1152 | // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful 1153 | xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted); 1154 | 1155 | // Destructor 1156 | ~xpath_node_set(); 1157 | 1158 | // Copy constructor/assignment operator 1159 | xpath_node_set(const xpath_node_set& ns); 1160 | xpath_node_set& operator=(const xpath_node_set& ns); 1161 | 1162 | // Get collection type 1163 | type_t type() const; 1164 | 1165 | // Get collection size 1166 | size_t size() const; 1167 | 1168 | // Indexing operator 1169 | const xpath_node& operator[](size_t index) const; 1170 | 1171 | // Collection iterators 1172 | const_iterator begin() const; 1173 | const_iterator end() const; 1174 | 1175 | // Sort the collection in ascending/descending order by document order 1176 | void sort(bool reverse = false); 1177 | 1178 | // Get first node in the collection by document order 1179 | xpath_node first() const; 1180 | 1181 | // Check if collection is empty 1182 | bool empty() const; 1183 | 1184 | private: 1185 | type_t _type; 1186 | 1187 | xpath_node _storage; 1188 | 1189 | xpath_node* _begin; 1190 | xpath_node* _end; 1191 | 1192 | void _assign(const_iterator begin, const_iterator end); 1193 | }; 1194 | #endif 1195 | 1196 | #ifndef PUGIXML_NO_STL 1197 | // Convert wide string to UTF8 1198 | std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); 1199 | std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); 1200 | 1201 | // Convert UTF8 to wide string 1202 | std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); 1203 | std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const std::basic_string, std::allocator >& str); 1204 | #endif 1205 | 1206 | // Memory allocation function interface; returns pointer to allocated memory or NULL on failure 1207 | typedef void* (*allocation_function)(size_t size); 1208 | 1209 | // Memory deallocation function interface 1210 | typedef void (*deallocation_function)(void* ptr); 1211 | 1212 | // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions. 1213 | void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate); 1214 | 1215 | // Get current memory management functions 1216 | allocation_function PUGIXML_FUNCTION get_memory_allocation_function(); 1217 | deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function(); 1218 | } 1219 | 1220 | #if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC)) 1221 | namespace std 1222 | { 1223 | // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier) 1224 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&); 1225 | std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&); 1226 | std::forward_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&); 1227 | } 1228 | #endif 1229 | 1230 | #if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC) 1231 | namespace std 1232 | { 1233 | // Workarounds for (non-standard) iterator category detection 1234 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&); 1235 | std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&); 1236 | std::forward_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&); 1237 | } 1238 | #endif 1239 | 1240 | #endif 1241 | 1242 | /** 1243 | * Copyright (c) 2006-2012 Arseny Kapoulkine 1244 | * 1245 | * Permission is hereby granted, free of charge, to any person 1246 | * obtaining a copy of this software and associated documentation 1247 | * files (the "Software"), to deal in the Software without 1248 | * restriction, including without limitation the rights to use, 1249 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 1250 | * copies of the Software, and to permit persons to whom the 1251 | * Software is furnished to do so, subject to the following 1252 | * conditions: 1253 | * 1254 | * The above copyright notice and this permission notice shall be 1255 | * included in all copies or substantial portions of the Software. 1256 | * 1257 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 1258 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 1259 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 1260 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 1261 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 1262 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 1263 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 1264 | * OTHER DEALINGS IN THE SOFTWARE. 1265 | */ 1266 | -------------------------------------------------------------------------------- /src/spider_common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_common.h -------------------------------------------------------------------------------- /src/spider_config.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "spider_utils.h" 3 | #include "spider_config.h" 4 | #include "spider_common.h" 5 | 6 | const char* kSubject[]={"control","storage","cookie", "mysql" ,NULL}; 7 | enum SUBJECT 8 | { 9 | CONTROL, 10 | STORAGE, 11 | COOKIE, 12 | MYSQL 13 | }; 14 | 15 | Spider_Config::Spider_Config() 16 | { 17 | load_history_=false; 18 | load_cookie_=false; 19 | load_dns_=false; 20 | 21 | storage_path_=""; 22 | dns_path_=""; 23 | cookie_path_=""; 24 | history_path_=""; 25 | } 26 | 27 | 28 | 29 | Spider_Config::~Spider_Config() 30 | { 31 | 32 | } 33 | 34 | int Spider_Config::load() 35 | { 36 | module_path_=GetExePath(); 37 | current_date_=get_date(); 38 | std::string config_path=module_path_+kConfigFileName; 39 | FILE* file=fopen(config_path.c_str(), "r"); 40 | if ( file!=NULL ) 41 | { 42 | SUBJECT subject; 43 | std::string site; 44 | char line[1024]; 45 | while (fgets(line,1024,file)) 46 | { 47 | if ( line[strlen(line)-1]=='\n' ) 48 | { 49 | line[strlen(line)-1]='\0'; 50 | } 51 | char* c=line; 52 | while (isspace(*c))c++; 53 | if ( *c=='[' ) 54 | { 55 | char sub[100]; 56 | int ret=prase_subject(line,sub); 57 | if ( ret==0 ) 58 | { 59 | int i=0; 60 | for (; kSubject[i]!=NULL ;i++) 61 | { 62 | if ( strcmp(sub,kSubject[i])==0 ) 63 | break; 64 | } 65 | subject=(SUBJECT)i; 66 | } 67 | } 68 | else if ( *c!='\0' ) 69 | { 70 | std::string key, value; 71 | int ret=prase_key_value(line, key,value); 72 | if ( ret==0 ) 73 | { 74 | if ( subject==CONTROL ) 75 | set_control(key,value); 76 | else if ( subject==STORAGE ) 77 | set_storage(key,value); 78 | else if ( subject==COOKIE ) 79 | set_token(site,key,value); 80 | else if (subject==MYSQL) 81 | set_mysql(key, value); 82 | } 83 | } 84 | } 85 | } 86 | else 87 | { 88 | LLOG(L_ERROR,"open config file error."); 89 | return -1; 90 | } 91 | return 0; 92 | } 93 | 94 | int Spider_Config::prase_subject(char* line, char* subject) 95 | { 96 | int ret=-1; 97 | char* c=line; 98 | while (isspace(*c))c++; 99 | if ( *c=='[' ) 100 | { 101 | char* s=c; 102 | s++; 103 | c=line+strlen(line)-1; 104 | while (isspace(*c))c--; 105 | if ( *c==']' ) 106 | { 107 | ret=0; 108 | strncpy(subject, s, c-s); 109 | subject[c-s]='\0'; 110 | } 111 | } 112 | return ret; 113 | } 114 | 115 | 116 | int Spider_Config::prase_key_value(char* line, std::string& key, std::string& value) 117 | { 118 | int ret=-1; 119 | char* c=line; 120 | while(isspace(*c))c++; 121 | char* s=c; 122 | while (*c!='='&&*c!='\0')c++; 123 | if ( *c!='\0'&&*c=='=' ) 124 | { 125 | char* k=c; 126 | while(isspace(*k))k--; 127 | key.assign(s, k-s); 128 | 129 | s=line+strlen(line)-1; 130 | while ( isspace(*s) )s--; 131 | c++; 132 | while (isspace(*c))c++; 133 | 134 | if ( s>=c ) 135 | { 136 | value.assign(c, s+1-c); 137 | ret=0; 138 | } 139 | } 140 | return ret; 141 | } 142 | 143 | int Spider_Config::set_control(std::string& key, std::string& value) 144 | { 145 | if ( key=="loadhistory" ) 146 | { 147 | if ( stricmp(value.c_str(),"yes")==0 ) 148 | { 149 | load_history_=true; 150 | } 151 | if ( stricmp(value.c_str(),"no")==0 ) 152 | { 153 | load_history_=false; 154 | } 155 | } 156 | else if ( key=="loadcookie" ) 157 | { 158 | if ( stricmp(value.c_str(),"yes")==0 ) 159 | { 160 | load_cookie_=true; 161 | } 162 | if ( stricmp(value.c_str(),"no")==0 ) 163 | { 164 | load_cookie_=false; 165 | } 166 | } 167 | else if ( key=="loaddns" ) 168 | { 169 | if ( stricmp(value.c_str(),"yes")==0 ) 170 | { 171 | load_dns_=true; 172 | } 173 | if ( stricmp(value.c_str(),"no")==0 ) 174 | { 175 | load_dns_=false; 176 | } 177 | } 178 | return 0; 179 | } 180 | 181 | int Spider_Config::set_storage(std::string& key, std::string& value) 182 | { 183 | if ( key=="storage_path" ) 184 | { 185 | storage_path_=value; 186 | } 187 | else if ( key=="dns_path" ) 188 | { 189 | dns_path_=value; 190 | } 191 | else if ( key=="cookie_path" ) 192 | { 193 | cookie_path_=value; 194 | } 195 | else if ( key=="history_path" ) 196 | { 197 | history_path_=value; 198 | } 199 | return 0; 200 | } 201 | 202 | int Spider_Config::set_token(std::string& current_site, std::string& key, std::string& value) 203 | { 204 | if ( current_site.empty()&&key!="site" ) 205 | { 206 | return -1; 207 | } 208 | else 209 | { 210 | if ( key=="site" ) 211 | { 212 | current_site=value; 213 | Token* token=new Token; 214 | tokens_[current_site]=token; 215 | } 216 | else if ( key=="user" ) 217 | { 218 | Token* token=tokens_[current_site]; 219 | strcpy(token->account, value.c_str()); 220 | } 221 | else if ( key=="password") 222 | { 223 | Token* token=tokens_[current_site]; 224 | strcpy(token->password, value.c_str()); 225 | } 226 | } 227 | return 0; 228 | } 229 | 230 | 231 | int Spider_Config::set_mysql(std::string& key, std::string& value) 232 | { 233 | if ( key=="host" ) 234 | { 235 | mysql_host_=value; 236 | } 237 | else if ( key=="db" ) 238 | { 239 | mysql_db_=value; 240 | } 241 | else if ( key=="port" ) 242 | { 243 | mysql_port_=atoi(value.c_str()); 244 | } 245 | else if ( key=="user" ) 246 | { 247 | mysql_user_=value; 248 | } 249 | else if (key=="password") 250 | { 251 | mysql_password_=value; 252 | } 253 | 254 | return 0; 255 | } 256 | -------------------------------------------------------------------------------- /src/spider_config.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_config.h -------------------------------------------------------------------------------- /src/spider_cookie.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_cookie.cpp -------------------------------------------------------------------------------- /src/spider_cookie.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_cookie.h -------------------------------------------------------------------------------- /src/spider_database.cpp: -------------------------------------------------------------------------------- 1 | #include "spider_database.h" 2 | 3 | 4 | Spider_Database::Spider_Database() 5 | { 6 | m_exit=false; 7 | } 8 | 9 | Spider_Database::~Spider_Database() 10 | { 11 | 12 | } 13 | 14 | int Spider_Database::initialize() 15 | { 16 | if (!mysql_init(&m_mysql) ) 17 | { 18 | LLOG(L_ERROR,"mysql_init error."); 19 | return -1; 20 | } 21 | else 22 | { 23 | int ret=connect(); 24 | if ( ret==0 ) 25 | { 26 | LLOG(L_TRACE, "connect mysql ok."); 27 | } 28 | m_queue_mutex=recursivemutex_create(); 29 | m_thread=thread_create(NULL,0,(THREAD_FUN)thread_proc, this, 0,0); 30 | } 31 | return 0; 32 | } 33 | 34 | int Spider_Database::uninitialize() 35 | { 36 | m_exit=true; 37 | thread_waitforend(m_thread,INFINITE); 38 | recursivemutex_destory(m_queue_mutex); 39 | close(); 40 | return 0; 41 | } 42 | 43 | int Spider_Database::thread_aid() 44 | { 45 | while ( !m_exit ) 46 | { 47 | if ( m_commands.size()>0 ) 48 | { 49 | std::string command; 50 | { 51 | Recursive_Lock lock(m_queue_mutex); 52 | command=m_commands.front(); 53 | m_commands.pop(); 54 | } 55 | int ret=query(command.c_str()); 56 | if ( ret!=0 ) 57 | { 58 | LLOG(L_ERROR,"insert_record error. "); 59 | return -1; 60 | } 61 | } 62 | else 63 | { 64 | Sleep(500); 65 | } 66 | } 67 | return 0; 68 | } 69 | 70 | int Spider_Database::thread_proc(void* param) 71 | { 72 | Spider_Database* pthis=(Spider_Database*)param; 73 | if ( pthis!=NULL) 74 | { 75 | pthis->thread_aid(); 76 | } 77 | return 0; 78 | } 79 | 80 | int Spider_Database::insert_record(const char* website, const char* albums, UrlPtr url_ptr) 81 | { 82 | Recursive_Lock lock(m_queue_mutex); 83 | 84 | long len=strlen(url_ptr->comment); 85 | char* es_comment=new char[2*len+1]; 86 | len=mysql_real_escape_string(&m_mysql, es_comment , url_ptr->comment, len); 87 | if( len>1648 ) 88 | { 89 | LLOG(L_ERROR, "comment is too long."); 90 | return -1; 91 | } 92 | 93 | char command[2048]; 94 | sprintf(command, "INSERT INTO hd_paints (file_name,date_added,header,comment,source_url,parent_url,source_website) VALUES ('%s', NOW(), '%s', '%s', '%s', '%s' ,'%s')", 95 | url_ptr->filename, 96 | "", 97 | es_comment, 98 | url_ptr->url, 99 | url_ptr->parent, 100 | website ); 101 | 102 | delete[] es_comment; 103 | m_commands.push(std::string(command)); 104 | return 0; 105 | } 106 | 107 | int Spider_Database::connect() 108 | { 109 | Spider_Config::instance(); 110 | MYSQL* ret=mysql_real_connect(&m_mysql, 111 | Spider_Config::instance().mysql_host_.c_str(), 112 | Spider_Config::instance().mysql_user_.c_str(), 113 | Spider_Config::instance().mysql_password_.c_str(), 114 | Spider_Config::instance().mysql_db_.c_str(), 115 | Spider_Config::instance().mysql_port_,NULL,0); 116 | if ( ret==NULL ) 117 | { 118 | LLOG(L_ERROR,"mysql: connect server error!!"); 119 | return -1; 120 | } 121 | mysql_set_character_set(&m_mysql , "utf8"); 122 | return 0; 123 | } 124 | 125 | int Spider_Database::query(const char* command) 126 | { 127 | int ret=mysql_real_query(&m_mysql, command, strlen(command)); 128 | if ( ret !=0) 129 | { 130 | LLOG(L_ERROR, "query error, code: %s ", mysql_error(&m_mysql)); 131 | return -1; 132 | } 133 | return 0; 134 | } 135 | 136 | int Spider_Database::close() 137 | { 138 | mysql_close(&m_mysql); 139 | return 0; 140 | } 141 | 142 | -------------------------------------------------------------------------------- /src/spider_database.h: -------------------------------------------------------------------------------- 1 | #ifndef __CROTON_SPIDER_DATABASE_H__ 2 | #define __CROTON_SPIDER_DATABASE_H__ 3 | #include "spider_utils.h" 4 | #include "spider_config.h" 5 | #include "spider_url.h" 6 | #include 7 | 8 | #ifdef WIN32 9 | #pragma comment(lib,"libmysql.lib") 10 | #endif 11 | 12 | class Spider_Database 13 | { 14 | public: 15 | Spider_Database(); 16 | ~Spider_Database(); 17 | 18 | int initialize(); 19 | int uninitialize(); 20 | 21 | int insert_record(const char* website, const char* albums, UrlPtr url_ptr); 22 | 23 | 24 | private: 25 | int thread_aid(); 26 | static int thread_proc(void* param); 27 | int connect(); 28 | int query(const char* command); 29 | int close(); 30 | 31 | handle_thread m_thread; 32 | handle_recursivemutex m_queue_mutex; 33 | std::queue m_commands; 34 | MYSQL m_mysql; 35 | 36 | volatile int m_exit; 37 | }; 38 | 39 | 40 | 41 | 42 | #endif 43 | 44 | -------------------------------------------------------------------------------- /src/spider_executor.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_executor.cpp -------------------------------------------------------------------------------- /src/spider_executor.h: -------------------------------------------------------------------------------- 1 | #ifndef __CROTON_SPIDER_EXECUTOR_H__ 2 | #define __CROTON_SPIDER_EXECUTOR_H__ 3 | #include "spider_url.h" 4 | #include "spider_thread_pool.h" 5 | 6 | class Spider_Executor 7 | { 8 | public: 9 | static Spider_Executor& instance() 10 | { 11 | static Spider_Executor _instance; 12 | return _instance; 13 | }; 14 | ~Spider_Executor(); 15 | int initialize(); 16 | int uninitialize(); 17 | 18 | int put_url(UrlPtr url_ptr); 19 | int put_urls(UrlPtrVec& url_ptrs); 20 | 21 | int execute_loop(); 22 | 23 | private: 24 | Spider_Executor(); 25 | int main_thread_aid(); 26 | static int main_thread(void* param); 27 | static int worker_work(void* param); 28 | 29 | handle_recursivemutex m_queue_mutex; 30 | std::queue m_task_queue; 31 | handle_thread m_thread_handle; 32 | handle_semaphore m_complete; 33 | 34 | Spider_Thread_Pool* m_thread_pool; 35 | 36 | #ifdef WIN32 37 | fd_set m_all_fdset; 38 | #else 39 | int m_epoll_fd; 40 | #endif 41 | 42 | volatile bool m_exit; 43 | }; 44 | 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/spider_http_client.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_http_client.cpp -------------------------------------------------------------------------------- /src/spider_http_client.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_http_client.h -------------------------------------------------------------------------------- /src/spider_main.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_main.cpp -------------------------------------------------------------------------------- /src/spider_md5.cpp: -------------------------------------------------------------------------------- 1 | #include "spider_md5.h" 2 | #include 3 | #include 4 | #include 5 | const int kBufferSize=1024*1024*5; 6 | 7 | unsigned char PADDING[]={0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 8 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 9 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 10 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 11 | 12 | void MD5Init(MD5_CTX *context) 13 | { 14 | context->count[0] = 0; 15 | context->count[1] = 0; 16 | context->state[0] = 0x67452301; 17 | context->state[1] = 0xEFCDAB89; 18 | context->state[2] = 0x98BADCFE; 19 | context->state[3] = 0x10325476; 20 | } 21 | 22 | void MD5Update(MD5_CTX *context,unsigned char *input,unsigned int inputlen) 23 | { 24 | unsigned int i = 0,index = 0,partlen = 0; 25 | index = (context->count[0] >> 3) & 0x3F; 26 | partlen = 64 - index; 27 | context->count[0] += inputlen << 3; 28 | if(context->count[0] < (inputlen << 3)) 29 | context->count[1]++; 30 | context->count[1] += inputlen >> 29; 31 | 32 | if(inputlen >= partlen) 33 | { 34 | memcpy(&context->buffer[index],input,partlen); 35 | MD5Transform(context->state,context->buffer); 36 | for(i = partlen;i+64 <= inputlen;i+=64) 37 | MD5Transform(context->state,&input[i]); 38 | index = 0; 39 | } 40 | else 41 | { 42 | i = 0; 43 | } 44 | memcpy(&context->buffer[index],&input[i],inputlen-i); 45 | } 46 | 47 | void MD5Final(MD5_CTX *context,unsigned char* digest) 48 | { 49 | unsigned int index = 0,padlen = 0; 50 | unsigned char bits[8]; 51 | index = (context->count[0] >> 3) & 0x3F; 52 | padlen = (index < 56)?(56-index):(120-index); 53 | MD5Encode(bits,context->count,8); 54 | MD5Update(context,PADDING,padlen); 55 | MD5Update(context,bits,8); 56 | MD5Encode(digest,context->state,16); 57 | } 58 | 59 | void MD5Encode(unsigned char *output,unsigned int *input,unsigned int len) 60 | { 61 | unsigned int i = 0,j = 0; 62 | while(j < len) 63 | { 64 | output[j] = input[i] & 0xFF; 65 | output[j+1] = (input[i] >> 8) & 0xFF; 66 | output[j+2] = (input[i] >> 16) & 0xFF; 67 | output[j+3] = (input[i] >> 24) & 0xFF; 68 | i++; 69 | j+=4; 70 | } 71 | } 72 | 73 | void MD5Decode(unsigned int *output,unsigned char *input,unsigned int len) 74 | { 75 | unsigned int i = 0,j = 0; 76 | while(j < len) 77 | { 78 | output[i] = (input[j]) | 79 | (input[j+1] << 8) | 80 | (input[j+2] << 16) | 81 | (input[j+3] << 24); 82 | i++; 83 | j+=4; 84 | } 85 | } 86 | 87 | void MD5Transform(unsigned int state[4],unsigned char block[64]) 88 | { 89 | unsigned int a = state[0]; 90 | unsigned int b = state[1]; 91 | unsigned int c = state[2]; 92 | unsigned int d = state[3]; 93 | unsigned int x[64]; 94 | MD5Decode(x,block,64); 95 | FF(a, b, c, d, x[ 0], 7, 0xd76aa478); /* 1 */ 96 | FF(d, a, b, c, x[ 1], 12, 0xe8c7b756); /* 2 */ 97 | FF(c, d, a, b, x[ 2], 17, 0x242070db); /* 3 */ 98 | FF(b, c, d, a, x[ 3], 22, 0xc1bdceee); /* 4 */ 99 | FF(a, b, c, d, x[ 4], 7, 0xf57c0faf); /* 5 */ 100 | FF(d, a, b, c, x[ 5], 12, 0x4787c62a); /* 6 */ 101 | FF(c, d, a, b, x[ 6], 17, 0xa8304613); /* 7 */ 102 | FF(b, c, d, a, x[ 7], 22, 0xfd469501); /* 8 */ 103 | FF(a, b, c, d, x[ 8], 7, 0x698098d8); /* 9 */ 104 | FF(d, a, b, c, x[ 9], 12, 0x8b44f7af); /* 10 */ 105 | FF(c, d, a, b, x[10], 17, 0xffff5bb1); /* 11 */ 106 | FF(b, c, d, a, x[11], 22, 0x895cd7be); /* 12 */ 107 | FF(a, b, c, d, x[12], 7, 0x6b901122); /* 13 */ 108 | FF(d, a, b, c, x[13], 12, 0xfd987193); /* 14 */ 109 | FF(c, d, a, b, x[14], 17, 0xa679438e); /* 15 */ 110 | FF(b, c, d, a, x[15], 22, 0x49b40821); /* 16 */ 111 | 112 | /* Round 2 */ 113 | GG(a, b, c, d, x[ 1], 5, 0xf61e2562); /* 17 */ 114 | GG(d, a, b, c, x[ 6], 9, 0xc040b340); /* 18 */ 115 | GG(c, d, a, b, x[11], 14, 0x265e5a51); /* 19 */ 116 | GG(b, c, d, a, x[ 0], 20, 0xe9b6c7aa); /* 20 */ 117 | GG(a, b, c, d, x[ 5], 5, 0xd62f105d); /* 21 */ 118 | GG(d, a, b, c, x[10], 9, 0x2441453); /* 22 */ 119 | GG(c, d, a, b, x[15], 14, 0xd8a1e681); /* 23 */ 120 | GG(b, c, d, a, x[ 4], 20, 0xe7d3fbc8); /* 24 */ 121 | GG(a, b, c, d, x[ 9], 5, 0x21e1cde6); /* 25 */ 122 | GG(d, a, b, c, x[14], 9, 0xc33707d6); /* 26 */ 123 | GG(c, d, a, b, x[ 3], 14, 0xf4d50d87); /* 27 */ 124 | GG(b, c, d, a, x[ 8], 20, 0x455a14ed); /* 28 */ 125 | GG(a, b, c, d, x[13], 5, 0xa9e3e905); /* 29 */ 126 | GG(d, a, b, c, x[ 2], 9, 0xfcefa3f8); /* 30 */ 127 | GG(c, d, a, b, x[ 7], 14, 0x676f02d9); /* 31 */ 128 | GG(b, c, d, a, x[12], 20, 0x8d2a4c8a); /* 32 */ 129 | 130 | /* Round 3 */ 131 | HH(a, b, c, d, x[ 5], 4, 0xfffa3942); /* 33 */ 132 | HH(d, a, b, c, x[ 8], 11, 0x8771f681); /* 34 */ 133 | HH(c, d, a, b, x[11], 16, 0x6d9d6122); /* 35 */ 134 | HH(b, c, d, a, x[14], 23, 0xfde5380c); /* 36 */ 135 | HH(a, b, c, d, x[ 1], 4, 0xa4beea44); /* 37 */ 136 | HH(d, a, b, c, x[ 4], 11, 0x4bdecfa9); /* 38 */ 137 | HH(c, d, a, b, x[ 7], 16, 0xf6bb4b60); /* 39 */ 138 | HH(b, c, d, a, x[10], 23, 0xbebfbc70); /* 40 */ 139 | HH(a, b, c, d, x[13], 4, 0x289b7ec6); /* 41 */ 140 | HH(d, a, b, c, x[ 0], 11, 0xeaa127fa); /* 42 */ 141 | HH(c, d, a, b, x[ 3], 16, 0xd4ef3085); /* 43 */ 142 | HH(b, c, d, a, x[ 6], 23, 0x4881d05); /* 44 */ 143 | HH(a, b, c, d, x[ 9], 4, 0xd9d4d039); /* 45 */ 144 | HH(d, a, b, c, x[12], 11, 0xe6db99e5); /* 46 */ 145 | HH(c, d, a, b, x[15], 16, 0x1fa27cf8); /* 47 */ 146 | HH(b, c, d, a, x[ 2], 23, 0xc4ac5665); /* 48 */ 147 | 148 | /* Round 4 */ 149 | II(a, b, c, d, x[ 0], 6, 0xf4292244); /* 49 */ 150 | II(d, a, b, c, x[ 7], 10, 0x432aff97); /* 50 */ 151 | II(c, d, a, b, x[14], 15, 0xab9423a7); /* 51 */ 152 | II(b, c, d, a, x[ 5], 21, 0xfc93a039); /* 52 */ 153 | II(a, b, c, d, x[12], 6, 0x655b59c3); /* 53 */ 154 | II(d, a, b, c, x[ 3], 10, 0x8f0ccc92); /* 54 */ 155 | II(c, d, a, b, x[10], 15, 0xffeff47d); /* 55 */ 156 | II(b, c, d, a, x[ 1], 21, 0x85845dd1); /* 56 */ 157 | II(a, b, c, d, x[ 8], 6, 0x6fa87e4f); /* 57 */ 158 | II(d, a, b, c, x[15], 10, 0xfe2ce6e0); /* 58 */ 159 | II(c, d, a, b, x[ 6], 15, 0xa3014314); /* 59 */ 160 | II(b, c, d, a, x[13], 21, 0x4e0811a1); /* 60 */ 161 | II(a, b, c, d, x[ 4], 6, 0xf7537e82); /* 61 */ 162 | II(d, a, b, c, x[11], 10, 0xbd3af235); /* 62 */ 163 | II(c, d, a, b, x[ 2], 15, 0x2ad7d2bb); /* 63 */ 164 | II(b, c, d, a, x[ 9], 21, 0xeb86d391); /* 64 */ 165 | state[0] += a; 166 | state[1] += b; 167 | state[2] += c; 168 | state[3] += d; 169 | } 170 | 171 | 172 | int Spider_MD5::get_file_md5( const char* filepath, char* filemd5 ) 173 | { 174 | MD5_CTX md5; 175 | MD5Init(&md5); 176 | 177 | unsigned char* buffer=new unsigned char[kBufferSize]; 178 | FILE* file=fopen(filepath,"rb"); 179 | if ( file!=NULL ) 180 | { 181 | int length=0; 182 | while ( (length=fread(buffer,1, kBufferSize, file))>0 ) 183 | { 184 | MD5Update(&md5, buffer ,length ); 185 | } 186 | 187 | unsigned char digest[16]={0}; 188 | MD5Final(&md5, digest); 189 | 190 | char hex[3]; 191 | for(int i=0;i<16;i++) 192 | { 193 | sprintf(hex,"%02x",digest[i]); 194 | strcat(filemd5,hex ); 195 | } 196 | fclose(file); 197 | } 198 | 199 | delete[] buffer; 200 | return 0; 201 | } 202 | 203 | int Spider_MD5::get_file_md5(const char* buffer, uint length, char* filemd5) 204 | { 205 | MD5_CTX md5; 206 | MD5Init(&md5); 207 | 208 | MD5Update(&md5, (unsigned char*)buffer ,length ); 209 | 210 | unsigned char digest[16]={0}; 211 | MD5Final(&md5, digest); 212 | 213 | char hex[3]; 214 | for(int i=0;i<16;i++) 215 | { 216 | sprintf(hex,"%02x",digest[i]); 217 | strcat(filemd5,hex ); 218 | } 219 | return 0; 220 | } 221 | 222 | uint Spider_MD5::get_buffer_md5_code( const char* buffer ,int length, uint max ) 223 | { 224 | MD5_CTX md5; 225 | MD5Init(&md5); 226 | MD5Update(&md5, (unsigned char*)buffer ,length ); 227 | unsigned char digest[16]={0}; 228 | MD5Final(&md5, digest); 229 | 230 | uint code=md5.state[1]%max; //(md5.state[1]^md5.state[2]^md5.state[3]^md5.state[4])%max; 231 | 232 | return code; 233 | } 234 | 235 | 236 | -------------------------------------------------------------------------------- /src/spider_md5.h: -------------------------------------------------------------------------------- 1 | #ifndef _FILE_MD5_H_ 2 | #define _FILE_MD5_H_ 3 | 4 | #include "spider_common.h" 5 | 6 | typedef struct 7 | { 8 | unsigned int count[2]; 9 | unsigned int state[4]; 10 | unsigned char buffer[64]; 11 | }MD5_CTX; 12 | 13 | #define F(x,y,z) ((x & y) | (~x & z)) 14 | #define G(x,y,z) ((x & z) | (y & ~z)) 15 | #define H(x,y,z) (x^y^z) 16 | #define I(x,y,z) (y ^ (x | ~z)) 17 | #define ROTATE_LEFT(x,n) ((x << n) | (x >> (32-n))) 18 | #define FF(a,b,c,d,x,s,ac) \ 19 | { \ 20 | a += F(b,c,d) + x + ac; \ 21 | a = ROTATE_LEFT(a,s); \ 22 | a += b; \ 23 | } 24 | 25 | #define GG(a,b,c,d,x,s,ac) \ 26 | { \ 27 | a += G(b,c,d) + x + ac; \ 28 | a = ROTATE_LEFT(a,s); \ 29 | a += b; \ 30 | } 31 | 32 | #define HH(a,b,c,d,x,s,ac) \ 33 | { \ 34 | a += H(b,c,d) + x + ac; \ 35 | a = ROTATE_LEFT(a,s); \ 36 | a += b; \ 37 | } 38 | 39 | #define II(a,b,c,d,x,s,ac) \ 40 | { \ 41 | a += I(b,c,d) + x + ac; \ 42 | a = ROTATE_LEFT(a,s); \ 43 | a += b; \ 44 | } 45 | 46 | void MD5Init(MD5_CTX *context); 47 | void MD5Update(MD5_CTX *context,unsigned char *input,unsigned int inputlen); 48 | void MD5Final(MD5_CTX *context,unsigned char* digest); 49 | void MD5Transform(unsigned int state[4],unsigned char block[64]); 50 | void MD5Encode(unsigned char *output,unsigned int *input,unsigned int len); 51 | void MD5Decode(unsigned int *output,unsigned char *input,unsigned int len); 52 | 53 | 54 | class Spider_MD5 55 | { 56 | public: 57 | static int get_file_md5(const char* filepath, char* filemd5 ); 58 | static int get_file_md5(const char* buffer, uint length, char* filemd5); 59 | 60 | static uint get_buffer_md5_code(const char* buffer, int length, uint max=0xfffffffe); 61 | }; 62 | 63 | 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/spider_porting.cpp: -------------------------------------------------------------------------------- 1 | #include "spider_porting.h" 2 | 3 | #ifdef _DEBUG 4 | LOG_LEVEL g_LogLevel=L_DEBUG; 5 | #else 6 | LOG_LEVEL g_LogLevel=L_WARN; 7 | #endif 8 | 9 | 10 | #ifdef WIN32 11 | //thread function. 12 | handle_thread thread_create(void* security, unsigned stack_size, THREAD_FUN start_address, void* arglist, unsigned initflag /*= 0*/, unsigned* thraddr /*= NULL*/) 13 | { 14 | return (handle_thread)_beginthreadex(security, stack_size, start_address, arglist, initflag, thraddr); 15 | } 16 | 17 | void thread_end(unsigned retval) 18 | { 19 | _endthreadex(retval); 20 | } 21 | 22 | int thread_waitforend(handle_thread hThread, unsigned long dwMilliseconds) 23 | { 24 | DWORD dwWaitResult = WaitForSingleObject(hThread, dwMilliseconds); 25 | switch (dwWaitResult) 26 | { 27 | case WAIT_OBJECT_0: 28 | return 0; 29 | case WAIT_TIMEOUT: 30 | case WAIT_ABANDONED: 31 | default: 32 | return -1; 33 | } 34 | 35 | } 36 | 37 | bool thread_close(handle_thread hThread) 38 | { 39 | if (hThread != NULL) 40 | { 41 | if (CloseHandle(hThread)) 42 | return true; 43 | else 44 | return false; 45 | } 46 | return true; 47 | } 48 | 49 | std::string GetMacAddress() 50 | { 51 | std::string macaddr = ""; 52 | 53 | IP_ADAPTER_INFO AdapterInfo[20]; // up to 20 NICs... 54 | DWORD dwBufLen = sizeof(AdapterInfo); 55 | 56 | DWORD dwStatus = GetAdaptersInfo(AdapterInfo, &dwBufLen); 57 | if(dwStatus == ERROR_SUCCESS) 58 | { 59 | PIP_ADAPTER_INFO pAdapterInfo = AdapterInfo; 60 | 61 | while(pAdapterInfo) 62 | { 63 | if(pAdapterInfo->Type == MIB_IF_TYPE_LOOPBACK) 64 | { 65 | pAdapterInfo = pAdapterInfo->Next; 66 | continue; 67 | } 68 | 69 | if(strstr(pAdapterInfo->Description, "VMware") > 0 70 | || strstr(pAdapterInfo->Description, "Loopback")> 0 71 | || strstr(pAdapterInfo->Description, "Bluetooth")> 0 72 | || strstr(pAdapterInfo->Description, "Virtual")> 0 73 | ) 74 | { 75 | pAdapterInfo = pAdapterInfo->Next; 76 | continue; 77 | } 78 | 79 | char temp[10] = {0}; 80 | for(int i = 0 ; i < 6 ; i++ ) 81 | { 82 | macaddr += _itoa((int)pAdapterInfo->Address[i], temp, 16); 83 | } 84 | break; 85 | } 86 | } 87 | return macaddr; 88 | } 89 | 90 | handle_mutex mutex_create() 91 | { 92 | return CreateMutex( NULL, FALSE, NULL); 93 | } 94 | 95 | bool mutex_destroy(handle_mutex handle) 96 | { 97 | if (handle == NULL) 98 | { 99 | return true; 100 | } 101 | if (CloseHandle(handle)) 102 | { 103 | return true; 104 | } 105 | return false; 106 | } 107 | 108 | bool mutex_lock(handle_mutex handle) 109 | { 110 | bool bres = false; 111 | if (handle == NULL) 112 | { 113 | return true; 114 | } 115 | 116 | DWORD dwWaitResult = WaitForSingleObject( 117 | handle, // handle to mutex 118 | INFINITE); // five-second time-out interval 119 | 120 | switch (dwWaitResult) 121 | { 122 | // The thread got mutex ownership. 123 | case WAIT_OBJECT_0: 124 | bres = true; 125 | break; 126 | // Cannot get mutex ownership due to time-out. 127 | case WAIT_TIMEOUT: 128 | // Got ownership of the abandoned mutex object. 129 | case WAIT_ABANDONED: 130 | break; 131 | } 132 | 133 | return bres; 134 | } 135 | 136 | bool mutex_unlock(handle_mutex handle) 137 | { 138 | if (handle == NULL) 139 | { 140 | return true; 141 | } 142 | if (ReleaseMutex(handle)) 143 | { 144 | handle=NULL; 145 | return true; 146 | } 147 | return false; 148 | } 149 | 150 | handle_recursivemutex recursivemutex_create() 151 | { 152 | CRITICAL_SECTION* pSect=new CRITICAL_SECTION(); 153 | InitializeCriticalSection(pSect); 154 | return pSect; 155 | } 156 | 157 | void recursivemutex_destory(handle_recursivemutex handle) 158 | { 159 | DeleteCriticalSection(handle); 160 | if (handle!=NULL) 161 | { 162 | delete handle; 163 | handle=NULL; 164 | } 165 | } 166 | 167 | void recursivemutex_lock(handle_recursivemutex handle) 168 | { 169 | EnterCriticalSection(handle); 170 | } 171 | 172 | void recursivemutex_unlock(handle_recursivemutex handle) 173 | { 174 | LeaveCriticalSection(handle); 175 | } 176 | 177 | handle_semaphore semaphore_create(long init_count, long max_count) 178 | { 179 | handle_semaphore sem=CreateSemaphore(NULL,init_count,max_count,NULL); 180 | return sem; 181 | } 182 | 183 | void semaphore_destory(handle_semaphore handle) 184 | { 185 | if (handle!=NULL) 186 | { 187 | CloseHandle(handle); 188 | handle=NULL; 189 | } 190 | } 191 | 192 | bool semaphore_wait(handle_semaphore handle) 193 | { 194 | bool bret=false; 195 | if ( handle!=NULL ) 196 | { 197 | WaitForSingleObject(handle,INFINITE); 198 | bret=true; 199 | } 200 | return bret; 201 | } 202 | 203 | bool semaphore_release(handle_semaphore handle) 204 | { 205 | return ReleaseSemaphore(handle,1,NULL); 206 | } 207 | 208 | 209 | int SetSockNoblock(int sock, int mode) 210 | { 211 | u_long mm=mode; 212 | return ioctlsocket(sock,FIONBIO , &mm ); 213 | } 214 | 215 | std::string GetExePath() 216 | { 217 | char cstr_path[MAX_PATH+1] = {0}; 218 | TCHAR path[MAX_PATH+1] = {0}; 219 | GetModuleFileName(NULL, path, MAX_PATH); 220 | WideCharToMultiByte(CP_ACP,0,path,MAX_PATH,cstr_path, MAX_PATH, NULL, 0); 221 | std::string full_path = cstr_path; 222 | int pos=full_path.find_last_of(PATH_SEPARATOR); 223 | return full_path.substr(0, pos+1); 224 | } 225 | 226 | 227 | #else 228 | 229 | //thread function. 230 | handle_thread thread_create(void* security, unsigned stack_size, THREAD_FUN start_address, void* arglist, unsigned initflag /*= 0*/, unsigned* thraddr /*= NULL*/) 231 | { 232 | pthread_attr_t attr; 233 | pthread_attr_init(&attr); 234 | pthread_attr_setstacksize(&attr, stack_size); 235 | 236 | handle_thread hThread = new (std::nothrow) pthread_t; 237 | if(0 == pthread_create(hThread, &attr, start_address, arglist)) 238 | { 239 | return hThread; 240 | } 241 | else 242 | { 243 | delete hThread; 244 | return NULL; 245 | } 246 | } 247 | 248 | void thread_end(unsigned retval) 249 | { 250 | pthread_exit(&retval); 251 | } 252 | 253 | int thread_waitforend(handle_thread hThread, unsigned long dwMilliseconds) 254 | { 255 | if(hThread == NULL) 256 | return 0; 257 | 258 | return pthread_join(*hThread, NULL); 259 | } 260 | 261 | bool thread_close(handle_thread hThread) 262 | { 263 | if (hThread != NULL) 264 | { 265 | delete hThread; 266 | } 267 | 268 | return true; 269 | } 270 | 271 | ulong GetTickCount() 272 | { 273 | timeval tv; 274 | gettimeofday(&tv, NULL); 275 | return tv.tv_sec*1000L+tv.tv_usec/1000L; 276 | } 277 | 278 | void Sleep(ulong millisecond) 279 | { 280 | usleep(millisecond*1000); 281 | } 282 | 283 | int closesocket(int sock) 284 | { 285 | return close(sock); 286 | } 287 | 288 | handle_mutex mutex_create() 289 | { 290 | handle_mutex handle = new (std::nothrow) pthread_mutex_t; 291 | if (pthread_mutex_init(handle, NULL) != 0) 292 | { 293 | delete handle; 294 | return NULL; 295 | } 296 | 297 | return handle; 298 | } 299 | 300 | bool mutex_destroy(handle_mutex handle) 301 | { 302 | if (handle == NULL) 303 | { 304 | return true; 305 | } 306 | 307 | int iRes = pthread_mutex_destroy(handle); 308 | delete handle; 309 | 310 | if (iRes != 0) 311 | { 312 | return false; 313 | } 314 | 315 | return true; 316 | } 317 | 318 | bool mutex_lock(handle_mutex handle) 319 | { 320 | if (handle == NULL) 321 | { 322 | return true; 323 | } 324 | 325 | int iRes = pthread_mutex_lock(handle); 326 | 327 | if (iRes != 0) 328 | { 329 | return false; 330 | } 331 | 332 | return true; 333 | } 334 | 335 | bool mutex_unlock(handle_mutex handle) 336 | { 337 | if (handle == NULL) 338 | { 339 | return true; 340 | } 341 | 342 | int iRes = pthread_mutex_unlock(handle); 343 | 344 | if (iRes != 0) 345 | { 346 | return false; 347 | } 348 | 349 | return true; 350 | } 351 | 352 | handle_recursivemutex recursivemutex_create() 353 | { 354 | pthread_mutex_t* pMutex=new pthread_mutex_t(); 355 | pthread_mutexattr_t attr; 356 | pthread_mutexattr_init(&attr); 357 | pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); 358 | pthread_mutex_init(pMutex, &attr); 359 | return pMutex; 360 | } 361 | 362 | void recursivemutex_destory(handle_recursivemutex handle) 363 | { 364 | if (handle!=NULL) 365 | { 366 | pthread_mutex_destroy(handle); 367 | delete handle; 368 | handle=NULL; 369 | } 370 | } 371 | 372 | void recursivemutex_lock(handle_recursivemutex handle) 373 | { 374 | if (handle!=NULL) 375 | { 376 | pthread_mutex_lock(handle); 377 | } 378 | } 379 | 380 | void recursivemutex_unlock(handle_recursivemutex handle) 381 | { 382 | if (handle!=NULL) 383 | { 384 | pthread_mutex_unlock(handle); 385 | } 386 | } 387 | 388 | handle_semaphore semaphore_create(long init_count, long max_count) 389 | { 390 | handle_semaphore sem=new sem_t; 391 | int ret=sem_init(sem, 0, init_count ); 392 | if ( ret==0 ) 393 | { 394 | return sem; 395 | } 396 | else 397 | { 398 | return NULL; 399 | } 400 | } 401 | 402 | void semaphore_destory(handle_semaphore handle) 403 | { 404 | if ( handle!=NULL ) 405 | { 406 | int ret=sem_destroy(handle); 407 | if ( ret!=0 ) 408 | { 409 | printf("semaphore_destory error. \n"); 410 | } 411 | delete handle; 412 | handle=NULL; 413 | } 414 | } 415 | 416 | bool semaphore_wait(handle_semaphore handle) 417 | { 418 | int bret=false; 419 | int ret=sem_wait(handle); 420 | if ( ret==0 ) 421 | { 422 | bret=true; 423 | } 424 | return bret; 425 | } 426 | 427 | bool semaphore_release(handle_semaphore handle) 428 | { 429 | int bret=false; 430 | int ret=sem_post(handle); 431 | if ( ret==0 ) 432 | { 433 | bret=true; 434 | } 435 | return bret; 436 | } 437 | 438 | 439 | int SetSockNoblock(int sock, int mode) 440 | { 441 | int ret=0; 442 | int flags = fcntl(sock, F_GETFL, 0); 443 | if ( mode==1 ) 444 | { 445 | ret=fcntl(sock, F_SETFL, flags|O_NONBLOCK); 446 | } 447 | else 448 | { 449 | ret=fcntl(sock,F_SETFL, flags|~O_NONBLOCK); 450 | } 451 | return ret; 452 | } 453 | 454 | std::string GetExePath() 455 | { 456 | std::string full_path; 457 | char pidexe[1024]={0}; 458 | snprintf(pidexe, sizeof(pidexe), "/proc/%u/exe", getpid()); 459 | int fd = open(pidexe, O_RDONLY); 460 | if(fd == -1) 461 | return full_path; 462 | 463 | char buf[1024]={0}; 464 | if(readlink(pidexe,buf, 1024)!=-1) 465 | { 466 | full_path = buf; 467 | } 468 | close(fd); 469 | 470 | int pos=full_path.find_last_of(PATH_SEPARATOR); 471 | return full_path.substr(0,pos+1); 472 | } 473 | 474 | 475 | #endif 476 | 477 | 478 | int create_dir(const char* szdir) 479 | { /** so perfect!! **/ 480 | std::string dir(szdir); 481 | std::string parent; 482 | int pos=dir.find_last_of(PATH_SEPARATOR); 483 | if ( (pos+1)!=dir.length()) 484 | { 485 | parent=dir.substr(0, pos+1); 486 | } 487 | else 488 | { 489 | pos=dir.find_last_of(PATH_SEPARATOR,pos-1); 490 | parent=dir.substr(0, pos+1); 491 | } 492 | 493 | if ( access(dir.c_str(), 0)!=0 ) 494 | { 495 | create_dir(parent.c_str()); 496 | #ifdef WIN32 497 | mkdir(dir.c_str()); 498 | #else 499 | mkdir(dir.c_str(),'0755'); 500 | #endif 501 | } 502 | else 503 | { 504 | return 0; 505 | } 506 | return 0; 507 | } 508 | 509 | 510 | void gettime(char* sztime) 511 | { 512 | time_t local_time = time(0); 513 | struct tm * newtime = localtime(&local_time); 514 | sprintf(sztime, "%.19s : ", asctime(newtime)); 515 | } 516 | -------------------------------------------------------------------------------- /src/spider_porting.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_porting.h -------------------------------------------------------------------------------- /src/spider_seed.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_seed.cpp -------------------------------------------------------------------------------- /src/spider_seed.h: -------------------------------------------------------------------------------- 1 | #ifndef __CROTON_SPIDER_SEED_H__ 2 | #define __CROTON_SPIDER_SEED_H__ 3 | #include "spider_utils.h" 4 | #include "boost/xpressive/xpressive_dynamic.hpp" 5 | 6 | class Seed 7 | { 8 | public: 9 | void set_pic_size(const char* str); 10 | 11 | struct Index_Regex 12 | { 13 | bool has_comment; 14 | boost::xpressive::cregex index_regex; 15 | boost::xpressive::cregex comment_regex; 16 | }; 17 | 18 | StrVec start_url_; //爬取页面 19 | std::vector index_regex_; //second存储content的正则 20 | std::vector pic_regex_; 21 | IntPair pic_size_; 22 | }; 23 | 24 | class Spider_Seed 25 | { 26 | public: 27 | Spider_Seed(); 28 | ~Spider_Seed(); 29 | 30 | int load(); 31 | int get_seed(std::string& website, Seed** seed ); 32 | 33 | private: 34 | std::map m_seeds; 35 | }; 36 | 37 | 38 | #endif 39 | 40 | -------------------------------------------------------------------------------- /src/spider_storage.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_storage.cpp -------------------------------------------------------------------------------- /src/spider_storage.h: -------------------------------------------------------------------------------- 1 | #ifndef __CROTON_SPIDER_STORAGE_H__ 2 | #define __CROTON_SPIDER_STORAGE_H__ 3 | #include "spider_database.h" 4 | 5 | class Spider_Storage 6 | { 7 | public: 8 | static Spider_Storage& instance() 9 | { 10 | static Spider_Storage _instance; 11 | return _instance; 12 | }; 13 | ~Spider_Storage(); 14 | 15 | int initialize(); 16 | int uninitialize(); 17 | 18 | int write_file(const char* website, UrlPtr url_ptr ); 19 | //int write_file(const char* website, const char* albums, UrlPtr url_ptr ); 20 | 21 | int rename_filename_with_md5(UrlPtr url_ptr); 22 | 23 | private: 24 | Spider_Storage(); 25 | Spider_Database* m_database; 26 | }; 27 | 28 | 29 | #endif 30 | 31 | -------------------------------------------------------------------------------- /src/spider_thread_pool.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_thread_pool.cpp -------------------------------------------------------------------------------- /src/spider_thread_pool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_thread_pool.h -------------------------------------------------------------------------------- /src/spider_url.cpp: -------------------------------------------------------------------------------- 1 | #include "spider_url.h" 2 | #include "spider_utils.h" 3 | #include "spider_common.h" 4 | #include "spider_md5.h" 5 | #define NEW_METHOD 1 6 | 7 | std::string url_encode(std::string source_url) 8 | { 9 | std::string result; 10 | for ( unsigned int i=0; i>4)); 26 | result+=dec_to_char((j%16)); 27 | } 28 | } 29 | return result; 30 | } 31 | 32 | std::string url_decode(std::string source_url) 33 | { 34 | std::string result; 35 | for ( unsigned int i=0;iuri) 58 | { 59 | if ( *c=='.' ) 60 | found=true; 61 | c--; 62 | } 63 | if ( found==true&&*c=='/' ) 64 | { 65 | return ++c; 66 | } 67 | return ""; 68 | } 69 | 70 | UrlPtr create_url(std::string url, URLTYPE type) 71 | { 72 | assert(!url.empty()); 73 | std::string host, uri; 74 | int port=80; 75 | prase_url(url,host,uri, port); 76 | UrlPtr object(new URL()); 77 | object->url=strdup(url.c_str()); 78 | object->domain=strdup(host.c_str()); 79 | object->res=strdup(uri.c_str()); 80 | object->filename=strdup(get_filename_from_uri(uri.c_str())); 81 | object->port=port; 82 | object->type=type; 83 | object->comment=strdup(""); 84 | return object; 85 | } 86 | 87 | 88 | #if NEW_METHOD 89 | unsigned int url_hash_code(UrlPtr url) 90 | { 91 | uint md5_code=Spider_MD5::get_buffer_md5_code(url->url, strlen(url->url)); 92 | return md5_code; 93 | } 94 | #else 95 | unsigned int url_hash_code(UrlPtr url) 96 | { 97 | unsigned int ret=0; 98 | if (url!=NULL) 99 | { 100 | unsigned int h=url->port; 101 | unsigned int i=0; 102 | while (url->domain[i] != 0) 103 | { 104 | h = 31*h + url->domain[i]; 105 | i++; 106 | } 107 | i=0; 108 | while (url->res[i] != 0) 109 | { 110 | h = 31*h + url->res[i]; 111 | i++; 112 | } 113 | ret=h%kUrlHashSize; 114 | } 115 | return ret; 116 | } 117 | #endif 118 | 119 | 120 | -------------------------------------------------------------------------------- /src/spider_url.h: -------------------------------------------------------------------------------- 1 | #ifndef __CROTON_SPIDER_URL_H__ 2 | #define __CROTON_SPIDER_URL_H__ 3 | 4 | #include "spider_common.h" 5 | #include "boost/shared_ptr.hpp" 6 | 7 | 8 | enum URLTYPE 9 | { 10 | UT_START, //start 11 | UT_INDEX, //index 12 | UT_PICT, //picture 13 | }; 14 | 15 | class URL 16 | { 17 | public: 18 | URL() 19 | { 20 | parent=NULL; 21 | url=NULL; 22 | domain=NULL; 23 | res=NULL; 24 | ip=NULL; 25 | response=NULL; 26 | belong=NULL; 27 | length=0; 28 | comment=NULL; 29 | albums_id=0; 30 | } 31 | ~URL() 32 | { 33 | if (parent!=NULL)free(parent); 34 | if (url!=NULL) free(url); 35 | if (domain!=NULL)free(domain); 36 | if (res!=NULL )free(res); 37 | if (ip!=NULL) free(ip); 38 | if (response!=NULL)free(response); 39 | if (comment!=NULL)free(comment); 40 | } 41 | 42 | char* parent; //parent url for pic 43 | char* url; //url 44 | char* domain; //domain 45 | char* res; //Request Resource 46 | char* ip; //ip 47 | int port; //port default 80 48 | char* filename; //file name 49 | int albums_id; //the id of albums which this url belong. 50 | char* response; //http response 51 | int length; //the length of response. 52 | void* belong ; //the website which url belong. 53 | URLTYPE type; //whether picture or html. 54 | char* comment; //comment of gifs. 55 | }; 56 | 57 | typedef boost::shared_ptr UrlPtr; 58 | typedef std::vector UrlPtrVec; 59 | 60 | std::string url_encode(std::string source_url); 61 | std::string url_decode(std::string source_url); 62 | UrlPtr create_url(std::string url, URLTYPE type); 63 | 64 | unsigned int url_hash_code(UrlPtr url); 65 | 66 | #endif -------------------------------------------------------------------------------- /src/spider_url_rinse.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_url_rinse.cpp -------------------------------------------------------------------------------- /src/spider_url_rinse.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_url_rinse.h -------------------------------------------------------------------------------- /src/spider_utils.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_utils.cpp -------------------------------------------------------------------------------- /src/spider_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_utils.h -------------------------------------------------------------------------------- /src/spider_website.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_website.cpp -------------------------------------------------------------------------------- /src/spider_website.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhiwuba/Spider/b75d452eeacfc700e6bb76d9d8f2afffdf66c388/src/spider_website.h --------------------------------------------------------------------------------