├── Makefile.am ├── Makefile.in ├── README.md ├── aclocal.m4 ├── aclocal ├── ax_boost.m4 ├── ax_libevent.m4 ├── ax_log4cplus.m4 ├── ax_mongo_client.m4 ├── ax_qt5.m4 └── ax_thrift.m4 ├── conf.xml.demo ├── config.guess ├── config.sub ├── configure ├── configure.ac ├── create.sql ├── depcomp ├── install-sh ├── log.properties.demo ├── lspider.thrift ├── maindomain.list.demo ├── missing ├── readme.txt ├── src ├── CrawlService.cpp ├── CrawlService.h ├── DoubleList.h ├── atomic.h ├── cmd_ctrler.cpp ├── cmd_ctrler.h ├── conf.cpp ├── conf.h ├── controllable.cpp ├── controllable.h ├── crawl_listen_handler.cpp ├── crawl_listen_handler.h ├── defines.h ├── extractor.cpp ├── extractor.h ├── extractor_worker_view.cpp ├── extractor_worker_view.h ├── http_event_engine.cpp ├── http_event_engine.h ├── http_processor.cpp ├── http_processor.h ├── http_protocol.cpp ├── http_protocol.h ├── keyed_queue.h ├── link.h ├── link_scheduler.cpp ├── link_scheduler.h ├── link_table.cpp ├── link_table.h ├── locked_map.h ├── locked_queue.h ├── logger_container.cpp ├── logger_container.h ├── lspider_client.cpp ├── lspider_constants.cpp ├── lspider_constants.h ├── lspider_types.cpp ├── lspider_types.h ├── lthread.cpp ├── lthread.h ├── main.cpp ├── moc_extractor_worker_view.cpp ├── moc_link_scheduler.cpp ├── moc_mysql_dumper.cpp ├── moc_mysql_selector.cpp ├── mongo_dumper.cpp ├── mongo_dumper.h ├── mutex_lock.h ├── mysql_base.cpp ├── mysql_base.h ├── mysql_dumper.cpp ├── mysql_dumper.h ├── mysql_selector.cpp ├── mysql_selector.h ├── parse_url.cpp ├── request_recv.cpp ├── request_recv.h ├── singleton.h ├── synced_queue.h ├── url_context.cpp ├── url_context.h ├── url_tools.cpp ├── url_tools.h ├── util.cpp └── util.h ├── stamp-h1 └── test ├── Makefile ├── test_backtrace.cpp ├── test_bson.cpp ├── test_fetchmaindomain.cpp ├── test_keyedqueue.cpp ├── test_log.cpp ├── test_mongo.cpp ├── test_mysql.cpp ├── test_priorityqueue.cpp ├── test_qtextcodec.cpp ├── test_qweb.cpp ├── test_urlnormalize.cpp └── test_util.cpp /Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS=foreign 2 | 3 | bin_PROGRAMS=lspider lspider_client parse_url 4 | 5 | lspider_SOURCES=src/main.cpp src/conf.cpp src/extractor_worker_view.cpp src/lspider_constants.cpp src/logger_container.cpp src/moc_link_scheduler.cpp src/mysql_base.cpp src/url_context.cpp \ 6 | src/crawl_listen_handler.cpp src/http_event_engine.cpp src/lspider_types.cpp src/lthread.cpp src/moc_mysql_dumper.cpp src/mysql_dumper.cpp src/url_tools.cpp \ 7 | src/CrawlService.cpp src/http_processor.cpp src/link_scheduler.cpp src/moc_mysql_selector.cpp src/mysql_selector.cpp src/util.cpp \ 8 | src/extractor.cpp src/http_protocol.cpp src/link_table.cpp src/moc_extractor_worker_view.cpp src/mongo_dumper.cpp src/request_recv.cpp \ 9 | src/cmd_ctrler.cpp src/controllable.cpp 10 | lspider_CPPFLAGS=$(QT_CPPFLAGS) $(MONGO_CLIENT_CPPFLAGS) \ 11 | $(BOOST_CPPFLAGS) $(LOG4CPLUS_CPPFLAGS) $(LIBEVENT_CPPFLAGS) \ 12 | $(QT_CPPFLAGS)/QtWebKit $(THRIFT_CPPFLAGS) -fPIC 13 | 14 | lspider_client_SOURCES=src/lspider_client.cpp src/lspider_constants.cpp src/CrawlService.cpp src/lspider_types.cpp src/url_context.cpp src/logger_container.cpp src/url_tools.cpp src/util.cpp 15 | lspider_client_CPPFLAGS=$(QT_CPPFLAGS) $(MONGO_CLIENT_CPPFLAGS) \ 16 | $(BOOST_CPPFLAGS) $(LOG4CPLUS_CPPFLAGS) $(LIBEVENT_CPPFLAGS) \ 17 | $(QT_CPPFLAGS)/QtWebKit $(THRIFT_CPPFLAGS) -fPIC 18 | 19 | parse_url_SOURCES=src/parse_url.cpp src/url_tools.cpp 20 | 21 | LIBS=$(QT_LDFLAGS)/ -lQt5Core -lQt5Widgets -lQt5WebKit -lQt5WebKitWidgets -lQt5Network -lQt5Sql \ 22 | $(MONGO_CLIENT_LDFLAGS) -lmongoclient $(LIBEVENT_LDFLAGS) -levent -levent_core -levent_extra -levent_pthreads \ 23 | $(BOOST_LDFLAGS) -lboost_system -lboost_thread-mt -lboost_filesystem -lboost_program_options -lboost_regex $(LOG4CPLUS_LDFLAGS) -llog4cplus \ 24 | $(THRIFT_LDFLAGS) -lthrift -lz -lpthread -lreadline 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lspider 2 | ===== 3 | Linux下的轻量级网页抓取系统 4 | 5 | [下载地址](https://github.com/warmheartli/lspider/archive/master.zip) 6 | 7 | ### 功能特性 8 | - 将整套spider系统功能浓缩到一个二进制中,部署轻便 9 | - 多线程和并发网络通信,性能高,资源利用充分,单机可同时抓取数百站点并能保持cpu低占用 10 | - 简易配置,指定待抓取站点列表和高质量种子链接即可自适应抓取扩散至整站,参考链接深度的广度优先遍历同时保证时新性 11 | - 通过配置不同站点ip的抓取压力,全异步的压力控制和链接调度,优雅抓取,避免封禁 12 | - 基于WebKit的可定制的页面渲染和链接提取,便于性能和效果之间的权衡 13 | - mysql作为链接存储介质,更高效;mongo作为网页存储介质,便于字段扩展和快速查询 14 | - 天然支持分布式部署(基于站点划分),可横向扩展 15 | 16 | ### 安装依赖 17 | - libevent (>=2.0.22) 18 | - thrift (>=0.9.2) 19 | - qt (>=4.8.6) 20 | - mongo-cxx-driver-legacy (>=1.0.2) 21 | - log4cplus (>=1.2.0) 22 | - boost (>=1.58.0) 23 | 24 | ### 安装方法(RHEL6) 25 | 26 | =================================================================================================================== 27 | 28 | ## 一、安装依赖库(包括libevent、boost、mongo-client、qt、thrift) 29 | 30 | ### 1、安装libevent 31 | 32 | 在http://libevent.org/ 中找最新版(如2.0.22) 33 | 34 | `wget https://sourceforge.net/projects/levent/files/libevent/libevent-2.0/libevent-2.0.22-stable.tar.gz` 35 | 36 | 解压后执行 37 | 38 | `./configure --prefix=${HOME}/libevent` 39 | 40 | `make` 41 | 42 | `make install` 43 | 44 | ### 2、安装boost 45 | 46 | `yum install boost-devel` 47 | 48 | ### 3、安装scons(因为下面编译mongoclient要用到) 49 | 50 | `yum install scons` 51 | 52 | ### 4、安装mongo-client 53 | 54 | 在https://github.com/mongodb/mongo-cxx-driver/releases/ 中找最新版(如1.0.2) 55 | 56 | `wget https://github.com/mongodb/mongo-cxx-driver/archive/legacy-1.0.2.tar.gz` 57 | 58 | 解压后执行 59 | 60 | `cd mongo-cxx-driver-legacy-1.0.2` 61 | 62 | `scons --prefix=$HOME/mongo-client install` 63 | 64 | ### 5、安装log4cplus 65 | 66 | `yum install log4cplus-devel` 67 | 68 | ### 6、安装qt5 69 | 70 | `yum install qt5-qtbase-devel` 71 | 72 | `yum install qt5-qtwebkit-devel` 73 | 74 | ### 7、安装bison(后面编译安装thrift会用到) 75 | 76 | 从http://ftp.gnu.org/gnu/bison/bison-2.7.tar.gz 下载2.5以上版本的bison,解压后执行 77 | 78 | `cd bison-2.7` 79 | 80 | `./configure` 81 | 82 | `make` 83 | 84 | `make install` 85 | 86 | ### 8、安装thrift 87 | 88 | 从http://thrift.apache.org/download 下载最新版thrift的源代码(如:http://apache.fayea.com/thrift/0.9.0/thrift-0.9.0.tar.gz),解压后执行 89 | 90 | `cd thrift-0.9.0` 91 | 92 | `./configure --prefix=${HOME}/thrift --without-csharp --without-java --without-erlang --without-nodejs --without-lua --without-python --without-perl --without-ruby --without-haskell --without-go --without-d` 93 | 94 | `make` 95 | 96 | `make install` 97 | 98 | ### 8、安装高版本autoconf 99 | 100 | 从ftp://ftp.gnu.org/gnu/autoconf/ 下载最新版autoconf的源代码(如:ftp://ftp.gnu.org/gnu/autoconf/autoconf-2.69.tar.gz),解压后执行 101 | 102 | `./configure` 103 | 104 | `make` 105 | 106 | `make install` 107 | 108 | ### 9、安装高版本automake 109 | 110 | 从ftp://ftp.gnu.org/gnu/automake/ 下载最新版automake的源代码(如:ftp://ftp.gnu.org/gnu/automake/automake-1.13.tar.gz),解压后执行 111 | 112 | `./configure` 113 | 114 | `make` 115 | 116 | `make install` 117 | 118 | ## 二、安装lspider 119 | 120 | ### 1.从github下载并编译 121 | 122 | `git clone https://github.com/warmheartli/lspider.git` 123 | 124 | ### 2.编译安装 125 | 126 | `./configure --prefix=${HOME}/lspider --with-libevent-include=${HOME}/libevent/include/ --with-libevent-libdir=${HOME}/libevent/lib/ --with-mongo-client-include=${HOME}/mongo-client/include/ --with-mongo-client-libdir=${HOME}/mongo-client/lib/ --with-thrift-include=${HOME}/thrift/include/ --with-thrift-libdir=${HOME}/thrift/lib/` 127 | 128 | `make` 129 | 130 | `make install` 131 | 132 | ### 3.配置 133 | 134 | `mkdir -p ${HOME}/lspider/conf ${HOME}/lspider/log` 135 | 136 | `cp conf.xml.demo ${HOME}/lspider/conf/conf.xml` 137 | 138 | `cp log.properties.demo ${HOME}/lspider/conf/log.properties` 139 | 140 | `cp maindomain.list.demo ${HOME}/lspider/conf/maindomain.list` 141 | 142 | `export LD_LIBRARY_PATH=/home/lichuang/libevent/lib/:/home/lichuang/thrift/lib/:${LD_LIBRARY_PATH}` 143 | 144 | ### 4.安装xvfb 145 | 146 | `yum install xorg-x11-server-Xvfb` 147 | 148 | ## 三、运行lspider 149 | 150 | ### 1.确定mysql和mongo已经启动并执行 151 | 152 | `xvfb-run --server-args="-screen 0, 1024x768x24" ./bin/lspider` 153 | -------------------------------------------------------------------------------- /aclocal/ax_libevent.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([AX_LIBEVENT], 2 | [ 3 | AC_ARG_WITH([libevent-include], 4 | [AS_HELP_STRING([--with-libevent-include=INCLUDE_DIR], 5 | [use libevent])], 6 | [ 7 | if test -d "$withval" 8 | then 9 | ac_libevent_include_path="$withval" 10 | else 11 | AC_MSG_ERROR(--with-libevent-include expected directory name) 12 | fi 13 | ], 14 | [ac_libevent_include_path=""] 15 | ) 16 | 17 | 18 | AC_ARG_WITH([libevent-libdir], 19 | AS_HELP_STRING([--with-libevent-libdir=LIB_DIR], 20 | [Force given directory for libevent libraries. Note that this will override library path detection, so use this parameter only if default library detection fails and you know exactly where your libevent libraries are located.]), 21 | [ 22 | if test -d "$withval" 23 | then 24 | ac_libevent_lib_path="$withval" 25 | else 26 | AC_MSG_ERROR(--with-libevent-libdir expected directory name) 27 | fi 28 | ], 29 | [ac_libevent_lib_path=""] 30 | ) 31 | 32 | libsubdirs="lib64 libx32 lib lib64" 33 | 34 | if test "$ac_libevent_include_path" != ""; then 35 | LIBEVENT_CPPFLAGS="-I$ac_libevent_include_path" 36 | else 37 | ac_libevent_include_path="/usr/include/" 38 | LIBEVENT_CPPFLAGS="-I$ac_libevent_include_path" 39 | fi 40 | 41 | if test "$ac_libevent_lib_path" != ""; then 42 | LIBEVENT_LDFLAGS="-L$ac_libevent_lib_path" 43 | else 44 | for libsubdir in $libsubdirs ; do 45 | if ls "$/usr/$libsubdir/libevent"* >/dev/null 2>&1 ; then break; fi 46 | done 47 | ac_libevent_lib_path="/usr/$libsubdir/" 48 | LIBEVENT_LDFLAGS="-L$ac_libevent_lib_path" 49 | fi 50 | 51 | succeeded=no 52 | if ls "$ac_libevent_include_path/event.h" >/dev/null 2>&1 ; then 53 | if ls "$ac_libevent_lib_path/libevent"* >/dev/null 2>&1 ; then 54 | succeeded=yes 55 | fi 56 | fi 57 | 58 | AC_SUBST(LIBEVENT_CPPFLAGS) 59 | AC_SUBST(LIBEVENT_LDFLAGS) 60 | 61 | ]) 62 | -------------------------------------------------------------------------------- /aclocal/ax_log4cplus.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([AX_LOG4CPLUS], 2 | [ 3 | AC_ARG_WITH([log4cplus-include], 4 | [AS_HELP_STRING([--with-log4cplus-include=INCLUDE_DIR], 5 | [use log4cplus])], 6 | [ 7 | if test -d "$withval" 8 | then 9 | ac_log4cplus_include_path="$withval" 10 | else 11 | AC_MSG_ERROR(--with-log4cplus-include expected directory name) 12 | fi 13 | ], 14 | [ac_log4cplus_include_path=""] 15 | ) 16 | 17 | 18 | AC_ARG_WITH([log4cplus-libdir], 19 | AS_HELP_STRING([--with-log4cplus-libdir=LIB_DIR], 20 | [Force given directory for log4cplus libraries. Note that this will override library path detection, so use this parameter only if default library detection fails and you know exactly where your log4cplus libraries are located.]), 21 | [ 22 | if test -d "$withval" 23 | then 24 | ac_log4cplus_lib_path="$withval" 25 | else 26 | AC_MSG_ERROR(--with-log4cplus-libdir expected directory name) 27 | fi 28 | ], 29 | [ac_log4cplus_lib_path=""] 30 | ) 31 | 32 | libsubdirs="lib64 libx32 lib lib64" 33 | 34 | if test "$ac_log4cplus_include_path" != ""; then 35 | LOG4CPLUS_CPPFLAGS="-I$ac_log4cplus_include_path" 36 | else 37 | ac_log4cplus_include_path="/usr/include/" 38 | LOG4CPLUS_CPPFLAGS="-I$ac_log4cplus_include_path" 39 | fi 40 | 41 | if test "$ac_log4cplus_lib_path" != ""; then 42 | LOG4CPLUS_LDFLAGS="-L$ac_log4cplus_lib_path" 43 | else 44 | for libsubdir in $libsubdirs ; do 45 | if ls "$/usr/$libsubdir/liblog4cplus"* >/dev/null 2>&1 ; then break; fi 46 | done 47 | ac_log4cplus_lib_path="/usr/$libsubdir/" 48 | LOG4CPLUS_LDFLAGS="-L$ac_log4cplus_lib_path" 49 | fi 50 | 51 | succeeded=no 52 | if ls "$ac_log4cplus_include_path/log4cplus" >/dev/null 2>&1 ; then 53 | if ls "$ac_log4cplus_lib_path/liblog4cplus"* >/dev/null 2>&1 ; then 54 | succeeded=yes 55 | fi 56 | fi 57 | 58 | AC_SUBST(LOG4CPLUS_CPPFLAGS) 59 | AC_SUBST(LOG4CPLUS_LDFLAGS) 60 | 61 | ]) 62 | -------------------------------------------------------------------------------- /aclocal/ax_mongo_client.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([AX_MONGO_CLIENT], 2 | [ 3 | AC_ARG_WITH([mongo-client-include], 4 | [AS_HELP_STRING([--with-mongo-client-include=INCLUDE_DIR], 5 | [use mongo-client])], 6 | [ 7 | if test -d "$withval" 8 | then 9 | ac_mongo_client_include_path="$withval" 10 | else 11 | AC_MSG_ERROR(--with-mongo-client-include expected directory name) 12 | fi 13 | ], 14 | [ac_mongo_client_include_path=""] 15 | ) 16 | 17 | 18 | AC_ARG_WITH([mongo-client-libdir], 19 | AS_HELP_STRING([--with-mongo-client-libdir=LIB_DIR], 20 | [Force given directory for mongo-client libraries. Note that this will override library path detection, so use this parameter only if default library detection fails and you know exactly where your mongo-client libraries are located.]), 21 | [ 22 | if test -d "$withval" 23 | then 24 | ac_mongo_client_lib_path="$withval" 25 | else 26 | AC_MSG_ERROR(--with-mongo-client-libdir expected directory name) 27 | fi 28 | ], 29 | [ac_mongo_client_lib_path=""] 30 | ) 31 | 32 | libsubdirs="lib64 libx32 lib lib64" 33 | 34 | if test "$ac_mongo_client_include_path" != ""; then 35 | MONGO_CLIENT_CPPFLAGS="-I$ac_mongo_client_include_path" 36 | else 37 | ac_mongo_client_include_path="/usr/include/mongo-client/" 38 | MONGO_CLIENT_CPPFLAGS="-I$ac_mongo_client_include_path" 39 | fi 40 | 41 | if test "$ac_mongo_client_lib_path" != ""; then 42 | MONGO_CLIENT_LDFLAGS="-L$ac_mongo_client_lib_path" 43 | else 44 | for libsubdir in $libsubdirs ; do 45 | if ls "$/usr/$libsubdir/libmongoclient"* >/dev/null 2>&1 ; then break; fi 46 | done 47 | ac_mongo_client_lib_path="/usr/$libsubdir/" 48 | MONGO_CLIENT_LDFLAGS="-L$ac_mongo_client_lib_path" 49 | fi 50 | 51 | succeeded=no 52 | if ls "$ac_mongo_client_include_path/mongo" >/dev/null 2>&1 ; then 53 | if ls "$ac_mongo_client_lib_path/libmongoclient"* >/dev/null 2>&1 ; then 54 | succeeded=yes 55 | fi 56 | fi 57 | 58 | AC_SUBST(MONGO_CLIENT_CPPFLAGS) 59 | AC_SUBST(MONGO_CLIENT_LDFLAGS) 60 | 61 | ]) 62 | -------------------------------------------------------------------------------- /aclocal/ax_qt5.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([AX_QT], 2 | [ 3 | AC_ARG_WITH([qt-include], 4 | [AS_HELP_STRING([--with-qt-include=INCLUDE_DIR], 5 | [use qt])], 6 | [ 7 | if test -d "$withval" 8 | then 9 | ac_qt_include_path="$withval" 10 | else 11 | AC_MSG_ERROR(--with-qt-include expected directory name) 12 | fi 13 | ], 14 | [ac_qt_include_path=""] 15 | ) 16 | 17 | 18 | AC_ARG_WITH([qt-libdir], 19 | AS_HELP_STRING([--with-qt-libdir=LIB_DIR], 20 | [Force given directory for qt libraries. Note that this will override library path detection, so use this parameter only if default library detection fails and you know exactly where your qt libraries are located.]), 21 | [ 22 | if test -d "$withval" 23 | then 24 | ac_qt_lib_path="$withval" 25 | else 26 | AC_MSG_ERROR(--with-qt-libdir expected directory name) 27 | fi 28 | ], 29 | [ac_qt_lib_path=""] 30 | ) 31 | 32 | libsubdirs="lib64 libx32 lib lib64" 33 | 34 | if test "$ac_qt_include_path" != ""; then 35 | QT_CPPFLAGS="-I$ac_qt_include_path" 36 | else 37 | ac_qt_include_path="/usr/include/qt5/" 38 | QT_CPPFLAGS="-I$ac_qt_include_path" 39 | fi 40 | 41 | if test "$ac_qt_lib_path" != ""; then 42 | QT_LDFLAGS="-L$ac_qt_lib_path" 43 | else 44 | for libsubdir in $libsubdirs ; do 45 | if ls "$/usr/$libsubdir/libQt"* >/dev/null 2>&1 ; then break; fi 46 | done 47 | ac_qt_lib_path="/usr/$libsubdir/" 48 | QT_LDFLAGS="-L$ac_qt_lib_path" 49 | fi 50 | 51 | succeeded=no 52 | if ls "$ac_qt_include_path/QtGui" >/dev/null 2>&1 ; then 53 | if ls "$ac_qt_lib_path/libQt"* >/dev/null 2>&1 ; then 54 | succeeded=yes 55 | fi 56 | fi 57 | 58 | AC_SUBST(QT_CPPFLAGS) 59 | AC_SUBST(QT_LDFLAGS) 60 | 61 | ]) 62 | -------------------------------------------------------------------------------- /aclocal/ax_thrift.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([AX_THRIFT], 2 | [ 3 | AC_ARG_WITH([thrift-include], 4 | [AS_HELP_STRING([--with-thrift-include=INCLUDE_DIR], 5 | [use thrift])], 6 | [ 7 | if test -d "$withval" 8 | then 9 | ac_thrift_include_path="$withval" 10 | else 11 | AC_MSG_ERROR(--with-thrift-include expected directory name) 12 | fi 13 | ], 14 | [ac_thrift_include_path=""] 15 | ) 16 | 17 | 18 | AC_ARG_WITH([thrift-libdir], 19 | AS_HELP_STRING([--with-thrift-libdir=LIB_DIR], 20 | [Force given directory for thrift libraries. Note that this will override library path detection, so use this parameter only if default library detection fails and you know exactly where your thrift libraries are located.]), 21 | [ 22 | if test -d "$withval" 23 | then 24 | ac_thrift_lib_path="$withval" 25 | else 26 | AC_MSG_ERROR(--with-thrift-libdir expected directory name) 27 | fi 28 | ], 29 | [ac_thrift_lib_path=""] 30 | ) 31 | 32 | libsubdirs="lib64 libx32 lib lib64" 33 | 34 | if test "$ac_thrift_include_path" != ""; then 35 | THRIFT_CPPFLAGS="-I$ac_thrift_include_path" 36 | else 37 | ac_thrift_include_path="/usr/include/" 38 | THRIFT_CPPFLAGS="-I$ac_thrift_include_path" 39 | fi 40 | 41 | if test "$ac_thrift_lib_path" != ""; then 42 | THRIFT_LDFLAGS="-L$ac_thrift_lib_path" 43 | else 44 | for libsubdir in $libsubdirs ; do 45 | if ls "$/usr/$libsubdir/libthrift"* >/dev/null 2>&1 ; then break; fi 46 | done 47 | ac_thrift_lib_path="/usr/$libsubdir/" 48 | THRIFT_LDFLAGS="-L$ac_thrift_lib_path" 49 | fi 50 | 51 | succeeded=no 52 | if ls "$ac_thrift_include_path/thrift" >/dev/null 2>&1 ; then 53 | if ls "$ac_thrift_lib_path/libthrift"* >/dev/null 2>&1 ; then 54 | succeeded=yes 55 | fi 56 | fi 57 | 58 | AC_SUBST(THRIFT_CPPFLAGS) 59 | AC_SUBST(THRIFT_LDFLAGS) 60 | 61 | ]) 62 | -------------------------------------------------------------------------------- /conf.xml.demo: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | localhost 6 | 3306 7 | root 8 | 9 | lspider 10 | link 11 | 12 | 13 | on 14 | 10 15 | 1 16 | ./data/maindomain.list 17 | 5 18 | 19 | 20 | 20 21 | 20 22 | 23 | 24 | 25 | 26 | 5 27 | 20 28 | 1 29 | 5 30 | 2 31 | 512 32 | 5 33 | 524288 34 | 1000 35 | 36 | 37 | 38 | 5 39 | 20 40 | 5 41 | 42 | 43 | 44 | localhost 45 | 46 | 47 | 48 | 300 49 | 10000> 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.69]) 5 | AC_INIT(lspider, 0.1, whlichuang@126.com) 6 | AC_CONFIG_SRCDIR([src/main.cpp]) 7 | AM_INIT_AUTOMAKE 8 | 9 | # Checks for programs. 10 | AC_PROG_CXX 11 | AC_PROG_CC 12 | 13 | # Checks for libraries. 14 | AX_QT 15 | if test "x$succeeded" != "xyes";then 16 | AC_MSG_ERROR([checking qt fail]) 17 | fi 18 | 19 | AX_MONGO_CLIENT 20 | if test "x$succeeded" != "xyes";then 21 | AC_MSG_ERROR([checking mongo-client fail]) 22 | fi 23 | 24 | AX_LIBEVENT 25 | if test "x$succeeded" != "xyes";then 26 | AC_MSG_ERROR([checking libevent fail]) 27 | fi 28 | 29 | AX_THRIFT 30 | if test "x$succeeded" != "xyes";then 31 | AC_MSG_ERROR([checking thrift fail]) 32 | fi 33 | 34 | AX_BOOST 35 | if test "x$succeeded" != "xyes";then 36 | AC_MSG_ERROR([checking boost fail]) 37 | fi 38 | 39 | AX_LOG4CPLUS 40 | if test "x$succeeded" != "xyes";then 41 | AC_MSG_ERROR([checking log4cplus fail]) 42 | fi 43 | 44 | #if test "x$succeeded" = "xno" ; then 45 | # AC_MSG_ERROR(["Error: thrift required"]) 46 | #fi 47 | 48 | # Checks for header files. 49 | AC_CHECK_HEADERS([arpa/inet.h fcntl.h netdb.h netinet/in.h stdint.h stdlib.h string.h sys/socket.h sys/time.h unistd.h]) 50 | 51 | # Checks for typedefs, structures, and compiler characteristics. 52 | AC_CHECK_HEADER_STDBOOL 53 | AC_C_INLINE 54 | AC_TYPE_INT16_T 55 | AC_TYPE_INT32_T 56 | AC_TYPE_INT64_T 57 | AC_TYPE_SIZE_T 58 | AC_TYPE_UINT32_T 59 | AC_TYPE_UINT64_T 60 | AC_TYPE_UINT8_T 61 | 62 | # Checks for library functions. 63 | AC_FUNC_MALLOC 64 | AC_CHECK_FUNCS([bzero gettimeofday memset select socket strchr strcspn strerror strncasecmp]) 65 | 66 | AC_OUTPUT([Makefile]) 67 | -------------------------------------------------------------------------------- /create.sql: -------------------------------------------------------------------------------- 1 | create database lspider; 2 | use lspider; 3 | create table link ( 4 | `sign` VARCHAR(24) NOT NULL, 5 | `url` VARCHAR(1024) NOT NULL DEFAULT '', 6 | `maindomain` VARCHAR(80) DEFAULT '', 7 | `ip` VARCHAR(16) DEFAULT '', 8 | `prelink` VARCHAR(1024) DEFAULT '', 9 | `preanchor` VARCHAR(1024) DEFAULT '', 10 | `weight` INT(10) unsigned NOT NULL DEFAULT 1, 11 | `linkdepth` INT(10) unsigned NOT NULL DEFAULT 0, 12 | `crawlstate` INT(10) NOT NULL DEFAULT 0, 13 | `crawlretry` INT(10) NOT NULL DEFAULT 0, 14 | `hub` BOOL NOT NULL DEFAULT FALSE, 15 | `fresh` BOOL NOT NULL DEFAULT FALSE, 16 | `updatetime` TIMESTAMP, 17 | `foundtime` TIMESTAMP, 18 | `crawledtime` TIMESTAMP, 19 | PRIMARY KEY (`sign`) 20 | ) ENGINE=MyISAM DEFAULT character set utf8; 21 | -------------------------------------------------------------------------------- /log.properties.demo: -------------------------------------------------------------------------------- 1 | # http://log4cplus.sourceforge.net/docs/html/classlog4cplus_1_1PropertyConfigurator.html 2 | # http://log4cplus.sourceforge.net/docs/html/classlog4cplus_1_1PatternLayout.html 3 | # %p event level 4 | # %D{%Y-%m-%d %H:%M:%S} 2015-05-16 21:33:43 5 | # %t thread name 6 | # %l test.cpp:12 7 | # %M __FUNCTION__ 8 | # %m message 9 | # %n new line 10 | 11 | log4cplus.rootLogger=TRACE, normal_appender, wf_appender 12 | 13 | log4cplus.appender.normal_appender=log4cplus::RollingFileAppender 14 | log4cplus.appender.normal_appender.File=log/lspider.log 15 | log4cplus.appender.normal_appender.MaxFileSize=500MB 16 | log4cplus.appender.normal_appender.MaxBackupIndex=10 17 | log4cplus.appender.normal_appender.layout=log4cplus::PatternLayout 18 | log4cplus.appender.normal_appender.layout.ConversionPattern=%-5p: %D{%Y-%m-%d %H:%M:%S} %t %l %m%n 19 | #log4cplus.appender.normal_appender.filters.1=log4cplus::spi::LogLevelMatchFilter 20 | #log4cplus.appender.normal_appender.filters.1.LogLevelToMatch=TRACE 21 | log4cplus.appender.normal_appender.filters.1=log4cplus::spi::LogLevelRangeFilter 22 | log4cplus.appender.normal_appender.filters.1.LogLevelMin=TRACE 23 | #log4cplus.appender.normal_appender.filters.1.LogLevelMin=INFO 24 | log4cplus.appender.normal_appender.filters.1.LogLevelMax=INFO 25 | log4cplus.appender.normal_appender.filters.1.AcceptOnMatch=true 26 | log4cplus.appender.normal_appender.filters.2=log4cplus::spi::DenyAllFilter 27 | 28 | log4cplus.appender.wf_appender=log4cplus::RollingFileAppender 29 | log4cplus.appender.wf_appender.File=log/lspider.log.wf 30 | log4cplus.appender.wf_appender.MaxFileSize=500MB 31 | log4cplus.appender.wf_appender.MaxBackupIndex=10 32 | log4cplus.appender.wf_appender.layout=log4cplus::PatternLayout 33 | #log4cplus.appender.wf_appender.layout.ConversionPattern=%-5p: %D{%Y-%m-%d %H:%M:%S} %t %l [%M] %m%n 34 | log4cplus.appender.wf_appender.layout.ConversionPattern=%-5p: %D{%Y-%m-%d %H:%M:%S} %t %l %m%n 35 | #log4cplus.appender.wf_appender.filters.1=log4cplus::spi::LogLevelMatchFilter 36 | #log4cplus.appender.wf_appender.filters.1.LogLevelToMatch=TRACE 37 | log4cplus.appender.wf_appender.filters.1=log4cplus::spi::LogLevelRangeFilter 38 | log4cplus.appender.wf_appender.filters.1.LogLevelMin=WARN 39 | log4cplus.appender.wf_appender.filters.1.LogLevelMax=FATAL 40 | log4cplus.appender.wf_appender.filters.1.AcceptOnMatch=true 41 | log4cplus.appender.wf_appender.filters.2=log4cplus::spi::DenyAllFilter 42 | -------------------------------------------------------------------------------- /lspider.thrift: -------------------------------------------------------------------------------- 1 | namespace cpp lspider 2 | 3 | struct IUrl { 4 | 1: string str, 5 | 2: string ip, 6 | 3: bool hub = false, 7 | } 8 | 9 | service CrawlService { 10 | void request(1: IUrl u), 11 | string exec_cmd(1: string cmd), 12 | } 13 | -------------------------------------------------------------------------------- /maindomain.list.demo: -------------------------------------------------------------------------------- 1 | shareditor.com 2 | -------------------------------------------------------------------------------- /missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common wrapper for a few potentially missing GNU programs. 3 | 4 | scriptversion=2012-06-26.16; # UTC 5 | 6 | # Copyright (C) 1996-2013 Free Software Foundation, Inc. 7 | # Originally written by Fran,cois Pinard , 1996. 8 | 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | if test $# -eq 0; then 28 | echo 1>&2 "Try '$0 --help' for more information" 29 | exit 1 30 | fi 31 | 32 | case $1 in 33 | 34 | --is-lightweight) 35 | # Used by our autoconf macros to check whether the available missing 36 | # script is modern enough. 37 | exit 0 38 | ;; 39 | 40 | --run) 41 | # Back-compat with the calling convention used by older automake. 42 | shift 43 | ;; 44 | 45 | -h|--h|--he|--hel|--help) 46 | echo "\ 47 | $0 [OPTION]... PROGRAM [ARGUMENT]... 48 | 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due 50 | to PROGRAM being missing or too old. 51 | 52 | Options: 53 | -h, --help display this help and exit 54 | -v, --version output version information and exit 55 | 56 | Supported PROGRAM values: 57 | aclocal autoconf autoheader autom4te automake makeinfo 58 | bison yacc flex lex help2man 59 | 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and 61 | 'g' are ignored when checking the name. 62 | 63 | Send bug reports to ." 64 | exit $? 65 | ;; 66 | 67 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 68 | echo "missing $scriptversion (GNU Automake)" 69 | exit $? 70 | ;; 71 | 72 | -*) 73 | echo 1>&2 "$0: unknown '$1' option" 74 | echo 1>&2 "Try '$0 --help' for more information" 75 | exit 1 76 | ;; 77 | 78 | esac 79 | 80 | # Run the given program, remember its exit status. 81 | "$@"; st=$? 82 | 83 | # If it succeeded, we are done. 84 | test $st -eq 0 && exit 0 85 | 86 | # Also exit now if we it failed (or wasn't found), and '--version' was 87 | # passed; such an option is passed most likely to detect whether the 88 | # program is present and works. 89 | case $2 in --version|--help) exit $st;; esac 90 | 91 | # Exit code 63 means version mismatch. This often happens when the user 92 | # tries to use an ancient version of a tool on a file that requires a 93 | # minimum version. 94 | if test $st -eq 63; then 95 | msg="probably too old" 96 | elif test $st -eq 127; then 97 | # Program was missing. 98 | msg="missing on your system" 99 | else 100 | # Program was found and executed, but failed. Give up. 101 | exit $st 102 | fi 103 | 104 | perl_URL=http://www.perl.org/ 105 | flex_URL=http://flex.sourceforge.net/ 106 | gnu_software_URL=http://www.gnu.org/software 107 | 108 | program_details () 109 | { 110 | case $1 in 111 | aclocal|automake) 112 | echo "The '$1' program is part of the GNU Automake package:" 113 | echo "<$gnu_software_URL/automake>" 114 | echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" 115 | echo "<$gnu_software_URL/autoconf>" 116 | echo "<$gnu_software_URL/m4/>" 117 | echo "<$perl_URL>" 118 | ;; 119 | autoconf|autom4te|autoheader) 120 | echo "The '$1' program is part of the GNU Autoconf package:" 121 | echo "<$gnu_software_URL/autoconf/>" 122 | echo "It also requires GNU m4 and Perl in order to run:" 123 | echo "<$gnu_software_URL/m4/>" 124 | echo "<$perl_URL>" 125 | ;; 126 | esac 127 | } 128 | 129 | give_advice () 130 | { 131 | # Normalize program name to check for. 132 | normalized_program=`echo "$1" | sed ' 133 | s/^gnu-//; t 134 | s/^gnu//; t 135 | s/^g//; t'` 136 | 137 | printf '%s\n' "'$1' is $msg." 138 | 139 | configure_deps="'configure.ac' or m4 files included by 'configure.ac'" 140 | case $normalized_program in 141 | autoconf*) 142 | echo "You should only need it if you modified 'configure.ac'," 143 | echo "or m4 files included by it." 144 | program_details 'autoconf' 145 | ;; 146 | autoheader*) 147 | echo "You should only need it if you modified 'acconfig.h' or" 148 | echo "$configure_deps." 149 | program_details 'autoheader' 150 | ;; 151 | automake*) 152 | echo "You should only need it if you modified 'Makefile.am' or" 153 | echo "$configure_deps." 154 | program_details 'automake' 155 | ;; 156 | aclocal*) 157 | echo "You should only need it if you modified 'acinclude.m4' or" 158 | echo "$configure_deps." 159 | program_details 'aclocal' 160 | ;; 161 | autom4te*) 162 | echo "You might have modified some maintainer files that require" 163 | echo "the 'automa4te' program to be rebuilt." 164 | program_details 'autom4te' 165 | ;; 166 | bison*|yacc*) 167 | echo "You should only need it if you modified a '.y' file." 168 | echo "You may want to install the GNU Bison package:" 169 | echo "<$gnu_software_URL/bison/>" 170 | ;; 171 | lex*|flex*) 172 | echo "You should only need it if you modified a '.l' file." 173 | echo "You may want to install the Fast Lexical Analyzer package:" 174 | echo "<$flex_URL>" 175 | ;; 176 | help2man*) 177 | echo "You should only need it if you modified a dependency" \ 178 | "of a man page." 179 | echo "You may want to install the GNU Help2man package:" 180 | echo "<$gnu_software_URL/help2man/>" 181 | ;; 182 | makeinfo*) 183 | echo "You should only need it if you modified a '.texi' file, or" 184 | echo "any other file indirectly affecting the aspect of the manual." 185 | echo "You might want to install the Texinfo package:" 186 | echo "<$gnu_software_URL/texinfo/>" 187 | echo "The spurious makeinfo call might also be the consequence of" 188 | echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" 189 | echo "want to install GNU make:" 190 | echo "<$gnu_software_URL/make/>" 191 | ;; 192 | *) 193 | echo "You might have modified some files without having the proper" 194 | echo "tools for further handling them. Check the 'README' file, it" 195 | echo "often tells you about the needed prerequisites for installing" 196 | echo "this package. You may also peek at any GNU archive site, in" 197 | echo "case some other package contains this missing '$1' program." 198 | ;; 199 | esac 200 | } 201 | 202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \ 203 | -e '2,$s/^/ /' >&2 204 | 205 | # Propagate the correct exit status (expected to be 127 for a program 206 | # not found, 63 for a program that failed due to version mismatch). 207 | exit $st 208 | 209 | # Local variables: 210 | # eval: (add-hook 'write-file-hooks 'time-stamp) 211 | # time-stamp-start: "scriptversion=" 212 | # time-stamp-format: "%:y-%02m-%02d.%02H" 213 | # time-stamp-time-zone: "UTC" 214 | # time-stamp-end: "; # UTC" 215 | # End: 216 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | ~/thrift/bin/thrift -r -out src/ --gen cpp lspider.thrift 2 | 3 | moc-qt5 src/extractor_worker_view.h -o src/moc_extractor_worker_view.cpp 4 | moc-qt5 src/extractor_worker_view.h -o src/moc_extractor_worker_view.cpp^C 5 | moc-qt5 src/link_scheduler.h -o src/moc_link_scheduler.cpp 6 | moc-qt5 src/mysql_dumper.h -o src/moc_mysql_dumper.cpp 7 | moc-qt5 src/mysql_selector.h -o src/moc_mysql_selector.cpp 8 | -------------------------------------------------------------------------------- /src/DoubleList.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Doubly Linked List. NOT thread safe! Refer to SyncedQ.h for a locked wrapper. 3 | * by Shiding Lin 4 | * 5 | * READ THIS BEFORE YOU USE: 6 | * To prevent a node is inserted to list twice or to multiple lists 7 | * at the same time, we will check whether the link is initalized 8 | * when you call insert-related interfaces. So, you have to do the 9 | * initialization when you create a node. 10 | * 11 | * 12 | * TElement must contain the following fields: 13 | * DLINK link; 14 | */ 15 | #ifndef _DOUBLE_LINK_LIST_H_ 16 | #define _DOUBLE_LINK_LIST_H_ 17 | 18 | #include 19 | #include "defines.h" 20 | #include "link.h" 21 | 22 | typedef void* ptr; 23 | 24 | template 25 | static inline 26 | bool dlink_contains(TElement* pHead, TElement* pNode) 27 | { 28 | ASSERT(NULL != pHead); 29 | ASSERT(NULL != pNode); 30 | TElement* pTmp = pHead; 31 | do { 32 | if (pTmp == pNode) 33 | return true; 34 | if (pTmp && pTmp->link._next) { 35 | pTmp = CONTAINING_RECORD(pTmp->link._next, TElement, link); 36 | } 37 | else { 38 | break; 39 | } 40 | } while (NULL!=pTmp && pHead!=pTmp); 41 | return false; 42 | } 43 | 44 | // return next 45 | template 46 | static inline 47 | TElement* dlink_pop_self(TElement* pE) 48 | { 49 | ASSERT(NULL != pE); 50 | DLINK* p = &pE->link; 51 | if (p!=p->_next && NULL!=p->_next) { 52 | pE = CONTAINING_RECORD(p->_next, TElement, link); 53 | DLINK_REMOVE(p); 54 | DLINK_INITIALIZE(p); 55 | return pE; 56 | } 57 | return NULL; 58 | } 59 | 60 | template 61 | static inline 62 | TElement* dlink_get_next(TElement* pE) 63 | { 64 | ASSERT(NULL != pE); 65 | DLINK* p = pE->link._next; 66 | if (NULL != p) { 67 | pE = CONTAINING_RECORD(p, TElement, link); 68 | return pE; 69 | } 70 | return NULL; 71 | } 72 | 73 | template 74 | static inline 75 | TElement* dlink_get_prev(TElement* pE) 76 | { 77 | ASSERT(NULL != pE); 78 | DLINK* p = pE->link._prev; 79 | if (NULL != p) { 80 | pE = CONTAINING_RECORD(p, TElement, link); 81 | return pE; 82 | } 83 | return NULL; 84 | } 85 | 86 | template 87 | static inline 88 | TElement* dlink_get_container(DLINK* pLink) 89 | { 90 | ASSERT(NULL != pLink); 91 | return CONTAINING_RECORD(pLink, TElement, link); 92 | } 93 | 94 | template 95 | class TLinkedList 96 | { 97 | protected: 98 | int m_count; 99 | DLINK m_head; 100 | public: 101 | typedef bool (*ENUMERATOR)(TElement* pE, ptr pData); // return false to stop the enumeration 102 | 103 | TLinkedList() { Init(); } 104 | ~TLinkedList() {} 105 | 106 | void Init() { 107 | DLINK_INITIALIZE(&m_head); 108 | m_count = 0; 109 | } 110 | 111 | void Assert() const { 112 | int n = 0; 113 | DLINK* p = m_head._next; 114 | 115 | ASSERT_EQUAL(p->_prev->_next, p); 116 | while (p != &m_head) { 117 | ASSERT_EQUAL(p->_prev->_next, p); 118 | p = p->_next; 119 | n ++; 120 | } 121 | ASSERT_EQUAL(m_count, n); 122 | } 123 | 124 | void Print() const { 125 | const DLINK *p = &m_head; 126 | 127 | do { 128 | p = p->_next; 129 | } while (p != &m_head); 130 | } 131 | 132 | // Notice that it is O(n), be careful! 133 | bool contains(TElement* pNode) const { 134 | ASSERT(NULL != pNode); 135 | #ifndef _UNITTEST 136 | return true; 137 | #endif 138 | for (DLINK* p=m_head._next; p && p!=&m_head; p=p->_next) { 139 | if (p == &pNode->link) 140 | return true; 141 | } 142 | return false; 143 | } 144 | 145 | int size() const { 146 | return m_count; 147 | } 148 | 149 | void merge(TLinkedList* pList) { 150 | m_head._prev->_next = pList->m_head._next; 151 | pList->m_head._next->_prev = m_head._prev; 152 | m_head._prev = pList->m_head._prev; 153 | pList->m_head._prev->_next = &m_head; 154 | DLINK_INITIALIZE(&pList->m_head); 155 | m_count += pList->m_count; 156 | pList->m_count = 0; 157 | } 158 | 159 | int push_back(TElement* e) { 160 | ASSERT(NULL != e); 161 | ASSERT(DLINK_IS_STANDALONE(&e->link)); 162 | DLINK_INSERT_PREV(&m_head, &e->link); 163 | return ++ m_count; 164 | } 165 | 166 | TElement* pop_back() { 167 | DLINK* p = m_head._prev; 168 | if (p != &m_head) { 169 | DLINK_REMOVE(p); 170 | DLINK_INITIALIZE(p); 171 | m_count --; 172 | return CONTAINING_RECORD(p, TElement, link); 173 | } 174 | ASSERT(0 == m_count); 175 | return NULL; 176 | } 177 | 178 | TElement* get_back() { 179 | DLINK* p = m_head._prev; 180 | if (p != &m_head) { 181 | return CONTAINING_RECORD(p, TElement, link); 182 | } 183 | ASSERT(0 == m_count); 184 | return NULL; 185 | } 186 | 187 | int push_front(TElement* e) { 188 | ASSERT(NULL != e); 189 | 190 | ASSERT(DLINK_IS_STANDALONE(&e->link)); 191 | DLINK_INSERT_NEXT(&m_head, &e->link); 192 | return ++ m_count; 193 | } 194 | 195 | TElement* pop_front() { 196 | DLINK* p = m_head._next; 197 | if (p != &m_head) { 198 | DLINK_REMOVE(p); 199 | DLINK_INITIALIZE(p); 200 | m_count --; 201 | return CONTAINING_RECORD(p, TElement, link); 202 | } 203 | ASSERT_EQUAL(0, m_count); 204 | return NULL; 205 | } 206 | 207 | TElement* get_front() { 208 | DLINK* p = m_head._next; 209 | if (p != &m_head) { 210 | ASSERT(0 < m_count); 211 | return CONTAINING_RECORD(p, TElement, link); 212 | } 213 | ASSERT_EQUAL(0, m_count); 214 | return NULL; 215 | } 216 | 217 | TElement* get_next(TElement* pElement) { 218 | ASSERT(contains(pElement)); 219 | DLINK* p = pElement->link._next; 220 | if (p != &m_head) { 221 | return CONTAINING_RECORD(p, TElement, link); 222 | } 223 | return NULL; 224 | } 225 | 226 | TElement* get_prev(TElement* pElement) { 227 | ASSERT(contains(pElement)); 228 | DLINK* p = pElement->link._prev; 229 | if (p != &m_head) { 230 | return CONTAINING_RECORD(p, TElement, link); 231 | } 232 | return NULL; 233 | } 234 | 235 | int remove(TElement* pElement) { 236 | ASSERT(contains(pElement)); 237 | DLINK_REMOVE(&pElement->link); 238 | DLINK_INITIALIZE(&pElement->link); 239 | return -- m_count; 240 | } 241 | 242 | bool Enumerate(ENUMERATOR f, ptr pData) { 243 | TElement* pE; 244 | DLINK* p = m_head._next; 245 | while (p != &m_head) { 246 | pE = CONTAINING_RECORD(p, TElement, link); 247 | p = p->_next; 248 | if (!f(pE, pData)) 249 | return false; 250 | } 251 | return true; 252 | } 253 | 254 | bool Eat(ENUMERATOR f, ptr pData) { 255 | TElement* pE; 256 | 257 | while (NULL != (pE=pop_front())) { 258 | if (!f(pE, pData)) { 259 | push_front(pE); 260 | return false; 261 | } 262 | } 263 | return true; 264 | } 265 | 266 | TElement* FindIf(ENUMERATOR f, ptr pData) { 267 | TElement* pE; 268 | DLINK* p = m_head._next; 269 | while (p != &m_head) { 270 | pE = CONTAINING_RECORD(p, TElement, link); 271 | p = p->_next; 272 | if (f(pE, pData)) 273 | return pE; 274 | } 275 | return NULL; 276 | } 277 | }; 278 | 279 | #endif 280 | -------------------------------------------------------------------------------- /src/atomic.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri May 15 00:25:46 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __ATOMIC_H__ 9 | #define __ATOMIC_H__ 10 | 11 | static inline int atomic_add(volatile int *count, int add) 12 | { 13 | __asm__ __volatile__( 14 | "lock xadd %0, (%1);" 15 | : "=a"(add) 16 | : "r"(count), "a"(add) 17 | : "memory" 18 | ); 19 | return add; 20 | } 21 | 22 | #define __atomic_add64__(mem, add) \ 23 | do { \ 24 | asm volatile ( "lock addq %2, %0" \ 25 | : "=m" (*mem) \ 26 | : "m" (*mem), "ir" (add)); \ 27 | } while (0) 28 | 29 | #define __atomic_sub64__(mem, sub) \ 30 | do { \ 31 | asm volatile ( "lock subq %2, %0" \ 32 | : "=m" (*mem) \ 33 | : "m" (*mem), "ir" (sub)); \ 34 | } while (0) 35 | 36 | static inline int // return old value 37 | atomic_swap(volatile void *lockword, int value) 38 | { 39 | __asm__ __volatile__( 40 | "lock xchg %0, (%1);" 41 | : "=a"(value) 42 | : "r"(lockword), "a"(value) 43 | : "memory" 44 | ); 45 | return value; 46 | } 47 | 48 | static inline int // return old value 49 | atomic_comp_swap(volatile void *lockword, 50 | int exchange, 51 | int comperand) 52 | { 53 | __asm__ __volatile__( 54 | "lock cmpxchg %1, (%2)" 55 | :"=a"(comperand) 56 | :"d"(exchange), "r"(lockword), "a"(comperand) 57 | ); 58 | return comperand; 59 | } 60 | 61 | static inline int // return old value 62 | atomic_comp_swap64(volatile void *lockword, 63 | int64_t exchange, 64 | int64_t comperand) 65 | { 66 | __asm__ __volatile__( 67 | "lock cmpxchg %1, (%2)" 68 | :"=a"(comperand) 69 | :"d"(exchange), "r"(lockword), "a"(comperand) 70 | ); 71 | return comperand; 72 | } 73 | 74 | #define nop() __asm__ ("pause" ) 75 | #define sfence() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") 76 | 77 | #define AtomicGetValue(x) (atomic_comp_swap(&(x), 0, 0)) 78 | #define AtomicSetValue(x, v) (atomic_swap(&(x), (v))) 79 | #define AtomicSetValueIf(x, v, ifn) (atomic_comp_swap(&(x), (v), ifn)) 80 | 81 | // return new value 82 | #define AtomicDec(c) (atomic_add(&(c), -1) - 1) 83 | #define AtomicInc(c) (atomic_add(&(c), 1) + 1) 84 | 85 | #endif //__ATOMIC_H__ 86 | -------------------------------------------------------------------------------- /src/cmd_ctrler.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Mon 29 Jun 2015 11:26:40 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "logger_container.h" 10 | #include "mysql_selector.h" 11 | #include "mysql_dumper.h" 12 | #include "mongo_dumper.h" 13 | #include "link_scheduler.h" 14 | #include "http_processor.h" 15 | #include "extractor.h" 16 | #include "cmd_ctrler.h" 17 | 18 | using std::istringstream; 19 | 20 | CmdCtrler::CmdCtrler(QApplication *app) 21 | :_app(app) 22 | { 23 | } 24 | 25 | CmdCtrler::~CmdCtrler() 26 | { 27 | } 28 | 29 | void CmdCtrler::control(string& response, const string& cmd) 30 | { 31 | istringstream sscmd(cmd); 32 | string level1cmd; 33 | string level2cmd; 34 | string level3cmd; 35 | sscmd >> level1cmd; 36 | sscmd >> level2cmd; 37 | sscmd >> level3cmd; 38 | 39 | if ("help" == level1cmd) { 40 | if ("" == level2cmd) { 41 | response = "command: help|show"; 42 | } 43 | } else if ("show" == level1cmd) { 44 | if ("RequestRecv" == level2cmd) { 45 | _handlers["RequestRecv"]->control(response, level3cmd); 46 | } else if ("MySqlSelector" == level2cmd) { 47 | _handlers["MySqlSelector"]->control(response, level3cmd); 48 | } else if ("HttpProcessor" == level2cmd) { 49 | _handlers["HttpProcessor"]->control(response, level3cmd); 50 | } else if ("LinkScheduler" == level2cmd) { 51 | _handlers["LinkScheduler"]->control(response, level3cmd); 52 | } else if ("MongoDumper" == level2cmd) { 53 | _handlers["MongoDumper"]->control(response, level3cmd); 54 | } else if ("MySqlDumper" == level2cmd) { 55 | _handlers["MySqlDumper"]->control(response, level3cmd); 56 | } else if ("Extractor" == level2cmd) { 57 | _handlers["Extractor"]->control(response, level3cmd); 58 | } else { 59 | response = "must specific modle: RequestRecv|MySqlSelector|HttpProcessor|LinkScheduler|MongoDumper|MySqlDumper|Extractor"; 60 | } 61 | } 62 | } 63 | 64 | void CmdCtrler::addHandler(const string name, Controllable* controllable) 65 | { 66 | LOG_F(DEBUG, "addHandler %s %p", name.c_str(), controllable); 67 | _handlers[name] = controllable; 68 | } 69 | -------------------------------------------------------------------------------- /src/cmd_ctrler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Mon 29 Jun 2015 11:26:13 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __CMD_CTRLER_H__ 9 | #define __CMD_CTRLER_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "controllable.h" 15 | 16 | using std::string; 17 | using std::map; 18 | 19 | class CmdCtrler 20 | { 21 | public: 22 | CmdCtrler(QApplication *app); 23 | ~CmdCtrler(); 24 | 25 | void control(string& response, const string& cmd); 26 | void addHandler(const string name, Controllable* controllable); 27 | 28 | private: 29 | map _handlers; 30 | QApplication *_app; 31 | }; 32 | 33 | #endif //__CMD_CTRLER_H__ 34 | -------------------------------------------------------------------------------- /src/conf.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 03 Jun 2015 02:58:02 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "logger_container.h" 12 | #include "util.h" 13 | #include "conf.h" 14 | 15 | IMPLEMENT_SINGLETON(Conf); 16 | 17 | bool Conf::load(const char *confFile) 18 | { 19 | boost::property_tree::ptree pt; 20 | try { 21 | read_xml(confFile, pt); 22 | mysqlHost = pt.get("lspider.mysql.host"); 23 | mysqlPort = pt.get("lspider.mysql.port"); 24 | mysqlUser = pt.get("lspider.mysql.user"); 25 | mysqlPassword = pt.get("lspider.mysql.password"); 26 | mysqlDatabase = pt.get("lspider.mysql.database"); 27 | mysqlTableName = pt.get("lspider.mysql.tableName"); 28 | mysqlSelectInterval = pt.get("lspider.mysql.selector.selectInterval"); 29 | mysqlSelectMaxLinkDepth = pt.get("lspider.mysql.selector.maxLinkDepth"); 30 | mysqlMaxCrawlRetry = pt.get("lspider.mysql.selector.maxCrawlRetry"); 31 | mysqlSelectorSwitch = pt.get("lspider.mysql.selector.switch"); 32 | maindomainListFile = pt.get("lspider.mysql.selector.maindomainListFile"); 33 | mysqlDumperInsertBatch = pt.get("lspider.mysql.dumper.insertBatch"); 34 | mysqlDumperUpdateBatch = pt.get("lspider.mysql.dumper.updateBatch"); 35 | httpEventPrintStateInterval = pt.get("lspider.http.eventPrintStateInterval"); 36 | httpConnectTimeout = pt.get("lspider.http.connectTimeout"); 37 | httpSendTimeout = pt.get("lspider.http.sendTimeout"); 38 | httpRecvTimeout = pt.get("lspider.http.recvTimeout"); 39 | httpMaxRetry = pt.get("lspider.http.maxRetry"); 40 | httpMaxConcurrence = pt.get("lspider.http.maxConcurrence"); 41 | httpMaxRedirectDepth = pt.get("lspider.http.maxRedirectDepth"); 42 | httpMaxPageSize = pt.get("lspider.http.maxPageSize"); 43 | httpConnectQueueSize = pt.get("lspider.http.connectQueueSize"); 44 | schedulerPrintStateInterval = pt.get("lspider.scheduler.printStateInterval"); 45 | schedulerIpScheduleInterval = pt.get("lspider.scheduler.ipScheduleInterval"); 46 | schedulerMaxUrlTableSize = pt.get("lspider.scheduler.maxUrlTableSize"); 47 | mongoHostAndPort = pt.get("lspider.mongo.hostAndPort"); 48 | extractorMaxQueueSize = pt.get("lspider.extractor.maxQueueSize"); 49 | extractorTimeout = pt.get("lspider.extractor.timeout"); 50 | 51 | if (false == loadMaindomainList(maindomainListFile)) { 52 | LOG_F(FATAL, "%s must exist and has at least one maindomain", maindomainListFile.c_str()); 53 | return false; 54 | } 55 | } catch (boost::property_tree::xml_parser::xml_parser_error &e) { 56 | LOG_F(FATAL, "read_xml %s fail line:%ld message:%s what:%s", 57 | confFile, e.line(), e.filename().c_str(), e.what()); 58 | return false; 59 | } catch (boost::property_tree::ptree_bad_path &e) { 60 | LOG_F(FATAL, "read_xml %s fail what:%s", 61 | confFile, e.what()); 62 | return false; 63 | } catch (std::exception &e) { 64 | LOG_F(FATAL, "read_xml %s fail what:%s", 65 | confFile, e.what()); 66 | return false; 67 | } 68 | 69 | return true; 70 | } 71 | 72 | bool Conf::loadMaindomainList(const string maindomainListFile) 73 | { 74 | maindomainList.clear(); 75 | std::fstream fin(maindomainListFile.c_str()); 76 | std::string line; 77 | while (getline(fin, line)) { 78 | ltrim(rtrim(line)); 79 | if (line.length() == 0) { 80 | continue; 81 | } 82 | maindomainList.push_back(line); 83 | } 84 | 85 | if (maindomainList.size() > 0) { 86 | return true; 87 | } else { 88 | return false; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/conf.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 03 Jun 2015 02:57:09 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __CONF_H__ 9 | #define __CONF_H__ 10 | 11 | #include 12 | #include 13 | #include "singleton.h" 14 | 15 | using std::string; 16 | using std::vector; 17 | 18 | class Conf 19 | { 20 | DECLARE_SINGLETON(Conf); 21 | public: 22 | bool load(const char *confFile); 23 | bool loadMaindomainList(const string maindomainListFile); 24 | 25 | string mysqlHost; 26 | int mysqlPort; 27 | string mysqlUser; 28 | string mysqlPassword; 29 | string mysqlDatabase; 30 | string mysqlTableName; 31 | int mysqlMaxCrawlRetry; 32 | int mysqlSelectInterval; 33 | int mysqlSelectMaxLinkDepth; 34 | string mysqlSelectorSwitch; 35 | int mysqlDumperInsertBatch; 36 | int mysqlDumperUpdateBatch; 37 | int httpEventPrintStateInterval; 38 | int httpConnectTimeout; 39 | int httpSendTimeout; 40 | int httpRecvTimeout; 41 | int httpMaxRetry; 42 | int httpMaxConcurrence; 43 | int httpMaxRedirectDepth; 44 | int httpMaxPageSize; 45 | int httpConnectQueueSize; 46 | int schedulerPrintStateInterval; 47 | int schedulerIpScheduleInterval; 48 | size_t schedulerMaxUrlTableSize; 49 | string mongoHostAndPort; 50 | int extractorMaxQueueSize; 51 | // 解析超时时间,单位毫秒 52 | int extractorTimeout; 53 | string maindomainListFile; 54 | vector maindomainList; 55 | }; 56 | 57 | #endif //__CONF_H__ 58 | -------------------------------------------------------------------------------- /src/controllable.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Mon 29 Jun 2015 11:30:06 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "controllable.h" 9 | 10 | Controllable::~Controllable() 11 | { 12 | } 13 | -------------------------------------------------------------------------------- /src/controllable.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Mon 29 Jun 2015 11:29:20 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __CONTROLLABLE_H__ 9 | #define __CONTROLLABLE_H__ 10 | 11 | #include 12 | 13 | using std::string; 14 | 15 | class Controllable 16 | { 17 | public: 18 | virtual ~Controllable(); 19 | virtual void control(string& response, const string& cmd) = 0; 20 | }; 21 | 22 | #endif //__CONTROLLABLE_H__ 23 | -------------------------------------------------------------------------------- /src/crawl_listen_handler.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:55:32 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | 9 | #include "logger_container.h" 10 | #include "http_processor.h" 11 | #include "url_context.h" 12 | #include "request_recv.h" 13 | #include "crawl_listen_handler.h" 14 | 15 | CrawlListenHandler::CrawlListenHandler(HttpProcessor *httpProcessor, 16 | RequestRecv *requestRecv, 17 | CmdCtrler *cmdCtroler) 18 | :_httpProcessor(httpProcessor), _requestRecv(requestRecv), _cmdCtrler(cmdCtroler) 19 | 20 | { 21 | } 22 | 23 | void CrawlListenHandler::request(const IUrl& u) 24 | { 25 | UrlContext *urlContext = new UrlContext; 26 | char normalurl[MAX_URL_LEN]; 27 | if (1 == normalize_url(u.str.c_str(), normalurl, MAX_URL_LEN)) { 28 | urlContext->url = string("http://") + string(normalurl); 29 | } else { 30 | urlContext->url = u.str; 31 | } 32 | urlContext->linkDepth = 0; 33 | urlContext->hub = u.hub; 34 | 35 | uint64_t sign = get_url_sign64(u.str.c_str()); 36 | snprintf(urlContext->sign, MAX_SIGN_LEN, "%lu", sign); 37 | 38 | _requestRecv->addRequestCount(); 39 | _httpProcessor->pushConnectQueue(urlContext); 40 | } 41 | 42 | void CrawlListenHandler::exec_cmd(string& response, const string& cmd) 43 | { 44 | LOG_F(DEBUG, "cmd=%s", cmd.c_str()); 45 | assert(NULL != _cmdCtrler); 46 | _cmdCtrler->control(response, cmd); 47 | } 48 | -------------------------------------------------------------------------------- /src/crawl_listen_handler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:55:40 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __CRAWL_LISTEN_HANDLER_H__ 9 | #define __CRAWL_LISTEN_HANDLER_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "CrawlService.h" 18 | #include "cmd_ctrler.h" 19 | 20 | using namespace ::apache::thrift; 21 | using namespace ::apache::thrift::protocol; 22 | using namespace ::apache::thrift::transport; 23 | using namespace ::apache::thrift::server; 24 | using namespace ::apache::thrift::concurrency; 25 | 26 | using boost::shared_ptr; 27 | 28 | using namespace ::lspider; 29 | 30 | class HttpProcessor; 31 | class RequestRecv; 32 | 33 | /** 34 | * 抓取器抓取服务的网络请求处理逻辑 35 | * 基于thrift RPC 36 | */ 37 | class CrawlListenHandler : virtual public CrawlServiceIf { 38 | public: 39 | CrawlListenHandler(HttpProcessor *httpProcessor, 40 | RequestRecv *requestRecv, 41 | CmdCtrler *cmdCtrler); 42 | 43 | /** 44 | * RPC接口 45 | */ 46 | void request(const IUrl& u); 47 | void exec_cmd(string& response, const string& cmd); 48 | 49 | private: 50 | HttpProcessor *_httpProcessor; // 传递指针 51 | RequestRecv *_requestRecv; 52 | CmdCtrler *_cmdCtrler; 53 | }; 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/defines.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define ASSERT(c) if (!(c)) { \ 4 | abort(); \ 5 | } 6 | #define ASSERT_EQUAL(x, y) if ((x) != (y)) { \ 7 | abort(); \ 8 | } 9 | -------------------------------------------------------------------------------- /src/extractor.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:55:32 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "extractor_worker_view.h" 10 | #include "logger_container.h" 11 | #include "conf.h" 12 | #include "url_context.h" 13 | #include "extractor.h" 14 | 15 | Extractor::Extractor(ExtractorWorkerView *extractorWorkerView) 16 | { 17 | _extractorWorkerView = extractorWorkerView; 18 | } 19 | 20 | void Extractor::start() 21 | { 22 | LOG(INFO, "start"); 23 | _extractorWorkerView->doWithOneUrl(); 24 | } 25 | 26 | void Extractor::push(UrlContext *urlContext) 27 | { 28 | while (_waitExtractQueue.size() > Conf::instance()->extractorMaxQueueSize) { 29 | usleep(10000); 30 | } 31 | _waitExtractQueue.push_back(urlContext); 32 | } 33 | 34 | UrlContext * Extractor::pop() 35 | { 36 | return _waitExtractQueue.pop_front(); 37 | } 38 | 39 | int Extractor::queueSize() 40 | { 41 | return _waitExtractQueue.size(); 42 | } 43 | 44 | void Extractor::control(string& response, const string& cmd) 45 | { 46 | LOG(DEBUG, ""); 47 | } 48 | -------------------------------------------------------------------------------- /src/extractor.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:55:40 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __EXTRACTOR_H__ 9 | #define __EXTRACTOR_H__ 10 | 11 | #include "synced_queue.h" 12 | #include "controllable.h" 13 | #include "DoubleList.h" 14 | 15 | class ExtractorWorkerView; 16 | class UrlContext; 17 | 18 | class Extractor : public Controllable 19 | { 20 | public: 21 | Extractor(ExtractorWorkerView *workerView); 22 | 23 | void start(); 24 | 25 | void push(UrlContext *urlContext); 26 | UrlContext * pop(); 27 | int queueSize(); 28 | virtual void control(string& response, const string& cmd); 29 | 30 | private: 31 | ExtractorWorkerView *_extractorWorkerView; 32 | TSyncedQueue > _waitExtractQueue; // 待连接请求队列 33 | }; 34 | 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/extractor_worker_view.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 27 May 2015 10:01:38 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "logger_container.h" 13 | #include "util.h" 14 | #include "mongo_dumper.h" 15 | #include "url_context.h" 16 | #include "extractor.h" 17 | #include "conf.h" 18 | #include "extractor_worker_view.h" 19 | 20 | ExtractorWorkerView::ExtractorWorkerView(QWidget *parent) 21 | :QWebView(parent) 22 | { 23 | setPage(new MyWebPage()); 24 | _extractor = NULL; 25 | _mongoDumper = NULL; 26 | connect(this, SIGNAL(loadFinished(bool)), SLOT(onLoadFinish(bool))); 27 | 28 | this->page()->networkAccessManager()->setNetworkAccessible(QNetworkAccessManager::NotAccessible); 29 | this->settings()->setAttribute(QWebSettings::AutoLoadImages, false); 30 | this->settings()->setAttribute(QWebSettings::JavascriptEnabled, false); 31 | this->settings()->setMaximumPagesInCache(0); 32 | this->settings()->setObjectCacheCapacities(0, 0, 0); 33 | this->settings()->setOfflineWebApplicationCacheQuota(0); 34 | } 35 | 36 | void ExtractorWorkerView::setExtractor(Extractor *extractor) 37 | { 38 | _extractor = extractor; 39 | } 40 | 41 | void ExtractorWorkerView::setMongoDumper(MongoDumper *mongoDumper) 42 | { 43 | _mongoDumper = mongoDumper; 44 | } 45 | 46 | void ExtractorWorkerView::doWithOneUrl() 47 | { 48 | assert(NULL != _extractor); 49 | _urlContext = _extractor->pop(); 50 | QString body = QString(_urlContext->body.c_str()); 51 | QString url = QString(_urlContext->url.c_str()); 52 | // 设置解析超时时间 53 | _timerID = this->startTimer(Conf::instance()->extractorTimeout); 54 | this->setHtml(body, url); 55 | } 56 | 57 | void ExtractorWorkerView::getTitle() 58 | { 59 | // 获取标题 60 | QString title = this->title(); 61 | if (title.length() > 0) { 62 | _urlContext->title = title.toStdString(); 63 | } 64 | to_utf8(_urlContext->title); 65 | LOG_F(DEBUG, "%d [%s] title=[%s]", 66 | _urlContext->uuid, 67 | _urlContext->url.c_str(), 68 | _urlContext->title.c_str()); 69 | } 70 | 71 | void ExtractorWorkerView::getLinks() 72 | { 73 | // 找所有超链接 74 | QWebFrame *frame = this->page()->mainFrame(); 75 | QWebElementCollection collection = frame->documentElement().findAll("a[href]"); 76 | foreach (QWebElement element, collection) { 77 | std::string anchor = element.toPlainText().toStdString(); 78 | amendAnchor(anchor); 79 | std::string source_anchor = anchor; 80 | 81 | std::string href = element.attribute(QString("href")).toLocal8Bit().constData(); 82 | to_utf8(href); 83 | ltrim(rtrim(href)); 84 | 85 | if (href[0] != '#' && href != "" && href != "/" 86 | && (href.find("javascript") == std::string::npos) 87 | && (href.find("mailto") == std::string::npos)) { 88 | // 如果链接是相对路径 89 | if (std::string::npos == href.find("://")) { 90 | if (href[0] == '/') { 91 | href = std::string("http://") + std::string(_urlContext->host) + href; 92 | } else { 93 | href = std::string("http://") + std::string(_urlContext->host) + "/" + href; 94 | } 95 | } else if (0 != href.find("http")) { 96 | // 如果是其他特殊协议则不保存 97 | LOG_F(DEBUG, "%d [%s] ignore link[%s]", 98 | _urlContext->uuid, 99 | _urlContext->url.c_str(), 100 | href.c_str()); 101 | continue; 102 | } 103 | 104 | char normalurl[MAX_URL_LEN]; 105 | if (1 == normalize_url(href.c_str(), normalurl, MAX_URL_LEN)) { 106 | href = string("http://") + string(normalurl); 107 | } else { 108 | LOG_F(DEBUG, "%d [%s] normalurl fail [%s]", 109 | _urlContext->uuid, 110 | _urlContext->url.c_str(), 111 | href.c_str()); 112 | } 113 | 114 | if (href.length() >= MAX_URL_LEN-1) { 115 | continue; 116 | } 117 | if (href == _urlContext->url) { 118 | continue; 119 | } 120 | 121 | LOG_F(DEBUG, "%d [%s] href=[%s] anchor=[%s]", 122 | _urlContext->uuid, 123 | _urlContext->url.c_str(), 124 | href.c_str(), 125 | anchor.c_str()); 126 | _urlContext->links[href] = anchor; 127 | } 128 | } 129 | } 130 | 131 | void ExtractorWorkerView::amendAnchor(string &anchor) 132 | { 133 | to_utf8(anchor); 134 | ltrim(rtrim(anchor)); 135 | if (anchor.length() > MAX_ANCHOR_LEN) { 136 | anchor = anchor.substr(0, MAX_ANCHOR_LEN); 137 | } 138 | size_t pos = string::npos; 139 | if ((pos = anchor.find('\n')) != string::npos) { 140 | anchor = anchor.substr(0, pos); 141 | } 142 | if ((pos = anchor.find('\r')) != string::npos) { 143 | anchor = anchor.substr(0, pos); 144 | } 145 | } 146 | 147 | void ExtractorWorkerView::onLoadFinish(bool ok) 148 | { 149 | assert(NULL != _mongoDumper); 150 | getTitle(); 151 | getLinks(); 152 | _mongoDumper->waitDumpQueue.push_back(_urlContext); 153 | killTimer(_timerID); 154 | doWithOneUrl(); 155 | } 156 | 157 | void ExtractorWorkerView::timerEvent(QTimerEvent *event) 158 | { 159 | LOG_F(WARN, "%d [%s] extract timeout", _urlContext->uuid, _urlContext->url.c_str()); 160 | // 这里还不确定超时后重试能否就成功了,所以继续观察 161 | //doWithOneUrl(); 162 | QString body = QString(_urlContext->body.c_str()); 163 | QString url = QString(_urlContext->url.c_str()); 164 | _timerID = this->startTimer(Conf::instance()->extractorTimeout); 165 | this->setHtml(body, url); 166 | } 167 | -------------------------------------------------------------------------------- /src/extractor_worker_view.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 27 May 2015 10:01:35 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __EXTRACTOR_WORKER_VIEW_H__ 9 | #define __EXTRACTOR_WORKER_VIEW_H__ 10 | 11 | #include 12 | #include 13 | #include "DoubleList.h" 14 | 15 | using std::string; 16 | 17 | class Extractor; 18 | class MongoDumper; 19 | class UrlContext; 20 | 21 | class MyWebPage : public QWebPage { 22 | protected: 23 | virtual void javaScriptAlert (QWebFrame*, const QString &) {} 24 | }; 25 | 26 | class ExtractorWorkerView : public QWebView 27 | { 28 | Q_OBJECT 29 | public: 30 | explicit ExtractorWorkerView(QWidget *parent = 0); 31 | 32 | 33 | void setExtractor(Extractor *extractor); 34 | void setMongoDumper(MongoDumper *mongoDumper); 35 | 36 | /** 37 | * 从队列取一条url并开始加载 38 | */ 39 | void doWithOneUrl(); 40 | 41 | private: 42 | /** 43 | * 提取title 44 | */ 45 | void getTitle(); 46 | /** 47 | * 提取后链 48 | */ 49 | void getLinks(); 50 | /** 51 | * 修正anchor 52 | */ 53 | void amendAnchor(string &anchor); 54 | 55 | protected slots: 56 | /** 57 | * url加载完成触发回调 58 | */ 59 | void onLoadFinish(bool ok); 60 | 61 | protected: 62 | void timerEvent(QTimerEvent *event); 63 | 64 | private: 65 | UrlContext *_urlContext; // 临时存储当前处理中的urlContext 66 | Extractor *_extractor; 67 | MongoDumper *_mongoDumper; 68 | int _timerID; 69 | 70 | const static int MAX_ANCHOR_LEN = 64; 71 | }; 72 | 73 | 74 | #endif //__EXTRACTOR_WORKER_VIEW_H__ 75 | -------------------------------------------------------------------------------- /src/http_event_engine.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:59:05 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "logger_container.h" 10 | #include "http_processor.h" 11 | #include "conf.h" 12 | #include "http_event_engine.h" 13 | 14 | HttpEventEngine::HttpEventEngine(HttpProcessor *httpProcessor) 15 | { 16 | base = NULL; 17 | _isStop = false; 18 | _httpProcessor = httpProcessor; 19 | } 20 | 21 | HttpEventEngine::~HttpEventEngine() 22 | { 23 | if (NULL != base) { 24 | event_base_free(base); 25 | } 26 | stop(); 27 | } 28 | 29 | void HttpEventEngine::event_log_callback(int severity, const char *msg) 30 | { 31 | LOG_F(DEBUG, "%d %s", severity, msg); 32 | } 33 | 34 | void HttpEventEngine::event_fatal_callback(int err) 35 | { 36 | LOG_F(DEBUG, "%d", err); 37 | } 38 | 39 | /** 40 | * 线程执行函数 41 | */ 42 | void HttpEventEngine::run() 43 | { 44 | LOG(INFO, "start"); 45 | evthread_use_pthreads(); 46 | event_set_log_callback(HttpEventEngine::event_log_callback); 47 | event_set_fatal_callback(HttpEventEngine::event_fatal_callback); 48 | base = event_base_new(); 49 | 50 | _printStateEvent = evtimer_new(base, on_print_state, this); 51 | struct timeval t = {Conf::instance()->httpEventPrintStateInterval, 0 }; 52 | evtimer_add(_printStateEvent, &t); 53 | 54 | event_base_dispatch(base); 55 | abort(); 56 | } 57 | 58 | void HttpEventEngine::stop() 59 | { 60 | _isStop = true; 61 | this->wait(); 62 | } 63 | 64 | void HttpEventEngine::on_print_state(evutil_socket_t sock, short event, void *arg) 65 | { 66 | HttpEventEngine *httpEventEngine = (HttpEventEngine*)arg; 67 | LOG(DEBUG, "on_print_state"); 68 | httpEventEngine->_httpProcessor->printState(); 69 | struct timeval t = {Conf::instance()->httpEventPrintStateInterval, 0 }; 70 | evtimer_add(httpEventEngine->_printStateEvent, &t); 71 | } 72 | -------------------------------------------------------------------------------- /src/http_event_engine.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:58:37 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __HTTP_EVENT_ENGINE_H__ 9 | #define __HTTP_EVENT_ENGINE_H__ 10 | 11 | #include 12 | #include 13 | #include "lthread.h" 14 | 15 | class HttpProcessor; 16 | 17 | /** 18 | * 时间引擎,维护http请求的event_base 19 | */ 20 | class HttpEventEngine : public lthread::LThread 21 | { 22 | public: 23 | HttpEventEngine(HttpProcessor *httpProcessor); 24 | ~HttpEventEngine(); 25 | 26 | /** 27 | * 线程执行函数 28 | */ 29 | virtual void run(); 30 | 31 | void stop(); 32 | 33 | static void on_print_state(evutil_socket_t sock, short event, void *arg); 34 | 35 | struct event_base* base; // event_base 36 | 37 | private: 38 | static void event_log_callback(int severity, const char *msg); 39 | static void event_fatal_callback(int err); 40 | 41 | bool _isStop; 42 | struct event *_printStateEvent; 43 | HttpProcessor *_httpProcessor; 44 | }; 45 | 46 | 47 | #endif //__HTTP_EVENT_ENGINE_H__ 48 | -------------------------------------------------------------------------------- /src/http_processor.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:56:04 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __HTTP_PROCESSOR_H__ 9 | #define __HTTP_PROCESSOR_H__ 10 | 11 | #include 12 | #include "lthread.h" 13 | #include "synced_queue.h" 14 | #include "locked_queue.h" 15 | #include "DoubleList.h" 16 | #include "controllable.h" 17 | #include "mysql_base.h" 18 | 19 | using std::string; 20 | 21 | class HttpEventEngine; 22 | class MongoDumper; 23 | class Extractor; 24 | class UrlContext; 25 | 26 | /** 27 | * http协议处理,网络收发数据 28 | */ 29 | class HttpProcessor : public lthread::LThread, public MySqlBase, public Controllable 30 | { 31 | public: 32 | typedef enum { 33 | E_ERR = -1, // socket读写异常 34 | E_FINISH = 0, // 发送或接收完成 35 | E_AGAIN = 1, // 读写正常,buffer为空 36 | } SndRcvRetType; 37 | 38 | HttpProcessor(MongoDumper* mongoDumper, Extractor *extractor, string connectionName); 39 | ~HttpProcessor(); 40 | 41 | TSyncedQueue > waitConnectQueue; // 待连接请求队列 42 | 43 | void pushConnectQueue(UrlContext *urlContext); 44 | 45 | virtual void run(); 46 | void stop(); 47 | virtual void control(string& response, const string& cmd); 48 | 49 | /** 50 | * 收到完整网页包后回调 51 | */ 52 | virtual void onRecvOnePage(UrlContext *urlContext); 53 | /** 54 | * 抓取超时回调 55 | */ 56 | virtual void onTimeout(UrlContext *urlContext); 57 | /** 58 | * 对方关闭连接 59 | */ 60 | virtual void onBroken(UrlContext *urlContext); 61 | 62 | void printState(); 63 | 64 | 65 | private: 66 | // libevent 回调事件 67 | static void on_connected(int sock, short /*event*/, void* arg); 68 | static void on_send(int sock, short /*event*/, void* arg); 69 | static void on_recv(int sock, short /*event*/, void* arg); 70 | 71 | int connect(UrlContext *urlContext); 72 | bool getServAddr(UrlContext *urlContext, struct sockaddr_in &serv_addr); 73 | int createNonblockingSocket(); 74 | SndRcvRetType tryRecv(UrlContext *urlContext); 75 | SndRcvRetType trySend(UrlContext *urlContext); 76 | void doWithSockTimeout(UrlContext *urlContext); 77 | void doWithSend(UrlContext *urlContext); 78 | void doWithRecv(UrlContext *urlContext); 79 | void doWithCrawlDone(UrlContext *urlContext); 80 | bool doWithRedirect(UrlContext *urlContext); 81 | 82 | HttpEventEngine *_engine; 83 | MongoDumper *_mongoDumper; 84 | Extractor *_extractor; 85 | bool _isStop; 86 | 87 | volatile int _concCount; 88 | volatile int _tryProcCount; 89 | volatile int _procFinishCount; 90 | }; 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /src/http_protocol.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 10:53:24 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __HTTP_PROTOCOL_H__ 9 | #define __HTTP_PROTOCOL_H__ 10 | 11 | class UrlContext; 12 | 13 | class HttpProtocol 14 | { 15 | public: 16 | typedef enum { 17 | HA_OK, 18 | HA_FAIL 19 | } HARetType; 20 | 21 | static bool buildHttpRequestHeader(UrlContext *urlContext); 22 | static HARetType analyse(UrlContext *urlContext); 23 | static HARetType analyseFirstLine(UrlContext *urlContext); 24 | static HARetType analyseHeader(UrlContext *urlContext); 25 | static HARetType analyseCookie(UrlContext *urlContext); 26 | static HARetType analyseBody(UrlContext *urlContext); 27 | static HARetType analyseChunk(UrlContext *urlContext); 28 | const static std::string DOUBLECRLF; 29 | const static std::string CRLF; 30 | }; 31 | 32 | #endif //__HTTP_PROTOCOL_H__ 33 | -------------------------------------------------------------------------------- /src/keyed_queue.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 04:48:53 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __KEYED_QUEUE_H__ 9 | #define __KEYED_QUEUE_H__ 10 | 11 | #include 12 | #include "DoubleList.h" 13 | 14 | using std::map; 15 | using std::pair; 16 | using std::less; 17 | 18 | /* 19 | * TElement must contain the following fields: 20 | * DLINK link; 21 | * TKey key; 22 | */ 23 | template > 24 | class TKeyedQueue 25 | { 26 | protected: 27 | typedef map Indexer; 28 | typedef TLinkedList Queue; 29 | typedef typename Indexer::value_type value_type; 30 | public: 31 | typedef typename Indexer::iterator Iterator; 32 | typedef bool (*ENUMERATOR)(TElement* pE, ptr pData); // return false to stop the enumeration 33 | 34 | TKeyedQueue() {} 35 | ~TKeyedQueue() {} 36 | 37 | bool push_back(TElement* pElement, TElement** ppOldElement=NULL) { 38 | ASSERT(NULL != pElement); 39 | ASSERT(DLINK_IS_STANDALONE(&pElement->link)); 40 | 41 | pair pr; 42 | pr = m_Indexer.insert(value_type(pElement->key, pElement)); 43 | if (pr.second) { 44 | m_Queue.push_back(pElement); 45 | return true; 46 | } 47 | if (ppOldElement) { 48 | *ppOldElement = pr.first->second; 49 | } 50 | return false; 51 | } 52 | 53 | TElement* push_back_and_replace(TElement* pNew) { 54 | ASSERT(NULL != pNew); 55 | ASSERT(DLINK_IS_STANDALONE(&pNew->link)); 56 | 57 | pair pr; 58 | pr = m_Indexer.insert(value_type(pNew->key, pNew)); 59 | if (pr.second) { 60 | m_Queue.push_back(pNew); 61 | } 62 | else { 63 | TElement* pOld = pr.first->second; 64 | if (pOld != pNew) { // prevent double insertion 65 | pr.first->second = pNew; 66 | DLINK_REPLACE(&pOld->link, &pNew->link); 67 | DLINK_INITIALIZE(&pOld->link); 68 | return pOld; 69 | } 70 | } 71 | return NULL; 72 | } 73 | 74 | TElement* pop_front() { 75 | TElement* pE = m_Queue.pop_front(); 76 | if (pE) { 77 | Iterator it = m_Indexer.find(pE->key); 78 | ASSERT(it != m_Indexer.end()); 79 | m_Indexer.erase(it); 80 | } 81 | return pE; 82 | } 83 | 84 | TElement* front_to_back() { 85 | TElement* pE = m_Queue.pop_front(); 86 | if (pE) { 87 | m_Queue.push_back(pE); 88 | } 89 | return pE; 90 | } 91 | 92 | bool move_to_back(TElement* pElement) { 93 | ASSERT(NULL != pElement); 94 | Iterator it = m_Indexer.find(pElement->key); 95 | if (m_Indexer.end()!=it && pElement==it->second) { 96 | m_Queue.remove(pElement); 97 | m_Queue.push_back(pElement); 98 | return true; 99 | } 100 | return false; 101 | } 102 | 103 | bool move_to_front(TElement* pElement) { 104 | ASSERT(NULL != pElement); 105 | Iterator it = m_Indexer.find(pElement->key); 106 | if (m_Indexer.end()!=it && pElement==it->second) { 107 | m_Queue.remove(pElement); 108 | m_Queue.push_front(pElement); 109 | return true; 110 | } 111 | return false; 112 | } 113 | 114 | TElement* get_front() { 115 | return m_Queue.get_front(); 116 | } 117 | 118 | TElement* find(Key k) { 119 | Iterator it = m_Indexer.find(k); 120 | if (it != m_Indexer.end()) { 121 | return it->second; 122 | } 123 | return NULL; 124 | } 125 | 126 | TElement* remove(Key k) { 127 | Iterator it = m_Indexer.find(k); 128 | if (it != m_Indexer.end()) { 129 | TElement* pE = it->second; 130 | m_Queue.remove(pE); 131 | m_Indexer.erase(it); 132 | return pE; 133 | } 134 | return NULL; 135 | } 136 | 137 | bool remove(TElement* pElement) { 138 | ASSERT(NULL != pElement); 139 | Iterator it = m_Indexer.find(pElement->key); 140 | if (m_Indexer.end()!=it && pElement==it->second) { 141 | m_Indexer.erase(it); 142 | m_Queue.remove(pElement); 143 | return true; 144 | } 145 | return false; 146 | } 147 | 148 | bool Enumerate(ENUMERATOR f, ptr pData) { 149 | return m_Queue.Enumerate(f, pData); 150 | } 151 | 152 | int size() const { return m_Queue.size(); } 153 | 154 | protected: 155 | Indexer m_Indexer; // indexer 156 | Queue m_Queue; // the queue 157 | }; 158 | 159 | #endif //__KEYED_QUEUE_H__ 160 | -------------------------------------------------------------------------------- /src/link.h: -------------------------------------------------------------------------------- 1 | #ifndef __LINK_H__ 2 | #define __LINK_H__ 3 | 4 | /* 5 | SLINK - single link 6 | */ 7 | typedef struct _SLINK{ 8 | _SLINK* _next; 9 | } SLINK, *PSLINK; 10 | 11 | //initialize the link 12 | #define SLINK_INITIALIZE(_head) ((_head)->_next = NULL) 13 | 14 | //check if list is empty 15 | #define SLINK_IS_EMPTY(_head) ((_head)->_next == NULL) 16 | 17 | //pop the first item in list 18 | #define SLINK_POP(_head) (_head)->_next;\ 19 | (_head)->_next = (_head)->_next->_next; 20 | //push an item to the head of list 21 | #define SLINK_PUSH(_head,_link) (_link)->_next = (_head)->_next;\ 22 | (_head)->_next = (_link) 23 | 24 | /* 25 | DLINK - double link 26 | */ 27 | typedef struct _DLINK{ 28 | _DLINK* _prev; 29 | _DLINK* _next; 30 | } DLINK, *PDLINK; 31 | 32 | //initialize the link 33 | #define DLINK_INITIALIZE(_head) ((_head)->_next = (_head)->_prev = (_head)) 34 | 35 | //check if list is empty 36 | #define DLINK_IS_EMPTY(_head) ((_head)->_next == (_head)) 37 | 38 | //insert item after the _head item 39 | #define DLINK_INSERT_NEXT(_head,_dlink) (_dlink)->_next = (_head)->_next; \ 40 | (_dlink)->_prev = (_head); \ 41 | (_head)->_next->_prev = (_dlink); \ 42 | (_head)->_next = (_dlink) 43 | 44 | //insert item previous to the _head item 45 | #define DLINK_INSERT_PREV(_head,_dlink) (_dlink)->_prev = (_head)->_prev; \ 46 | (_dlink)->_next = (_head); \ 47 | (_head)->_prev->_next = (_dlink); \ 48 | (_head)->_prev = (_dlink) 49 | //remove the item from list 50 | #define DLINK_REMOVE(_dlink) (_dlink)->_prev->_next = (_dlink)->_next;\ 51 | (_dlink)->_next->_prev = (_dlink)->_prev 52 | //extract the item previous to _head 53 | #define DLINK_EXTRUCT_PREV(_head) (_head)->_prev; \ 54 | DLINK_REMOVE((_head)->_prev) 55 | //extract the item after the _head item 56 | #define DLINK_EXTRUCT_NEXT(_head) (_head)->_next; \ 57 | DLINK_REMOVE((_head)->_next) 58 | 59 | // replace node _old with node _new in the list 60 | #define DLINK_REPLACE(_old, _new) (_old)->_next->_prev = (_new); \ 61 | (_old)->_prev->_next = (_new); \ 62 | (_new)->_next = (_old)->_next; \ 63 | (_new)->_prev = (_old)->_prev 64 | 65 | #ifdef _DEBUG 66 | #if defined(__x86_64__) 67 | #define INVALID_POINTER (DLINK*)0xCCCCCCCCCCCCCCCCULL 68 | #else 69 | #define INVALID_POINTER (DLINK*)0xCCCCCCCCUL 70 | #endif 71 | 72 | #define DLINK_IS_STANDALONE(_dlink) (((_dlink)==(_dlink)->_next \ 73 | && (_dlink)==(_dlink)->_prev) \ 74 | || (NULL==(_dlink)->_next \ 75 | && NULL==(_dlink)->_prev) \ 76 | || (INVALID_POINTER==(_dlink)->_next \ 77 | && INVALID_POINTER==(_dlink)->_prev) \ 78 | ) 79 | #define PTR_IS_NULL(_ptr) (NULL==(_ptr) || INVALID_POINTER==(DLINK*)(_ptr)) 80 | 81 | #else 82 | #define DLINK_IS_STANDALONE(_dlink) (((_dlink)==(_dlink)->_next \ 83 | && (_dlink)==(_dlink)->_prev) \ 84 | || (NULL==(_dlink)->_next \ 85 | && NULL==(_dlink)->_prev) \ 86 | ) 87 | #define PTR_IS_NULL(_ptr) (NULL==(_ptr)) 88 | 89 | #endif 90 | 91 | //get the object of type "type" that contains the field "field" stating in address "address" 92 | #ifndef CONTAINING_RECORD 93 | #define CONTAINING_RECORD(address, type, field) ((type *)( \ 94 | (char*)(address) - \ 95 | (long)(&((type *)0)->field))) 96 | #endif 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src/link_scheduler.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 11:45:03 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "logger_container.h" 9 | #include "http_processor.h" 10 | #include "conf.h" 11 | #include "link_scheduler.h" 12 | 13 | LinkScheduler::LinkScheduler(HttpProcessor *httpProcessor) 14 | :_linkTable(this) 15 | { 16 | _base = NULL; 17 | _isStop = false; 18 | _httpProcessor = httpProcessor; 19 | } 20 | 21 | LinkScheduler::~LinkScheduler() 22 | { 23 | if (NULL != _base) { 24 | event_base_free(_base); 25 | } 26 | stop(); 27 | } 28 | 29 | void LinkScheduler::event_log_callback(int severity, const char *msg) 30 | { 31 | LOG_F(DEBUG, "%d %s", severity, msg); 32 | } 33 | 34 | void LinkScheduler::event_fatal_callback(int err) 35 | { 36 | LOG_F(DEBUG, "%d", err); 37 | } 38 | 39 | void LinkScheduler::run() 40 | { 41 | LOG(INFO, "start"); 42 | evthread_use_pthreads(); 43 | event_set_log_callback(LinkScheduler::event_log_callback); 44 | event_set_fatal_callback(LinkScheduler::event_fatal_callback); 45 | _base = event_base_new(); 46 | 47 | _printStateEvent = evtimer_new(_base, on_print_state, this); 48 | struct timeval t = {Conf::instance()->schedulerPrintStateInterval, 0 }; 49 | evtimer_add(_printStateEvent, &t); 50 | 51 | event_base_dispatch(_base); 52 | abort(); 53 | } 54 | 55 | void LinkScheduler::stop() 56 | { 57 | _isStop = true; 58 | this->wait(); 59 | } 60 | 61 | LinkTable::RetType LinkScheduler::addUrl(UrlContext *urlContext) 62 | { 63 | return _linkTable.addUrl(urlContext); 64 | } 65 | 66 | bool LinkScheduler::addIpSchedule(IpContext *ipContext) 67 | { 68 | struct event *ipScheduleEvent = evtimer_new(_base, on_ip_schedule, ipContext); 69 | ipContext->ipScheduleEvent = ipScheduleEvent; 70 | struct timeval t = {ipContext->scheduleInterval, 0 }; 71 | evtimer_add(ipScheduleEvent, &t); 72 | return true; 73 | } 74 | 75 | void LinkScheduler::on_ip_schedule(evutil_socket_t sock, short event, void *arg) 76 | { 77 | IpContext *ipContext = (IpContext*)arg; 78 | HttpProcessor *httpProcessor = ipContext->linkTable->getLinkScheduler()->_httpProcessor; 79 | Ip ip; 80 | Site site; 81 | UrlContext *urlContext = NULL; 82 | bool isIpContextObsolete = ipContext->linkTable->select(ipContext, ip, site, urlContext); 83 | LOG_F(DEBUG, "pop ip=%s site=%s url=%s urlContext:%p", 84 | ip.c_str(), site.c_str(), urlContext->url.c_str(), urlContext); 85 | 86 | httpProcessor->pushConnectQueue(urlContext); 87 | 88 | // select()可能delete掉ipContext 89 | if (!isIpContextObsolete) { 90 | struct event *ipScheduleEvent = ipContext->ipScheduleEvent; 91 | struct timeval t = {ipContext->scheduleInterval, 0 }; 92 | evtimer_add(ipScheduleEvent, &t); 93 | } 94 | } 95 | 96 | void LinkScheduler::on_print_state(evutil_socket_t sock, short event, void *arg) 97 | { 98 | LinkScheduler *linkScheduler = (LinkScheduler*)arg; 99 | LOG_F(INFO, "on_print_state table count=%d", linkScheduler->_linkTable.count()); 100 | linkScheduler->_linkTable.printState(); 101 | struct timeval t = {Conf::instance()->schedulerPrintStateInterval, 0 }; 102 | evtimer_add(linkScheduler->_printStateEvent, &t); 103 | } 104 | 105 | void LinkScheduler::control(string& response, const string& cmd) 106 | { 107 | response = _linkTable.getState(); 108 | } 109 | -------------------------------------------------------------------------------- /src/link_scheduler.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 11:44:41 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __LINK_SCHEDULER_H__ 9 | #define __LINK_SCHEDULER_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "controllable.h" 15 | #include "link_table.h" 16 | 17 | class LinkScheduler : public QThread, public Controllable 18 | { 19 | Q_OBJECT 20 | public: 21 | LinkScheduler(HttpProcessor *httpProcessor); 22 | virtual ~LinkScheduler(); 23 | 24 | virtual void run(); 25 | void stop(); 26 | virtual void control(string& response, const string& cmd); 27 | 28 | LinkTable::RetType addUrl(UrlContext *urlContext); 29 | bool addIpSchedule(IpContext *ipContext); 30 | 31 | private: 32 | static void event_log_callback(int severity, const char *msg); 33 | static void event_fatal_callback(int err); 34 | 35 | static void on_ip_schedule(evutil_socket_t sock, short event, void *arg); 36 | static void on_print_state(evutil_socket_t sock, short event, void *arg); 37 | 38 | LinkTable _linkTable; 39 | HttpProcessor *_httpProcessor; // 传递指针 40 | struct event_base* _base; // event_base 41 | struct event *_printStateEvent; 42 | bool _isStop; 43 | }; 44 | 45 | #endif //__LINK_SCHEDULER_H__ 46 | -------------------------------------------------------------------------------- /src/link_table.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 05:35:15 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "url_tools.h" 9 | #include "util.h" 10 | #include "logger_container.h" 11 | #include "url_context.h" 12 | #include "link_scheduler.h" 13 | #include "conf.h" 14 | #include "link_table.h" 15 | 16 | LinkTable::LinkTable(LinkScheduler *linkScheduler) 17 | { 18 | pthread_mutex_init(&_mutex, NULL); 19 | _linkScheduler = linkScheduler; 20 | _count = 0; 21 | } 22 | 23 | LinkTable::~LinkTable() 24 | { 25 | pthread_mutex_destroy(&_mutex); 26 | } 27 | 28 | LinkTable::RetType LinkTable::addUrl(UrlContext *urlContext) 29 | { 30 | if (false == urlContext->parseUrl()) { 31 | return RT_PARSE_ERROR; 32 | } 33 | if (urlContext->site.length() == 0 || urlContext->ip.length() == 0) { 34 | return RT_PARSE_ERROR; 35 | } 36 | 37 | pthread_mutex_lock(&_mutex); 38 | IpContext *ipContext = getIpContext(&_ipTable, urlContext->ip); 39 | SiteContext *siteContext = getSiteContext(ipContext, urlContext->site); 40 | urlContext->key = urlContext->url; 41 | if (siteContext->urlTable.size() < Conf::instance()->schedulerMaxUrlTableSize) { 42 | siteContext->urlTable.push(urlContext); 43 | LOG_F(DEBUG, "%s", urlContext->url.c_str()); 44 | atomic_add(&_count, 1); 45 | pthread_mutex_unlock(&_mutex); 46 | return RT_OK; 47 | } else { 48 | pthread_mutex_unlock(&_mutex); 49 | return RT_TABLE_FULL; 50 | } 51 | } 52 | 53 | bool LinkTable::select(IpContext *ipContext, 54 | Ip &ip, 55 | Site &site, 56 | UrlContext* &urlContext) 57 | { 58 | bool isIpContextObsolete = false; 59 | pthread_mutex_lock(&_mutex); 60 | ip = ipContext->ip; 61 | SiteContext *siteContext = ipContext->select(); 62 | site = siteContext->site; 63 | assert(NULL != siteContext); 64 | urlContext = siteContext->urlTable.top(); 65 | siteContext->urlTable.pop(); 66 | atomic_add(&_count, -1); 67 | if (siteContext->urlTable.size() == 0) { 68 | ipContext->siteTable.erase(siteContext->site); 69 | delete siteContext; 70 | } 71 | if (ipContext->siteTable.size() == 0) { 72 | ipContext->linkTable->_ipTable.erase(ipContext->ip); 73 | delete ipContext; 74 | isIpContextObsolete = true; 75 | } else { 76 | isIpContextObsolete = false; 77 | } 78 | 79 | pthread_mutex_unlock(&_mutex); 80 | return isIpContextObsolete; 81 | } 82 | 83 | IpContext* LinkTable::getIpContext(IpTable *ipTable, Ip ip, bool allocAtFail) 84 | { 85 | IpTable::iterator iter = ipTable->find(ip); 86 | if (ipTable->end() == iter) { 87 | if (allocAtFail) { 88 | IpContext *ipContext = new IpContext; 89 | ipContext->ip = ip; 90 | ipContext->scheduleInterval = Conf::instance()->schedulerIpScheduleInterval; 91 | ipContext->linkTable = this; 92 | ipTable->insert(pair(ip, ipContext)); 93 | _linkScheduler->addIpSchedule(ipContext); 94 | return ipContext; 95 | } else { 96 | return NULL; 97 | } 98 | } else { 99 | return iter->second; 100 | } 101 | } 102 | 103 | SiteContext* LinkTable::getSiteContext(IpContext *ipContext, Site site, bool allocAtFail) 104 | { 105 | SiteTable::iterator iter = ipContext->siteTable.find(site); 106 | if (ipContext->siteTable.end() == iter) { 107 | if (allocAtFail) { 108 | SiteContext *siteContext = new SiteContext; 109 | siteContext->site = site; 110 | ipContext->siteTable.insert(pair(site, siteContext)); 111 | return siteContext; 112 | } else { 113 | return NULL; 114 | } 115 | } else { 116 | return iter->second; 117 | } 118 | } 119 | 120 | int LinkTable::count() const 121 | { 122 | return _count; 123 | } 124 | 125 | void LinkTable::printState() 126 | { 127 | LOG(DEBUG, "---------------- begin state ------------------"); 128 | pthread_mutex_lock(&_mutex); 129 | IpTable::const_iterator table_iter = _ipTable.begin(); 130 | for (; _ipTable.end() != table_iter; ++table_iter) { 131 | LOG_F(DEBUG, "ip:%s", table_iter->first.c_str()); 132 | IpContext *ipContext = table_iter->second; 133 | SiteTable::const_iterator ip_iter = ipContext->siteTable.begin(); 134 | for (; ipContext->siteTable.end() != ip_iter; ++ip_iter) { 135 | SiteContext *siteContext = ip_iter->second; 136 | LOG_F(DEBUG, "\tsite:%s urlcount=%lu", ip_iter->first.c_str(), siteContext->urlTable.size()); 137 | } 138 | } 139 | pthread_mutex_unlock(&_mutex); 140 | LOG(DEBUG, "---------------- end state ------------------"); 141 | } 142 | 143 | string LinkTable::getState() 144 | { 145 | string state; 146 | pthread_mutex_lock(&_mutex); 147 | IpTable::const_iterator table_iter = _ipTable.begin(); 148 | for (; _ipTable.end() != table_iter; ++table_iter) { 149 | state += "ip:"; 150 | state += table_iter->first; 151 | state += "\n"; 152 | IpContext *ipContext = table_iter->second; 153 | SiteTable::const_iterator ip_iter = ipContext->siteTable.begin(); 154 | for (; ipContext->siteTable.end() != ip_iter; ++ip_iter) { 155 | SiteContext *siteContext = ip_iter->second; 156 | state += "\tsite:"; 157 | state += ip_iter->first; 158 | state += " urlcount="; 159 | char tableSizeStr[32]; 160 | snprintf(tableSizeStr, 32, "%lu", siteContext->urlTable.size()); 161 | state += tableSizeStr; 162 | state += "\n"; 163 | } 164 | } 165 | pthread_mutex_unlock(&_mutex); 166 | return state; 167 | } 168 | 169 | LinkScheduler* LinkTable::getLinkScheduler() 170 | { 171 | return _linkScheduler; 172 | } 173 | -------------------------------------------------------------------------------- /src/link_table.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 05:33:10 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __LINK_TABLE_H__ 9 | #define __LINK_TABLE_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "keyed_queue.h" 16 | #include "url_context.h" 17 | 18 | // 每秒选一个,10年的选取总量作为最大值 19 | #define MAX_SCALE (86400*365*10*1.0) 20 | #define DEFAULT_SCHEDULE_INTERVAL 1 21 | 22 | using std::string; 23 | using std::map; 24 | using std::priority_queue; 25 | 26 | class LinkScheduler; 27 | class IpContext; 28 | class LinkTable; 29 | 30 | typedef priority_queue, UrlContextComp> UrlTable; 31 | struct SiteContext { 32 | SiteContext() { 33 | concurrenceCount = 0; 34 | quota = 1; 35 | selectCountDivQuota = 0.0; 36 | } 37 | Site site; 38 | UrlTable urlTable; 39 | volatile int concurrenceCount; 40 | int quota; 41 | float selectCountDivQuota; 42 | }; 43 | 44 | typedef map SiteTable; 45 | struct IpContext { 46 | IpContext() { 47 | concurrenceCount = 0; 48 | scheduleInterval = DEFAULT_SCHEDULE_INTERVAL; 49 | ipScheduleEvent = NULL; 50 | } 51 | 52 | SiteContext *select() { 53 | float minScale = MAX_SCALE; 54 | SiteContext *selectedContext = NULL; 55 | SiteTable::iterator iter = siteTable.begin(); 56 | // 找到最小的selectCountDivQuota 57 | for (; siteTable.end() != iter; ++iter) { 58 | SiteContext *siteContext = iter->second; 59 | if (siteContext->selectCountDivQuota < minScale) { 60 | minScale = siteContext->selectCountDivQuota; 61 | selectedContext = siteContext; 62 | } 63 | } 64 | selectedContext->selectCountDivQuota += 1.0 / selectedContext->quota; 65 | return selectedContext; 66 | } 67 | 68 | Ip ip; 69 | SiteTable siteTable; 70 | volatile int concurrenceCount; 71 | int scheduleInterval; 72 | struct event *ipScheduleEvent; 73 | LinkTable *linkTable; 74 | }; 75 | 76 | typedef map IpTable; 77 | 78 | class LinkTable 79 | { 80 | public: 81 | typedef enum { 82 | RT_PARSE_ERROR = -2, 83 | RT_TABLE_FULL = -1, 84 | RT_OK = 0, 85 | } RetType; 86 | 87 | LinkTable(LinkScheduler *linkScheduler); 88 | virtual ~LinkTable(); 89 | 90 | RetType addUrl(UrlContext *urlContext); 91 | bool select(IpContext *ipContext, 92 | Ip &ip, 93 | Site &site, 94 | UrlContext* &urlContext); 95 | int count() const; 96 | void printState(); 97 | string getState(); 98 | 99 | LinkScheduler* getLinkScheduler(); 100 | 101 | private: 102 | IpContext* getIpContext(IpTable *ipTable, Ip ip, bool allocAtFail = true); 103 | SiteContext* getSiteContext(IpContext *ipContext, Site site, bool allocAtFail = true); 104 | 105 | IpTable _ipTable; 106 | LinkScheduler *_linkScheduler; 107 | pthread_mutex_t _mutex; 108 | volatile int _count; 109 | }; 110 | 111 | #endif //__LINK_TABLE_H__ 112 | -------------------------------------------------------------------------------- /src/locked_map.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | class LockedMap : public std::map 6 | { 7 | public: 8 | LockedMap() { 9 | pthread_mutex_init(&_mutex, NULL); 10 | } 11 | 12 | ~LockedMap() { 13 | pthread_mutex_destroy(&_mutex); 14 | } 15 | 16 | bool set(KEY_TYPE key, VALUE_TYPE & value) { 17 | pthread_mutex_lock(&_mutex); 18 | bool ret = insert(std::pair(key, value)); 19 | pthread_mutex_unlock(&_mutex); 20 | return ret; 21 | } 22 | 23 | VALUE_TYPE* get(KEY_TYPE key) { 24 | VALUE_TYPE *ret = NULL; 25 | pthread_mutex_lock(&_mutex); 26 | typename std::map::iterator iter; 27 | if (std::end != (iter = find(key))) { 28 | ret = &iter->second; 29 | } 30 | pthread_mutex_unlock(&_mutex); 31 | return ret; 32 | } 33 | 34 | private: 35 | pthread_mutex_t _mutex; 36 | }; 37 | -------------------------------------------------------------------------------- /src/locked_queue.h: -------------------------------------------------------------------------------- 1 | #ifndef __LOCKED_QUEUE_H__ 2 | #define __LOCKED_QUEUE_H__ 3 | 4 | #include "mutex_lock.h" 5 | 6 | 7 | 8 | typedef void* ptr; 9 | 10 | template 11 | class TLockedQueue 12 | { 13 | public: 14 | typedef typename TQueue::ENUMERATOR ENUMERATOR; 15 | TLockedQueue() { Init(); } 16 | ~TLockedQueue() { } 17 | 18 | void Init() { 19 | m_queue.Init(); 20 | } 21 | 22 | int push_back(TElement* e) { 23 | int n; 24 | m_lock.Lock(); 25 | n = m_queue.push_back(e); 26 | m_lock.Unlock(); 27 | return n; 28 | } 29 | 30 | int push_front(TElement* e) { 31 | int n; 32 | m_lock.Lock(); 33 | n = m_queue.push_front(e); 34 | m_lock.Unlock(); 35 | return n; 36 | } 37 | 38 | TElement* pop_front() { 39 | TElement* e; 40 | 41 | m_lock.Lock(); 42 | e = m_queue.pop_front(); 43 | m_lock.Unlock(); 44 | return e; 45 | } 46 | 47 | TElement* pop_back() { 48 | TElement* e; 49 | 50 | m_lock.Lock(); 51 | e = m_queue.pop_back(); 52 | m_lock.Unlock(); 53 | return e; 54 | } 55 | 56 | TElement* get_back() { 57 | TElement* e; 58 | 59 | m_lock.Lock(); 60 | e = m_queue.get_back(); 61 | m_lock.Unlock(); 62 | return e; 63 | } 64 | 65 | TElement* get_front() { 66 | TElement* e; 67 | 68 | m_lock.Lock(); 69 | e = m_queue.get_front(); 70 | m_lock.Unlock(); 71 | return e; 72 | } 73 | 74 | void Enumerate(ENUMERATOR f, ptr pData) { 75 | m_lock.Lock(); 76 | m_queue.Enumerate(f, pData); 77 | m_lock.Unlock(); 78 | } 79 | 80 | int remove(TElement* p) { 81 | int n; 82 | m_lock.Lock(); 83 | n = m_queue.remove(p); 84 | m_lock.Unlock(); 85 | return n; 86 | } 87 | 88 | int size() { 89 | int n; 90 | m_lock.Lock(); 91 | n = m_queue.size(); 92 | m_lock.Unlock(); 93 | return n; 94 | } 95 | 96 | private: 97 | TQueue m_queue; 98 | TLock m_lock; 99 | }; 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/logger_container.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat May 16 23:18:47 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "logger_container.h" 9 | 10 | log4cplus::Logger LoggerContainer::logger; 11 | -------------------------------------------------------------------------------- /src/logger_container.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat May 16 22:24:08 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __LOGGER_CONTAINER_H__ 9 | #define __LOGGER_CONTAINER_H__ 10 | 11 | #define LOG4CPLUS_HAVE_GNU_VARIADIC_MACROS 1 12 | #undef LOG4CPLUS_HAVE_C99_VARIADIC_MACROS 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define LOG4CPLUS_HAVE_GNU_VARIADIC_MACROS 1 20 | #undef LOG4CPLUS_HAVE_C99_VARIADIC_MACROS 21 | 22 | class LoggerContainer 23 | { 24 | public: 25 | static void init() { 26 | logger = log4cplus::Logger::getRoot(); 27 | log4cplus::PropertyConfigurator::doConfigure("./conf/log.properties"); 28 | } 29 | static log4cplus::Logger logger; 30 | }; 31 | 32 | // flag: INFO TRACE DEBUG WARN ERROR FATAL 33 | #define LOG_F_INTERNAL(flag, logFmt, ...) \ 34 | LOG4CPLUS_##flag##_FMT(LoggerContainer::logger, logFmt, __VA_ARGS__) 35 | 36 | #define LOG(flag, logEvent) \ 37 | LOG_F_INTERNAL(flag, "[%s] * "logEvent, __FUNCTION__) 38 | 39 | #define LOG_F(flag, logFmt, ...) \ 40 | LOG_F_INTERNAL(flag, "[%s] * "logFmt, __FUNCTION__, __VA_ARGS__) 41 | 42 | #endif //__LOGGER_CONTAINER_H__ 43 | -------------------------------------------------------------------------------- /src/lspider_client.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:56:17 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "CrawlService.h" 25 | #include "util.h" 26 | 27 | using std::string; 28 | using std::istringstream; 29 | using namespace ::apache::thrift; 30 | using namespace ::apache::thrift::protocol; 31 | using namespace ::apache::thrift::transport; 32 | using namespace ::apache::thrift::server; 33 | 34 | using boost::shared_ptr; 35 | 36 | using namespace lspider; 37 | 38 | int crawl(string file) 39 | { 40 | boost::shared_ptr socket(new TSocket("localhost", 9090)); 41 | boost::shared_ptr transport(new TBufferedTransport(socket)); 42 | boost::shared_ptr protocol(new TBinaryProtocol(transport)); 43 | 44 | try { 45 | transport->open(); 46 | } catch (apache::thrift::transport::TTransportException e) { 47 | return -1; 48 | } 49 | 50 | 51 | CrawlServiceClient client(protocol); 52 | 53 | IUrl u; 54 | 55 | for (int i = 0; i < 1; i++) { 56 | std::fstream fin(file.c_str()); 57 | std::string ReadLine; 58 | while(getline(fin,ReadLine)) { 59 | printf("%s\n", ReadLine.c_str()); 60 | u.__set_str(ReadLine); 61 | u.__set_hub(true); 62 | client.request(u); 63 | } 64 | } 65 | 66 | 67 | transport->close(); 68 | } 69 | 70 | bool getline(const char* prompt, string& line) 71 | { 72 | char* line_buffer = readline(prompt); 73 | if (line_buffer == NULL) 74 | { 75 | return false; 76 | } 77 | add_history(line_buffer); 78 | line = line_buffer; 79 | free(line_buffer); 80 | return true; 81 | } 82 | 83 | int processCmd() 84 | { 85 | boost::shared_ptr socket(new TSocket("localhost", 9090)); 86 | boost::shared_ptr transport(new TBufferedTransport(socket)); 87 | boost::shared_ptr protocol(new TBinaryProtocol(transport)); 88 | 89 | try { 90 | transport->open(); 91 | } catch (apache::thrift::transport::TTransportException e) { 92 | return -1; 93 | } 94 | 95 | 96 | CrawlServiceClient client(protocol); 97 | string response; 98 | string cmd_line; 99 | string cmd; 100 | const char* prompt = ">"; 101 | while (getline(prompt, cmd_line)) 102 | { 103 | istringstream istr(cmd_line); 104 | istr >> cmd; 105 | try { 106 | client.exec_cmd(response, rtrim(ltrim(cmd_line))); 107 | } catch (apache::thrift::transport::TTransportException e) { 108 | fprintf(stderr, "TTransportException : %s\n", e.what()); 109 | break; 110 | } 111 | fprintf(stdout, "%s\n", response.c_str()); 112 | } 113 | 114 | transport->close(); 115 | } 116 | 117 | void printUsage(char *program) 118 | { 119 | fprintf(stderr, "Usage: %s -tfh\n"); 120 | fprintf(stderr, " -t type(crawl|cmd)\n"); 121 | fprintf(stderr, " -f urllist use when type is crawl\n"); 122 | fprintf(stderr, " -h help message\n"); 123 | } 124 | 125 | int main(int argc, char *argv[]) 126 | { 127 | char c; 128 | string type; 129 | string file; 130 | while ((c = getopt(argc, argv, "t:f:h")) != -1) 131 | { 132 | switch (c) 133 | { 134 | case 't': 135 | type = optarg; 136 | break; 137 | case 'f': 138 | file = optarg; 139 | break; 140 | case 'h': 141 | default: 142 | printUsage(argv[0]); 143 | break; 144 | } 145 | } 146 | 147 | if ("crawl" == type) { 148 | if (file != "") { 149 | crawl(file); 150 | } else { 151 | return -1; 152 | } 153 | } else if ("cmd" == type) { 154 | processCmd(); 155 | } 156 | return 0; 157 | } 158 | -------------------------------------------------------------------------------- /src/lspider_constants.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Autogenerated by Thrift Compiler (0.9.0) 3 | * 4 | * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | * @generated 6 | */ 7 | #include "lspider_constants.h" 8 | 9 | namespace lspider { 10 | 11 | const lspiderConstants g_lspider_constants; 12 | 13 | lspiderConstants::lspiderConstants() { 14 | } 15 | 16 | } // namespace 17 | 18 | -------------------------------------------------------------------------------- /src/lspider_constants.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Autogenerated by Thrift Compiler (0.9.0) 3 | * 4 | * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | * @generated 6 | */ 7 | #ifndef lspider_CONSTANTS_H 8 | #define lspider_CONSTANTS_H 9 | 10 | #include "lspider_types.h" 11 | 12 | namespace lspider { 13 | 14 | class lspiderConstants { 15 | public: 16 | lspiderConstants(); 17 | 18 | }; 19 | 20 | extern const lspiderConstants g_lspider_constants; 21 | 22 | } // namespace 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/lspider_types.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Autogenerated by Thrift Compiler (0.9.0) 3 | * 4 | * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | * @generated 6 | */ 7 | #include "lspider_types.h" 8 | 9 | #include 10 | 11 | namespace lspider { 12 | 13 | const char* IUrl::ascii_fingerprint = "7F21FB535884165D6350077C7B970E93"; 14 | const uint8_t IUrl::binary_fingerprint[16] = {0x7F,0x21,0xFB,0x53,0x58,0x84,0x16,0x5D,0x63,0x50,0x07,0x7C,0x7B,0x97,0x0E,0x93}; 15 | 16 | uint32_t IUrl::read(::apache::thrift::protocol::TProtocol* iprot) { 17 | 18 | uint32_t xfer = 0; 19 | std::string fname; 20 | ::apache::thrift::protocol::TType ftype; 21 | int16_t fid; 22 | 23 | xfer += iprot->readStructBegin(fname); 24 | 25 | using ::apache::thrift::protocol::TProtocolException; 26 | 27 | 28 | while (true) 29 | { 30 | xfer += iprot->readFieldBegin(fname, ftype, fid); 31 | if (ftype == ::apache::thrift::protocol::T_STOP) { 32 | break; 33 | } 34 | switch (fid) 35 | { 36 | case 1: 37 | if (ftype == ::apache::thrift::protocol::T_STRING) { 38 | xfer += iprot->readString(this->str); 39 | this->__isset.str = true; 40 | } else { 41 | xfer += iprot->skip(ftype); 42 | } 43 | break; 44 | case 2: 45 | if (ftype == ::apache::thrift::protocol::T_STRING) { 46 | xfer += iprot->readString(this->ip); 47 | this->__isset.ip = true; 48 | } else { 49 | xfer += iprot->skip(ftype); 50 | } 51 | break; 52 | case 3: 53 | if (ftype == ::apache::thrift::protocol::T_BOOL) { 54 | xfer += iprot->readBool(this->hub); 55 | this->__isset.hub = true; 56 | } else { 57 | xfer += iprot->skip(ftype); 58 | } 59 | break; 60 | default: 61 | xfer += iprot->skip(ftype); 62 | break; 63 | } 64 | xfer += iprot->readFieldEnd(); 65 | } 66 | 67 | xfer += iprot->readStructEnd(); 68 | 69 | return xfer; 70 | } 71 | 72 | uint32_t IUrl::write(::apache::thrift::protocol::TProtocol* oprot) const { 73 | uint32_t xfer = 0; 74 | xfer += oprot->writeStructBegin("IUrl"); 75 | 76 | xfer += oprot->writeFieldBegin("str", ::apache::thrift::protocol::T_STRING, 1); 77 | xfer += oprot->writeString(this->str); 78 | xfer += oprot->writeFieldEnd(); 79 | 80 | xfer += oprot->writeFieldBegin("ip", ::apache::thrift::protocol::T_STRING, 2); 81 | xfer += oprot->writeString(this->ip); 82 | xfer += oprot->writeFieldEnd(); 83 | 84 | xfer += oprot->writeFieldBegin("hub", ::apache::thrift::protocol::T_BOOL, 3); 85 | xfer += oprot->writeBool(this->hub); 86 | xfer += oprot->writeFieldEnd(); 87 | 88 | xfer += oprot->writeFieldStop(); 89 | xfer += oprot->writeStructEnd(); 90 | return xfer; 91 | } 92 | 93 | void swap(IUrl &a, IUrl &b) { 94 | using ::std::swap; 95 | swap(a.str, b.str); 96 | swap(a.ip, b.ip); 97 | swap(a.hub, b.hub); 98 | swap(a.__isset, b.__isset); 99 | } 100 | 101 | } // namespace 102 | -------------------------------------------------------------------------------- /src/lspider_types.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Autogenerated by Thrift Compiler (0.9.0) 3 | * 4 | * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | * @generated 6 | */ 7 | #ifndef lspider_TYPES_H 8 | #define lspider_TYPES_H 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | 17 | namespace lspider { 18 | 19 | typedef struct _IUrl__isset { 20 | _IUrl__isset() : str(false), ip(false), hub(true) {} 21 | bool str; 22 | bool ip; 23 | bool hub; 24 | } _IUrl__isset; 25 | 26 | class IUrl { 27 | public: 28 | 29 | static const char* ascii_fingerprint; // = "7F21FB535884165D6350077C7B970E93"; 30 | static const uint8_t binary_fingerprint[16]; // = {0x7F,0x21,0xFB,0x53,0x58,0x84,0x16,0x5D,0x63,0x50,0x07,0x7C,0x7B,0x97,0x0E,0x93}; 31 | 32 | IUrl() : str(), ip(), hub(false) { 33 | } 34 | 35 | virtual ~IUrl() throw() {} 36 | 37 | std::string str; 38 | std::string ip; 39 | bool hub; 40 | 41 | _IUrl__isset __isset; 42 | 43 | void __set_str(const std::string& val) { 44 | str = val; 45 | } 46 | 47 | void __set_ip(const std::string& val) { 48 | ip = val; 49 | } 50 | 51 | void __set_hub(const bool val) { 52 | hub = val; 53 | } 54 | 55 | bool operator == (const IUrl & rhs) const 56 | { 57 | if (!(str == rhs.str)) 58 | return false; 59 | if (!(ip == rhs.ip)) 60 | return false; 61 | if (!(hub == rhs.hub)) 62 | return false; 63 | return true; 64 | } 65 | bool operator != (const IUrl &rhs) const { 66 | return !(*this == rhs); 67 | } 68 | 69 | bool operator < (const IUrl & ) const; 70 | 71 | uint32_t read(::apache::thrift::protocol::TProtocol* iprot); 72 | uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const; 73 | 74 | }; 75 | 76 | void swap(IUrl &a, IUrl &b); 77 | 78 | } // namespace 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /src/lthread.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat May 16 23:18:47 2015 5 | * Description: 6 | **************************************************************************/ 7 | 8 | #include "lthread.h" 9 | 10 | using lthread::LThreadInstance; 11 | using lthread::LThread; 12 | using lthread::auto_lock_t; 13 | 14 | /// 设置pthread_key只需执行一次的锁 15 | static pthread_once_t storage_key_once = PTHREAD_ONCE_INIT; 16 | 17 | /// pthread_key用来存储LThreadInstance指针 18 | static pthread_key_t storage_key; 19 | 20 | /// 创建pthread_key 21 | static void create_storage_key() 22 | { 23 | pthread_key_create( &storage_key, NULL ); 24 | } 25 | 26 | // 静态变量,用来存储LThreadInstance实例 27 | static LThreadInstance main_instance = 28 | { 29 | { 0, &main_instance }, 0, 1, PTHREAD_COND_INITIALIZER, 0, PTHREAD_MUTEX_INITIALIZER 30 | }; 31 | 32 | //////// LThreadInstance ///////////////// 33 | 34 | void LThreadInstance::init() 35 | { 36 | args[0] = args[1] = 0; 37 | finished = false; 38 | running = false; 39 | pthread_cond_init(&thread_done, NULL); 40 | pthread_mutex_init(&mutex, NULL); 41 | thread_id = 0; 42 | LThread::initialize(); 43 | } 44 | 45 | void LThreadInstance::deinit() 46 | { 47 | pthread_cond_destroy(&thread_done); 48 | pthread_mutex_destroy(&mutex); 49 | } 50 | 51 | void LThreadInstance::terminate() 52 | { 53 | if (!thread_id) 54 | { 55 | return; 56 | } 57 | pthread_cancel(thread_id); 58 | } 59 | 60 | void *LThreadInstance::start(void *_arg) 61 | { 62 | void **arg = (void **) _arg; 63 | pthread_once( &storage_key_once, create_storage_key ); 64 | pthread_setspecific( storage_key, arg[1] ); 65 | pthread_cleanup_push( LThreadInstance::finish, arg[1] ); 66 | pthread_testcancel(); 67 | 68 | ((LThread *)arg[0])->run(); 69 | 70 | pthread_cleanup_pop(true); 71 | return 0; 72 | } 73 | 74 | void LThreadInstance::finish(void *) 75 | { 76 | LThreadInstance *d = current(); 77 | if (!d) 78 | { 79 | std::cerr << "LThread: internal error: zero data for running thread." << std::endl; 80 | return; 81 | } 82 | auto_lock_t(&d->mutex); 83 | d->args[0] = d->args[1] = 0; 84 | d->finished = true; 85 | d->running = false; 86 | d->thread_id = 0; 87 | pthread_cond_broadcast(&d->thread_done); 88 | } 89 | 90 | LThreadInstance* LThreadInstance::current() 91 | { 92 | pthread_once( &storage_key_once, create_storage_key ); 93 | LThreadInstance *ret = (LThreadInstance *) pthread_getspecific( storage_key ); 94 | return ret; 95 | } 96 | 97 | ///////////////// LThread //////////////// 98 | 99 | LThread::LThread() 100 | { 101 | d = new LThreadInstance; 102 | d->init(); 103 | } 104 | LThread::~LThread() 105 | { 106 | delete d; 107 | } 108 | 109 | int LThread::start() 110 | { 111 | auto_lock_t(&d->mutex); 112 | if (d->running) 113 | { 114 | pthread_cond_wait(&d->thread_done, &d->mutex); 115 | } 116 | d->running = true; 117 | d->finished = false; 118 | d->args[0] = this; 119 | d->args[1] = d; 120 | pthread_attr_t attr; 121 | pthread_attr_init(&attr); 122 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); 123 | int ret = pthread_create(&d->thread_id, NULL, LThreadInstance::start, d->args); 124 | pthread_attr_destroy( &attr ); 125 | return ret; 126 | } 127 | 128 | void LThread::terminate() 129 | { 130 | auto_lock_t(&d->mutex); 131 | if (d->finished || !d->running) 132 | { 133 | return; 134 | } 135 | d->terminate(); 136 | } 137 | 138 | pthread_t LThread::current_thread() 139 | { 140 | return pthread_self(); 141 | } 142 | 143 | bool LThread::wait(unsigned long time) 144 | { 145 | // 这里就奇怪了,用auto_lock_t之后锁不会释放 146 | pthread_mutex_lock(&d->mutex); 147 | if (d->thread_id == pthread_self()) 148 | { 149 | std::cerr << "LThread::wait: thread tried to wait on itself" << std::endl; 150 | pthread_mutex_unlock(&d->mutex); 151 | return false; 152 | } 153 | if (d->finished || ! d->running) 154 | { 155 | pthread_mutex_unlock(&d->mutex); 156 | return true; 157 | } 158 | int ret; 159 | if (time != (unsigned long)-1) 160 | { 161 | struct timeval tv; 162 | gettimeofday(&tv, 0); 163 | 164 | timespec ti; 165 | ti.tv_nsec = (tv.tv_usec + (time % 1000) * 1000) * 1000; 166 | ti.tv_sec = tv.tv_sec + (time / 1000) + (ti.tv_nsec / 1000000000); 167 | ti.tv_nsec %= 1000000000; 168 | ret = pthread_cond_timedwait(&d->thread_done, &d->mutex, &ti); 169 | } 170 | else 171 | { 172 | ret = pthread_cond_wait(&d->thread_done, &d->mutex); 173 | } 174 | pthread_mutex_unlock(&d->mutex); 175 | return (ret == 0); 176 | } 177 | 178 | void LThread::initialize() 179 | { 180 | pthread_once(&storage_key_once, create_storage_key); 181 | pthread_setspecific(storage_key, &main_instance); 182 | } 183 | 184 | void LThread::exit() 185 | { 186 | pthread_exit(0); 187 | } 188 | 189 | bool LThread::finished() const 190 | { 191 | auto_lock_t(&d->mutex); 192 | return d->finished; 193 | } 194 | 195 | bool LThread::running() const 196 | { 197 | auto_lock_t(&d->mutex); 198 | return d->running; 199 | } 200 | 201 | /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ 202 | -------------------------------------------------------------------------------- /src/lthread.h: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat May 16 22:24:08 2015 5 | * Description: 6 | **************************************************************************/ 7 | 8 | /** 9 | 10 | #include "lthread.h" 11 | using namespace lthread; 12 | 13 | class MyThread : public LThread 14 | { 15 | public: 16 | void run() 17 | { 18 | printf("the thread [%lu] is running.\n", current_thread()); 19 | sleep(1); 20 | } 21 | }; 22 | 23 | int main(int argc, char* argv[]) 24 | { 25 | MyThread t1; 26 | MyThread t2; 27 | t1.start(); 28 | t2.start(); 29 | t1.wait(); 30 | t2.wait(); 31 | printf("main thread [%lu] exit\n", pthread_self()); 32 | return 0; 33 | } 34 | 35 | * 36 | **/ 37 | 38 | #ifndef BUILD_SEA_LTHREAD_H 39 | #define BUILD_SEA_LTHREAD_H 40 | 41 | 42 | #include 43 | //using namespace std; 44 | #include 45 | #include 46 | #include 47 | 48 | namespace lthread 49 | { 50 | 51 | /** 52 | * 自动锁类 53 | */ 54 | struct auto_lock_t 55 | { 56 | auto_lock_t(pthread_mutex_t* mutex) : m_mutex(mutex) 57 | { 58 | pthread_mutex_lock(m_mutex); 59 | } 60 | ~auto_lock_t() 61 | { 62 | pthread_mutex_unlock(m_mutex); 63 | } 64 | pthread_mutex_t* m_mutex; 65 | }; 66 | 67 | /** 68 | * 线程执行的真正实例类,封装了线程的实现细节,可以实现不同平台的不同实现 69 | */ 70 | class LThreadInstance 71 | { 72 | public: 73 | 74 | /// 参数数组,分别存储LThread指针和LThreadInstance指针 75 | void *args[2]; 76 | /// 线程执行是否完成 77 | bool finished : 1; 78 | /// 线程是否正在执行 79 | bool running : 1; 80 | /// 线程执行完成condition信号 81 | pthread_cond_t thread_done; 82 | /// 线程id 83 | pthread_t thread_id; 84 | /// 互斥锁 85 | pthread_mutex_t mutex; 86 | 87 | /** 88 | * @brief 初始化成员变量 89 | * @return void 90 | * @author lichuang 91 | * @date 2012/06/03 12:41:00 92 | **/ 93 | void init(); 94 | /** 95 | * @brief 销毁成员变量 96 | * @return void 97 | * @author lichuang 98 | * @date 2012/06/03 12:41:18 99 | **/ 100 | void deinit(); 101 | /** 102 | * @brief 终止线程 103 | * @return void 104 | * @author lichuang 105 | * @date 2012/06/03 12:41:40 106 | **/ 107 | void terminate(); 108 | 109 | /** 110 | * @brief 启动静态函数,启动的线程中调用 111 | * @param [in/out] _arg : void* 112 | * @return void* 113 | * @author lichuang 114 | * @date 2012/06/03 12:41:54 115 | **/ 116 | static void *start(void *_arg); 117 | /** 118 | * @brief 线程执行结束后执行,唤醒等待信号 119 | * @return void 120 | * @author lichuang 121 | * @date 2012/06/03 12:42:30 122 | **/ 123 | static void finish(void *); 124 | /** 125 | * @brief 获取当前所在线程的LThreadInstance指针 126 | * @return LThreadInstance* 127 | * @author lichuang 128 | * @date 2012/06/03 12:43:01 129 | **/ 130 | static LThreadInstance* current(); 131 | }; 132 | 133 | /** 134 | * 线程类基类,需要继承并实现run()方法 135 | */ 136 | class LThread 137 | { 138 | public: 139 | /** 140 | * @brief 构造函数 141 | * @author lichuang 142 | * @date 2012/06/03 12:43:50 143 | **/ 144 | LThread(); 145 | /** 146 | * @brief 析构函数 147 | * @author lichuang 148 | * @date 2012/06/03 12:43:59 149 | **/ 150 | virtual ~LThread(); 151 | /** 152 | * @brief 启动线程 153 | * @return void 154 | * @author lichuang 155 | * @date 2012/06/02 14:12:47 156 | **/ 157 | int start(); 158 | /** 159 | * @brief 终止线程,非逼不得已不要用,因操作系统调度原因,可能不会立即终止线程, 160 | * 因此调用后最好接着调用bool wait(unsigned long time)来确定线程结束,否则后果自负 161 | * @return void 162 | * @author lichuang 163 | * @date 2012/06/02 14:13:04 164 | **/ 165 | void terminate(); 166 | /** 167 | * @brief 返回当前线程号 168 | * @return pthread_t 169 | * @author lichuang 170 | * @date 2012/06/03 10:48:36 171 | **/ 172 | pthread_t current_thread(); 173 | /** 174 | * @brief 等待线程执行结束,单位毫秒,默认为一直等待到结束 175 | * @param [in/out] time : unsigned long 176 | * @return bool 177 | * @author lichuang 178 | * @date 2012/06/03 10:54:15 179 | **/ 180 | bool wait(unsigned long time = (unsigned long)-1); 181 | /** 182 | * @brief 判断线程是否执行完成 183 | * @return bool 184 | * @author lichuang 185 | * @date 2012/06/06 10:01:18 186 | **/ 187 | bool finished() const; 188 | /** 189 | * @brief 判断线程是否正在运行 190 | * @return bool 191 | * @author lichuang 192 | * @date 2012/06/06 10:01:34 193 | **/ 194 | bool running() const; 195 | /** 196 | * @brief 初始化main_instance静态变量 197 | * @return void 198 | * @author lichuang 199 | * @date 2012/06/03 12:45:42 200 | **/ 201 | static void initialize(); 202 | /** 203 | * @brief Ends the execution of the calling thread and wakes up any threads 204 | * waiting for its termination. 205 | * @return void 206 | * @author lichuang 207 | * @date 2012/06/03 10:47:31 208 | **/ 209 | static void exit(); 210 | protected: 211 | /** 212 | * @brief 线程执行代码,由用户实现 213 | * @return void 214 | * @author lichuang 215 | * @date 2012/06/02 14:13:22 216 | **/ 217 | virtual void run() = 0; 218 | 219 | private: 220 | /// 线程类的实际实现类指针 221 | LThreadInstance * d; 222 | friend class LThreadInstance; 223 | }; 224 | 225 | }// namespace lthread 226 | 227 | #endif //BUILD_SEA_LTHREAD_H 228 | 229 | /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ 230 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:56:17 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include "logger_container.h" 11 | #include "request_recv.h" 12 | #include "http_processor.h" 13 | #include "mongo_dumper.h" 14 | #include "extractor.h" 15 | #include "extractor_worker_view.h" 16 | #include "mysql_dumper.h" 17 | #include "mysql_selector.h" 18 | #include "link_scheduler.h" 19 | #include "conf.h" 20 | #include "cmd_ctrler.h" 21 | 22 | int main(int argc, char ** argv) 23 | { 24 | QTextCodec::setCodecForLocale(QTextCodec::codecForName("UTF-8")); 25 | // 用户webkit解析事件循环 26 | QApplication app(argc, argv); 27 | 28 | // 日志初始化 29 | LoggerContainer::init(); 30 | 31 | if (false == Conf::instance()->load("./conf/conf.xml")) { 32 | return -1; 33 | } 34 | 35 | event_enable_debug_mode(); 36 | 37 | CmdCtrler *cmdCtrler = new CmdCtrler(&app); 38 | 39 | // 写mysql 40 | MySqlDumper *mySqlDumper = new MySqlDumper("linkdump"); 41 | MySqlSelector *mySqlSelector = new MySqlSelector("linkselect"); 42 | 43 | // 初始化mongoclient,连接mongodb 44 | MongoDumper *mongoDumper = new MongoDumper(mySqlDumper, Conf::instance()->mongoHostAndPort); 45 | if (!mongoDumper->initMongo()) { 46 | LOG(FATAL, "mongoDumper initMongo fail"); 47 | return -1; 48 | } 49 | 50 | // webkit模拟浏览器 51 | ExtractorWorkerView *workerView = new ExtractorWorkerView; 52 | // webkit解析控制器 53 | Extractor *extractor = new Extractor(workerView); 54 | 55 | workerView->setMongoDumper(mongoDumper); 56 | workerView->setExtractor(extractor); 57 | 58 | // http网络处理 59 | HttpProcessor *httpProcessor = new HttpProcessor(mongoDumper, extractor, "httpprocessor"); 60 | // 请求接收监听 61 | RequestRecv *recv = new RequestRecv(httpProcessor, cmdCtrler); 62 | 63 | // 链接调度 64 | LinkScheduler *linkScheduler = new LinkScheduler(httpProcessor); 65 | mySqlSelector->setLinkScheduler(linkScheduler); 66 | 67 | cmdCtrler->addHandler("RequestRecv", recv); 68 | cmdCtrler->addHandler("MySqlSelector", mySqlSelector); 69 | cmdCtrler->addHandler("HttpProcessor", httpProcessor); 70 | cmdCtrler->addHandler("LinkScheduler", linkScheduler); 71 | cmdCtrler->addHandler("MongoDumper", mongoDumper); 72 | cmdCtrler->addHandler("MySqlDumper", mySqlDumper); 73 | cmdCtrler->addHandler("Extractor", extractor); 74 | 75 | mongoDumper->start(); 76 | httpProcessor->start(); 77 | recv->start(); 78 | mySqlDumper->start(); 79 | usleep(100000); // 停顿一下,避免两个mysql连接同时起不来 80 | mySqlSelector->start(); 81 | linkScheduler->start(); 82 | extractor->start(); // 阻塞 83 | 84 | app.exec(); 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /src/moc_extractor_worker_view.cpp: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | ** Meta object code from reading C++ file 'extractor_worker_view.h' 3 | ** 4 | ** Created by: The Qt Meta Object Compiler version 67 (Qt 5.4.1) 5 | ** 6 | ** WARNING! All changes made in this file will be lost! 7 | *****************************************************************************/ 8 | 9 | #include "extractor_worker_view.h" 10 | #include 11 | #include 12 | #if !defined(Q_MOC_OUTPUT_REVISION) 13 | #error "The header file 'extractor_worker_view.h' doesn't include ." 14 | #elif Q_MOC_OUTPUT_REVISION != 67 15 | #error "This file was generated using the moc from 5.4.1. It" 16 | #error "cannot be used with the include files from this version of Qt." 17 | #error "(The moc has changed too much.)" 18 | #endif 19 | 20 | QT_BEGIN_MOC_NAMESPACE 21 | struct qt_meta_stringdata_ExtractorWorkerView_t { 22 | QByteArrayData data[4]; 23 | char stringdata[37]; 24 | }; 25 | #define QT_MOC_LITERAL(idx, ofs, len) \ 26 | Q_STATIC_BYTE_ARRAY_DATA_HEADER_INITIALIZER_WITH_OFFSET(len, \ 27 | qptrdiff(offsetof(qt_meta_stringdata_ExtractorWorkerView_t, stringdata) + ofs \ 28 | - idx * sizeof(QByteArrayData)) \ 29 | ) 30 | static const qt_meta_stringdata_ExtractorWorkerView_t qt_meta_stringdata_ExtractorWorkerView = { 31 | { 32 | QT_MOC_LITERAL(0, 0, 19), // "ExtractorWorkerView" 33 | QT_MOC_LITERAL(1, 20, 12), // "onLoadFinish" 34 | QT_MOC_LITERAL(2, 33, 0), // "" 35 | QT_MOC_LITERAL(3, 34, 2) // "ok" 36 | 37 | }, 38 | "ExtractorWorkerView\0onLoadFinish\0\0ok" 39 | }; 40 | #undef QT_MOC_LITERAL 41 | 42 | static const uint qt_meta_data_ExtractorWorkerView[] = { 43 | 44 | // content: 45 | 7, // revision 46 | 0, // classname 47 | 0, 0, // classinfo 48 | 1, 14, // methods 49 | 0, 0, // properties 50 | 0, 0, // enums/sets 51 | 0, 0, // constructors 52 | 0, // flags 53 | 0, // signalCount 54 | 55 | // slots: name, argc, parameters, tag, flags 56 | 1, 1, 19, 2, 0x09 /* Protected */, 57 | 58 | // slots: parameters 59 | QMetaType::Void, QMetaType::Bool, 3, 60 | 61 | 0 // eod 62 | }; 63 | 64 | void ExtractorWorkerView::qt_static_metacall(QObject *_o, QMetaObject::Call _c, int _id, void **_a) 65 | { 66 | if (_c == QMetaObject::InvokeMetaMethod) { 67 | ExtractorWorkerView *_t = static_cast(_o); 68 | switch (_id) { 69 | case 0: _t->onLoadFinish((*reinterpret_cast< bool(*)>(_a[1]))); break; 70 | default: ; 71 | } 72 | } 73 | } 74 | 75 | const QMetaObject ExtractorWorkerView::staticMetaObject = { 76 | { &QWebView::staticMetaObject, qt_meta_stringdata_ExtractorWorkerView.data, 77 | qt_meta_data_ExtractorWorkerView, qt_static_metacall, Q_NULLPTR, Q_NULLPTR} 78 | }; 79 | 80 | 81 | const QMetaObject *ExtractorWorkerView::metaObject() const 82 | { 83 | return QObject::d_ptr->metaObject ? QObject::d_ptr->dynamicMetaObject() : &staticMetaObject; 84 | } 85 | 86 | void *ExtractorWorkerView::qt_metacast(const char *_clname) 87 | { 88 | if (!_clname) return Q_NULLPTR; 89 | if (!strcmp(_clname, qt_meta_stringdata_ExtractorWorkerView.stringdata)) 90 | return static_cast(const_cast< ExtractorWorkerView*>(this)); 91 | return QWebView::qt_metacast(_clname); 92 | } 93 | 94 | int ExtractorWorkerView::qt_metacall(QMetaObject::Call _c, int _id, void **_a) 95 | { 96 | _id = QWebView::qt_metacall(_c, _id, _a); 97 | if (_id < 0) 98 | return _id; 99 | if (_c == QMetaObject::InvokeMetaMethod) { 100 | if (_id < 1) 101 | qt_static_metacall(this, _c, _id, _a); 102 | _id -= 1; 103 | } else if (_c == QMetaObject::RegisterMethodArgumentMetaType) { 104 | if (_id < 1) 105 | *reinterpret_cast(_a[0]) = -1; 106 | _id -= 1; 107 | } 108 | return _id; 109 | } 110 | QT_END_MOC_NAMESPACE 111 | -------------------------------------------------------------------------------- /src/moc_link_scheduler.cpp: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | ** Meta object code from reading C++ file 'link_scheduler.h' 3 | ** 4 | ** Created by: The Qt Meta Object Compiler version 67 (Qt 5.4.1) 5 | ** 6 | ** WARNING! All changes made in this file will be lost! 7 | *****************************************************************************/ 8 | 9 | #include "link_scheduler.h" 10 | #include 11 | #include 12 | #if !defined(Q_MOC_OUTPUT_REVISION) 13 | #error "The header file 'link_scheduler.h' doesn't include ." 14 | #elif Q_MOC_OUTPUT_REVISION != 67 15 | #error "This file was generated using the moc from 5.4.1. It" 16 | #error "cannot be used with the include files from this version of Qt." 17 | #error "(The moc has changed too much.)" 18 | #endif 19 | 20 | QT_BEGIN_MOC_NAMESPACE 21 | struct qt_meta_stringdata_LinkScheduler_t { 22 | QByteArrayData data[1]; 23 | char stringdata[14]; 24 | }; 25 | #define QT_MOC_LITERAL(idx, ofs, len) \ 26 | Q_STATIC_BYTE_ARRAY_DATA_HEADER_INITIALIZER_WITH_OFFSET(len, \ 27 | qptrdiff(offsetof(qt_meta_stringdata_LinkScheduler_t, stringdata) + ofs \ 28 | - idx * sizeof(QByteArrayData)) \ 29 | ) 30 | static const qt_meta_stringdata_LinkScheduler_t qt_meta_stringdata_LinkScheduler = { 31 | { 32 | QT_MOC_LITERAL(0, 0, 13) // "LinkScheduler" 33 | 34 | }, 35 | "LinkScheduler" 36 | }; 37 | #undef QT_MOC_LITERAL 38 | 39 | static const uint qt_meta_data_LinkScheduler[] = { 40 | 41 | // content: 42 | 7, // revision 43 | 0, // classname 44 | 0, 0, // classinfo 45 | 0, 0, // methods 46 | 0, 0, // properties 47 | 0, 0, // enums/sets 48 | 0, 0, // constructors 49 | 0, // flags 50 | 0, // signalCount 51 | 52 | 0 // eod 53 | }; 54 | 55 | void LinkScheduler::qt_static_metacall(QObject *_o, QMetaObject::Call _c, int _id, void **_a) 56 | { 57 | Q_UNUSED(_o); 58 | Q_UNUSED(_id); 59 | Q_UNUSED(_c); 60 | Q_UNUSED(_a); 61 | } 62 | 63 | const QMetaObject LinkScheduler::staticMetaObject = { 64 | { &QThread::staticMetaObject, qt_meta_stringdata_LinkScheduler.data, 65 | qt_meta_data_LinkScheduler, qt_static_metacall, Q_NULLPTR, Q_NULLPTR} 66 | }; 67 | 68 | 69 | const QMetaObject *LinkScheduler::metaObject() const 70 | { 71 | return QObject::d_ptr->metaObject ? QObject::d_ptr->dynamicMetaObject() : &staticMetaObject; 72 | } 73 | 74 | void *LinkScheduler::qt_metacast(const char *_clname) 75 | { 76 | if (!_clname) return Q_NULLPTR; 77 | if (!strcmp(_clname, qt_meta_stringdata_LinkScheduler.stringdata)) 78 | return static_cast(const_cast< LinkScheduler*>(this)); 79 | return QThread::qt_metacast(_clname); 80 | } 81 | 82 | int LinkScheduler::qt_metacall(QMetaObject::Call _c, int _id, void **_a) 83 | { 84 | _id = QThread::qt_metacall(_c, _id, _a); 85 | if (_id < 0) 86 | return _id; 87 | return _id; 88 | } 89 | QT_END_MOC_NAMESPACE 90 | -------------------------------------------------------------------------------- /src/moc_mysql_dumper.cpp: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | ** Meta object code from reading C++ file 'mysql_dumper.h' 3 | ** 4 | ** Created by: The Qt Meta Object Compiler version 67 (Qt 5.4.1) 5 | ** 6 | ** WARNING! All changes made in this file will be lost! 7 | *****************************************************************************/ 8 | 9 | #include "mysql_dumper.h" 10 | #include 11 | #include 12 | #if !defined(Q_MOC_OUTPUT_REVISION) 13 | #error "The header file 'mysql_dumper.h' doesn't include ." 14 | #elif Q_MOC_OUTPUT_REVISION != 67 15 | #error "This file was generated using the moc from 5.4.1. It" 16 | #error "cannot be used with the include files from this version of Qt." 17 | #error "(The moc has changed too much.)" 18 | #endif 19 | 20 | QT_BEGIN_MOC_NAMESPACE 21 | struct qt_meta_stringdata_MySqlDumper_t { 22 | QByteArrayData data[1]; 23 | char stringdata[12]; 24 | }; 25 | #define QT_MOC_LITERAL(idx, ofs, len) \ 26 | Q_STATIC_BYTE_ARRAY_DATA_HEADER_INITIALIZER_WITH_OFFSET(len, \ 27 | qptrdiff(offsetof(qt_meta_stringdata_MySqlDumper_t, stringdata) + ofs \ 28 | - idx * sizeof(QByteArrayData)) \ 29 | ) 30 | static const qt_meta_stringdata_MySqlDumper_t qt_meta_stringdata_MySqlDumper = { 31 | { 32 | QT_MOC_LITERAL(0, 0, 11) // "MySqlDumper" 33 | 34 | }, 35 | "MySqlDumper" 36 | }; 37 | #undef QT_MOC_LITERAL 38 | 39 | static const uint qt_meta_data_MySqlDumper[] = { 40 | 41 | // content: 42 | 7, // revision 43 | 0, // classname 44 | 0, 0, // classinfo 45 | 0, 0, // methods 46 | 0, 0, // properties 47 | 0, 0, // enums/sets 48 | 0, 0, // constructors 49 | 0, // flags 50 | 0, // signalCount 51 | 52 | 0 // eod 53 | }; 54 | 55 | void MySqlDumper::qt_static_metacall(QObject *_o, QMetaObject::Call _c, int _id, void **_a) 56 | { 57 | Q_UNUSED(_o); 58 | Q_UNUSED(_id); 59 | Q_UNUSED(_c); 60 | Q_UNUSED(_a); 61 | } 62 | 63 | const QMetaObject MySqlDumper::staticMetaObject = { 64 | { &QThread::staticMetaObject, qt_meta_stringdata_MySqlDumper.data, 65 | qt_meta_data_MySqlDumper, qt_static_metacall, Q_NULLPTR, Q_NULLPTR} 66 | }; 67 | 68 | 69 | const QMetaObject *MySqlDumper::metaObject() const 70 | { 71 | return QObject::d_ptr->metaObject ? QObject::d_ptr->dynamicMetaObject() : &staticMetaObject; 72 | } 73 | 74 | void *MySqlDumper::qt_metacast(const char *_clname) 75 | { 76 | if (!_clname) return Q_NULLPTR; 77 | if (!strcmp(_clname, qt_meta_stringdata_MySqlDumper.stringdata)) 78 | return static_cast(const_cast< MySqlDumper*>(this)); 79 | if (!strcmp(_clname, "MySqlBase")) 80 | return static_cast< MySqlBase*>(const_cast< MySqlDumper*>(this)); 81 | return QThread::qt_metacast(_clname); 82 | } 83 | 84 | int MySqlDumper::qt_metacall(QMetaObject::Call _c, int _id, void **_a) 85 | { 86 | _id = QThread::qt_metacall(_c, _id, _a); 87 | if (_id < 0) 88 | return _id; 89 | return _id; 90 | } 91 | QT_END_MOC_NAMESPACE 92 | -------------------------------------------------------------------------------- /src/moc_mysql_selector.cpp: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | ** Meta object code from reading C++ file 'mysql_selector.h' 3 | ** 4 | ** Created by: The Qt Meta Object Compiler version 67 (Qt 5.4.1) 5 | ** 6 | ** WARNING! All changes made in this file will be lost! 7 | *****************************************************************************/ 8 | 9 | #include "mysql_selector.h" 10 | #include 11 | #include 12 | #if !defined(Q_MOC_OUTPUT_REVISION) 13 | #error "The header file 'mysql_selector.h' doesn't include ." 14 | #elif Q_MOC_OUTPUT_REVISION != 67 15 | #error "This file was generated using the moc from 5.4.1. It" 16 | #error "cannot be used with the include files from this version of Qt." 17 | #error "(The moc has changed too much.)" 18 | #endif 19 | 20 | QT_BEGIN_MOC_NAMESPACE 21 | struct qt_meta_stringdata_MySqlSelector_t { 22 | QByteArrayData data[1]; 23 | char stringdata[14]; 24 | }; 25 | #define QT_MOC_LITERAL(idx, ofs, len) \ 26 | Q_STATIC_BYTE_ARRAY_DATA_HEADER_INITIALIZER_WITH_OFFSET(len, \ 27 | qptrdiff(offsetof(qt_meta_stringdata_MySqlSelector_t, stringdata) + ofs \ 28 | - idx * sizeof(QByteArrayData)) \ 29 | ) 30 | static const qt_meta_stringdata_MySqlSelector_t qt_meta_stringdata_MySqlSelector = { 31 | { 32 | QT_MOC_LITERAL(0, 0, 13) // "MySqlSelector" 33 | 34 | }, 35 | "MySqlSelector" 36 | }; 37 | #undef QT_MOC_LITERAL 38 | 39 | static const uint qt_meta_data_MySqlSelector[] = { 40 | 41 | // content: 42 | 7, // revision 43 | 0, // classname 44 | 0, 0, // classinfo 45 | 0, 0, // methods 46 | 0, 0, // properties 47 | 0, 0, // enums/sets 48 | 0, 0, // constructors 49 | 0, // flags 50 | 0, // signalCount 51 | 52 | 0 // eod 53 | }; 54 | 55 | void MySqlSelector::qt_static_metacall(QObject *_o, QMetaObject::Call _c, int _id, void **_a) 56 | { 57 | Q_UNUSED(_o); 58 | Q_UNUSED(_id); 59 | Q_UNUSED(_c); 60 | Q_UNUSED(_a); 61 | } 62 | 63 | const QMetaObject MySqlSelector::staticMetaObject = { 64 | { &QThread::staticMetaObject, qt_meta_stringdata_MySqlSelector.data, 65 | qt_meta_data_MySqlSelector, qt_static_metacall, Q_NULLPTR, Q_NULLPTR} 66 | }; 67 | 68 | 69 | const QMetaObject *MySqlSelector::metaObject() const 70 | { 71 | return QObject::d_ptr->metaObject ? QObject::d_ptr->dynamicMetaObject() : &staticMetaObject; 72 | } 73 | 74 | void *MySqlSelector::qt_metacast(const char *_clname) 75 | { 76 | if (!_clname) return Q_NULLPTR; 77 | if (!strcmp(_clname, qt_meta_stringdata_MySqlSelector.stringdata)) 78 | return static_cast(const_cast< MySqlSelector*>(this)); 79 | if (!strcmp(_clname, "MySqlBase")) 80 | return static_cast< MySqlBase*>(const_cast< MySqlSelector*>(this)); 81 | return QThread::qt_metacast(_clname); 82 | } 83 | 84 | int MySqlSelector::qt_metacall(QMetaObject::Call _c, int _id, void **_a) 85 | { 86 | _id = QThread::qt_metacall(_c, _id, _a); 87 | if (_id < 0) 88 | return _id; 89 | return _id; 90 | } 91 | QT_END_MOC_NAMESPACE 92 | -------------------------------------------------------------------------------- /src/mongo_dumper.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:32:22 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "logger_container.h" 10 | #include "mysql_dumper.h" 11 | #include "mongo_dumper.h" 12 | 13 | using namespace mongo; 14 | 15 | const std::string MongoDumper::_db = "lspider.pages"; 16 | 17 | MongoDumper::MongoDumper(MySqlDumper *mysqlDumper, std::string hostAndPort) 18 | :_mysqlDumper(mysqlDumper), _hostAndPort(hostAndPort), _isStop(false), 19 | _dumpSuccCount(0), _dumpFailCount(0) 20 | { 21 | } 22 | 23 | MongoDumper::~MongoDumper() 24 | { 25 | stop(); 26 | } 27 | 28 | void MongoDumper::run() 29 | { 30 | LOG(INFO, "start"); 31 | while (!_isStop) { 32 | // 取队列 33 | UrlContext *urlContext = waitDumpQueue.pop_front(); 34 | if (dump(urlContext)) { 35 | _mysqlDumper->waitDumpQueue.push_back(urlContext); 36 | } else { 37 | delete urlContext; 38 | } 39 | } 40 | } 41 | 42 | void MongoDumper::stop() 43 | { 44 | _isStop = true; 45 | this->terminate(); 46 | } 47 | 48 | bool MongoDumper::initMongo() 49 | { 50 | std::string errmsg; 51 | return _conn.connect(_hostAndPort.c_str(), errmsg); 52 | } 53 | 54 | bool MongoDumper::dump(UrlContext *urlContext) 55 | { 56 | // 在mongodb里查的key 57 | mongo::BSONObjBuilder query; 58 | query.append("url", urlContext->url); 59 | 60 | mongo::BSONObjBuilder data; 61 | makeData(urlContext, data); 62 | 63 | if (data.obj().objsize() <= _conn.getMaxBsonObjectSize()) { 64 | _conn.update(_db, 65 | query.obj(), 66 | data.obj(), 67 | true); 68 | if ("" == _conn.getLastError(_db)) { 69 | atomic_add(&_dumpSuccCount, 1); 70 | LOG_F(INFO, "%d [%s] dump mongo success [sign=%s]", 71 | urlContext->uuid, 72 | urlContext->url.c_str(), 73 | urlContext->sign); 74 | return true; 75 | } else { 76 | atomic_add(&_dumpFailCount, 1); 77 | LOG_F(WARN, "%d [%s] dump mongo fail", 78 | urlContext->uuid, 79 | urlContext->url.c_str()); 80 | return false; 81 | } 82 | } else { 83 | LOG_F(WARN, "%d [%s] objsize(%d) > MaxBsonObjectSize ", 84 | urlContext->uuid, 85 | urlContext->url.c_str(), data.obj().objsize()); 86 | return false; 87 | } 88 | } 89 | 90 | void MongoDumper::makeData(UrlContext *urlContext, mongo::BSONObjBuilder & data) 91 | { 92 | // 在mongodb里存的数据 93 | data.append("url", urlContext->url); 94 | char linkDepthStr[8]; 95 | snprintf(linkDepthStr, 8, "%d", urlContext->linkDepth); 96 | data.append("linkDepth", linkDepthStr); 97 | data.append("hub", urlContext->hub); 98 | 99 | data.append("host", urlContext->host); 100 | data.append("strport", urlContext->strport); 101 | data.append("file", urlContext->file); 102 | data.append("ip", urlContext->ip); 103 | data.append("site", urlContext->site); 104 | data.append("sign", urlContext->sign); 105 | 106 | data.append("redirectUrl", urlContext->redirectUrl); 107 | data.append("redirectDepth", urlContext->redirectDepth); 108 | char crawlElapseStr[16] = {'\0'}; 109 | 110 | snprintf(crawlElapseStr, 16, "%ld", urlContext->crawlElapse); 111 | data.append("crawlElapse", crawlElapseStr); 112 | data.append("crawlTime", ::ctime((const time_t *)&(urlContext->finishTime.tv_sec))); 113 | if (urlContext->headerLen > 0) { 114 | data.append("header", urlContext->header); 115 | } else { 116 | data.append("header", ""); 117 | } 118 | if (urlContext->bodyPos > 0) { 119 | data.append("body", urlContext->body); 120 | } else { 121 | data.append("body", ""); 122 | } 123 | data.append("title", urlContext->title); 124 | 125 | BSONObjBuilder link_array; 126 | makeLinkArray(urlContext, link_array); 127 | data.appendArray("links", link_array.obj()); 128 | } 129 | 130 | void MongoDumper::makeLinkArray(UrlContext *urlContext, mongo::BSONObjBuilder & link_array) 131 | { 132 | LINK_TYPE & links = urlContext->links; 133 | LINK_TYPE::iterator iter = links.begin(); 134 | int num = 0; 135 | for (; links.end() != iter; ++iter) { 136 | char numStr[16]; 137 | sprintf(numStr, "%d", num++); 138 | mongo::BSONObjBuilder link; 139 | link.append("href", iter->first); 140 | link.append("anchor", iter->second); 141 | link_array.append(numStr, link.obj()); 142 | } 143 | } 144 | 145 | MySqlDumper * MongoDumper::mysqlDumper() 146 | { 147 | return _mysqlDumper; 148 | } 149 | 150 | void MongoDumper::control(string& response, const string& cmd) 151 | { 152 | char msg[1024] = {'\0'}; 153 | snprintf(msg, 1024, "MongoDumper successCount:%d failCount:%d", 154 | _dumpSuccCount, _dumpFailCount); 155 | response = msg; 156 | } 157 | -------------------------------------------------------------------------------- /src/mongo_dumper.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:27:05 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __MONGO_DUMPER_H__ 9 | #define __MONGO_DUMPER_H__ 10 | 11 | #include "mongo/client/dbclient.h" 12 | #include "lthread.h" 13 | #include "synced_queue.h" 14 | #include "DoubleList.h" 15 | #include "controllable.h" 16 | #include "url_context.h" 17 | 18 | class MySqlDumper; 19 | 20 | class MongoDumper : public lthread::LThread, public Controllable 21 | { 22 | public: 23 | MongoDumper(MySqlDumper *mysqlDumper, std::string hostAndPort); 24 | ~MongoDumper(); 25 | 26 | bool initMongo(); 27 | void stop(); 28 | 29 | virtual void run(); 30 | virtual void control(string& response, const string& cmd); 31 | 32 | MySqlDumper * mysqlDumper(); 33 | 34 | TSyncedQueue > waitDumpQueue; // 待写mongo队列 35 | 36 | private: 37 | bool dump(UrlContext *urlContext); 38 | void makeData(UrlContext *urlContext, mongo::BSONObjBuilder & data); 39 | void makeLinkArray(UrlContext *urlContext, mongo::BSONObjBuilder & link_array); 40 | 41 | MySqlDumper *_mysqlDumper; 42 | mongo::DBClientConnection _conn; 43 | std::string _hostAndPort; 44 | bool _isStop; 45 | static const std::string _db; 46 | volatile int _dumpSuccCount; 47 | volatile int _dumpFailCount; 48 | }; 49 | 50 | #endif //__MONGO_DUMPER_H__ 51 | -------------------------------------------------------------------------------- /src/mutex_lock.h: -------------------------------------------------------------------------------- 1 | #ifndef __MUTEX_LOCK_H__ 2 | #define __MUTEX_LOCK_H__ 3 | 4 | 5 | #include 6 | 7 | class CMutexLock 8 | { 9 | public: 10 | CMutexLock() { 11 | pthread_mutex_init(&_mutex, NULL); 12 | } 13 | 14 | ~CMutexLock() { 15 | pthread_mutex_destroy(&_mutex); 16 | } 17 | 18 | void Init() { 19 | } 20 | 21 | void Lock() { 22 | pthread_mutex_lock(&_mutex); 23 | } 24 | 25 | bool Unlock() 26 | { 27 | pthread_mutex_unlock(&_mutex); 28 | return true; 29 | } 30 | 31 | private: 32 | pthread_mutex_t _mutex; 33 | }; 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/mysql_base.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:32:22 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "logger_container.h" 12 | #include "mysql_dumper.h" 13 | 14 | const string MySqlBase::TABLE_NAME = "link"; 15 | 16 | MySqlBase::MySqlBase(string connectionName, 17 | string hostName, 18 | int port, 19 | string userName, 20 | string passWord, 21 | string databaseName) 22 | :_connectionName(QString(connectionName.c_str())), 23 | _hostName(QString(hostName.c_str())), 24 | _port(port), 25 | _userName(QString(userName.c_str())), 26 | _passWord(QString(passWord.c_str())), 27 | _databaseName(QString(databaseName.c_str())) 28 | { 29 | } 30 | 31 | 32 | MySqlBase::~MySqlBase() 33 | { 34 | stop(); 35 | } 36 | 37 | void MySqlBase::stop() 38 | { 39 | _db.close(); 40 | QSqlDatabase::removeDatabase(_databaseName); 41 | } 42 | 43 | bool MySqlBase::initMySql() 44 | { 45 | _db = QSqlDatabase::addDatabase("QMYSQL", _connectionName); 46 | _db.setDatabaseName(_databaseName); 47 | _db.setHostName(_hostName); 48 | _db.setPort(_port); 49 | _db.setUserName(_userName); 50 | _db.setPassword(_passWord); 51 | if (!_db.open()) { 52 | QSqlError err = _db.lastError(); 53 | LOG_F(FATAL, "QSqlDatabase open fail errtype=%d text=%s [%s][%s][%s:%d][%s][%s]", 54 | err.type(), err.databaseText().toStdString().c_str(), 55 | _connectionName.toStdString().c_str(), _databaseName.toStdString().c_str(), 56 | _hostName.toStdString().c_str(), _port, 57 | _userName.toStdString().c_str(), _passWord.toStdString().c_str()); 58 | return false; 59 | } 60 | return true; 61 | } 62 | -------------------------------------------------------------------------------- /src/mysql_base.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:27:05 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __MYSQL_BASE_H__ 9 | #define __MYSQL_BASE_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "conf.h" 16 | 17 | using std::string; 18 | 19 | class MySqlBase 20 | { 21 | public: 22 | typedef enum { 23 | NEW = 0, 24 | SELECTED = 1, 25 | CRAWLED = 2 26 | } CrawlState; 27 | 28 | MySqlBase(string connectionName, 29 | string hostName = Conf::instance()->mysqlHost, 30 | int port = Conf::instance()->mysqlPort, 31 | string userName = Conf::instance()->mysqlUser, 32 | string passWord = Conf::instance()->mysqlPassword, 33 | string databaseName = Conf::instance()->mysqlDatabase); 34 | 35 | virtual ~MySqlBase(); 36 | 37 | virtual bool initMySql(); 38 | virtual void stop(); 39 | 40 | protected: 41 | QString _connectionName; 42 | QString _hostName; 43 | int _port; 44 | QString _userName; 45 | QString _passWord; 46 | QString _databaseName; 47 | QSqlDatabase _db; 48 | 49 | static const string TABLE_NAME; 50 | }; 51 | 52 | #endif //__MYSQL_BASE_H__ 53 | -------------------------------------------------------------------------------- /src/mysql_dumper.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:32:22 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "logger_container.h" 14 | #include "util.h" 15 | #include "url_tools.h" 16 | #include "mysql_dumper.h" 17 | #include "url_context.h" 18 | 19 | MySqlDumper::MySqlDumper(string connectionName) 20 | :MySqlBase(connectionName), _dumpCount(0) 21 | { 22 | _isStop = false; 23 | resetInsertLinksCmd(); 24 | resetUpdateLinksCmd(); 25 | _insertValueCount = 0; 26 | _updateValueCount = 0; 27 | } 28 | 29 | 30 | MySqlDumper::~MySqlDumper() 31 | { 32 | stop(); 33 | } 34 | 35 | void MySqlDumper::run() 36 | { 37 | LOG(INFO, "start"); 38 | if (!initMySql()) { 39 | LOG(FATAL, "mysqlDumper initMySql fail"); 40 | _exit(-1); 41 | return; 42 | } 43 | 44 | while (!_isStop) { 45 | // 如果不刷一下,总是剩余一些刷不进去 46 | if (0 == waitDumpQueue.size()) { 47 | flushUpdate(); 48 | flushInsert(); 49 | } 50 | // 取队列 51 | UrlContext *urlContext = waitDumpQueue.pop_front(); 52 | dump(urlContext); 53 | delete urlContext; 54 | } 55 | } 56 | 57 | void MySqlDumper::stop() 58 | { 59 | _isStop = true; 60 | this->terminate(); 61 | MySqlBase::stop(); 62 | } 63 | 64 | bool MySqlDumper::dump(UrlContext *urlContext) 65 | { 66 | if (urlContext->hub) { 67 | dumpHub(urlContext); 68 | } 69 | updateCrawlState(urlContext); 70 | 71 | // 新增后链 72 | LINK_TYPE & links = urlContext->links; 73 | LINK_TYPE::iterator iter = links.begin(); 74 | for (; links.end() != iter; ++iter) { 75 | string href = iter->first; 76 | to_utf8(href); 77 | string anchor = iter->second; 78 | to_utf8(anchor); 79 | 80 | dumpOneLink(href, anchor, urlContext); 81 | } 82 | 83 | LOG_F(DEBUG, "%d [%s] dump mysql success", 84 | urlContext->uuid, 85 | urlContext->url.c_str()); 86 | 87 | return true; 88 | } 89 | 90 | bool MySqlDumper::updateCrawlState(UrlContext *urlContext) 91 | { 92 | // 更新这条url的抓取状态 93 | // 计算签名 94 | uint64_t sign = get_url_sign64(urlContext->url.c_str()); 95 | char signstr[MAX_SIGN_LEN]; 96 | snprintf(signstr, MAX_SIGN_LEN, "%lu", sign); 97 | signstr[23] = '\0'; 98 | 99 | _updateLinksCmd += QString(signstr); 100 | _updateLinksCmd += QString(","); 101 | 102 | _updateValueCount++; 103 | 104 | if (_updateValueCount >= Conf::instance()->mysqlDumperUpdateBatch) { 105 | flushUpdate(); 106 | } 107 | 108 | return true; 109 | } 110 | 111 | bool MySqlDumper::dumpOneLink(const string href, const string anchor, UrlContext *urlContext) 112 | { 113 | // 计算签名 114 | uint64_t sign = get_url_sign64(href.c_str()); 115 | char signstr[MAX_SIGN_LEN]; 116 | snprintf(signstr, MAX_SIGN_LEN, "%lu", sign); 117 | signstr[23] = '\0'; 118 | 119 | // 计算maindomain 120 | char maindomain[MAX_SITE_LEN] = {'\0'}; 121 | if (false == fetch_url_maindomain(href.c_str(), maindomain, MAX_SITE_LEN)) { 122 | return false; 123 | } 124 | 125 | _insertLinksCmd += QString("("); 126 | _insertLinksCmd += QString(signstr) + ",\'"; 127 | _insertLinksCmd += QString(href.c_str()) + "\',\'"; 128 | _insertLinksCmd += QString(maindomain) + "\',\'"; 129 | _insertLinksCmd += QString(urlContext->url.c_str()) + "\',\'"; 130 | _insertLinksCmd += QString(anchor.c_str()).replace('\'', "\\'") + "\',"; 131 | _insertLinksCmd += QString::number(urlContext->linkDepth + 1) + ","; 132 | _insertLinksCmd += QString::number(MySqlBase::NEW) + ","; 133 | _insertLinksCmd += QString("current_timestamp()") + "),"; 134 | 135 | _insertValueCount++; 136 | 137 | if (_insertValueCount >= Conf::instance()->mysqlDumperInsertBatch) { 138 | flushInsert(); 139 | } 140 | return true; 141 | } 142 | 143 | bool MySqlDumper::dumpHub(UrlContext *urlContext) 144 | { 145 | uint64_t sign = get_url_sign64(urlContext->url.c_str()); 146 | char signstr[MAX_SIGN_LEN]; 147 | snprintf(signstr, MAX_SIGN_LEN, "%lu", sign); 148 | signstr[23] = '\0'; 149 | 150 | char maindomain[MAX_SITE_LEN] = {'\0'}; 151 | if (false == fetch_url_maindomain(urlContext->url.c_str(), maindomain, MAX_SITE_LEN)) { 152 | return false; 153 | } 154 | 155 | QString cmd = QString("UPDATE IGNORE ") + QString(TABLE_NAME.c_str()) 156 | + " SET sign=" + QString(signstr) + ", url=\'" + QString(urlContext->url.c_str()) 157 | +"\', maindomain=\'" + QString(maindomain) 158 | + "\', linkdepth=0, crawlstate=" + QString::number(MySqlBase::NEW) 159 | + ", hub=true, crawledtime=current_timestamp()"; 160 | 161 | QSqlQuery updateQuery(_db); 162 | updateQuery.prepare(cmd); 163 | if (false == updateQuery.exec()) { 164 | QSqlError err = updateQuery.lastError(); 165 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 166 | err.type(), updateQuery.lastQuery().toUtf8().constData(), err.text().toStdString().c_str()); 167 | resetUpdateLinksCmd(); 168 | return false; 169 | } else { 170 | LOG_F(INFO, "QSqlQuery success [cmd=%s]", 171 | updateQuery.lastQuery().toUtf8().constData()); 172 | resetUpdateLinksCmd(); 173 | return true; 174 | } 175 | } 176 | 177 | bool MySqlDumper::flushInsert() 178 | { 179 | if (0 == _insertValueCount) { 180 | return false; 181 | } 182 | 183 | QSqlQuery query(_db); 184 | _insertLinksCmd.remove(_insertLinksCmd.length()-1, 1); 185 | query.prepare(_insertLinksCmd); 186 | atomic_add(&_dumpCount, _insertValueCount); 187 | if (false == query.exec()) { 188 | QSqlError err = query.lastError(); 189 | // 重复key不打log 190 | if (!err.text().contains("Duplicate entry")) { 191 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 192 | err.type(), query.lastQuery().toUtf8().constData(), err.text().toStdString().c_str()); 193 | } 194 | resetInsertLinksCmd(); 195 | return false; 196 | } else { 197 | LOG_F(INFO, "QSqlQuery success [cmd=%s]", 198 | query.lastQuery().toUtf8().constData()); 199 | resetInsertLinksCmd(); 200 | return true; 201 | } 202 | } 203 | 204 | bool MySqlDumper::flushUpdate() 205 | { 206 | if (0 == _updateValueCount) { 207 | return false; 208 | } 209 | 210 | QSqlQuery updateQuery(_db); 211 | _updateLinksCmd.remove(_updateLinksCmd.length()-1, 1); 212 | _updateLinksCmd += QString(")"); 213 | updateQuery.prepare(_updateLinksCmd); 214 | if (false == updateQuery.exec()) { 215 | QSqlError err = updateQuery.lastError(); 216 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 217 | err.type(), updateQuery.lastQuery().toUtf8().constData(), err.text().toStdString().c_str()); 218 | resetUpdateLinksCmd(); 219 | return false; 220 | } else { 221 | LOG_F(INFO, "QSqlQuery success [cmd=%s]", 222 | updateQuery.lastQuery().toUtf8().constData()); 223 | resetUpdateLinksCmd(); 224 | return true; 225 | } 226 | } 227 | 228 | void MySqlDumper::resetInsertLinksCmd() 229 | { 230 | _insertLinksCmd = QString("INSERT IGNORE INTO ") + QString(TABLE_NAME.c_str()) + 231 | "(sign, url, maindomain, prelink, preanchor, linkdepth, crawlstate, foundtime)" 232 | " VALUES"; 233 | _insertValueCount = 0; 234 | } 235 | 236 | void MySqlDumper::resetUpdateLinksCmd() 237 | { 238 | _updateLinksCmd = QString("UPDATE ") + QString(TABLE_NAME.c_str()) 239 | + " SET crawlstate=" 240 | + QString::number(MySqlBase::CRAWLED) 241 | + ",crawledtime=current_timestamp()" 242 | + " WHERE sign IN ("; 243 | _updateValueCount = 0; 244 | } 245 | 246 | void MySqlDumper::control(string& response, const string& cmd) 247 | { 248 | char msg[1024] = {'\0'}; 249 | snprintf(msg, 1024, "MySqlDumper dumpCount:%d", 250 | _dumpCount); 251 | response = msg; 252 | } 253 | -------------------------------------------------------------------------------- /src/mysql_dumper.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 03:27:05 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __MYSQL_DUMPER_H__ 9 | #define __MYSQL_DUMPER_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "synced_queue.h" 16 | #include "DoubleList.h" 17 | #include "controllable.h" 18 | #include "mysql_base.h" 19 | 20 | using std::string; 21 | using std::map; 22 | 23 | class UrlContext; 24 | 25 | class MySqlDumper : public QThread, public MySqlBase, public Controllable 26 | { 27 | Q_OBJECT 28 | public: 29 | MySqlDumper(string connectionName); 30 | 31 | virtual ~MySqlDumper(); 32 | 33 | virtual void stop(); 34 | 35 | virtual void run(); 36 | 37 | virtual void control(string& response, const string& cmd); 38 | 39 | TSyncedQueue > waitDumpQueue; // 待写mongo队列 40 | 41 | private: 42 | bool dump(UrlContext *urlContext); 43 | bool updateCrawlState(UrlContext *urlContext); 44 | bool dumpOneLink(const string href, const string anchor, UrlContext *urlContext); 45 | bool dumpHub(UrlContext *urlContext); 46 | /** 47 | * 这里有个缺陷,就是flush是由数据触发的,每n个一触发,如果没数据可能剩一些触发不了 48 | */ 49 | bool flushInsert(); 50 | bool flushUpdate(); 51 | void resetInsertLinksCmd(); 52 | void resetUpdateLinksCmd(); 53 | 54 | bool _isStop; 55 | QString _insertLinksCmd; 56 | QString _updateLinksCmd; 57 | int _insertValueCount; 58 | int _updateValueCount; 59 | volatile int _dumpCount; 60 | }; 61 | 62 | #endif //__MYSQL_DUMPER_H__ 63 | -------------------------------------------------------------------------------- /src/mysql_selector.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 09:55:01 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "logger_container.h" 15 | #include "url_context.h" 16 | #include "link_scheduler.h" 17 | #include "conf.h" 18 | #include "mysql_selector.h" 19 | 20 | using std::string; 21 | 22 | MySqlSelector::MySqlSelector(string connectionName) 23 | :MySqlBase(connectionName) 24 | { 25 | _isStop = false; 26 | _linkScheduler = NULL; 27 | } 28 | 29 | MySqlSelector::~MySqlSelector() 30 | { 31 | stop(); 32 | } 33 | 34 | void MySqlSelector::setLinkScheduler(LinkScheduler *linkScheduler) 35 | { 36 | _linkScheduler = linkScheduler; 37 | } 38 | 39 | void MySqlSelector::stop() 40 | { 41 | _isStop = true; 42 | this->terminate(); 43 | MySqlBase::stop(); 44 | } 45 | 46 | void MySqlSelector::run() 47 | { 48 | if ("off" == Conf::instance()->mysqlSelectorSwitch) { 49 | return; 50 | } 51 | 52 | LOG(INFO, "start"); 53 | if (!initMySql()) { 54 | LOG(FATAL, "mysqlSelector initMySql fail"); 55 | _exit(-1); 56 | return; 57 | } 58 | 59 | recoverCrawlState(); 60 | 61 | while (!_isStop) { 62 | LOG(DEBUG, "select once"); 63 | select(); 64 | sleep(Conf::instance()->mysqlSelectInterval); 65 | } 66 | } 67 | 68 | void MySqlSelector::recoverCrawlState() 69 | { 70 | // 构造要抓取的网站主域列表 71 | vector & maindomainList = Conf::instance()->maindomainList; 72 | assert(maindomainList.size() > 0); 73 | string inPhraseList = string("'") + maindomainList[0] + string("'"); 74 | for (size_t i = 1; i < maindomainList.size(); i++) { 75 | inPhraseList += ",'" + maindomainList[i] + string("'"); 76 | } 77 | 78 | QSqlQuery updateQuery(_db); 79 | updateQuery.prepare(QString("UPDATE ") + QString(TABLE_NAME.c_str()) 80 | + " SET crawlstate=" 81 | + QString::number(MySqlBase::NEW) 82 | + " WHERE crawlstate=" + QString::number(MySqlBase::SELECTED) 83 | + " AND crawlretry < " 84 | + QString::number(Conf::instance()->mysqlMaxCrawlRetry) 85 | + " AND maindomain IN (" + inPhraseList.c_str() + ")"); 86 | if (false == updateQuery.exec()) { 87 | QSqlError err = updateQuery.lastError(); 88 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 89 | err.type(), updateQuery.lastQuery().toStdString().c_str(), err.text().toStdString().c_str()); 90 | } else { 91 | LOG_F(DEBUG, "updateCmd: [%s]", updateQuery.lastQuery().toStdString().c_str()); 92 | } 93 | } 94 | 95 | 96 | bool MySqlSelector::select() 97 | { 98 | // 构造要抓取的网站主域列表 99 | // 动态加载 100 | Conf::instance()->loadMaindomainList(Conf::instance()->maindomainListFile); 101 | vector & maindomainList = Conf::instance()->maindomainList; 102 | if (maindomainList.size() == 0) { 103 | LOG(FATAL, "maindomainList.size == 0"); 104 | } 105 | string inPhraseList = string("'") + maindomainList[0] + string("'"); 106 | for (size_t i = 1; i < maindomainList.size(); i++) { 107 | inPhraseList += ",'" + maindomainList[i] + string("'"); 108 | } 109 | 110 | // 选取新增链接和种子链接 111 | QString selectCmdStr = "SELECT sign, url, maindomain, ip, prelink, preanchor, weight, linkdepth, crawlstate, crawlretry, hub, fresh, foundtime, crawledtime FROM " 112 | + QString(TABLE_NAME.c_str()) 113 | + " WHERE ((crawlstate=" 114 | + QString::number(MySqlBase::NEW) 115 | + " AND crawlretry < " 116 | + QString::number(Conf::instance()->mysqlMaxCrawlRetry) 117 | + " AND linkdepth <= " + QString::number(Conf::instance()->mysqlSelectMaxLinkDepth) 118 | + ") OR hub=true)" 119 | + " AND maindomain IN (" + inPhraseList.c_str() + ")" 120 | " ORDER BY linkdepth"; 121 | LOG_F(DEBUG, "selectCmdStr: [%s]", selectCmdStr.toStdString().c_str()); 122 | QSqlQuery query(selectCmdStr, _db); 123 | LOG_F(INFO, "selectResultNum=%d", query.size()); 124 | 125 | if (query.isActive()) { 126 | 127 | // 更新crawlstate信息 128 | QSqlQuery updateQuery(_db); 129 | updateQuery.prepare("UPDATE " + QString(TABLE_NAME.c_str()) 130 | + " SET crawlstate=" 131 | + QString::number(MySqlBase::SELECTED) 132 | + " WHERE (crawlstate=" + QString::number(MySqlBase::NEW) 133 | + " OR hub=true) AND crawlretry < " 134 | + QString::number(Conf::instance()->mysqlMaxCrawlRetry) 135 | + " AND maindomain IN (" + inPhraseList.c_str() + ")"); 136 | if (false == updateQuery.exec()) { 137 | QSqlError err = updateQuery.lastError(); 138 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 139 | err.type(), updateQuery.lastQuery().toStdString().c_str(), err.text().toStdString().c_str()); 140 | } 141 | 142 | while (query.next()) { 143 | QSqlRecord rec = query.record(); 144 | UrlContext *urlContext = new UrlContext; 145 | strncpy(urlContext->sign, rec.value("sign").toString().toStdString().c_str(), MAX_SIGN_LEN); 146 | urlContext->url = rec.value("url").toString().toUtf8().constData(); 147 | urlContext->linkDepth = rec.value("linkdepth").toInt(); 148 | urlContext->hub = rec.value("hub").toBool(); 149 | LinkTable::RetType ret = _linkScheduler->addUrl(urlContext); 150 | switch (ret) { 151 | case LinkTable::RT_PARSE_ERROR: 152 | // crawlretry++, crawlstate归零 153 | updateQuery.prepare("UPDATE " + QString(TABLE_NAME.c_str()) 154 | + " SET crawlretry=crawlretry+1,crawlstate=" + QString::number(MySqlBase::NEW) 155 | + " WHERE sign=" 156 | + QString(urlContext->sign)); 157 | if (false == updateQuery.exec()) { 158 | QSqlError err = updateQuery.lastError(); 159 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 160 | err.type(), updateQuery.lastQuery().toStdString().c_str(), err.text().toStdString().c_str()); 161 | } 162 | delete urlContext; 163 | break; 164 | 165 | case LinkTable::RT_TABLE_FULL: 166 | // crawlstate归零 167 | updateQuery.prepare("UPDATE " + QString(TABLE_NAME.c_str()) 168 | + " SET crawlstate=" + QString::number(MySqlBase::NEW) 169 | + " WHERE sign=" 170 | + QString(urlContext->sign)); 171 | if (false == updateQuery.exec()) { 172 | QSqlError err = updateQuery.lastError(); 173 | LOG_F(WARN, "QSqlQuery fail errtype=%d cmd=%s text=%s", 174 | err.type(), updateQuery.lastQuery().toStdString().c_str(), err.text().toStdString().c_str()); 175 | } 176 | delete urlContext; 177 | break; 178 | 179 | case LinkTable::RT_OK: 180 | break; 181 | default: 182 | break; 183 | } 184 | } 185 | 186 | return true; 187 | } else { 188 | return false; 189 | } 190 | } 191 | 192 | void MySqlSelector::control(string& response, const string& cmd) 193 | { 194 | } 195 | -------------------------------------------------------------------------------- /src/mysql_selector.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 09:54:47 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __MYSQL_SELECTOR_H__ 9 | #define __MYSQL_SELECTOR_H__ 10 | 11 | #include 12 | #include 13 | #include "controllable.h" 14 | #include "mysql_base.h" 15 | 16 | using std::string; 17 | 18 | class LinkScheduler; 19 | 20 | class MySqlSelector : public QThread, public MySqlBase, public Controllable 21 | { 22 | Q_OBJECT 23 | public: 24 | MySqlSelector(string connectionName); 25 | virtual ~MySqlSelector(); 26 | 27 | void setLinkScheduler(LinkScheduler *linkScheduler); 28 | 29 | void recoverCrawlState(); 30 | 31 | virtual void stop(); 32 | 33 | virtual void control(string& response, const string& cmd); 34 | 35 | /** 36 | * 线程执行函数 37 | */ 38 | virtual void run(); 39 | 40 | private: 41 | bool select(); 42 | 43 | bool _isStop; 44 | LinkScheduler *_linkScheduler; 45 | }; 46 | 47 | #endif //__MYSQL_SELECTOR_H__ 48 | -------------------------------------------------------------------------------- /src/parse_url.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu 04 Jun 2015 11:06:18 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "url_tools.h" 14 | 15 | int main(int argc, char *argv[]) 16 | { 17 | std::string line; 18 | while (getline(std::cin, line)) { 19 | const char *url = line.c_str(); 20 | char host[1024]; 21 | char strport[16]; 22 | char file[1024]; 23 | parse_url(url, host, 1024, strport, 16, file, 1024); 24 | if (strport[0] == '\0') { 25 | sprintf(strport, "80"); 26 | } 27 | 28 | char trunk[64]; 29 | const char* res = fetch_maindomain(host, trunk, 64); 30 | printf("%s\t%s\t%s\t%s\t%s\n", host, strport, file, res, trunk); 31 | } 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /src/request_recv.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:55:00 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | 9 | #include "url_context.h" 10 | #include "crawl_listen_handler.h" 11 | #include "logger_container.h" 12 | #include "http_processor.h" 13 | #include "cmd_ctrler.h" 14 | #include "request_recv.h" 15 | 16 | RequestRecv::RequestRecv(HttpProcessor *httpProcessor, CmdCtrler *cmdCtrler) 17 | :_httpProcessor(httpProcessor), _cmdCtrler(cmdCtrler), _requestCount(0) 18 | { 19 | } 20 | 21 | void RequestRecv::run() 22 | { 23 | LOG(INFO, "start"); 24 | int port = 9090; 25 | shared_ptr handler(new CrawlListenHandler(_httpProcessor, this, _cmdCtrler)); 26 | shared_ptr processor(new CrawlServiceProcessor(handler)); 27 | shared_ptr serverTransport(new TServerSocket(port)); 28 | shared_ptr transportFactory(new TBufferedTransportFactory()); 29 | shared_ptr protocolFactory(new TBinaryProtocolFactory()); 30 | shared_ptr threadManager = ThreadManager::newSimpleThreadManager(1); 31 | shared_ptr threadFactory = shared_ptr (new PosixThreadFactory()); //PosixThreadFactory可以自定义(继承于ThreadFactory) 32 | threadManager->threadFactory(threadFactory); 33 | threadManager->start(); 34 | 35 | TThreadPoolServer server(processor, serverTransport, transportFactory, protocolFactory, threadManager); 36 | 37 | try { 38 | server.serve(); 39 | } catch (apache::thrift::transport::TTransportException e) { 40 | LOG_F(FATAL, "TTransportException %s", e.what()); 41 | } 42 | } 43 | 44 | void RequestRecv::stop() 45 | { 46 | this->terminate(); 47 | } 48 | 49 | void RequestRecv::addRequestCount() 50 | { 51 | atomic_add(&_requestCount, 1); 52 | } 53 | 54 | void RequestRecv::control(string& response, const string& cmd) 55 | { 56 | char msg[1024] = {'\0'}; 57 | snprintf(msg, 1024, "requestCount:%d", _requestCount); 58 | response = string(msg); 59 | } 60 | -------------------------------------------------------------------------------- /src/request_recv.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:54:43 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __REQUEST_RECV_H__ 9 | #define __REQUEST_RECV_H__ 10 | 11 | #include "lthread.h" 12 | #include "controllable.h" 13 | 14 | class HttpProcessor; 15 | class CmdCtrler; 16 | 17 | class RequestRecv : public lthread::LThread, public Controllable 18 | { 19 | public: 20 | RequestRecv(HttpProcessor *httpProcessor, CmdCtrler *cmdCtrler); 21 | virtual void run(); 22 | void stop(); 23 | void addRequestCount(); 24 | virtual void control(string& response, const string& cmd); 25 | 26 | private: 27 | HttpProcessor *_httpProcessor; 28 | CmdCtrler *_cmdCtrler; 29 | volatile int _requestCount; 30 | }; 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/singleton.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 03 Jun 2015 02:53:51 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __SINGLETON_H__ 9 | #define __SINGLETON_H__ 10 | 11 | #define DECLARE_SINGLETON(classname) \ 12 | public: \ 13 | static classname * instance(){ \ 14 | if (NULL == _instance) \ 15 | _instance = new classname(); \ 16 | return _instance; \ 17 | } \ 18 | static bool isCreated(){ \ 19 | return NULL != _instance; \ 20 | } \ 21 | protected: \ 22 | classname(); \ 23 | static classname * _instance; \ 24 | private: 25 | 26 | #define IMPLEMENT_SINGLETON(classname) \ 27 | classname* classname::_instance = NULL; \ 28 | classname::classname(){} 29 | 30 | #define IMPLEMENT_SINGLETON_NO_CONSTRACTOR(classname) \ 31 | classname * classname::_instance = NULL; 32 | 33 | #endif //__SINGLETON_H__ 34 | -------------------------------------------------------------------------------- /src/synced_queue.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYNCED_Q_H_ 2 | #define _SYNCED_Q_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "mutex_lock.h" 11 | 12 | template 13 | class TSyncedQueue 14 | { 15 | public: 16 | TSyncedQueue() { Init(); } 17 | ~TSyncedQueue() { sem_destroy(&_sem); } 18 | 19 | void Init() { 20 | m_queue.Init(); 21 | m_lock.Init(); 22 | if (0 != sem_init(&_sem, 0, (unsigned int)0)) { 23 | perror("sem_init fail\n"); 24 | abort(); 25 | } 26 | } 27 | 28 | void push_back(TElement* e) { 29 | m_lock.Lock(); 30 | m_queue.push_back(e); 31 | m_lock.Unlock(); 32 | sem_post(&_sem); 33 | } 34 | 35 | void push_front(TElement* e) { 36 | m_lock.Lock(); 37 | m_queue.push_front(e); 38 | m_lock.Unlock(); 39 | sem_post(&_sem); 40 | } 41 | 42 | TElement* pop_front() { 43 | TElement* e; 44 | sem_wait(&_sem); 45 | m_lock.Lock(); 46 | e = m_queue.pop_front(); 47 | m_lock.Unlock(); 48 | assert(NULL != e); 49 | return e; 50 | } 51 | 52 | TElement* pop_back() { 53 | TElement* e; 54 | sem_wait(&_sem); 55 | m_lock.Lock(); 56 | e = m_queue.pop_back(); 57 | m_lock.Unlock(); 58 | 59 | assert(NULL != e); 60 | return e; 61 | } 62 | 63 | int size() { 64 | int n; 65 | m_lock.Lock(); 66 | n = m_queue.size(); 67 | m_lock.Unlock(); 68 | return n; 69 | } 70 | 71 | private: 72 | TQueue m_queue; 73 | TLock m_lock; 74 | sem_t _sem; 75 | }; 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /src/url_context.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat 30 May 2015 10:41:48 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "util.h" 9 | #include "logger_container.h" 10 | #include "url_context.h" 11 | 12 | UrlContext::UrlContext() 13 | { 14 | uuid = atomic_add(&g_uuid, 1); 15 | retry = 0; 16 | redirectDepth = 0; 17 | hub = false; 18 | 19 | httpProcessor = NULL; 20 | event = NULL; 21 | base = NULL; 22 | 23 | sign[0] = '\0'; 24 | linkDepth = 0; 25 | 26 | init(); 27 | LOG_F(DEBUG, "%d [%s] construct", uuid, url.c_str()); 28 | } 29 | 30 | UrlContext::~UrlContext() 31 | { 32 | LOG_F(DEBUG, "%d [%s] deconstruct", uuid, url.c_str()); 33 | if (DELETED == status) { 34 | string backtrace = get_backtrace(); 35 | LOG_F(FATAL, "%d [%s] double free %s", uuid, url.c_str(), backtrace.c_str()); 36 | } 37 | status = DELETED; 38 | } 39 | 40 | void UrlContext::init() 41 | { 42 | host[0] = '\0'; 43 | strport[0] = '\0'; 44 | file[0] = '\0'; 45 | ip = ""; 46 | hasParsed = false; 47 | 48 | status = CONNECTING; 49 | 50 | requestLen = 0; 51 | writeLen = 0; 52 | recvData = ""; 53 | 54 | firstLineLen = -1; 55 | headerLen = -1; 56 | uncompressedBodyLen = -1; 57 | bodyPos = -1; 58 | fields.clear(); 59 | finishAnalyseFirstLine = false; 60 | finishAnalyseHeader = false; 61 | finishAnalyseBody = false; 62 | 63 | crawlElapse = -1; 64 | 65 | DLINK_INITIALIZE(&link); 66 | } 67 | 68 | bool UrlContext::parseUrl() 69 | { 70 | // 不重复解析, init()之后会复位 71 | if (hasParsed) { 72 | return true; 73 | } 74 | 75 | // 第一次执行走这个分支,第二次执行重定向走下一个分支 76 | if (redirectDepth == 0) { 77 | if (1 == parse_url(url.c_str(), 78 | host, 79 | MAX_SITE_LEN, 80 | strport, 81 | MAX_PORT_LEN, 82 | file, 83 | MAX_PATH_LEN)) { 84 | // 这里简单起见,site先赋值为host 85 | site = host; 86 | // 可能上游已经指定了ip 87 | if (0 == ip.length()) { 88 | vector ips = getip(host); 89 | size_t ipsSize = ips.size(); 90 | if (ipsSize > 0) { 91 | ip = ips[rand() % ipsSize]; 92 | } 93 | if (0 == ip.length()) { 94 | LOG_F(WARN, "%d [%s] parse_url getip fail", uuid, url.c_str()); 95 | return false; 96 | } 97 | } 98 | hasParsed = true; 99 | return true; 100 | } else { 101 | LOG_F(WARN, "%d [%s] parse_url fail", uuid, url.c_str()); 102 | return false; 103 | } 104 | } else { 105 | // 如果是重定向,则可以确保在linktable调用过parseUrl,site一定有值 106 | assert(site.length() != 0); 107 | if (1 == parse_url(redirectUrl.c_str(), 108 | host, 109 | MAX_SITE_LEN, 110 | strport, 111 | MAX_PORT_LEN, 112 | file, 113 | MAX_PATH_LEN)) { 114 | vector ips = getip(host); 115 | size_t ipsSize = ips.size(); 116 | if (ipsSize > 0) { 117 | ip = ips[rand() % ipsSize]; 118 | } 119 | if (0 == ip.length()) { 120 | LOG_F(WARN, "%d [%s] parse_url redirectUrl:%s getip fail", uuid, url.c_str(), redirectUrl.c_str()); 121 | return false; 122 | } 123 | hasParsed = true; 124 | return true; 125 | } else { 126 | LOG_F(WARN, "%d [%s] parse_url redirectUrl:%s fail", uuid, url.c_str(), redirectUrl.c_str()); 127 | return false; 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/url_context.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:56:35 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __URL_CONTEXT_H__ 9 | #define __URL_CONTEXT_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "link.h" 17 | #include "atomic.h" 18 | #include "http_protocol.h" 19 | #include "url_tools.h" 20 | 21 | // 最大http请求头长度 22 | #define MAX_HEAD_CMD_LEN 8192 23 | #define MAX_SIGN_LEN 24 24 | 25 | using std::string; 26 | using std::map; 27 | using std::vector; 28 | using std::pair; 29 | 30 | typedef string Url; 31 | typedef string Site; 32 | typedef string Ip; 33 | 34 | //typedef vector > LINK_TYPE; 35 | typedef map LINK_TYPE; 36 | 37 | static volatile int g_uuid = 0; 38 | 39 | class HttpProcessor; 40 | 41 | /** 42 | * url请求的上下文信息,包含基本信息和中间状态,生命周期从请求过来到抓取结束 43 | */ 44 | class UrlContext 45 | { 46 | public: 47 | // 网络通信状态类型 48 | typedef enum { 49 | CONNECTING, // 正在连接 50 | SENDING, // 正在发送请求 51 | RECVING, // 正在接受数据 52 | DONE, // 抓取完成 53 | TIMEOUT, // 抓取超时 54 | PROTERR, // 协议错误 55 | DELETED // 已delete 56 | } StatusType; 57 | 58 | UrlContext(); 59 | ~UrlContext(); 60 | 61 | void init(); 62 | bool parseUrl(); 63 | 64 | public: 65 | // 直接赋值的变量 66 | Url url; // url地址 67 | int uuid; // 全局唯一id 68 | int linkDepth; // 链接深度,以hub页为准 69 | bool hub; // 是否是索引页 70 | 71 | // url解析生成的变量 72 | char host[MAX_SITE_LEN]; // url解析出的域名 73 | char strport[MAX_PORT_LEN]; // url解析出的端口号 74 | char file[MAX_PATH_LEN]; // url解析出的路径部分 75 | Ip ip; // 点分十进制ip地址 76 | Site site; // 站点名,可能和host相同,也可能不同 77 | bool hasParsed; // 是否已经解析过了 78 | char sign[MAX_SIGN_LEN]; // url签名 79 | 80 | // 动态变化的变量 81 | Url redirectUrl; // 重定向url 82 | int redirectDepth; // 重定向次数 83 | int retry; // 抓取重试次数 84 | StatusType status; 85 | 86 | // 网络通信的变量 87 | char requestData[MAX_HEAD_CMD_LEN]; // http请求头 88 | int requestLen; // http请求头长度 89 | int writeLen; // 已经发送的请求头长度 90 | int sock; // 和网站的连接socket 91 | string recvData; // 接收的http返回结果数据 92 | 93 | // protocol处理结果 94 | int firstLineLen; // response第一行长度 95 | int headerLen; // response的header部分长度 96 | int uncompressedBodyLen; // response的body部分长度 97 | int bodyPos; // response的body首位置 98 | string httpVersion; // response里提取的HTTP版本信息 99 | int httpStatus; // response里提取的返回状态信息 100 | string httpReason; // response里提取的返回码描述信息 101 | map fields; // response的header部分域kv数据 102 | string header; // header 103 | string body; // body部分数据 104 | string cookie; // cookie 105 | bool finishAnalyseFirstLine; // 是否完成第一行解析 106 | bool finishAnalyseHeader; // 是否完成header解析 107 | bool finishAnalyseBody; // 是否完成body解析 108 | 109 | // 传递指针 110 | HttpProcessor *httpProcessor; // 上下文传递指针 111 | struct event *event; // 上下文传递指针 112 | event_base *base; // 上下文传递指针 113 | 114 | // 统计信息 115 | struct timeval beginConnectTime; // 开始连接时间 116 | struct timeval beginSendTime; // 开始发送请求时间 117 | struct timeval beginRecvTime; // 开始接收结果时间 118 | struct timeval finishTime; // 完成接收时间 119 | long crawlElapse; // 抓取历经多久,单位毫秒 120 | 121 | // extractor解析生成的变量 122 | string title; 123 | LINK_TYPE links; // 后链,包含链接和anchor,有重复 124 | 125 | // 特殊数据结构需要的变量 126 | DLINK link; // 队列数据结构要用到的变量 127 | Url key; // keyedQueue数据结构要用的变量 128 | }; 129 | 130 | struct UrlContextComp { 131 | bool operator() (UrlContext *& u1, UrlContext *& u2) 132 | { 133 | return u1->linkDepth > u2->linkDepth; 134 | } 135 | }; 136 | 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /src/url_tools.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:58:37 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __URL_TOOLS_H__ 9 | #define __URL_TOOLS_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define MAX_URL_LEN 1024 17 | #define MAX_SITE_LEN 256 18 | #define MAX_PATH_LEN 1024 19 | #define MAX_PORT_LEN 7 20 | 21 | int delete_inter(char * str); 22 | 23 | int parse_url(const char *input, char *site,size_t site_size, 24 | char *port, size_t port_size,char *path, size_t max_path_size); 25 | uint64_t MurmurHash64A(const void * key, int len, unsigned int seed); 26 | 27 | uint64_t get_url_sign64(const char *url); 28 | 29 | int normalize_url(const char* url, char* buf,size_t buf_size); 30 | int isnormalized_url(const char *url); 31 | 32 | const char* fetch_maindomain(const char* site, char* trunk, int trunk_size, 33 | bool recoveryMode = true); 34 | 35 | bool fetch_url_maindomain(const char *url, char *maindomain, int len); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/util.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sun 31 May 2015 12:30:57 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "util.h" 10 | 11 | /** 12 | * 将一个IP字符串转换为unsigned int 13 | * @param ip 转换后的数值 14 | * @retval 非0 转换成功 15 | * @retval 0 转换失败 16 | */ 17 | int ip_aton (const char *ipstring, unsigned int *ip) 18 | { 19 | in_addr l_inp; 20 | int ret; 21 | 22 | if (strcmp (ipstring, "0") == 0) 23 | { 24 | *ip = 0; 25 | return 1; 26 | } 27 | 28 | ret = inet_pton (AF_INET, ipstring, &l_inp); 29 | 30 | *ip = l_inp.s_addr; 31 | 32 | if (ret <= 0) 33 | { 34 | return 0; 35 | } 36 | else 37 | { 38 | return ret; 39 | } 40 | } 41 | 42 | /** 43 | * 将整数型IP转换成字符串 44 | * @param ip 45 | * @retval 非0 转换成功 46 | * @retval 0 转换失败 47 | */ 48 | int ip_ntoa (const unsigned int ip, char * ipstring, int len) 49 | { 50 | if ( inet_ntop (AF_INET, &ip, ipstring, len) == NULL) 51 | return 1; 52 | else 53 | return 0; 54 | } 55 | 56 | vector getSockAddr(const char *host) 57 | { 58 | vector addrs; 59 | struct addrinfo hints, *res; 60 | 61 | memset(&hints, 0, sizeof(hints)); 62 | hints.ai_family = AF_INET; 63 | hints.ai_socktype = SOCK_STREAM; 64 | if (0 == getaddrinfo(host, NULL, &hints, &res)) { 65 | if (NULL != res) { 66 | for (struct addrinfo *rp = res; rp != NULL; rp = rp->ai_next) { 67 | unsigned long result = ((sockaddr_in*)rp->ai_addr)->sin_addr.s_addr; 68 | addrs.push_back(result); 69 | } 70 | freeaddrinfo(res); 71 | } 72 | } 73 | return addrs; 74 | } 75 | 76 | vector getip(const char *host) 77 | { 78 | vector result; 79 | vector addrs = getSockAddr(host); 80 | if (0 == addrs.size()) { 81 | fprintf (stderr, "warn: %s:%d getSockAddr %s\n", __FILE__, __LINE__, host); 82 | } else { 83 | for (int i = 0; i < addrs.size(); i++) { 84 | char ipstring[16] = {'\0'}; 85 | if (0 == ip_ntoa(addrs[i], ipstring, 16)) { 86 | result.push_back(ipstring); 87 | } 88 | } 89 | } 90 | return result; 91 | } 92 | 93 | /** 94 | * 设置socket为非阻塞 95 | */ 96 | int setnonblocking (int sockfd) 97 | { 98 | // non-blocking 99 | int flags = fcntl (sockfd, F_GETFL, 0); 100 | 101 | if (flags == -1) 102 | { 103 | fprintf (stderr, "error: %s:%d\n", __FILE__, __LINE__); 104 | return -2; 105 | } 106 | if (fcntl (sockfd, F_SETFL, flags | O_NONBLOCK) == -1) 107 | { 108 | fprintf (stderr, "error: %s:%d\n", __FILE__, __LINE__); 109 | return -3; 110 | } 111 | return 0; 112 | } 113 | 114 | std::string& rtrim(std::string &s) 115 | { 116 | return s.erase(s.find_last_not_of(" \t\n\r") + 1); 117 | } 118 | 119 | std::string& ltrim(std::string &s) 120 | { 121 | return s.erase(0, s.find_first_not_of(" \t\n\r")); 122 | } 123 | 124 | int gzcompress(Bytef *data, uLong ndata, 125 | Bytef *zdata, uLong *nzdata) 126 | { 127 | z_stream c_stream; 128 | int err = 0; 129 | 130 | if(data && ndata > 0) { 131 | c_stream.zalloc = NULL; 132 | c_stream.zfree = NULL; 133 | c_stream.opaque = NULL; 134 | //只有设置为MAX_WBITS + 16才能在在压缩文本中带header和trailer 135 | if(deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 136 | MAX_WBITS + 16, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; 137 | c_stream.next_in = data; 138 | c_stream.avail_in = ndata; 139 | c_stream.next_out = zdata; 140 | c_stream.avail_out = *nzdata; 141 | while(c_stream.avail_in != 0 && c_stream.total_out < *nzdata) { 142 | if(deflate(&c_stream, Z_NO_FLUSH) != Z_OK) return -1; 143 | } 144 | if(c_stream.avail_in != 0) return c_stream.avail_in; 145 | for(;;) { 146 | if((err = deflate(&c_stream, Z_FINISH)) == Z_STREAM_END) break; 147 | if(err != Z_OK) return -1; 148 | } 149 | if(deflateEnd(&c_stream) != Z_OK) return -1; 150 | *nzdata = c_stream.total_out; 151 | return 0; 152 | } 153 | return -1; 154 | } 155 | 156 | int gzdecompress(Byte *zdata, uLong nzdata, 157 | Byte *data, uLong *ndata) 158 | { 159 | int err = 0; 160 | z_stream d_stream = {0}; /* decompression stream */ 161 | static char dummy_head[2] = { 162 | 0x8 + 0x7 * 0x10, 163 | (((0x8 + 0x7 * 0x10) * 0x100 + 30) / 31 * 31) & 0xFF, 164 | }; 165 | d_stream.zalloc = NULL; 166 | d_stream.zfree = NULL; 167 | d_stream.opaque = NULL; 168 | d_stream.next_in = zdata; 169 | d_stream.avail_in = 0; 170 | d_stream.next_out = data; 171 | //只有设置为MAX_WBITS + 16才能在解压带header和trailer的文本 172 | if(inflateInit2(&d_stream, MAX_WBITS + 16) != Z_OK) { 173 | return -1; 174 | } 175 | while(d_stream.total_out < *ndata && d_stream.total_in < nzdata) { 176 | d_stream.avail_in = d_stream.avail_out = 1; /* force small buffers */ 177 | if((err = inflate(&d_stream, Z_NO_FLUSH)) == Z_STREAM_END) break; 178 | if(err != Z_OK) { 179 | if(err == Z_DATA_ERROR) { 180 | d_stream.next_in = (Bytef*) dummy_head; 181 | d_stream.avail_in = sizeof(dummy_head); 182 | if((err = inflate(&d_stream, Z_NO_FLUSH)) != Z_OK) { 183 | inflateEnd(&d_stream); 184 | return -2; 185 | } 186 | } else { 187 | inflateEnd(&d_stream); 188 | return -3; 189 | } 190 | } 191 | } 192 | if(inflateEnd(&d_stream) != Z_OK) return -4; 193 | *ndata = d_stream.total_out; 194 | return 0; 195 | } 196 | 197 | void hex2dec(const char *hexStr, char *decStr) 198 | { 199 | int a; 200 | sscanf(hexStr, "%x", &a); 201 | sprintf(decStr, "%d", a); 202 | } 203 | 204 | bool isNonAsciiUTF8(const char* str, int &len) { 205 | if (*str == 0) { 206 | return false; 207 | } 208 | 209 | unsigned char chr = *str; 210 | 211 | if(chr>=0x80) 212 | { 213 | if(chr>=0xFC&&chr<=0xFD) 214 | len=6; 215 | else if(chr>=0xF8) 216 | len=5; 217 | else if(chr>=0xF0) 218 | len=4; 219 | else if(chr>=0xE0) 220 | len=3; 221 | else if(chr>=0xC0) 222 | len=2; 223 | else 224 | { 225 | return false; 226 | } 227 | } 228 | 229 | 230 | for(int i=1; i(source_str); 274 | size_t outlen = 4 * inlen; 275 | char *out = (char*)malloc(outlen); 276 | memset(out, '\0', outlen); 277 | if (NULL == out) { 278 | iconv_close(cd); 279 | return false; 280 | } 281 | char *pout = out; 282 | if ((size_t)-1 == iconv(cd, &in, &inlen, &pout, &outlen)) { 283 | iconv_close(cd); 284 | free(out); 285 | return false; 286 | } else { 287 | target_str = std::string(out); 288 | } 289 | free(out); 290 | iconv_close(cd); 291 | return true; 292 | } else { 293 | return false; 294 | } 295 | } 296 | } 297 | 298 | bool to_utf8(std::string &str) 299 | { 300 | return to_utf8(str.c_str(), str.length(), str); 301 | } 302 | 303 | size_t get_executable_path( char* processdir,char* processname, size_t len) 304 | { 305 | char* path_end; 306 | if(readlink("/proc/self/exe", processdir,len) <=0) 307 | return -1; 308 | path_end = strrchr(processdir, '/'); 309 | if(path_end == NULL) 310 | return -1; 311 | ++path_end; 312 | strcpy(processname, path_end); 313 | *path_end = '\0'; 314 | return (size_t)(path_end - processdir); 315 | } 316 | 317 | string get_backtrace_line(int nptrs, void *buffer[100], const char *program) 318 | { 319 | string result; 320 | char cmd[512]; 321 | int len = snprintf(cmd, sizeof(cmd), 322 | "addr2line -ifsC -e %s", program); 323 | char *p = cmd + len; 324 | size_t s = sizeof(cmd) - len; 325 | for(int i = 0; i < nptrs; ++i) { 326 | if(s > 0) { 327 | len = snprintf(p, s, " %p", buffer[i]); 328 | p += len; 329 | s -= len; 330 | } 331 | } 332 | FILE *fp; 333 | char buf[128]; 334 | if((fp = popen(cmd, "r"))) { 335 | while(fgets(buf, sizeof(buf), fp)) { 336 | buf[strlen(buf)-1] = '\0'; 337 | result += "["; 338 | result += buf; 339 | result += "] "; 340 | } 341 | pclose(fp); 342 | } 343 | 344 | return result; 345 | } 346 | 347 | string get_backtrace() 348 | { 349 | char path[MAX_PATH_LEN] = {'\0'}; 350 | char processname[1024] = {'\0'}; 351 | get_executable_path(path, processname, sizeof(path)); 352 | string processpath = string(path) + string(processname); 353 | 354 | int nptrs; 355 | void *buffer[100]; 356 | char **strings; 357 | nptrs = backtrace(buffer, 100); 358 | strings = backtrace_symbols(buffer, nptrs); 359 | string result = get_backtrace_line(nptrs, buffer, processpath.c_str()); 360 | free(strings); 361 | return result; 362 | } 363 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu May 14 15:58:37 2015 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #ifndef __UTIL_H__ 9 | #define __UTIL_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #define MAX_PATH_LEN 1024 25 | 26 | using std::string; 27 | using std::vector; 28 | 29 | // 求时间差,单位毫秒 30 | #define TIME_DIFF(start, end) \ 31 | ((end.tv_sec-start.tv_sec)*1000+(end.tv_usec-start.tv_usec)/1000) 32 | 33 | int ip_aton (const char *ipstring, unsigned int *ip); 34 | 35 | int ip_ntoa (const unsigned int ip, char * ipstring, int len); 36 | 37 | vector getSockAddr(const char *host); 38 | 39 | vector getip(const char *host); 40 | 41 | int setnonblocking (int sockfd); 42 | 43 | string& rtrim(string &s); 44 | 45 | string& ltrim(string &s); 46 | 47 | int gzcompress(Bytef *data, uLong ndata, 48 | Bytef *zdata, uLong *nzdata); 49 | 50 | int gzdecompress(Byte *zdata, uLong nzdata, 51 | Byte *data, uLong *ndata); 52 | 53 | void hex2dec(const char *hexStr, char *decStr); 54 | 55 | bool isNonAsciiUTF8(const char* str, int &len); 56 | 57 | bool is_utf8(const char* str); 58 | 59 | bool to_utf8(const char* source_str, const size_t len, string & target_str); 60 | 61 | bool to_utf8(string &str); 62 | 63 | size_t get_executable_path( char* processdir,char* processname, size_t len); 64 | 65 | string get_backtrace_line(int nptrs, void *buffer[100], const char *program); 66 | 67 | string get_backtrace(); 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /stamp-h1: -------------------------------------------------------------------------------- 1 | timestamp for config.h 2 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | all : test_util test_qtextcodec test_backtrace 2 | 3 | test_util : test_util.o util.o 4 | g++ -g -o test_util test_util.o util.o -lz 5 | 6 | test_qtextcodec : test_qtextcodec.o 7 | g++ -g -o test_qtextcodec test_qtextcodec.o -L /usr/lib64/ -lQt5Core 8 | 9 | test_backtrace : test_backtrace.o 10 | g++ -g -o test_backtrace test_backtrace.o -rdynamic -ldl 11 | 12 | 13 | 14 | 15 | test_util.o : test_util.cpp 16 | g++ -c -g -o test_util.o test_util.cpp -I ../src/ 17 | 18 | util.o : ../src/util.cpp 19 | g++ -c -g -o util.o ../src/util.cpp 20 | 21 | test_qtextcodec.o : test_qtextcodec.cpp 22 | g++ -c -g -o test_qtextcodec.o test_qtextcodec.cpp -I /usr/include/qt5/ -fPIC 23 | 24 | test_backtrace.o : test_backtrace.cpp 25 | g++ -c -g -o test_backtrace.o test_backtrace.cpp 26 | 27 | clean : 28 | rm -f *.o test_util test_qtextcodec test_backtrace 29 | -------------------------------------------------------------------------------- /test/test_backtrace.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sun 05 Jul 2015 08:20:24 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | using std::string; 14 | 15 | string getLine(int nptrs, void *buffer[100], char *argv[]) 16 | { 17 | string result; 18 | char cmd[512]; 19 | int len = snprintf(cmd, sizeof(cmd), 20 | "addr2line -ifsC -e %s", argv[0]); 21 | char *p = cmd + len; 22 | size_t s = sizeof(cmd) - len; 23 | for(int i = 0; i < nptrs; ++i) { 24 | if(s > 0) { 25 | len = snprintf(p, s, " %p", buffer[i]); 26 | p += len; 27 | s -= len; 28 | } 29 | } 30 | FILE *fp; 31 | char buf[128]; 32 | printf("%s\n", cmd); 33 | if((fp = popen(cmd, "r"))) { 34 | while(fgets(buf, sizeof(buf), fp)) { 35 | result += buf; 36 | } 37 | pclose(fp); 38 | } 39 | 40 | return result; 41 | } 42 | 43 | string myfunc1(char *argv[]) 44 | { 45 | int nptrs; 46 | void *buffer[100]; 47 | char **strings; 48 | string result; 49 | 50 | nptrs = backtrace(buffer, 100); 51 | 52 | strings = backtrace_symbols(buffer, nptrs); 53 | /* 54 | if (NULL != strings) { 55 | for (int i = 0; i < nptrs; i++) { 56 | result += "{"; 57 | result += strings[i]; 58 | result += "} "; 59 | } 60 | free(strings); 61 | } 62 | */ 63 | 64 | result += getLine(nptrs, buffer, argv); 65 | free(strings); 66 | 67 | return result; 68 | } 69 | 70 | int main(int argc, char *argv[]) 71 | { 72 | string result = myfunc1(argv); 73 | printf("%s\n", result.c_str()); 74 | exit(EXIT_SUCCESS); 75 | } 76 | 77 | -------------------------------------------------------------------------------- /test/test_bson.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 27 May 2015 09:39:35 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "mongo/bson/bson.h" 10 | 11 | using namespace mongo; 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | BSONObjBuilder data; 16 | data.append("title", "this is title"); 17 | BSONObjBuilder arr; 18 | BSONObjBuilder subdata1; 19 | subdata1.append("anchor", "this is anchor"); 20 | subdata1.append("href", "this is href"); 21 | BSONObjBuilder subdata2; 22 | subdata2.append("anchor", "this is anchor"); 23 | subdata2.append("href", "this is href"); 24 | arr.append("link1", subdata1.obj()); 25 | arr.append("link2", subdata2.obj()); 26 | data.appendArray("links", arr.obj()); 27 | printf("%s\n", data.obj().toString().c_str()); 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /test/test_fetchmaindomain.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sun 31 May 2015 03:50:09 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include "url_tools.h" 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | std::fstream fin(argv[1]); 15 | std::string ReadLine; 16 | while(getline(fin,ReadLine)) { 17 | const char *url = ReadLine.c_str(); 18 | char host[1024]; 19 | char strport[16]; 20 | char file[1024]; 21 | parse_url(url, host, 1024, strport, 16, file, 1024); 22 | 23 | char trunk[64]; 24 | const char* res = fetch_maindomain(host, trunk, 64); 25 | printf("%s\t\t%s\t\t%s\n", host, res, trunk); 26 | } 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /test/test_keyedqueue.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Fri 29 May 2015 04:51:48 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include "keyed_queue.h" 11 | 12 | struct ElementType { 13 | ElementType() { 14 | DLINK_INITIALIZE(&link); 15 | } 16 | DLINK link; 17 | int key; 18 | }; 19 | 20 | bool func(ElementType *pe, ptr data) 21 | { 22 | printf("%d\n", pe->key); 23 | return true; 24 | } 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | ptr p; 29 | TKeyedQueue kq; 30 | ElementType e; 31 | ElementType e2; 32 | 33 | e.key = 1; 34 | e2.key = 2; 35 | printf("%d\n", __LINE__); 36 | kq.Enumerate(func, p); 37 | printf("pushbackret=%d\n", kq.push_back(&e)); 38 | printf("pushbackret=%d\n", kq.push_back(&e2)); 39 | printf("%d\n", __LINE__); 40 | kq.Enumerate(func, p); 41 | ElementType *pe = kq.pop_front(); 42 | printf("%d\n", __LINE__); 43 | kq.Enumerate(func, p); 44 | ElementType *e3 = kq.find(1); 45 | kq.remove(2); 46 | printf("%d\n", __LINE__); 47 | kq.Enumerate(func, p); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /test/test_log.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 27 May 2015 12:53:16 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include "logger_container.h" 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | LoggerContainer::init(); 14 | std::string str = "hello"; 15 | LOG_F(DEBUG, "%lu", str.length()); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /test/test_mongo.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Tue 19 May 2015 02:27:33 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include "mongo/client/dbclient.h" 9 | 10 | using namespace mongo; 11 | using namespace std; 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | DBClientConnection conn; 16 | try { 17 | 18 | // 连接 19 | conn.connect("localhost:27017"); 20 | 21 | // 查询sql 22 | auto_ptr cursor = conn.query("mydb.user"); 23 | while ( cursor->more() ) { 24 | BSONObj obj = cursor->next(); 25 | if( !obj.isEmpty() ) { 26 | printf("%s\n", obj.toString().c_str()); 27 | } 28 | } 29 | 30 | // 查询指定id 31 | BSONObj obj = conn.findOne( "mydb.user", BSONObjBuilder().append( "_id" , "1234" ).obj()); 32 | if( !obj.isEmpty() ){ 33 | string value = obj.getStringField("value"); 34 | printf("findOne value=%s\n", value.c_str()); 35 | } 36 | 37 | // 删除 38 | conn.remove( "mydb.user", BSONObjBuilder().append( "_id" , "1234" ).obj()); 39 | 40 | // 插入 41 | mongo::BSONObjBuilder b; 42 | b.append( "_id", "1234" ); 43 | b.append( "value", "hello" ); 44 | conn.update( "mydb.user", BSONObjBuilder().append( "_id" , "1234" ).obj(), b.obj(), true ); 45 | } catch (DBException &e) { 46 | printf("%s\n", e.what()); 47 | } catch ( std::exception& e ) { 48 | printf("MONGO Exception(set): %s", e.what()); 49 | return -1; 50 | } catch( ... ){ 51 | printf("MONGO Exception(set): NULL"); 52 | return -1; 53 | } 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /test/test_mysql.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Thu 28 May 2015 02:03:54 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | int main(int argc, char *argv[]) 18 | { 19 | uint64_t v = 18142234485787147589; 20 | char value[24]; 21 | sprintf(value, "%llu", v); 22 | printf("%llu %s\n", v, value); 23 | return 0; 24 | QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL"); 25 | db.setDatabaseName("lspider"); 26 | db.setHostName("localhost"); 27 | db.setUserName("root"); 28 | db.setPassword(""); 29 | if (!db.open()) { 30 | QSqlError err = db.lastError(); 31 | printf("db.open fail errtype=%d\n", err.type()); 32 | return -1; 33 | } 34 | 35 | QString selectCmdStr = "SELECT sign, url, maindomain, ip, prelink, preanchor, weight, linkdepth, crawlstate, crawlretry, hub, fresh, foundtime, crawledtime FROM link" 36 | " WHERE link.crawlstate=0 OR hub=true" 37 | " ORDER BY link.linkdepth"; 38 | QSqlQuery query(selectCmdStr, db); 39 | QSqlRecord rec = query.record(); 40 | 41 | if (query.isActive()) { 42 | while (query.next()) { 43 | QSqlRecord rec = query.record(); 44 | printf("%llu\t%f\t%s\t%s\n", 45 | (uint64_t)rec.value("sign").value(), 46 | rec.value("sign").toDouble(), 47 | rec.value("sign").toString().toStdString().c_str(), 48 | rec.value("url").toString().toStdString().c_str()); 49 | } 50 | } 51 | 52 | db.close(); 53 | QSqlDatabase::removeDatabase("QMYSQL"); 54 | 55 | 56 | return 0; 57 | } 58 | -------------------------------------------------------------------------------- /test/test_priorityqueue.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 03 Jun 2015 11:20:51 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | struct node { 15 | int priority; 16 | float value; 17 | }; 18 | 19 | struct comp 20 | { 21 | bool operator() (node *& n1, node *& n2) 22 | { 23 | return n1->priority < n2->priority; 24 | } 25 | }; 26 | 27 | 28 | int main(int argc, char *argv[]) 29 | { 30 | //priority_queue, mygreater > q; 31 | priority_queue, comp> q; 32 | node n1 = {3, 13.0}; 33 | node n2 = {2, 2.0}; 34 | node n3 = {5, 35.0}; 35 | node n4 = {9, 19.0}; 36 | node n5 = {2, 992.0}; 37 | q.push(&n1); 38 | q.push(&n2); 39 | q.push(&n3); 40 | q.push(&n4); 41 | q.push(&n5); 42 | while (!q.empty()) 43 | { 44 | node *t = q.top(); 45 | printf("%d %f\n", t->priority, t->value); 46 | q.pop(); 47 | } 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /test/test_qtextcodec.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Wed 17 Jun 2015 04:28:25 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | QString str = "What's your name"; 14 | QString str2 = str.replace('\'', "\'"); 15 | printf("%s\n", str2.toStdString().c_str()); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /test/test_qweb.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sun 24 May 2015 06:01:49 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | class MainWindow : public QMainWindow 17 | { 18 | Q_OBJECT 19 | 20 | public: 21 | MainWindow(const QUrl& url) { 22 | //QNetworkProxyFactory::setUseSystemConfiguration(true); 23 | view = new QWebView(this); 24 | view->load(url); 25 | connect(view, SIGNAL(loadFinished(bool)), SLOT(adjustLocation())); 26 | } 27 | 28 | virtual ~MainWindow(){} 29 | 30 | protected slots: 31 | 32 | void adjustLocation() { 33 | printf("haha\n"); 34 | } 35 | 36 | private: 37 | QWebView *view; 38 | }; 39 | 40 | int main(int argc, char * argv[]) 41 | { 42 | QApplication app(argc, argv); 43 | QUrl url; 44 | if (argc > 1) 45 | url = QUrl(argv[1]); 46 | else 47 | url = QUrl("http://www.google.com/ncr"); 48 | MainWindow *browser = new MainWindow(url); 49 | browser->show(); 50 | return app.exec(); 51 | } 52 | -------------------------------------------------------------------------------- /test/test_urlnormalize.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sun 31 May 2015 10:02:16 AM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include "url_tools.h" 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | std::fstream fin(argv[1]); 15 | std::string ReadLine; 16 | while(getline(fin,ReadLine)) { 17 | const char *url = ReadLine.c_str(); 18 | //printf("%s\n", url); 19 | //printf("%d\n", isnormalized_url(url)); 20 | //if (0 == isnormalized_url(url)) { 21 | char buf[4096]; 22 | normalize_url(url, buf, 4096); 23 | if (strcmp(url, buf) != 0) { 24 | printf("%s to %s %d to %d\n", url, buf, isnormalized_url(url), isnormalized_url(buf)); 25 | } 26 | // } 27 | } 28 | /* 29 | char host[64]; 30 | char strport[64]; 31 | char file[64]; 32 | if (1 == parse_url("https://www.baidu.com:333/index.html", 33 | host, 34 | MAX_SITE_LEN, 35 | strport, 36 | MAX_PORT_LEN, 37 | file, 38 | MAX_PATH_LEN)) { 39 | } 40 | printf("host=%s strport=%s file=%s\n", host, strport, file); 41 | */ 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /test/test_util.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015, LiChuang. All rights reserved. 3 | * Author: lichuang(whlichuang@126.com) 4 | * Created Time: Sat 13 Jun 2015 12:55:53 PM CST 5 | * Description: 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | //#include "util.h" 23 | 24 | using std::string; 25 | using std::vector; 26 | 27 | int ip_ntoaTest (const unsigned int ip, char * ipstring, int len) 28 | { 29 | if ( inet_ntop (AF_INET, &ip, ipstring, len) == NULL) 30 | return 1; 31 | else 32 | return 0; 33 | } 34 | 35 | vector getSockAddrTest(const char *host) 36 | { 37 | vector addrs; 38 | struct addrinfo hints, *res; 39 | 40 | memset(&hints, 0, sizeof(hints)); 41 | hints.ai_family = AF_INET; 42 | hints.ai_socktype = SOCK_STREAM; 43 | if (0 == getaddrinfo(host, NULL, &hints, &res)) { 44 | if (NULL != res) { 45 | for (struct addrinfo *rp = res; rp != NULL; rp = rp->ai_next) { 46 | unsigned long result = ((sockaddr_in*)rp->ai_addr)->sin_addr.s_addr; 47 | addrs.push_back(result); 48 | } 49 | freeaddrinfo(res); 50 | } 51 | } 52 | return addrs; 53 | } 54 | 55 | vector getipTest(const char *host) 56 | { 57 | vector result; 58 | vector addrs = getSockAddrTest(host); 59 | if (0 == addrs.size()) { 60 | fprintf (stderr, "warn: %s:%d getSockAddrTest %s\n", __FILE__, __LINE__, host); 61 | } else { 62 | for (int i = 0; i < addrs.size(); i++) { 63 | char ipstring[16] = {'\0'}; 64 | if (0 == ip_ntoaTest(addrs[i], ipstring, 16)) { 65 | result.push_back(ipstring); 66 | } 67 | } 68 | } 69 | return result; 70 | } 71 | 72 | int main(int argc, char *argv[]) 73 | { 74 | vector ips = getipTest("facebook.cn"); 75 | for (int i = 0; i < ips.size(); ++i) { 76 | printf("ip=%s\n", ips[i].c_str()); 77 | } 78 | return 0; 79 | } 80 | --------------------------------------------------------------------------------