├── .cproject ├── .project ├── Debug ├── makefile ├── module │ └── subdir.mk ├── objects.mk ├── sources.mk ├── src │ └── subdir.mk └── subdir.mk ├── module ├── HtmlParser.cpp ├── HtmlParser.h ├── HtmlParserMod.cpp ├── HtmlParserMod.h ├── SaveBase.cpp ├── SaveBase.h ├── SaveHtml.cpp ├── SaveHtml.h ├── SaveImage.cpp └── SaveImage.h ├── spider.conf └── src ├── DownLoader.cpp ├── DownLoader.h ├── DsoManager.cpp ├── DsoManager.h ├── EpollManager.cpp ├── EpollManager.h ├── HttpParser.cpp ├── HttpParser.h ├── Socket.cpp ├── Socket.h ├── SpiderApp.cpp ├── SpiderApp.h ├── Url.cpp ├── Url.h ├── UrlManager.cpp ├── UrlManager.h ├── confparser.cpp ├── confparser.h ├── spider.cpp ├── spider.h └── testmodule.cpp /.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 35 | 36 | 37 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 84 | 85 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.core.ccnature 24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 26 | 27 | 28 | -------------------------------------------------------------------------------- /Debug/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | -include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include src/subdir.mk 12 | -include module/subdir.mk 13 | -include subdir.mk 14 | -include objects.mk 15 | 16 | ifneq ($(MAKECMDGOALS),clean) 17 | ifneq ($(strip $(CC_DEPS)),) 18 | -include $(CC_DEPS) 19 | endif 20 | ifneq ($(strip $(C++_DEPS)),) 21 | -include $(C++_DEPS) 22 | endif 23 | ifneq ($(strip $(C_UPPER_DEPS)),) 24 | -include $(C_UPPER_DEPS) 25 | endif 26 | ifneq ($(strip $(CXX_DEPS)),) 27 | -include $(CXX_DEPS) 28 | endif 29 | ifneq ($(strip $(CPP_DEPS)),) 30 | -include $(CPP_DEPS) 31 | endif 32 | ifneq ($(strip $(C_DEPS)),) 33 | -include $(C_DEPS) 34 | endif 35 | endif 36 | 37 | -include ../makefile.defs 38 | 39 | # Add inputs and outputs from these tool invocations to the build variables 40 | 41 | # All Target 42 | all: spider 43 | 44 | # Tool invocations 45 | spider: $(OBJS) $(USER_OBJS) 46 | @echo 'Building target: $@' 47 | @echo 'Invoking: GCC C++ Linker' 48 | g++ -ldl -Xlinker -levent -o "spider" $(OBJS) $(USER_OBJS) $(LIBS) 49 | @echo 'Finished building target: $@' 50 | @echo ' ' 51 | 52 | # Other Targets 53 | clean: 54 | -$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(C_UPPER_DEPS)$(CXX_DEPS)$(OBJS)$(CPP_DEPS)$(C_DEPS) spider 55 | -@echo ' ' 56 | 57 | .PHONY: all clean dependents 58 | .SECONDARY: 59 | 60 | -include ../makefile.targets 61 | -------------------------------------------------------------------------------- /Debug/module/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../module/HtmlParser.cpp \ 8 | ../module/HtmlParserMod.cpp \ 9 | ../module/SaveBase.cpp \ 10 | ../module/SaveHtml.cpp \ 11 | ../module/SaveImage.cpp 12 | 13 | OBJS += \ 14 | ./module/HtmlParser.o \ 15 | ./module/HtmlParserMod.o \ 16 | ./module/SaveBase.o \ 17 | ./module/SaveHtml.o \ 18 | ./module/SaveImage.o 19 | 20 | CPP_DEPS += \ 21 | ./module/HtmlParser.d \ 22 | ./module/HtmlParserMod.d \ 23 | ./module/SaveBase.d \ 24 | ./module/SaveHtml.d \ 25 | ./module/SaveImage.d 26 | 27 | 28 | # Each subdirectory must supply rules for building sources it contributes 29 | module/%.o: ../module/%.cpp 30 | @echo 'Building file: $<' 31 | @echo 'Invoking: GCC C++ Compiler' 32 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 33 | @echo 'Finished building: $<' 34 | @echo ' ' 35 | 36 | 37 | -------------------------------------------------------------------------------- /Debug/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := 8 | 9 | -------------------------------------------------------------------------------- /Debug/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | C_UPPER_SRCS := 6 | CXX_SRCS := 7 | C++_SRCS := 8 | OBJ_SRCS := 9 | CC_SRCS := 10 | ASM_SRCS := 11 | CPP_SRCS := 12 | C_SRCS := 13 | O_SRCS := 14 | S_UPPER_SRCS := 15 | CC_DEPS := 16 | C++_DEPS := 17 | EXECUTABLES := 18 | C_UPPER_DEPS := 19 | CXX_DEPS := 20 | OBJS := 21 | CPP_DEPS := 22 | C_DEPS := 23 | 24 | # Every subdirectory with source files must be described here 25 | SUBDIRS := \ 26 | src \ 27 | module \ 28 | 29 | -------------------------------------------------------------------------------- /Debug/src/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/DownLoader.cpp \ 8 | ../src/DsoManager.cpp \ 9 | ../src/EpollManager.cpp \ 10 | ../src/HttpParser.cpp \ 11 | ../src/Socket.cpp \ 12 | ../src/SpiderApp.cpp \ 13 | ../src/Url.cpp \ 14 | ../src/UrlManager.cpp \ 15 | ../src/confparser.cpp \ 16 | ../src/spider.cpp \ 17 | ../src/testmodule.cpp 18 | 19 | OBJS += \ 20 | ./src/DownLoader.o \ 21 | ./src/DsoManager.o \ 22 | ./src/EpollManager.o \ 23 | ./src/HttpParser.o \ 24 | ./src/Socket.o \ 25 | ./src/SpiderApp.o \ 26 | ./src/Url.o \ 27 | ./src/UrlManager.o \ 28 | ./src/confparser.o \ 29 | ./src/spider.o \ 30 | ./src/testmodule.o 31 | 32 | CPP_DEPS += \ 33 | ./src/DownLoader.d \ 34 | ./src/DsoManager.d \ 35 | ./src/EpollManager.d \ 36 | ./src/HttpParser.d \ 37 | ./src/Socket.d \ 38 | ./src/SpiderApp.d \ 39 | ./src/Url.d \ 40 | ./src/UrlManager.d \ 41 | ./src/confparser.d \ 42 | ./src/spider.d \ 43 | ./src/testmodule.d 44 | 45 | 46 | # Each subdirectory must supply rules for building sources it contributes 47 | src/%.o: ../src/%.cpp 48 | @echo 'Building file: $<' 49 | @echo 'Invoking: GCC C++ Compiler' 50 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 51 | @echo 'Finished building: $<' 52 | @echo ' ' 53 | 54 | 55 | -------------------------------------------------------------------------------- /Debug/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../EpollManager.cpp 8 | 9 | OBJS += \ 10 | ./EpollManager.o 11 | 12 | CPP_DEPS += \ 13 | ./EpollManager.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | %.o: ../%.cpp 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: GCC C++ Compiler' 20 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 21 | @echo 'Finished building: $<' 22 | @echo ' ' 23 | 24 | 25 | -------------------------------------------------------------------------------- /module/HtmlParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * HtmlParser.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "../module/HtmlParser.h" 9 | 10 | HtmlParser::HtmlParser() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | HtmlParser::~HtmlParser() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | std::list HtmlParser::getUrls(std::string page) 22 | { 23 | 24 | } 25 | -------------------------------------------------------------------------------- /module/HtmlParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HtmlParser.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef HTMLPARSER_H_ 9 | #define HTMLPARSER_H_ 10 | 11 | #include 12 | #include 13 | 14 | class HtmlParser 15 | { 16 | public: 17 | HtmlParser(); 18 | ~HtmlParser(); 19 | 20 | public: //公有接口 21 | std::list getUrls(std::string page); 22 | 23 | private: //私有接口 24 | }; 25 | 26 | #endif /* HTMLPARSER_H_ */ 27 | -------------------------------------------------------------------------------- /module/HtmlParserMod.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * HtmlParserMod.cpp 3 | * 4 | * Created on: 2015年6月16日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "HtmlParserMod.h" 9 | #include "HtmlParser.h" 10 | 11 | void init(Module *module) 12 | { 13 | 14 | } 15 | 16 | int handle(void *data) 17 | { 18 | htmlParserParam *lparam = (htmlParserParam *)data; 19 | HtmlParser parser; 20 | lparam->urls = parser.getUrls(lparam->url); 21 | if (lparam->urls.size() == 0) 22 | return MODULE_ERR; 23 | return MODULE_OK; 24 | } 25 | -------------------------------------------------------------------------------- /module/HtmlParserMod.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HtmlParserMod.h 3 | * 4 | * Created on: 2015年6月16日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef HTMLPARSERMOD_H_ 9 | #define HTMLPARSERMOD_H_ 10 | 11 | #include "../src/DsoManager.h" 12 | //#include "../src" 13 | 14 | extern void init(Module *module); //初始化模块操作 15 | extern int handle(void *data); //模块功能函数 16 | 17 | #endif /* HTMLPARSERMOD_H_ */ 18 | -------------------------------------------------------------------------------- /module/SaveBase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveBase.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "../module/SaveBase.h" 9 | 10 | SaveBase::SaveBase() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | SaveBase::~SaveBase() 17 | { 18 | // TODO Auto-generated destructor stub 19 | cout << "test git" << endl; 20 | } 21 | 22 | -------------------------------------------------------------------------------- /module/SaveBase.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveBase.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef SAVEBASE_H_ 9 | #define SAVEBASE_H_ 10 | 11 | class SaveBase 12 | { 13 | public: 14 | SaveBase(); 15 | virtual ~SaveBase(); 16 | 17 | public: 18 | int setFilePath(); 19 | virtual int save() = 0; //子类重写 20 | int setFileData(); 21 | }; 22 | 23 | #endif /* SAVEBASE_H_ */ 24 | -------------------------------------------------------------------------------- /module/SaveHtml.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveHtml.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "../module/SaveHtml.h" 9 | 10 | SaveHtml::SaveHtml() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | SaveHtml::~SaveHtml() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | -------------------------------------------------------------------------------- /module/SaveHtml.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveHtml.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef SAVEHTML_H_ 9 | #define SAVEHTML_H_ 10 | 11 | #include "../module/SaveBase.h" 12 | 13 | class SaveHtml: public SaveBase 14 | { 15 | public: 16 | SaveHtml(); 17 | virtual ~SaveHtml(); 18 | 19 | public: 20 | 21 | private: 22 | 23 | }; 24 | 25 | #endif /* SAVEHTML_H_ */ 26 | -------------------------------------------------------------------------------- /module/SaveImage.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveImage.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "../module/SaveImage.h" 9 | 10 | SaveImage::SaveImage() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | SaveImage::~SaveImage() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | -------------------------------------------------------------------------------- /module/SaveImage.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SaveImage.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef SAVEIMAGE_H_ 9 | #define SAVEIMAGE_H_ 10 | 11 | #include "../module/SaveBase.h" 12 | 13 | class SaveImage: public SaveBase 14 | { 15 | public: 16 | SaveImage(); 17 | virtual ~SaveImage(); 18 | }; 19 | 20 | #endif /* SAVEIMAGE_H_ */ 21 | -------------------------------------------------------------------------------- /spider.conf: -------------------------------------------------------------------------------- 1 | # Max number of task_threads in parallel. Each thread fetches an ourl from ourl_queue 2 | # and crawls webpage and generates more threads according to cur_thread_num. 3 | # To adapt depending on your network 4 | max_job_num=1 5 | 6 | # From which urls to start job. 7 | # Comma seperated if you have more than one seed. 8 | seeds=http://www.imeiding.com 9 | #seeds=http://hi.baidu.com/qteqpid_pku 10 | 11 | # If include_prefixes is set, We only crawl the urls that match 12 | #include_prefixes=hi.baidu.com/qteqpid_pku/item 13 | 14 | # If include_prefixes is set, the urls that match will NOT be crawled 15 | #exclude_prefixes=www.imeiding.com/user 16 | 17 | # When daemonized, the process's output will be logged in logfile rather than console 18 | logfile=spiderq.log 19 | 20 | # Set the level to log. The probable values list as follow: 21 | # 0 DEBUG 22 | # 1 INFO 23 | # 2 WARN 24 | # 3 ERROR 25 | # 4 CRIT 26 | # Spider only logs those who's level is greater(or equal) than log_level here. 27 | # That means if you set log_level 0 here, You will get all logs. 28 | log_level=0 29 | 30 | # How deep do you want to go from seeds. If 0, we only crawl seeds and exit. 31 | # Comment the following line if You want to go as deep as possible 32 | max_depth=0 33 | 34 | # The interval time(in seconds) to print stat data. 35 | # If you need it, just uncomment the following line 36 | #stat_interval=2 37 | 38 | # How to save the crawled pages. Yes means respect sites hierarchy. 39 | # NOT supported yet!!! 40 | #make_hostdir=yes 41 | 42 | # Dynamic Shared Object (DSO) Support 43 | # The path where modules(.so) will locate. 44 | module_path=/etc/spider/modules/ 45 | 46 | # Which module to load. Each one a line. 47 | # The available modules' source codes are all in modules directory. 48 | # They will all been compiled to .so and copy to ${module_path} during make 49 | load_module=savehtml 50 | load_module=saveimage 51 | load_module=maxdepth 52 | load_module=domainlimit 53 | load_module=headerfilter 54 | 55 | 56 | # specify which type of resource we accept. Each one a line. 57 | # text/html is accepted default 58 | accept_types=image/jpeg 59 | -------------------------------------------------------------------------------- /src/DownLoader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * DownLoader.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "DownLoader.h" 9 | #include "EpollManager.h" 10 | 11 | DownLoader::DownLoader() 12 | { 13 | // TODO Auto-generated constructor stub 14 | 15 | } 16 | 17 | DownLoader::~DownLoader() 18 | { 19 | // TODO Auto-generated destructor stub 20 | } 21 | 22 | -------------------------------------------------------------------------------- /src/DownLoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DownLoader.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef DOWNLOADER_H_ 9 | #define DOWNLOADER_H_ 10 | 11 | #include 12 | #include 13 | #include "Url.h" 14 | #include "Socket.h" 15 | #include "HttpParser.h" 16 | 17 | class DownLoader 18 | { 19 | public: 20 | DownLoader(); 21 | ~DownLoader(); 22 | 23 | public: 24 | int getResource(URL *urls); //下载资源内容 动作 25 | void *getResContent(); //下载资源内容 26 | 27 | int init(); //初始化 28 | int reinit(); //重新初始化 29 | 30 | int getSockHandle(); //得到Socket句柄 31 | 32 | private: 33 | //socket 分装模块 34 | Socket m_sock; 35 | //http 协议处理模块 36 | HttpParser m_httpParser; 37 | 38 | URL *m_url; 39 | }; 40 | 41 | #endif /* DOWNLOADER_H_ */ 42 | -------------------------------------------------------------------------------- /src/DsoManager.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Dso.cpp 3 | * 4 | * Created on: 2015年6月9日 5 | * Author: yjlong 6 | */ 7 | 8 | #include 9 | #include 10 | #include "DsoManager.h" 11 | 12 | DsoManager::DsoManager() 13 | { 14 | // TODO Auto-generated constructor stub 15 | 16 | } 17 | 18 | DsoManager::~DsoManager() 19 | { 20 | // TODO Auto-generated destructor stub 21 | } 22 | 23 | //动态加载编译好的模块 24 | int DsoManager::load(const std::string &path, const std::string &name) 25 | { 26 | Module *module = new Module(); 27 | void *handle = NULL; 28 | std::string filepath = path + name +".so"; 29 | handle = dlopen(filepath.c_str(), RTLD_GLOBAL| RTLD_NOW); 30 | if (handle == NULL) 31 | return MODULE_ERR; 32 | //module->handle = handle; 33 | //int (*phandle)(void *, const char *) = dlsym; 34 | //module->init = phandle; 35 | //module->handle = (int)(*handle)(dlsym(handle , "handle")); 36 | if (module->handle == NULL) 37 | return MODULE_ERR; 38 | dlclose(handle); 39 | 40 | m_modules.insert(std::pair(name, module)); 41 | //m_modules[name] = module; 42 | return 0; 43 | } 44 | 45 | //从模块列表中获得需要的模块 46 | Module * DsoManager::getModule(const std::string &name) 47 | { 48 | //std::map::iterator it; 49 | auto it = m_modules.find(name); 50 | //Module test; 51 | if (it != m_modules.end()) 52 | { 53 | return (*it).second; 54 | } 55 | else 56 | { 57 | return NULL; 58 | } 59 | //return &test; 60 | //return 0; 61 | } 62 | -------------------------------------------------------------------------------- /src/DsoManager.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Dso.h 3 | * 4 | * Created on: 2015年6月9日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef DSOMANAGER_H_ 9 | #define DSOMANAGER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define MODULE_OK 0 17 | #define MODULE_ERR 1 18 | 19 | #define MODULE_MAIN_VERSION 1 20 | #define MODULE_SUB_VERSION 0 21 | 22 | //模块描述结构 23 | class Module 24 | { 25 | public: 26 | int version; //主版本号 27 | int subversion; //次版本号 28 | std::string name; //模块名 29 | void (*init)(Module *); //初始化函数指针 30 | int (*handle)(void *); //入口函数指针 31 | }; 32 | 33 | class DsoManager 34 | { 35 | public: 36 | DsoManager(); 37 | ~DsoManager(); 38 | 39 | public: 40 | int load(const std::string &path, const std::string &name); //动态加载编译好的模块 41 | Module * getModule(const std::string &name); //从模块列表中获得需要的模块 42 | 43 | private: 44 | std::map m_modules; 45 | }; 46 | 47 | class htmlParserParam 48 | { 49 | public: 50 | std::string url; //in 51 | std::list urls; //out 52 | }; 53 | 54 | #endif /* DSOMANAGER_H_ */ 55 | -------------------------------------------------------------------------------- /src/EpollManager.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EpollManager.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "EpollManager.h" 9 | 10 | EpollManager::EpollManager() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | EpollManager::~EpollManager() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | //注册 Socket 句柄 22 | int EpollManager::registHandle(int handle) 23 | { 24 | return 0; 25 | } 26 | //注销句柄 27 | int EpollManager::unregistHandle(int handle) 28 | { 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /src/EpollManager.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EpollManager.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef EPOLLMANAGER_H_ 9 | #define EPOLLMANAGER_H_ 10 | 11 | #include 12 | 13 | class EpollManager 14 | { 15 | public: 16 | EpollManager(); //构造函数初始化 Epoll ,得到句柄 17 | ~EpollManager(); 18 | 19 | public: 20 | //注册 Socket 句柄 21 | int registHandle(int handle); 22 | //注销句柄 23 | int unregistHandle(int handle); 24 | 25 | private: 26 | int m_eHandle; //epool 句柄 27 | int m_tasknum; //维护抓取的最大任务数 28 | }; 29 | 30 | #endif /* EPOLLMANAGER_H_ */ 31 | -------------------------------------------------------------------------------- /src/HttpParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Http.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "HttpParser.h" 9 | 10 | HttpParser::HttpParser() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | HttpParser::~HttpParser() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | -------------------------------------------------------------------------------- /src/HttpParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Http.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef HTTPPARSER_H_ 9 | #define HTTPPARSER_H_ 10 | 11 | #include "Url.h" 12 | 13 | class HttpParser 14 | { 15 | public: 16 | HttpParser(); 17 | ~HttpParser(); 18 | 19 | public: //公共接口 20 | int parserHeader(); //解析头 21 | int createHeader(); 22 | 23 | int getContent(void *buf); //得到下载的内容,传入缓冲区 动态分配内存 24 | int getUrl(URL *url); 25 | 26 | int init(); 27 | int reinit(); 28 | 29 | int updateUrl(); //把从 http 中获取的信息进行更新 30 | 31 | private: 32 | URL m_url; 33 | }; 34 | 35 | #endif /* HTTPPARSER_H_ */ 36 | -------------------------------------------------------------------------------- /src/Socket.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Socket.cpp 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "Socket.h" 9 | 10 | Socket::Socket() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | Socket::~Socket() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | 21 | -------------------------------------------------------------------------------- /src/Socket.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Socket.h 3 | * 4 | * Created on: 2015年6月14日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef SOCKET_H_ 9 | #define SOCKET_H_ 10 | 11 | #include "Url.h" 12 | 13 | class Socket 14 | { 15 | public: 16 | Socket(); 17 | ~Socket(); 18 | 19 | public: //外部接口 20 | int Request(URL *urls); //发送请求 21 | int Response(); //接收反馈 22 | int setSocket(); //设置Socket 23 | int connect(); //连接 24 | int disConnect(); //断开连接 25 | void getData(); 26 | 27 | private: //内部接口 28 | 29 | private: 30 | int m_sockHandle; //socket 句柄 31 | char *buf; // 32 | }; 33 | 34 | #endif /* SOCKET_H_ */ 35 | -------------------------------------------------------------------------------- /src/SpiderApp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SpiderApp.cpp 3 | * 4 | * Created on: 2015年6月13日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "SpiderApp.h" 9 | #include "confparser.h" 10 | 11 | SpiderApp::SpiderApp() 12 | { 13 | // TODO Auto-generated constructor stub 14 | isDeamon = false; 15 | 16 | } 17 | 18 | SpiderApp::~SpiderApp() 19 | { 20 | // TODO Auto-generated destructor stub 21 | } 22 | 23 | int SpiderApp::initArgc(int argc, char *argv[]) 24 | { 25 | 26 | } 27 | 28 | int SpiderApp::initApp(int argc, char *argv[]) 29 | { 30 | /*处理参数*/ 31 | if (initArgc(argc, argv) == 0) 32 | { 33 | // 34 | return 0; 35 | } 36 | 37 | /*读取配置文件*/ 38 | ConfigParser::GetInstance()->loader(CONF_PATH); 39 | 40 | /*处理是否以守护进程运行*/ 41 | if (isDeamon) 42 | { 43 | deamon(); 44 | } 45 | 46 | /*载入所有模块*/ 47 | //待完成 48 | // 49 | 50 | 51 | } 52 | 53 | int SpiderApp::run() 54 | { 55 | 56 | } 57 | 58 | int SpiderApp::deamon() 59 | { 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/SpiderApp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SpiderApp.h 3 | * 4 | * Created on: 2015年6月13日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef SPIDERAPP_H_ 9 | #define SPIDERAPP_H_ 10 | 11 | #define CONF_PATH "spider.conf" 12 | 13 | class SpiderApp 14 | { 15 | public: 16 | SpiderApp(); 17 | ~SpiderApp(); 18 | 19 | public: //业务成员函数 20 | //初始化环境 21 | //返回值: 1.成功 0.失败 与C语言相反 22 | int initApp(int argc, char *argv[]); 23 | //主处理流程 24 | //返回值: 1.成功 0.失败 25 | int run(); 26 | int deamon(); //守护进程 27 | 28 | private: //内部接口 29 | //处理参数 30 | int initArgc(int argc, char *argv[]); 31 | 32 | private: //类私有成员变量 33 | bool isDeamon; //是否以守护进程运行 34 | 35 | 36 | 37 | }; 38 | 39 | #endif /* SPIDERAPP_H_ */ 40 | -------------------------------------------------------------------------------- /src/Url.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Url.cpp 3 | * 4 | * Created on: 2015年6月8日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "Url.h" 9 | 10 | Url::Url() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | Url::~Url() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | -------------------------------------------------------------------------------- /src/Url.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Url.h 3 | * 4 | * Created on: 2015年6月8日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef URL_H_ 9 | #define URL_H_ 10 | 11 | #include 12 | #include 13 | 14 | typedef struct stru_url 15 | { 16 | std::string url; 17 | std::string protocol; 18 | std::string siteName; 19 | std::string path; 20 | std::string fileName; 21 | bool state; 22 | int deep; 23 | std::string fileType; 24 | }URL; 25 | 26 | class Url 27 | { 28 | public: 29 | Url(); 30 | Url(std::string url_str); 31 | ~Url(); 32 | 33 | public: 34 | int parseUrl(); //解析URL,并填充内部URL结构 35 | struct stru_url * getUrlData(); //获得URL数据描述结构 36 | 37 | private: 38 | struct stru_url m_url; 39 | }; 40 | #endif /* URL_H_ */ 41 | -------------------------------------------------------------------------------- /src/UrlManager.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * UrlManager.cpp 3 | * 4 | * Created on: 2015年6月8日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "UrlManager.h" 9 | 10 | UrlManager::UrlManager() 11 | { 12 | // TODO Auto-generated constructor stub 13 | 14 | } 15 | 16 | UrlManager::~UrlManager() 17 | { 18 | // TODO Auto-generated destructor stub 19 | } 20 | -------------------------------------------------------------------------------- /src/UrlManager.h: -------------------------------------------------------------------------------- 1 | /* 2 | * UrlManager.h 3 | * 4 | * Created on: 2015年6月8日 5 | * Author: yjlong 6 | */ 7 | 8 | #ifndef URLMANAGER_H_ 9 | #define URLMANAGER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "Url.h" 16 | 17 | class UrlManager 18 | { 19 | public: 20 | UrlManager(); 21 | ~UrlManager(); 22 | public: //外部接口 23 | bool addUrl(std::string urls); //添加一个新的URL,加入 list 和 queue 中,并生成索引 24 | int addUrlList(std::list urls); //批量添加 25 | stru_url * getUrlForQueue(); //取一个未处理的对象指针 26 | int setUrlState(stru_url *urls); //设置url处理状态 27 | 28 | private: //内部借口 29 | stru_url *findurl(std::string urls); 30 | // removeUrlForQueue(); 31 | 32 | 33 | private: 34 | std::list m_urls; //所有URL 35 | std::map m_urlMap; //所有URL索引 36 | std::queue m_urlQue; //等待抓取的URL 37 | }; 38 | #endif /* URLMANAGER_H_ */ 39 | -------------------------------------------------------------------------------- /src/confparser.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * File Name: confparser.cpp 3 | * Author: yanjinlong 4 | * Mail: yjlxaut@126.com 5 | * Created Time: 2015年06月07日 星期日 22时44分04秒 6 | ************************************************************************/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "confparser.h" 12 | 13 | ConfigParser *ConfigParser::self = NULL; //静态成员变量类外初始化 14 | 15 | //构造函数 16 | ConfigParser::ConfigParser() 17 | { 18 | job_num = 0; 19 | seed = new char[40]; 20 | memset(seed, 0, 40); 21 | depth = 1; 22 | log_level = 0; 23 | //module_path = (char *)malloc(20 * sizeof(char)); 24 | module_path = new char[40]; 25 | memset(module_path, 0, 20); 26 | } 27 | 28 | /* 29 | ConfigParser::~ConfigParser() 30 | { 31 | 32 | if (seed != nullptr) 33 | { 34 | delete seed; 35 | seed = nullptr; 36 | } 37 | if (module_path != nullptr) 38 | { 39 | delete module_path; 40 | module_path = nullptr; 41 | } 42 | } */ 43 | 44 | ConfigParser * ConfigParser::GetInstance() 45 | { 46 | if (self == NULL) 47 | { 48 | self = new ConfigParser(); 49 | } 50 | 51 | return self; 52 | } 53 | 54 | int ConfigParser::loader(const char *conf_filepath) 55 | { 56 | const int MAX_LEN = 200; 57 | char buf[MAX_LEN] = {0}; 58 | char *res = NULL; 59 | char left[20] = {0}; 60 | char right[40] = {0}; 61 | FILE *fp; 62 | fp = fopen(conf_filepath, "r"); 63 | if (fp == NULL) 64 | { 65 | return -1; 66 | } 67 | 68 | //逐行解析 69 | while (fgets(buf, MAX_LEN, fp) != NULL) 70 | { 71 | if (buf[0]=='#' || buf[0]=='\0' || buf[0]==' ') //忽略注释和空行 72 | continue; 73 | 74 | //分割字符串 75 | res = strtok(buf, "="); // = 左边 76 | memset(left, 0, sizeof(left)); 77 | strcpy(left, res); 78 | while (res != NULL) 79 | { 80 | memset(right, 0, sizeof(right)); 81 | strcpy(right, res); 82 | res = strtok(NULL, "="); // = 右边 83 | } 84 | 85 | //解析字符串 86 | if (strcmp(left, "max_job_num") == 0) 87 | { 88 | job_num = atoi(right); 89 | }else if (strcmp(left, "seeds") == 0) 90 | { 91 | strcpy(seed, right); 92 | }else if (strcmp(left, "log_level") == 0) 93 | { 94 | log_level = atoi(right); 95 | }else if (strcmp(left, "module_path") == 0) 96 | { 97 | strcpy(module_path, right); 98 | }else if (strcmp(left, "max_depth") == 0) 99 | { 100 | depth = atoi(right); 101 | }else if (strcmp(left, "load_module") == 0) 102 | { 103 | module_name.push_back(right); 104 | }else if (strcmp(left, "accept_types") == 0) 105 | { 106 | file_type.push_back(right); 107 | } 108 | else 109 | continue; 110 | } 111 | return 0; 112 | } 113 | 114 | int ConfigParser::getJobNum() //任务数 115 | { 116 | printf("job_num = %d\n", job_num); 117 | return job_num; 118 | } 119 | 120 | char *ConfigParser::getUrlSeed() //URL 种子 121 | { 122 | printf("seed = %s\n", seed); 123 | return seed; 124 | } 125 | 126 | int ConfigParser::getLogLevel() //日志等级 127 | { 128 | printf("log_level = %d\n", log_level); 129 | return log_level; 130 | } 131 | 132 | int ConfigParser::getDepth() //深度 133 | { 134 | printf("depth = %d\n", depth); 135 | return depth; 136 | } 137 | 138 | char * ConfigParser::getModulePath() //模块路径 139 | { 140 | printf("module_path = %s\n", module_path); 141 | return module_path; 142 | } 143 | 144 | std::list ConfigParser::getModules() //模块 145 | { 146 | std::list::iterator ibegin, iend; 147 | ibegin = module_name.begin(); 148 | iend = module_name.end(); 149 | for(; ibegin!=iend; ibegin++) 150 | std::cout << "module_name = " <<*ibegin << std::endl; 151 | return module_name; 152 | } 153 | 154 | std::list ConfigParser::getFileTypes() //文件类型 155 | { 156 | std::list::iterator ibegin, iend; 157 | ibegin = file_type.begin(); 158 | iend = file_type.end(); 159 | for(; ibegin!=iend; ibegin++) 160 | std::cout << "file_type = " <<*ibegin << std::endl; 161 | return file_type; 162 | } 163 | 164 | /* 165 | int main() 166 | { 167 | ConfigParser *conf = ConfigParser::GetInstance(); 168 | if (conf->loader("./spider.conf") < 0) 169 | { 170 | printf("Load Config File Failed!\n"); 171 | return -1; 172 | } 173 | conf->getJobNum(); 174 | conf->getLogLevel(); 175 | conf->getUrlSeed(); 176 | conf->getDepth(); 177 | conf->getModulePath(); 178 | conf->getModules(); 179 | conf->getFileTypes(); 180 | 181 | std::cout << "Module Test Success!" << std::endl; 182 | return 0; 183 | } 184 | */ 185 | -------------------------------------------------------------------------------- /src/confparser.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * File Name: confparser.h 3 | * Author: yanjinlong 4 | * Mail: yjlxaut@126.com 5 | * Created Time: 2015年06月07日 星期日 21时17分17秒 6 | ************************************************************************/ 7 | 8 | #ifndef CONFPARSER_H 9 | #define CONFPARSER_H 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | class ConfigParser 16 | { 17 | public: 18 | //~ConfigParser(); 19 | //单例模式 : 保证一个类只有一个实例,并提供一个访问他的全局访问点 20 | //1.构造函数私有 2.静态成员变量 3.静态成员函数获取实例 21 | static ConfigParser *GetInstance(); 22 | 23 | //操作 24 | int loader(const char *conf_filepath); //加载配置文件 25 | int getJobNum(); //任务数 26 | char * getUrlSeed(); //URL 种子 27 | int getLogLevel(); //日志等级 28 | int getDepth(); //深度 29 | char * getModulePath(); //模块路径 30 | std::list getModules(); //模块 31 | std::list getFileTypes(); //文件类型 32 | 33 | private: 34 | ConfigParser(); 35 | static ConfigParser *self; 36 | 37 | private: 38 | int job_num; 39 | char *seed; 40 | int depth; 41 | int log_level; 42 | char *module_path; 43 | std::list module_name; 44 | std::list file_type; 45 | }; 46 | #endif // 47 | -------------------------------------------------------------------------------- /src/spider.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * spider.cpp 3 | * 4 | * Created on: 2015年6月13日 5 | * Author: yjlong 6 | */ 7 | 8 | #include "spider.h" 9 | #include "SpiderApp.h" 10 | #include "EpollManager.h" 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | 15 | //对程序初始化 16 | SpiderApp app; 17 | 18 | /*初始化*/ 19 | if (app.initApp(argc, argv) == 0) //失败 20 | { 21 | //写入日志 22 | SPIDER_LOG(SPIDER_LEVEL_ERROR, "App init failed! app has exit!"); 23 | return 0; 24 | } 25 | /*主流程*/ 26 | if (app.run() == 0) 27 | { 28 | SPIDER_LOG(SPIDER_LEVEL_ERROR, "App Run Error!"); 29 | return 0; 30 | } 31 | 32 | return 0; 33 | } 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/spider.h: -------------------------------------------------------------------------------- 1 | #ifndef SPIDER_H 2 | #define SPIDER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | //#include 9 | //#include "url.h" 10 | //#include "socket.h" 11 | //#include "threads.h" 12 | //#include "confparser.h" 13 | //#include "dso.h" 14 | 15 | /* macros */ 16 | #define MAX_MESG_LEN 1024 17 | 18 | #define SPIDER_LEVEL_DEBUG 0 19 | #define SPIDER_LEVEL_INFO 1 20 | #define SPIDER_LEVEL_WARN 2 21 | #define SPIDER_LEVEL_ERROR 3 22 | #define SPIDER_LEVEL_CRIT 4 23 | 24 | static const char * LOG_STR[] = { 25 | "DEBUG", 26 | "INFO", 27 | "WARN", 28 | "ERROR", 29 | "CRIT" 30 | }; 31 | 32 | //extern Config *g_conf; 33 | //strftime(buf, sizeof(buf), "%Y%m%d %H:%M:%S", localtime(&now)); 34 | #define SPIDER_LOG(level, format, ...) do{ \ 35 | if (level >= 0) {\ 36 | time_t now = time(NULL); \ 37 | char msg[MAX_MESG_LEN]; \ 38 | char buf[32]; \ 39 | sprintf(msg, format, ##__VA_ARGS__); \ 40 | strftime(buf, sizeof(buf), "%Y%m%d %H:%M:%S", localtime(&now)); \ 41 | fprintf(stdout, "[%s] [%s] %s\n", buf, LOG_STR[level], msg); \ 42 | fflush(stdout); \ 43 | } \ 44 | if (level == SPIDER_LEVEL_ERROR) {\ 45 | exit(-1); \ 46 | } \ 47 | } while(0) 48 | 49 | 50 | //extern int attach_epoll_task(); 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/testmodule.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * testmodule.cpp 3 | * 4 | * Created on: 2015年6月12日 5 | * Author: yjlong 6 | */ 7 | 8 | #include 9 | #include "confparser.h" 10 | #include "Url.h" 11 | #include "UrlManager.h" 12 | #include "DsoManager.h" 13 | /* 14 | int main() 15 | { 16 | return 0; 17 | } 18 | */ 19 | 20 | --------------------------------------------------------------------------------