├── .cproject
├── .project
├── Debug
├── makefile
├── module
│ └── subdir.mk
├── objects.mk
├── sources.mk
├── src
│ └── subdir.mk
└── subdir.mk
├── module
├── HtmlParser.cpp
├── HtmlParser.h
├── HtmlParserMod.cpp
├── HtmlParserMod.h
├── SaveBase.cpp
├── SaveBase.h
├── SaveHtml.cpp
├── SaveHtml.h
├── SaveImage.cpp
└── SaveImage.h
├── spider.conf
└── src
├── DownLoader.cpp
├── DownLoader.h
├── DsoManager.cpp
├── DsoManager.h
├── EpollManager.cpp
├── EpollManager.h
├── HttpParser.cpp
├── HttpParser.h
├── Socket.cpp
├── Socket.h
├── SpiderApp.cpp
├── SpiderApp.h
├── Url.cpp
├── Url.h
├── UrlManager.cpp
├── UrlManager.h
├── confparser.cpp
├── confparser.h
├── spider.cpp
├── spider.h
└── testmodule.cpp
/.cproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | spider
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder
10 | clean,full,incremental,
11 |
12 |
13 |
14 |
15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
16 | full,incremental,
17 |
18 |
19 |
20 |
21 |
22 | org.eclipse.cdt.core.cnature
23 | org.eclipse.cdt.core.ccnature
24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature
25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
26 |
27 |
28 |
--------------------------------------------------------------------------------
/Debug/makefile:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | -include ../makefile.init
6 |
7 | RM := rm -rf
8 |
9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include src/subdir.mk
12 | -include module/subdir.mk
13 | -include subdir.mk
14 | -include objects.mk
15 |
16 | ifneq ($(MAKECMDGOALS),clean)
17 | ifneq ($(strip $(CC_DEPS)),)
18 | -include $(CC_DEPS)
19 | endif
20 | ifneq ($(strip $(C++_DEPS)),)
21 | -include $(C++_DEPS)
22 | endif
23 | ifneq ($(strip $(C_UPPER_DEPS)),)
24 | -include $(C_UPPER_DEPS)
25 | endif
26 | ifneq ($(strip $(CXX_DEPS)),)
27 | -include $(CXX_DEPS)
28 | endif
29 | ifneq ($(strip $(CPP_DEPS)),)
30 | -include $(CPP_DEPS)
31 | endif
32 | ifneq ($(strip $(C_DEPS)),)
33 | -include $(C_DEPS)
34 | endif
35 | endif
36 |
37 | -include ../makefile.defs
38 |
39 | # Add inputs and outputs from these tool invocations to the build variables
40 |
41 | # All Target
42 | all: spider
43 |
44 | # Tool invocations
45 | spider: $(OBJS) $(USER_OBJS)
46 | @echo 'Building target: $@'
47 | @echo 'Invoking: GCC C++ Linker'
48 | g++ -ldl -Xlinker -levent -o "spider" $(OBJS) $(USER_OBJS) $(LIBS)
49 | @echo 'Finished building target: $@'
50 | @echo ' '
51 |
52 | # Other Targets
53 | clean:
54 | -$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(C_UPPER_DEPS)$(CXX_DEPS)$(OBJS)$(CPP_DEPS)$(C_DEPS) spider
55 | -@echo ' '
56 |
57 | .PHONY: all clean dependents
58 | .SECONDARY:
59 |
60 | -include ../makefile.targets
61 |
--------------------------------------------------------------------------------
/Debug/module/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../module/HtmlParser.cpp \
8 | ../module/HtmlParserMod.cpp \
9 | ../module/SaveBase.cpp \
10 | ../module/SaveHtml.cpp \
11 | ../module/SaveImage.cpp
12 |
13 | OBJS += \
14 | ./module/HtmlParser.o \
15 | ./module/HtmlParserMod.o \
16 | ./module/SaveBase.o \
17 | ./module/SaveHtml.o \
18 | ./module/SaveImage.o
19 |
20 | CPP_DEPS += \
21 | ./module/HtmlParser.d \
22 | ./module/HtmlParserMod.d \
23 | ./module/SaveBase.d \
24 | ./module/SaveHtml.d \
25 | ./module/SaveImage.d
26 |
27 |
28 | # Each subdirectory must supply rules for building sources it contributes
29 | module/%.o: ../module/%.cpp
30 | @echo 'Building file: $<'
31 | @echo 'Invoking: GCC C++ Compiler'
32 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
33 | @echo 'Finished building: $<'
34 | @echo ' '
35 |
36 |
37 |
--------------------------------------------------------------------------------
/Debug/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | USER_OBJS :=
6 |
7 | LIBS :=
8 |
9 |
--------------------------------------------------------------------------------
/Debug/sources.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | C_UPPER_SRCS :=
6 | CXX_SRCS :=
7 | C++_SRCS :=
8 | OBJ_SRCS :=
9 | CC_SRCS :=
10 | ASM_SRCS :=
11 | CPP_SRCS :=
12 | C_SRCS :=
13 | O_SRCS :=
14 | S_UPPER_SRCS :=
15 | CC_DEPS :=
16 | C++_DEPS :=
17 | EXECUTABLES :=
18 | C_UPPER_DEPS :=
19 | CXX_DEPS :=
20 | OBJS :=
21 | CPP_DEPS :=
22 | C_DEPS :=
23 |
24 | # Every subdirectory with source files must be described here
25 | SUBDIRS := \
26 | src \
27 | module \
28 |
29 |
--------------------------------------------------------------------------------
/Debug/src/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../src/DownLoader.cpp \
8 | ../src/DsoManager.cpp \
9 | ../src/EpollManager.cpp \
10 | ../src/HttpParser.cpp \
11 | ../src/Socket.cpp \
12 | ../src/SpiderApp.cpp \
13 | ../src/Url.cpp \
14 | ../src/UrlManager.cpp \
15 | ../src/confparser.cpp \
16 | ../src/spider.cpp \
17 | ../src/testmodule.cpp
18 |
19 | OBJS += \
20 | ./src/DownLoader.o \
21 | ./src/DsoManager.o \
22 | ./src/EpollManager.o \
23 | ./src/HttpParser.o \
24 | ./src/Socket.o \
25 | ./src/SpiderApp.o \
26 | ./src/Url.o \
27 | ./src/UrlManager.o \
28 | ./src/confparser.o \
29 | ./src/spider.o \
30 | ./src/testmodule.o
31 |
32 | CPP_DEPS += \
33 | ./src/DownLoader.d \
34 | ./src/DsoManager.d \
35 | ./src/EpollManager.d \
36 | ./src/HttpParser.d \
37 | ./src/Socket.d \
38 | ./src/SpiderApp.d \
39 | ./src/Url.d \
40 | ./src/UrlManager.d \
41 | ./src/confparser.d \
42 | ./src/spider.d \
43 | ./src/testmodule.d
44 |
45 |
46 | # Each subdirectory must supply rules for building sources it contributes
47 | src/%.o: ../src/%.cpp
48 | @echo 'Building file: $<'
49 | @echo 'Invoking: GCC C++ Compiler'
50 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
51 | @echo 'Finished building: $<'
52 | @echo ' '
53 |
54 |
55 |
--------------------------------------------------------------------------------
/Debug/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../EpollManager.cpp
8 |
9 | OBJS += \
10 | ./EpollManager.o
11 |
12 | CPP_DEPS += \
13 | ./EpollManager.d
14 |
15 |
16 | # Each subdirectory must supply rules for building sources it contributes
17 | %.o: ../%.cpp
18 | @echo 'Building file: $<'
19 | @echo 'Invoking: GCC C++ Compiler'
20 | g++ -O0 -g3 -Wall -c -fmessage-length=0 -std=c++0x -levent -ldl -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
21 | @echo 'Finished building: $<'
22 | @echo ' '
23 |
24 |
25 |
--------------------------------------------------------------------------------
/module/HtmlParser.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * HtmlParser.cpp
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #include "../module/HtmlParser.h"
9 |
10 | HtmlParser::HtmlParser()
11 | {
12 | // TODO Auto-generated constructor stub
13 |
14 | }
15 |
16 | HtmlParser::~HtmlParser()
17 | {
18 | // TODO Auto-generated destructor stub
19 | }
20 |
21 | std::list HtmlParser::getUrls(std::string page)
22 | {
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/module/HtmlParser.h:
--------------------------------------------------------------------------------
1 | /*
2 | * HtmlParser.h
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef HTMLPARSER_H_
9 | #define HTMLPARSER_H_
10 |
11 | #include
12 | #include
13 |
14 | class HtmlParser
15 | {
16 | public:
17 | HtmlParser();
18 | ~HtmlParser();
19 |
20 | public: //公有接口
21 | std::list getUrls(std::string page);
22 |
23 | private: //私有接口
24 | };
25 |
26 | #endif /* HTMLPARSER_H_ */
27 |
--------------------------------------------------------------------------------
/module/HtmlParserMod.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * HtmlParserMod.cpp
3 | *
4 | * Created on: 2015年6月16日
5 | * Author: yjlong
6 | */
7 |
8 | #include "HtmlParserMod.h"
9 | #include "HtmlParser.h"
10 |
11 | void init(Module *module)
12 | {
13 |
14 | }
15 |
16 | int handle(void *data)
17 | {
18 | htmlParserParam *lparam = (htmlParserParam *)data;
19 | HtmlParser parser;
20 | lparam->urls = parser.getUrls(lparam->url);
21 | if (lparam->urls.size() == 0)
22 | return MODULE_ERR;
23 | return MODULE_OK;
24 | }
25 |
--------------------------------------------------------------------------------
/module/HtmlParserMod.h:
--------------------------------------------------------------------------------
1 | /*
2 | * HtmlParserMod.h
3 | *
4 | * Created on: 2015年6月16日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef HTMLPARSERMOD_H_
9 | #define HTMLPARSERMOD_H_
10 |
11 | #include "../src/DsoManager.h"
12 | //#include "../src"
13 |
14 | extern void init(Module *module); //初始化模块操作
15 | extern int handle(void *data); //模块功能函数
16 |
17 | #endif /* HTMLPARSERMOD_H_ */
18 |
--------------------------------------------------------------------------------
/module/SaveBase.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveBase.cpp
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #include "../module/SaveBase.h"
9 |
10 | SaveBase::SaveBase()
11 | {
12 | // TODO Auto-generated constructor stub
13 |
14 | }
15 |
16 | SaveBase::~SaveBase()
17 | {
18 | // TODO Auto-generated destructor stub
19 | cout << "test git" << endl;
20 | }
21 |
22 |
--------------------------------------------------------------------------------
/module/SaveBase.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveBase.h
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef SAVEBASE_H_
9 | #define SAVEBASE_H_
10 |
11 | class SaveBase
12 | {
13 | public:
14 | SaveBase();
15 | virtual ~SaveBase();
16 |
17 | public:
18 | int setFilePath();
19 | virtual int save() = 0; //子类重写
20 | int setFileData();
21 | };
22 |
23 | #endif /* SAVEBASE_H_ */
24 |
--------------------------------------------------------------------------------
/module/SaveHtml.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveHtml.cpp
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #include "../module/SaveHtml.h"
9 |
10 | SaveHtml::SaveHtml()
11 | {
12 | // TODO Auto-generated constructor stub
13 |
14 | }
15 |
16 | SaveHtml::~SaveHtml()
17 | {
18 | // TODO Auto-generated destructor stub
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/module/SaveHtml.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveHtml.h
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef SAVEHTML_H_
9 | #define SAVEHTML_H_
10 |
11 | #include "../module/SaveBase.h"
12 |
13 | class SaveHtml: public SaveBase
14 | {
15 | public:
16 | SaveHtml();
17 | virtual ~SaveHtml();
18 |
19 | public:
20 |
21 | private:
22 |
23 | };
24 |
25 | #endif /* SAVEHTML_H_ */
26 |
--------------------------------------------------------------------------------
/module/SaveImage.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveImage.cpp
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #include "../module/SaveImage.h"
9 |
10 | SaveImage::SaveImage()
11 | {
12 | // TODO Auto-generated constructor stub
13 |
14 | }
15 |
16 | SaveImage::~SaveImage()
17 | {
18 | // TODO Auto-generated destructor stub
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/module/SaveImage.h:
--------------------------------------------------------------------------------
1 | /*
2 | * SaveImage.h
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef SAVEIMAGE_H_
9 | #define SAVEIMAGE_H_
10 |
11 | #include "../module/SaveBase.h"
12 |
13 | class SaveImage: public SaveBase
14 | {
15 | public:
16 | SaveImage();
17 | virtual ~SaveImage();
18 | };
19 |
20 | #endif /* SAVEIMAGE_H_ */
21 |
--------------------------------------------------------------------------------
/spider.conf:
--------------------------------------------------------------------------------
1 | # Max number of task_threads in parallel. Each thread fetches an ourl from ourl_queue
2 | # and crawls webpage and generates more threads according to cur_thread_num.
3 | # To adapt depending on your network
4 | max_job_num=1
5 |
6 | # From which urls to start job.
7 | # Comma seperated if you have more than one seed.
8 | seeds=http://www.imeiding.com
9 | #seeds=http://hi.baidu.com/qteqpid_pku
10 |
11 | # If include_prefixes is set, We only crawl the urls that match
12 | #include_prefixes=hi.baidu.com/qteqpid_pku/item
13 |
14 | # If include_prefixes is set, the urls that match will NOT be crawled
15 | #exclude_prefixes=www.imeiding.com/user
16 |
17 | # When daemonized, the process's output will be logged in logfile rather than console
18 | logfile=spiderq.log
19 |
20 | # Set the level to log. The probable values list as follow:
21 | # 0 DEBUG
22 | # 1 INFO
23 | # 2 WARN
24 | # 3 ERROR
25 | # 4 CRIT
26 | # Spider only logs those who's level is greater(or equal) than log_level here.
27 | # That means if you set log_level 0 here, You will get all logs.
28 | log_level=0
29 |
30 | # How deep do you want to go from seeds. If 0, we only crawl seeds and exit.
31 | # Comment the following line if You want to go as deep as possible
32 | max_depth=0
33 |
34 | # The interval time(in seconds) to print stat data.
35 | # If you need it, just uncomment the following line
36 | #stat_interval=2
37 |
38 | # How to save the crawled pages. Yes means respect sites hierarchy.
39 | # NOT supported yet!!!
40 | #make_hostdir=yes
41 |
42 | # Dynamic Shared Object (DSO) Support
43 | # The path where modules(.so) will locate.
44 | module_path=/etc/spider/modules/
45 |
46 | # Which module to load. Each one a line.
47 | # The available modules' source codes are all in modules directory.
48 | # They will all been compiled to .so and copy to ${module_path} during make
49 | load_module=savehtml
50 | load_module=saveimage
51 | load_module=maxdepth
52 | load_module=domainlimit
53 | load_module=headerfilter
54 |
55 |
56 | # specify which type of resource we accept. Each one a line.
57 | # text/html is accepted default
58 | accept_types=image/jpeg
59 |
--------------------------------------------------------------------------------
/src/DownLoader.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * DownLoader.cpp
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #include "DownLoader.h"
9 | #include "EpollManager.h"
10 |
11 | DownLoader::DownLoader()
12 | {
13 | // TODO Auto-generated constructor stub
14 |
15 | }
16 |
17 | DownLoader::~DownLoader()
18 | {
19 | // TODO Auto-generated destructor stub
20 | }
21 |
22 |
--------------------------------------------------------------------------------
/src/DownLoader.h:
--------------------------------------------------------------------------------
1 | /*
2 | * DownLoader.h
3 | *
4 | * Created on: 2015年6月14日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef DOWNLOADER_H_
9 | #define DOWNLOADER_H_
10 |
11 | #include
12 | #include
13 | #include "Url.h"
14 | #include "Socket.h"
15 | #include "HttpParser.h"
16 |
17 | class DownLoader
18 | {
19 | public:
20 | DownLoader();
21 | ~DownLoader();
22 |
23 | public:
24 | int getResource(URL *urls); //下载资源内容 动作
25 | void *getResContent(); //下载资源内容
26 |
27 | int init(); //初始化
28 | int reinit(); //重新初始化
29 |
30 | int getSockHandle(); //得到Socket句柄
31 |
32 | private:
33 | //socket 分装模块
34 | Socket m_sock;
35 | //http 协议处理模块
36 | HttpParser m_httpParser;
37 |
38 | URL *m_url;
39 | };
40 |
41 | #endif /* DOWNLOADER_H_ */
42 |
--------------------------------------------------------------------------------
/src/DsoManager.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Dso.cpp
3 | *
4 | * Created on: 2015年6月9日
5 | * Author: yjlong
6 | */
7 |
8 | #include
9 | #include
10 | #include "DsoManager.h"
11 |
12 | DsoManager::DsoManager()
13 | {
14 | // TODO Auto-generated constructor stub
15 |
16 | }
17 |
18 | DsoManager::~DsoManager()
19 | {
20 | // TODO Auto-generated destructor stub
21 | }
22 |
23 | //动态加载编译好的模块
24 | int DsoManager::load(const std::string &path, const std::string &name)
25 | {
26 | Module *module = new Module();
27 | void *handle = NULL;
28 | std::string filepath = path + name +".so";
29 | handle = dlopen(filepath.c_str(), RTLD_GLOBAL| RTLD_NOW);
30 | if (handle == NULL)
31 | return MODULE_ERR;
32 | //module->handle = handle;
33 | //int (*phandle)(void *, const char *) = dlsym;
34 | //module->init = phandle;
35 | //module->handle = (int)(*handle)(dlsym(handle , "handle"));
36 | if (module->handle == NULL)
37 | return MODULE_ERR;
38 | dlclose(handle);
39 |
40 | m_modules.insert(std::pair(name, module));
41 | //m_modules[name] = module;
42 | return 0;
43 | }
44 |
45 | //从模块列表中获得需要的模块
46 | Module * DsoManager::getModule(const std::string &name)
47 | {
48 | //std::map::iterator it;
49 | auto it = m_modules.find(name);
50 | //Module test;
51 | if (it != m_modules.end())
52 | {
53 | return (*it).second;
54 | }
55 | else
56 | {
57 | return NULL;
58 | }
59 | //return &test;
60 | //return 0;
61 | }
62 |
--------------------------------------------------------------------------------
/src/DsoManager.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Dso.h
3 | *
4 | * Created on: 2015年6月9日
5 | * Author: yjlong
6 | */
7 |
8 | #ifndef DSOMANAGER_H_
9 | #define DSOMANAGER_H_
10 |
11 | #include
12 | #include
13 | #include