├── .gitattributes ├── .gitignore ├── README.md ├── analyzer └── analyzer.go ├── basic ├── check.go ├── config.go ├── items.go ├── links.go ├── logging.go ├── request.go └── response.go ├── controller ├── controller.go └── signal.go ├── downloader └── downloader.go ├── main.go ├── middleware ├── channel.go └── workpool.go └── processor └── processor.go /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | # ========================= 27 | # Operating System Files 28 | # ========================= 29 | 30 | # OSX 31 | # ========================= 32 | 33 | .DS_Store 34 | .AppleDouble 35 | .LSOverride 36 | 37 | # Thumbnails 38 | ._* 39 | 40 | # Files that might appear on external disk 41 | .Spotlight-V100 42 | .Trashes 43 | 44 | # Directories potentially created on remote AFP share 45 | .AppleDB 46 | .AppleDesktop 47 | Network Trash Folder 48 | Temporary Items 49 | .apdisk 50 | 51 | # Windows 52 | # ========================= 53 | 54 | # Windows image file caches 55 | Thumbs.db 56 | ehthumbs.db 57 | 58 | # Folder config file 59 | Desktop.ini 60 | 61 | # Recycle Bin used on file shares 62 | $RECYCLE.BIN/ 63 | 64 | # Windows Installer files 65 | *.cab 66 | *.msi 67 | *.msm 68 | *.msp 69 | 70 | # Windows shortcuts 71 | *.lnk 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrago 2 | 一个简单的go并发爬虫框架 A simple concurrcy spider with go 3 | 4 | ##介绍 5 | 6 | ***[scrago](http://z3lion.com/article/165)*** 7 | 8 | ##简单使用 9 | 10 | func main() { 11 | //创建一个控制器,这里有4个必须给与的参数: 12 | //爬取的初始url,爬取深度,解析函数,储存函数 13 | //basic.InitConfig() //配置Config需要先初始化 14 | //basic.Config.HttpHeader["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) " + 15 | // "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36" 16 | controller := controller.NewController("http://z3lion.com/", 1, Parser, Store) 17 | //启动爬虫 18 | controller.Go() 19 | } 20 | 21 | //解析函数定义,声明必须相同,下同 22 | func Parser(httpRes *http.Response) ([]string, []basic.Item) { 23 | //两个需要返回的列表 24 | linklist := make([]string, 0) //下一步需要请求的链接 25 | itemlist := make([]basic.Item, 0) //保存的数据类型为 map[string]interface{} 26 | 27 | //自定义部分 28 | //抓取所有链接 29 | doc, _ := goquery.NewDocumentFromResponse(httpRes) 30 | doc.Find("a").Each(func(i int, s *goquery.Selection) { 31 | link, exits := s.Attr("href") 32 | if exits { 33 | link = basic.CheckLink(link) 34 | if link != "" { 35 | linklist = append(linklist, link) 36 | } 37 | } 38 | }) 39 | //保存每个页面的标题 40 | title := strings.TrimSpace(doc.Find("head title").Text()) 41 | if title != "" { 42 | item := make(map[string]interface{}) 43 | item["标题"] = title 44 | itemlist = append(itemlist, item) 45 | } 46 | 47 | return linklist, itemlist 48 | } 49 | 50 | //储存函数定义 51 | func Store(item basic.Item) { 52 | //这里只打印抓取的数据 53 | fmt.Println(item) 54 | } -------------------------------------------------------------------------------- /analyzer/analyzer.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "basic" 5 | //"fmt" 6 | "net/http" 7 | ) 8 | 9 | type GenAnalyzer interface { 10 | Analyze(httpRes *http.Response, parser Parser) ([]string, []basic.Item) 11 | } 12 | 13 | type Parser func(httpRes *http.Response) ([]string, []basic.Item) 14 | 15 | type Analyzer struct { 16 | linklist []string 17 | itemlist []basic.Item 18 | } 19 | 20 | func NewAnalyzer() GenAnalyzer { 21 | return &Analyzer{ 22 | make([]string, 0), 23 | make([]basic.Item, 0), 24 | } 25 | } 26 | 27 | //用于解析页面 28 | func (self *Analyzer) Analyze(httpRes *http.Response, parser Parser) ([]string, []basic.Item) { 29 | defer httpRes.Body.Close() 30 | if parser == nil { 31 | panic("xxx") 32 | } 33 | return parser(httpRes) 34 | } 35 | -------------------------------------------------------------------------------- /basic/check.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import ( 4 | "net/url" 5 | "strings" 6 | ) 7 | 8 | func Check(e error) { 9 | if e != nil { 10 | panic(e) 11 | } 12 | } 13 | 14 | func CheckBaseurl(Url string) string { 15 | u, _ := url.Parse(Url) 16 | if u.Scheme == "" { 17 | Url = "http://" + Url 18 | } 19 | if flag := strings.HasSuffix(Url, "/"); flag != true { 20 | Url = Url + "/" 21 | } 22 | return Url 23 | } 24 | 25 | func CheckLink(link string) string { 26 | u, _ := url.Parse(link) 27 | if u.Scheme != "" { 28 | return "" 29 | } 30 | if u.Scheme == "http" || u.Scheme == "https" { 31 | return link 32 | } 33 | if flag := strings.HasPrefix(link, Config.StartUrl); flag != true { 34 | link = strings.Join([]string{Config.StartUrl, link}, "") 35 | return link 36 | } 37 | return "" 38 | } 39 | -------------------------------------------------------------------------------- /basic/config.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import () 4 | 5 | type config struct { 6 | flag bool //配置是否初始化过的标志 7 | Name string //爬虫名:) 8 | StartUrl string //初始Url,会从创建控制器是给与的参数添加 9 | RequestMethod string //http请求的方法 10 | HttpHeader map[string]string //http请求的header 11 | DownloaderNumber int //下载器数目 12 | AnalyzerNumber int //分析器数目 13 | ProcessorNumber int //处理器数目 14 | ReqChanLength int //请求通道长度 15 | ResChanLength int //响应通道长度 16 | LinkChanLength int //链接通道长度 17 | ItemChanLength int //数据通道长度 18 | } 19 | 20 | var Config *config = new(config) 21 | 22 | func InitConfig() { 23 | if flag { 24 | return 25 | } 26 | Config.HttpHeader = make(map[string]string) 27 | if Config.Name == "" { 28 | Config.Name = "scrago" 29 | } 30 | if Config.RequestMethod == "" { 31 | Config.RequestMethod = "GET" 32 | } 33 | if Config.DownloaderNumber == 0 { 34 | Config.DownloaderNumber = 5 35 | } 36 | if Config.AnalyzerNumber == 0 { 37 | Config.AnalyzerNumber = 2 38 | } 39 | if Config.ProcessorNumber == 0 { 40 | Config.ProcessorNumber = 3 41 | } 42 | if Config.ReqChanLength == 0 { 43 | Config.ReqChanLength = 500 44 | } 45 | if Config.ResChanLength == 0 { 46 | Config.ResChanLength = 200 47 | } 48 | if Config.LinkChanLength == 0 { 49 | Config.LinkChanLength = 1000 50 | } 51 | if Config.ItemChanLength == 0 { 52 | Config.ItemChanLength = 200 53 | } 54 | flag = true 55 | } 56 | -------------------------------------------------------------------------------- /basic/items.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | type Item map[string]interface{} 4 | 5 | func NewItems() Item { 6 | it := make(map[string]interface{}) 7 | return it 8 | } 9 | -------------------------------------------------------------------------------- /basic/links.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import () 4 | 5 | type Link struct { 6 | link string 7 | index uint32 8 | } 9 | 10 | func NewLinks(link string, index uint32) Link { 11 | return Link{link, index} 12 | } 13 | 14 | func (self *Link) GetLink() string { 15 | return self.link 16 | } 17 | 18 | func (self *Link) GetIndex() uint32 { 19 | return self.index 20 | } 21 | -------------------------------------------------------------------------------- /basic/logging.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "runtime" 7 | "strings" 8 | ) 9 | 10 | type Position uint 11 | 12 | const ( 13 | POSITION_SINGLE Position = 1 14 | POSITION_IN_MANAGER Position = 2 15 | ) 16 | 17 | func init() { 18 | log.SetFlags(log.LstdFlags) 19 | } 20 | 21 | type Logger interface { 22 | GetPosition() Position 23 | SetPosition(pos Position) 24 | Error(v ...interface{}) string 25 | Errorf(format string, v ...interface{}) string 26 | Errorln(v ...interface{}) string 27 | Fatal(v ...interface{}) string 28 | Fatalf(format string, v ...interface{}) string 29 | Fatalln(v ...interface{}) string 30 | Info(v ...interface{}) string 31 | Infof(format string, v ...interface{}) string 32 | Infoln(v ...interface{}) string 33 | Panic(v ...interface{}) string 34 | Panicf(format string, v ...interface{}) string 35 | Panicln(v ...interface{}) string 36 | Warn(v ...interface{}) string 37 | Warnf(format string, v ...interface{}) string 38 | Warnln(v ...interface{}) string 39 | } 40 | 41 | func getInvokerLocation(skipNumber int) string { 42 | pc, file, line, ok := runtime.Caller(skipNumber) 43 | if !ok { 44 | return "" 45 | } 46 | simpleFileName := "" 47 | if index := strings.LastIndex(file, "/"); index > 0 { 48 | simpleFileName = file[index+1 : len(file)] 49 | } 50 | funcPath := "" 51 | funcPtr := runtime.FuncForPC(pc) 52 | if funcPtr != nil { 53 | funcPath = funcPtr.Name() 54 | } 55 | return fmt.Sprintf("%s : (%s:%d)", funcPath, simpleFileName, line) 56 | } 57 | 58 | func generateLogContent( 59 | logTag LogTag, 60 | pos Position, 61 | format string, 62 | v ...interface{}) string { 63 | skipNumber := int(pos) + 2 64 | baseInfo := 65 | fmt.Sprintf("%s %s - ", logTag.Prefix(), getInvokerLocation(skipNumber)) 66 | var result string 67 | if len(format) > 0 { 68 | result = fmt.Sprintf((baseInfo + format), v...) 69 | } else { 70 | vLen := len(v) 71 | params := make([]interface{}, (vLen + 1)) 72 | params[0] = baseInfo 73 | for i := 1; i <= vLen; i++ { 74 | params[i] = v[i-1] 75 | } 76 | result = fmt.Sprint(params...) 77 | } 78 | return result 79 | } 80 | 81 | func NewSimpleLogger() Logger { 82 | logger := &ConsoleLogger{} 83 | logger.SetPosition(POSITION_SINGLE) 84 | return logger 85 | } 86 | 87 | func NewLogger(loggers []Logger) Logger { 88 | for _, logger := range loggers { 89 | logger.SetPosition(POSITION_IN_MANAGER) 90 | } 91 | return &LogManager{loggers: loggers} 92 | } 93 | 94 | type ConsoleLogger struct { 95 | position Position 96 | } 97 | 98 | func (logger *ConsoleLogger) GetPosition() Position { 99 | return logger.position 100 | } 101 | 102 | func (logger *ConsoleLogger) SetPosition(pos Position) { 103 | logger.position = pos 104 | } 105 | 106 | func (logger *ConsoleLogger) Error(v ...interface{}) string { 107 | content := generateLogContent(getErrorLogTag(), logger.GetPosition(), "", v...) 108 | log.Print(content) 109 | return content 110 | } 111 | 112 | func (logger *ConsoleLogger) Errorf(format string, v ...interface{}) string { 113 | content := generateLogContent(getErrorLogTag(), logger.GetPosition(), format, v...) 114 | log.Print(content) 115 | return content 116 | } 117 | 118 | func (logger *ConsoleLogger) Errorln(v ...interface{}) string { 119 | content := generateLogContent(getErrorLogTag(), logger.GetPosition(), "", v...) 120 | log.Println(content) 121 | return content 122 | } 123 | 124 | func (logger *ConsoleLogger) Fatal(v ...interface{}) string { 125 | content := generateLogContent(getFatalLogTag(), logger.GetPosition(), "", v...) 126 | log.Print(content) 127 | return content 128 | } 129 | 130 | func (logger *ConsoleLogger) Fatalf(format string, v ...interface{}) string { 131 | content := generateLogContent(getFatalLogTag(), logger.GetPosition(), format, v...) 132 | log.Print(content) 133 | return content 134 | } 135 | 136 | func (logger *ConsoleLogger) Fatalln(v ...interface{}) string { 137 | content := generateLogContent(getFatalLogTag(), logger.GetPosition(), "", v...) 138 | log.Println(content) 139 | return content 140 | } 141 | 142 | func (logger *ConsoleLogger) Info(v ...interface{}) string { 143 | content := generateLogContent(getInfoLogTag(), logger.GetPosition(), "", v...) 144 | log.Print(content) 145 | return content 146 | } 147 | 148 | func (logger *ConsoleLogger) Infof(format string, v ...interface{}) string { 149 | content := generateLogContent(getInfoLogTag(), logger.GetPosition(), format, v...) 150 | log.Print(content) 151 | return content 152 | } 153 | 154 | func (logger *ConsoleLogger) Infoln(v ...interface{}) string { 155 | content := generateLogContent(getInfoLogTag(), logger.GetPosition(), "", v...) 156 | log.Println(content) 157 | return content 158 | } 159 | 160 | func (logger *ConsoleLogger) Panic(v ...interface{}) string { 161 | content := generateLogContent(getPanicLogTag(), logger.GetPosition(), "", v...) 162 | log.Print(content) 163 | return content 164 | } 165 | 166 | func (logger *ConsoleLogger) Panicf(format string, v ...interface{}) string { 167 | content := generateLogContent(getPanicLogTag(), logger.GetPosition(), format, v...) 168 | log.Print(content) 169 | return content 170 | } 171 | 172 | func (logger *ConsoleLogger) Panicln(v ...interface{}) string { 173 | content := generateLogContent(getPanicLogTag(), logger.GetPosition(), "", v...) 174 | log.Println(content) 175 | return content 176 | } 177 | 178 | func (logger *ConsoleLogger) Warn(v ...interface{}) string { 179 | content := generateLogContent(getWarnLogTag(), logger.GetPosition(), "", v...) 180 | log.Print(content) 181 | return content 182 | } 183 | 184 | func (logger *ConsoleLogger) Warnf(format string, v ...interface{}) string { 185 | content := generateLogContent(getWarnLogTag(), logger.GetPosition(), format, v...) 186 | log.Print(content) 187 | return content 188 | } 189 | 190 | func (logger *ConsoleLogger) Warnln(v ...interface{}) string { 191 | content := generateLogContent(getWarnLogTag(), logger.GetPosition(), "", v...) 192 | log.Println(content) 193 | return content 194 | } 195 | 196 | type LogManager struct { 197 | loggers []Logger 198 | } 199 | 200 | func (logger *LogManager) GetPosition() Position { 201 | return POSITION_SINGLE 202 | } 203 | 204 | func (logger *LogManager) SetPosition(pos Position) {} 205 | 206 | func (self *LogManager) Error(v ...interface{}) string { 207 | var content string 208 | for _, logger := range self.loggers { 209 | content = logger.Error(v...) 210 | } 211 | return content 212 | } 213 | 214 | func (self *LogManager) Errorf(format string, v ...interface{}) string { 215 | var content string 216 | for _, logger := range self.loggers { 217 | content = logger.Errorf(format, v...) 218 | } 219 | return content 220 | } 221 | 222 | func (self *LogManager) Errorln(v ...interface{}) string { 223 | var content string 224 | for _, logger := range self.loggers { 225 | content = logger.Errorln(v...) 226 | } 227 | return content 228 | } 229 | 230 | func (self *LogManager) Fatal(v ...interface{}) string { 231 | var content string 232 | for _, logger := range self.loggers { 233 | content = logger.Fatal(v...) 234 | } 235 | return content 236 | } 237 | 238 | func (self *LogManager) Fatalf(format string, v ...interface{}) string { 239 | var content string 240 | for _, logger := range self.loggers { 241 | content = logger.Fatalf(format, v...) 242 | } 243 | return content 244 | } 245 | 246 | func (self *LogManager) Fatalln(v ...interface{}) string { 247 | var content string 248 | for _, logger := range self.loggers { 249 | content = logger.Fatalln(v...) 250 | } 251 | return content 252 | } 253 | 254 | func (self *LogManager) Info(v ...interface{}) string { 255 | var content string 256 | for _, logger := range self.loggers { 257 | content = logger.Info(v...) 258 | } 259 | return content 260 | } 261 | 262 | func (self *LogManager) Infof(format string, v ...interface{}) string { 263 | var content string 264 | for _, logger := range self.loggers { 265 | content = logger.Infof(format, v...) 266 | } 267 | return content 268 | } 269 | 270 | func (self *LogManager) Infoln(v ...interface{}) string { 271 | var content string 272 | for _, logger := range self.loggers { 273 | content = logger.Infoln(v...) 274 | } 275 | return content 276 | } 277 | 278 | func (self *LogManager) Panic(v ...interface{}) string { 279 | var content string 280 | for _, logger := range self.loggers { 281 | content = logger.Panic(v...) 282 | } 283 | return content 284 | } 285 | 286 | func (self *LogManager) Panicf(format string, v ...interface{}) string { 287 | var content string 288 | for _, logger := range self.loggers { 289 | content = logger.Panicf(format, v...) 290 | } 291 | return content 292 | } 293 | 294 | func (self *LogManager) Panicln(v ...interface{}) string { 295 | var content string 296 | for _, logger := range self.loggers { 297 | content = logger.Panicln(v...) 298 | } 299 | return content 300 | } 301 | 302 | func (self *LogManager) Warn(v ...interface{}) string { 303 | var content string 304 | for _, logger := range self.loggers { 305 | content = logger.Warn(v...) 306 | } 307 | return content 308 | } 309 | 310 | func (self *LogManager) Warnf(format string, v ...interface{}) string { 311 | var content string 312 | for _, logger := range self.loggers { 313 | content = logger.Warnf(format, v...) 314 | } 315 | return content 316 | } 317 | 318 | func (self *LogManager) Warnln(v ...interface{}) string { 319 | var content string 320 | for _, logger := range self.loggers { 321 | content = logger.Warnln(v...) 322 | } 323 | return content 324 | } 325 | 326 | const ( 327 | ERROR_LOG_KEY = "ERROR" 328 | FATAL_LOG_KEY = "FATAL" 329 | INFO_LOG_KEY = "INFO" 330 | PANIC_LOG_KEY = "PANIC" 331 | WARN_LOG_KEY = "WARN" 332 | ) 333 | 334 | type LogTag struct { 335 | name string 336 | prefix string 337 | } 338 | 339 | func (self *LogTag) Name() string { 340 | return self.name 341 | } 342 | 343 | func (self *LogTag) Prefix() string { 344 | return self.prefix 345 | } 346 | 347 | var logTagMap map[string]LogTag = map[string]LogTag{ 348 | ERROR_LOG_KEY: LogTag{name: ERROR_LOG_KEY, prefix: "[" + ERROR_LOG_KEY + "]"}, 349 | FATAL_LOG_KEY: LogTag{name: FATAL_LOG_KEY, prefix: "[" + FATAL_LOG_KEY + "]"}, 350 | INFO_LOG_KEY: LogTag{name: INFO_LOG_KEY, prefix: "[" + INFO_LOG_KEY + "]"}, 351 | PANIC_LOG_KEY: LogTag{name: PANIC_LOG_KEY, prefix: "[" + PANIC_LOG_KEY + "]"}, 352 | WARN_LOG_KEY: LogTag{name: WARN_LOG_KEY, prefix: "[" + WARN_LOG_KEY + "]"}, 353 | } 354 | 355 | func getErrorLogTag() LogTag { 356 | return logTagMap[ERROR_LOG_KEY] 357 | } 358 | 359 | func getFatalLogTag() LogTag { 360 | return logTagMap[FATAL_LOG_KEY] 361 | } 362 | 363 | func getInfoLogTag() LogTag { 364 | return logTagMap[INFO_LOG_KEY] 365 | } 366 | 367 | func getPanicLogTag() LogTag { 368 | return logTagMap[PANIC_LOG_KEY] 369 | } 370 | 371 | func getWarnLogTag() LogTag { 372 | return logTagMap[WARN_LOG_KEY] 373 | } 374 | -------------------------------------------------------------------------------- /basic/request.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import ( 4 | "net/http" 5 | ) 6 | 7 | type Request struct { 8 | httpReq *http.Request 9 | index uint32 10 | } 11 | 12 | func NewRequest(httpReq *http.Request, index uint32) *Request { 13 | return &Request{httpReq: httpReq, index: index} 14 | } 15 | 16 | func (req *Request) GetReq() *http.Request { 17 | return req.httpReq 18 | } 19 | 20 | func (req *Request) GetIndex() uint32 { 21 | return req.index 22 | } 23 | -------------------------------------------------------------------------------- /basic/response.go: -------------------------------------------------------------------------------- 1 | package basic 2 | 3 | import ( 4 | "net/http" 5 | ) 6 | 7 | type Response struct { 8 | httpRes *http.Response 9 | index uint32 10 | } 11 | 12 | func NewResponse(httpRes *http.Response, index uint32) *Response { 13 | return &Response{httpRes: httpRes, index: index} 14 | } 15 | 16 | func (res *Response) GetRes() *http.Response { 17 | return res.httpRes 18 | } 19 | 20 | func (res *Response) GetIndex() uint32 { 21 | return res.index 22 | } 23 | -------------------------------------------------------------------------------- /controller/controller.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "analyzer" 5 | "basic" 6 | "downloader" 7 | //"fmt" 8 | "middleware" 9 | "net/http" 10 | "processor" 11 | "sync" 12 | "time" 13 | ) 14 | 15 | var wg sync.WaitGroup //全局wait锁 16 | 17 | var logger basic.Logger = basic.NewSimpleLogger() // 日志记录器 18 | 19 | type Controller struct { 20 | Downloader downloader.GenDownloader //下载器 21 | Analyzer analyzer.GenAnalyzer //分析器 22 | Processor processor.GenProcessor //处理器 23 | Channel *middleware.Channel //管道 24 | WorkPool *middleware.WorkPool //工作池 25 | StopSignal *StopSignal //停止信号 26 | StartUrl string //初始爬行Url 27 | Depth uint32 //爬行深度 28 | Parser analyzer.Parser //解析页面函数 29 | Store processor.Store 30 | } 31 | 32 | func NewController(StartUrl string, Depth uint32, Parser analyzer.Parser, Store processor.Store) *Controller { 33 | return &Controller{StartUrl: StartUrl, Depth: Depth, Parser: Parser, Store: Store} 34 | } 35 | 36 | func (ctrl *Controller) Go() { 37 | basic.Config.StartUrl = ctrl.StartUrl 38 | basic.InitConfig() 39 | ctrl.Downloader = downloader.NewDownloader() //初始化各组件,下同 40 | ctrl.Analyzer = analyzer.NewAnalyzer() 41 | ctrl.Processor = processor.NewProcessor() 42 | ctrl.Channel = middleware.NewChannel() 43 | ctrl.WorkPool = middleware.NewWorkPool() 44 | ctrl.StopSignal = NewStopSignal() 45 | 46 | //准备第一次请求 47 | primaryreq, err := http.NewRequest(basic.Config.RequestMethod, basic.Config.StartUrl, nil) 48 | basic.Check(err) 49 | basereq := basic.NewRequest(primaryreq, 0) 50 | ctrl.Channel.ReqChan() <- *basereq 51 | 52 | //利用goroutine使三个组件同时工作,启动监视器监视 53 | wg.Add(4) 54 | go ctrl.DownloaderManager() 55 | go ctrl.AnalyzerManager() 56 | go ctrl.ProcessorManager() 57 | go ctrl.Monitors() 58 | 59 | wg.Wait() 60 | //fmt.Println(len(ctrl.Processor.GetVurl())) 61 | //fmt.Println(ctrl.Processor.GetVurl()) 62 | } 63 | 64 | func (ctrl *Controller) DownloaderManager() { 65 | defer wg.Done() 66 | dwg := new(sync.WaitGroup) 67 | dwg.Add(basic.Config.DownloaderNumber) 68 | //工作池机制 69 | ctrl.WorkPool.Pool(basic.Config.DownloaderNumber, func() { 70 | for req := range ctrl.Channel.ReqChan() { 71 | res := ctrl.Downloader.Download(&req) //res为构造请求类型 72 | if res != nil { 73 | ctrl.Channel.ResChan() <- *res //放入响应通道 74 | } 75 | } 76 | dwg.Done() 77 | }) 78 | dwg.Wait() 79 | //请求通道关闭后关闭响应通道 80 | close(ctrl.Channel.ResChan()) 81 | logger.Infoln("download quit!") 82 | } 83 | 84 | func (ctrl *Controller) AnalyzerManager() { 85 | defer wg.Done() 86 | awg := new(sync.WaitGroup) 87 | awg.Add(basic.Config.AnalyzerNumber) 88 | ctrl.WorkPool.Pool(basic.Config.AnalyzerNumber, func() { 89 | for res := range ctrl.Channel.ResChan() { 90 | Links, Items := ctrl.Analyzer.Analyze(res.GetRes(), ctrl.Parser) //解析函数解析html页面 91 | //将item放入通道传至持久储存函数 92 | for _, item := range Items { 93 | ctrl.Channel.ItemChan() <- item 94 | } 95 | //如果停止信号发出,不再向链接通道传输数据 96 | if ctrl.StopSignal.status() { 97 | continue 98 | } 99 | //发送至链接通道后续处理 100 | for _, link := range Links { 101 | ctrl.Channel.LinkChan() <- basic.NewLinks(link, res.GetIndex()+1) 102 | } 103 | } 104 | awg.Done() 105 | }) 106 | awg.Wait() 107 | //同上,发送完毕后关闭 108 | close(ctrl.Channel.LinkChan()) 109 | close(ctrl.Channel.ItemChan()) 110 | //运行流程的最后一步,所以发送已结束信号,为监视器提供信号 111 | ctrl.StopSignal.finish() 112 | logger.Infoln("Analyzer quit!") 113 | } 114 | 115 | func (ctrl *Controller) ProcessorManager() { 116 | defer wg.Done() 117 | pwg := new(sync.WaitGroup) 118 | pwg.Add(basic.Config.ProcessorNumber) 119 | ctrl.WorkPool.Pool(1, func() { //分析link速度很快,基本上不会阻塞,只需分配一个goroutine 120 | for link := range ctrl.Channel.LinkChan() { 121 | //flag为判重标志 122 | req, flag := ctrl.Processor.DealLink(link) 123 | if !flag { 124 | continue 125 | } 126 | //判断深度(索引) 127 | if req.GetIndex() <= ctrl.Depth { 128 | ctrl.Channel.ReqChan() <- *req 129 | } else { 130 | //避免重复关闭通道 131 | if ctrl.StopSignal.status() { 132 | continue 133 | } 134 | close(ctrl.Channel.ReqChan()) 135 | ctrl.StopSignal.sign() 136 | } 137 | } 138 | pwg.Done() 139 | }) 140 | ctrl.WorkPool.Pool(basic.Config.ProcessorNumber-1, func() { 141 | for item := range ctrl.Channel.ItemChan() { 142 | ctrl.Processor.DealItem(item, ctrl.Store) 143 | } 144 | pwg.Done() 145 | }) 146 | pwg.Wait() 147 | logger.Infoln("Processor quit!") 148 | } 149 | 150 | func (ctrl *Controller) Monitors() { 151 | defer wg.Done() 152 | logger.Infoln("Spider start! ") 153 | for { 154 | logger.Infof("Spider Status: \n"+ 155 | "reqchan length: %d\n"+ 156 | "reschan length: %d\n"+ 157 | "linkchan length: %d\n"+ 158 | "itemchan length: %d\n", 159 | len(ctrl.Channel.ReqChan()), len(ctrl.Channel.ResChan()), len(ctrl.Channel.LinkChan()), len(ctrl.Channel.ItemChan())) 160 | time.Sleep(time.Second * 3) 161 | if ctrl.StopSignal.ended() { 162 | logger.Infof("Spider is Stoped!") 163 | break 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /controller/signal.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | type GenStopSignal interface { 8 | sign() bool 9 | status() bool 10 | finish() bool 11 | ended() bool 12 | } 13 | 14 | func NewStopSignal() *StopSignal { 15 | return &StopSignal{} 16 | } 17 | 18 | type StopSignal struct { 19 | rwmutex sync.RWMutex //读写锁 20 | flag bool //停止信号是否发出的标志 21 | stop bool //模块是否停止完全的标志 22 | } 23 | 24 | func (self *StopSignal) sign() bool { 25 | self.rwmutex.Lock() 26 | defer self.rwmutex.Unlock() 27 | if self.flag { 28 | return false 29 | } 30 | self.flag = true 31 | return true 32 | } 33 | 34 | func (self *StopSignal) status() bool { 35 | return self.flag 36 | } 37 | 38 | func (self *StopSignal) finish() bool { 39 | self.rwmutex.Lock() 40 | defer self.rwmutex.Unlock() 41 | if self.stop { 42 | return false 43 | } 44 | self.stop = true 45 | return true 46 | } 47 | 48 | func (self *StopSignal) ended() bool { 49 | return self.stop 50 | } 51 | -------------------------------------------------------------------------------- /downloader/downloader.go: -------------------------------------------------------------------------------- 1 | /* 2 | 下载器,接受请求并返回请求 3 | */ 4 | 5 | package downloader 6 | 7 | import ( 8 | "basic" 9 | "fmt" 10 | "net/http" 11 | ) 12 | 13 | type GenDownloader interface { 14 | Download(req *basic.Request) *basic.Response 15 | } 16 | 17 | type Downloader struct { 18 | //用于处理http请求 19 | client *http.Client 20 | } 21 | 22 | func NewDownloader() GenDownloader { 23 | return &Downloader{&http.Client{}} 24 | } 25 | 26 | //接受构造请求,返回构造响应 27 | func (self *Downloader) Download(req *basic.Request) *basic.Response { 28 | for k, v := range basic.Config.HttpHeader { 29 | fmt.Println(k, v) 30 | req.GetReq().Header.Set(k, v) 31 | } 32 | httpRes, err := self.client.Do(req.GetReq()) 33 | if err != nil { 34 | return nil 35 | } 36 | response := basic.NewResponse(httpRes, req.GetIndex()) 37 | return response 38 | } 39 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "basic" 5 | "controller" 6 | "fmt" 7 | "github.com/PuerkitoBio/goquery" 8 | "net/http" 9 | "strings" 10 | ) 11 | 12 | func main() { 13 | //创建一个控制器,这里有4个必须给与的参数: 14 | //爬取的初始url,爬取深度,解析函数,储存函数 15 | //basic.InitConfig() //配置Config需要先初始化 16 | //basic.Config.HttpHeader["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) " + 17 | // "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36" 18 | controller := controller.NewController("http://z3lion.com/", 1, Parser, Store) 19 | //启动爬虫 20 | controller.Go() 21 | } 22 | 23 | //解析函数定义,声明必须相同,下同 24 | func Parser(httpRes *http.Response) ([]string, []basic.Item) { 25 | //两个需要返回的列表 26 | linklist := make([]string, 0) //下一步需要请求的链接 27 | itemlist := make([]basic.Item, 0) //保存的数据类型为 map[string]interface{} 28 | 29 | //自定义部分 30 | //抓取所有链接 31 | doc, _ := goquery.NewDocumentFromResponse(httpRes) 32 | doc.Find("a").Each(func(i int, s *goquery.Selection) { 33 | link, exits := s.Attr("href") 34 | if exits { 35 | link = basic.CheckLink(link) 36 | if link != "" { 37 | linklist = append(linklist, link) 38 | } 39 | } 40 | }) 41 | //保存每个页面的标题 42 | title := strings.TrimSpace(doc.Find("head title").Text()) 43 | if title != "" { 44 | item := make(map[string]interface{}) 45 | item["标题"] = title 46 | itemlist = append(itemlist, item) 47 | } 48 | 49 | return linklist, itemlist 50 | } 51 | 52 | //储存函数定义 53 | func Store(item basic.Item) { 54 | //这里只打印抓取的数据 55 | fmt.Println(item) 56 | } 57 | -------------------------------------------------------------------------------- /middleware/channel.go: -------------------------------------------------------------------------------- 1 | package middleware 2 | 3 | import ( 4 | "basic" 5 | ) 6 | 7 | type Channel struct { 8 | reqchan chan basic.Request 9 | reschan chan basic.Response 10 | linkchan chan basic.Link 11 | itemchan chan basic.Item 12 | } 13 | 14 | func NewChannel() *Channel { 15 | return &Channel{ 16 | make(chan basic.Request, basic.Config.ReqChanLength), 17 | make(chan basic.Response, basic.Config.ResChanLength), 18 | make(chan basic.Link, basic.Config.LinkChanLength), 19 | make(chan basic.Item, basic.Config.ItemChanLength), 20 | } 21 | } 22 | 23 | func (self *Channel) ReqChan() chan basic.Request { 24 | return self.reqchan 25 | } 26 | 27 | func (self *Channel) ResChan() chan basic.Response { 28 | return self.reschan 29 | } 30 | 31 | func (self *Channel) LinkChan() chan basic.Link { 32 | return self.linkchan 33 | } 34 | 35 | func (self *Channel) ItemChan() chan basic.Item { 36 | return self.itemchan 37 | } 38 | -------------------------------------------------------------------------------- /middleware/workpool.go: -------------------------------------------------------------------------------- 1 | package middleware 2 | 3 | import () 4 | 5 | type WorkPool struct { 6 | } 7 | 8 | func NewWorkPool() *WorkPool { 9 | return &WorkPool{} 10 | } 11 | 12 | //以num个goroutine执行函数 13 | func (self *WorkPool) Pool(num int, work func()) { 14 | for w := 0; w < num; w++ { 15 | go work() 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /processor/processor.go: -------------------------------------------------------------------------------- 1 | package processor 2 | 3 | import ( 4 | "basic" 5 | "net/http" 6 | ) 7 | 8 | type GenProcessor interface { 9 | DealLink(link basic.Link) (*basic.Request, bool) 10 | DealItem(item basic.Item, Store Store) 11 | GetVurl() map[string]bool 12 | } 13 | 14 | type Store func(item basic.Item) 15 | 16 | type Processor struct { 17 | Vurl map[string]bool //已访问过的url字典 18 | } 19 | 20 | func NewProcessor() *Processor { 21 | return &Processor{make(map[string]bool)} 22 | } 23 | 24 | func (self *Processor) DealLink(link basic.Link) (*basic.Request, bool) { 25 | //url判重 26 | if _, visited := self.Vurl[link.GetLink()]; visited { 27 | return nil, false 28 | } 29 | self.Vurl[link.GetLink()] = true //放入字典 30 | httpReq, err := http.NewRequest(basic.Config.RequestMethod, link.GetLink(), nil) 31 | basic.Check(err) 32 | request := basic.NewRequest(httpReq, link.GetIndex()) //转化为构造请求 33 | return request, true 34 | } 35 | 36 | func (self *Processor) DealItem(item basic.Item, Store Store) { 37 | Store(item) 38 | } 39 | 40 | func (self *Processor) GetVurl() map[string]bool { 41 | return self.Vurl 42 | } 43 | --------------------------------------------------------------------------------