├── pholcus
    ├── node
    │   └── node.go
    ├── status
    │   └── status.go
    ├── gui
    │   ├── rsrc.syso
    │   ├── guimain.manifest
    │   ├── logview.go
    │   ├── menu.go
    │   ├── guispider.go
    │   └── guimain.go
    ├── crawler
    │   ├── crawler.go
    │   └── crawl.go
    ├── pholcus.go
    ├── keeper
    │   └── login.go
    └── socket
    │   ├── PhoSocketTest.go
    │   └── PhoSocket.go
├── doc
    ├── ICON.ico
    ├── project.png
    └── GUI编译命令.txt
├── main.go
├── reporter
    ├── reporter.go
    └── report.go
├── spiders
    ├── spider
    │   ├── spiderlist.go
    │   ├── rss.go
    │   ├── common.go
    │   └── spider.go
    ├── kaola.go
    ├── shunfenghaitao.go
    ├── baidusearch.go
    ├── readme.md
    ├── jdsearch.go
    ├── miyabaobei.go
    ├── alibaba.go
    ├── taobaosearch.go
    ├── googlesearch.go
    ├── hollandandbarrett.go
    ├── baidunews.go
    └── wangyi.go
├── README.md
├── downloader
    ├── downloader.go
    ├── context
    │   ├── response.go
    │   └── request.go
    └── downloader_http.go
├── pipeline
    ├── collector
    │   ├── datacell.go
    │   ├── output.go
    │   ├── docker.go
    │   ├── collector.go
    │   └── output_lib.go
    └── pipeline.go
├── main.manifest
├── common
    ├── deduplicate
    │   └── deduplicate.go
    ├── mlog
    │   ├── mlog.go
    │   ├── strace.go
    │   └── filelog.go
    ├── queue
    │   └── queue.go
    ├── etc_config
    │   └── etc_config.go
    ├── util
    │   └── util.go
    └── config
    │   └── config.go
├── scheduler
    ├── scheduler.go
    └── src_manage.go
└── config
    └── config.go


/pholcus/node/node.go:
--------------------------------------------------------------------------------
1 | package node
2 | 
3 | import ()
4 | 


--------------------------------------------------------------------------------
/pholcus/status/status.go:
--------------------------------------------------------------------------------
1 | package status
2 | 
3 | import ()
4 | 


--------------------------------------------------------------------------------
/doc/ICON.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/ICON.ico


--------------------------------------------------------------------------------
/doc/project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/project.png


--------------------------------------------------------------------------------
/pholcus/gui/rsrc.syso:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/pholcus/gui/rsrc.syso


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/pholcus/gui"
 5 | )
 6 | 
 7 | func main() {
 8 | 	gui.Run()
 9 | }
10 | 


--------------------------------------------------------------------------------
/doc/GUI编译命令.txt:
--------------------------------------------------------------------------------
1 | go get github.com/akavel/rsrc
2 | rsrc -manifest test.manifest -o rsrc.syso
3 | 
4 | go build
5 | 
6 | 
7 | go build -ldflags="-H windowsgui"


--------------------------------------------------------------------------------
/reporter/reporter.go:
--------------------------------------------------------------------------------
1 | package reporter
2 | 
3 | type Reporter interface {
4 | 	Printf(format string, v ...interface{})
5 | 	Println(v ...interface{})
6 | 	send(string)
7 | }
8 | 


--------------------------------------------------------------------------------
/pholcus/crawler/crawler.go:
--------------------------------------------------------------------------------
 1 | package crawler
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
 5 | )
 6 | 
 7 | type Crawler interface {
 8 | 	Init(*spider.Spider) Crawler
 9 | 	Start()
10 | }
11 | 


--------------------------------------------------------------------------------
/spiders/spider/spiderlist.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | type Spiders []*Spider
 4 | 
 5 | var SpiderList = Spiders{}
 6 | 
 7 | func (Spiders) Init() {
 8 | 	SpiderList = Spiders{}
 9 | }
10 | 
11 | func (Spiders) Add(sp *Spider) {
12 | 	SpiderList = append(SpiderList, sp)
13 | }
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## pholcus
 2 | Pholcus（幽灵蛛）是一款Go语言编写的爬虫软件框架（含GUI界面），优雅的爬虫规则、可控的高并发、任意的批量任务、多种输出方式、大量Demo，并且考虑了支持分布式布局。
 3 | 
 4 | ![image](https://github.com/henrylee2cn/pholcus/blob/master/doc/project.png)
 5 | 
 6 | 
 7 | 
 8 | **安装幽灵蛛**
 9 | ```
10 | go get github.com/henrylee2cn/pholcus
11 | ```
12 | 


--------------------------------------------------------------------------------
/downloader/downloader.go:
--------------------------------------------------------------------------------
 1 | package downloader
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/downloader/context"
 5 | )
 6 | 
 7 | // The Downloader interface.
 8 | // You can implement the interface by implement function Download.
 9 | // Function Download need to return Page instance pointer that has request result downloaded from Request.
10 | type Downloader interface {
11 | 	Download(req *context.Request) *context.Response
12 | }
13 | 


--------------------------------------------------------------------------------
/pipeline/collector/datacell.go:
--------------------------------------------------------------------------------
 1 | // 数据存储单元
 2 | package collector
 3 | 
 4 | type DataCell map[string]interface{}
 5 | 
 6 | func NewDataCell(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) DataCell {
 7 | 	return DataCell{
 8 | 		"RuleName":     ruleName,  //规定Data中的key
 9 | 		"Data":         data,      //数据存储,key须与Rule的Fields保持一致
10 | 		"Url":          url,       //用于索引
11 | 		"ParentUrl":    parentUrl, //DataCell的上级url
12 | 		"DownloadTime": downloadTime,
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/main.manifest:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
 3 | 	<assemblyIdentity version="1.0.0.0" processorArchitecture="*" name="SomeFunkyNameHere" type="win32"/>
 4 | 	<dependency>
 5 | 		<dependentAssembly>
 6 | 			<assemblyIdentity type="win32" name="Microsoft.Windows.Common-Controls" version="6.0.0.0" processorArchitecture="*" publicKeyToken="6595b64144ccf1df" language="*"/>
 7 | 		</dependentAssembly>
 8 | 	</dependency>
 9 | </assembly>
10 | 


--------------------------------------------------------------------------------
/pholcus/gui/guimain.manifest:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
 3 | 	<assemblyIdentity version="1.0.0.0" processorArchitecture="*" name="SomeFunkyNameHere" type="win32"/>
 4 | 	<dependency>
 5 | 		<dependentAssembly>
 6 | 			<assemblyIdentity type="win32" name="Microsoft.Windows.Common-Controls" version="6.0.0.0" processorArchitecture="*" publicKeyToken="6595b64144ccf1df" language="*"/>
 7 | 		</dependentAssembly>
 8 | 	</dependency>
 9 | </assembly>
10 | 


--------------------------------------------------------------------------------
/reporter/report.go:
--------------------------------------------------------------------------------
 1 | package reporter
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | )
 7 | 
 8 | type Report struct{}
 9 | 
10 | func (self *Report) send(str string) {
11 | 	if true {
12 | 
13 | 	}
14 | }
15 | 
16 | func (self *Report) Printf(format string, v ...interface{}) {
17 | 	log.Printf(format, v...)
18 | 	self.send(fmt.Sprintf(format, v...))
19 | }
20 | 
21 | func (self *Report) Println(v ...interface{}) {
22 | 	log.Println(v...)
23 | 	self.send(fmt.Sprintln(v...))
24 | }
25 | 
26 | var Log Reporter
27 | 
28 | func init() {
29 | 	Log = &Report{}
30 | }
31 | 


--------------------------------------------------------------------------------
/common/deduplicate/deduplicate.go:
--------------------------------------------------------------------------------
 1 | package deduplicate
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/common/util"
 5 | )
 6 | 
 7 | type Deduplicate interface {
 8 | 	// 采集非重复样本并返回对比结果，重复为true
 9 | 	Compare(obj interface{}) bool
10 | }
11 | 
12 | type Deduplication struct {
13 | 	sampling map[string]bool
14 | }
15 | 
16 | func New() Deduplicate {
17 | 	return &Deduplication{
18 | 		sampling: make(map[string]bool),
19 | 	}
20 | }
21 | 
22 | // 对比是否已存在，不存在则采样
23 | func (self *Deduplication) Compare(obj interface{}) bool {
24 | 	s := util.MakeUnique(obj)
25 | 	if !self.sampling[s] {
26 | 		self.sampling[s] = true
27 | 		return false
28 | 	}
29 | 	return true
30 | }
31 | 


--------------------------------------------------------------------------------
/common/mlog/mlog.go:
--------------------------------------------------------------------------------
 1 | // Package mlog implements log operations.
 2 | package mlog
 3 | 
 4 | import (
 5 |     "runtime"
 6 | )
 7 | 
 8 | // The plog is a public function combiation for other log objects.
 9 | type plog struct {
10 |     isopen bool
11 | }
12 | 
13 | // GetCaller returns file name and line number at the third step of runtime.
14 | func (*plog) getCaller() (string, int) {
15 |     _, file, line, ok := runtime.Caller(3)
16 |     if !ok {
17 |         file = "???"
18 |         line = 0
19 |     }
20 |     return file, line
21 | }
22 | 
23 | // Open makes log open.
24 | func (this *plog) Open() {
25 |     this.isopen = true
26 | }
27 | 
28 | // Close makes log close.
29 | func (this *plog) Close() {
30 |     this.isopen = false
31 | }
32 | 


--------------------------------------------------------------------------------
/pipeline/collector/output.go:
--------------------------------------------------------------------------------
 1 | //数据输出管理
 2 | package collector
 3 | 
 4 | import (
 5 | 	"github.com/henrylee2cn/pholcus/config"
 6 | 	"log"
 7 | 	// "fmt"
 8 | 	"time"
 9 | )
10 | 
11 | func (self *Collector) Output(dataIndex int) {
12 | 	defer func() {
13 | 		err := recover()
14 | 		if err != nil {
15 | 			log.Printf("输出时出错！\n")
16 | 		} else {
17 | 			// 正常情况下回收内存
18 | 			self.DockerQueue.Recover(dataIndex)
19 | 		}
20 | 	}()
21 | 
22 | 	dataLen := len(self.DockerQueue.Dockers[dataIndex])
23 | 	if dataLen == 0 {
24 | 		// log.Println("没有抓到结果！！！")
25 | 		return
26 | 	}
27 | 
28 | 	// 输出数据统计
29 | 	self.setSum(dataLen)
30 | 
31 | 	// 选择执行输出
32 | 	switch self.outType {
33 | 	case "excel":
34 | 		self.excel(dataIndex)
35 | 	case "csv":
36 | 		self.csv(dataIndex)
37 | 	case "mongoDB":
38 | 		self.mgo(dataIndex)
39 | 	default:
40 | 		self.excel(dataIndex)
41 | 	}
42 | 	log.Printf("[任务：%v | 关键词：%v | 批次：%v] 输出 %v 条数据，用时 %.5f 分钟！！！\n", self.Spider.GetName(), self.Spider.GetKeyword(), self.outCount[1]+1, dataLen, time.Since(config.StartTime).Minutes())
43 | }
44 | 


--------------------------------------------------------------------------------
/common/mlog/strace.go:
--------------------------------------------------------------------------------
 1 | package mlog
 2 | 
 3 | import (
 4 |     "log"
 5 |     "os"
 6 | )
 7 | 
 8 | // Strace represents an active object that strace the processing of spider.
 9 | // The strace info is output to os.Stderr.
10 | // The loginst is an point of logger in Std-Packages.
11 | // The isopen is a label represents whether open strace or not.
12 | type strace struct {
13 |     plog
14 | 
15 |     loginst *log.Logger
16 | }
17 | 
18 | var pstrace *strace
19 | 
20 | // StraceInst get the singleton strace object.
21 | func StraceInst() *strace {
22 |     if pstrace == nil {
23 |         pstrace = newStrace()
24 |     }
25 |     return pstrace
26 | }
27 | 
28 | // The newStrace returns initialized strace object.
29 | func newStrace() *strace {
30 |     pstrace := &strace{}
31 |     pstrace.loginst = log.New(os.Stderr, "", log.LstdFlags)
32 |     pstrace.isopen = true
33 |     return pstrace
34 | }
35 | 
36 | // Println output the str to os.Stderr.
37 | func (this *strace) Println(str string) {
38 |     if !this.isopen {
39 |         return
40 |     }
41 |     this.loginst.Printf("%s\n", str)
42 | }
43 | 


--------------------------------------------------------------------------------
/common/queue/queue.go:
--------------------------------------------------------------------------------
 1 | package queue
 2 | 
 3 | type Queue struct {
 4 | 	PoolSize int
 5 | 	PoolChan chan interface{}
 6 | }
 7 | 
 8 | func NewQueue(size int) *Queue {
 9 | 	return &Queue{
10 | 		PoolSize: size,
11 | 		PoolChan: make(chan interface{}, size),
12 | 	}
13 | }
14 | 
15 | func (this *Queue) Init(size int) *Queue {
16 | 	this.PoolSize = size
17 | 	this.PoolChan = make(chan interface{}, size)
18 | 	return this
19 | }
20 | 
21 | func (this *Queue) Push(i interface{}) bool {
22 | 	if len(this.PoolChan) == this.PoolSize {
23 | 		return false
24 | 	}
25 | 	this.PoolChan <- i
26 | 	return true
27 | }
28 | 
29 | func (this *Queue) PushSlice(s []interface{}) {
30 | 	for _, i := range s {
31 | 		this.Push(i)
32 | 	}
33 | }
34 | 
35 | func (this *Queue) Pull() interface{} {
36 | 	return <-this.PoolChan
37 | }
38 | 
39 | // 二次使用Queue实例时，根据容量需求进行高效转换
40 | func (this *Queue) Exchange(num int) (add int) {
41 | 	last := len(this.PoolChan)
42 | 
43 | 	if last >= num {
44 | 		add = int(0)
45 | 		return
46 | 	}
47 | 
48 | 	if this.PoolSize < num {
49 | 		pool := []interface{}{}
50 | 		for i := 0; i < last; i++ {
51 | 			pool = append(pool, <-this.PoolChan)
52 | 		}
53 | 		// 重新定义、赋值
54 | 		this.Init(num).PushSlice(pool)
55 | 	}
56 | 
57 | 	add = num - last
58 | 	return
59 | }
60 | 


--------------------------------------------------------------------------------
/scheduler/scheduler.go:
--------------------------------------------------------------------------------
 1 | package scheduler
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/common/deduplicate"
 5 | 	"github.com/henrylee2cn/pholcus/downloader/context"
 6 | )
 7 | 
 8 | type Scheduler interface {
 9 | 	// 采集非重复url并返回对比结果，重复为true
10 | 	Compare(string) bool
11 | 
12 | 	SrcManager
13 | 	// 以下为具体方法列表
14 | 	// 存入
15 | 	// Push(*context.Request)
16 | 	// 取出
17 | 	// Use(string) *context.Request
18 | 	// 释放一个资源
19 | 	// Free()
20 | 	// 资源队列是否闲置
21 | 	// IsEmpty(string) bool
22 | 	// IsAllEmpty() bool
23 | 
24 | }
25 | 
26 | type scheduler struct {
27 | 	*SrcManage
28 | 	*deduplicate.Deduplication
29 | }
30 | 
31 | func New(capacity uint) Scheduler {
32 | 	return &scheduler{
33 | 		SrcManage:     NewSrcManage(capacity).(*SrcManage),
34 | 		Deduplication: deduplicate.New().(*deduplicate.Deduplication),
35 | 	}
36 | }
37 | 
38 | func (self *scheduler) Push(req *context.Request) {
39 | 	is := self.Compare(req.GetUrl())
40 | 	// 有重复则返回
41 | 	if is {
42 | 		return
43 | 	}
44 | 	self.SrcManage.Push(req)
45 | }
46 | 
47 | func (self *scheduler) Compare(url string) bool {
48 | 	return self.Deduplication.Compare(url)
49 | }
50 | 
51 | // 定义全局调度
52 | var Self Scheduler
53 | 
54 | func Init(capacity uint) Scheduler {
55 | 	Self = New(capacity)
56 | 	return Self
57 | }
58 | 


--------------------------------------------------------------------------------
/pholcus/pholcus.go:
--------------------------------------------------------------------------------
 1 | package pholcus
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/downloader/context"
 5 | 	// "github.com/henrylee2cn/pholcus/pholcus/node"
 6 | 	// "github.com/henrylee2cn/pholcus/pholcus/status"
 7 | 	"github.com/henrylee2cn/pholcus/scheduler"
 8 | 	"sync"
 9 | )
10 | 
11 | type Pholcus struct {
12 | 	// *node.Node
13 | 	// *Status
14 | 	isOutsource bool
15 | }
16 | 
17 | var pushMutex sync.Mutex
18 | 
19 | func (self *Pholcus) Push(req *context.Request) {
20 | 	pushMutex.Lock()
21 | 	defer func() {
22 | 		pushMutex.Unlock()
23 | 	}()
24 | 	if !self.TryOutsource(req) {
25 | 		scheduler.Self.Push(req)
26 | 	}
27 | }
28 | 
29 | func (self *Pholcus) TryOutsource(req *context.Request) bool {
30 | 	if self.IsOutsource() && req.TryOutsource() {
31 | 		self.Send(*req)
32 | 		return true
33 | 	}
34 | 	return false
35 | }
36 | 
37 | func (self *Pholcus) SetOutsource(serve bool) {
38 | 	self.isOutsource = serve
39 | }
40 | 
41 | func (self *Pholcus) IsOutsource() bool {
42 | 	return self.isOutsource
43 | }
44 | 
45 | func (self *Pholcus) Send(req context.Request) {
46 | 
47 | }
48 | 
49 | func (self *Pholcus) Receive(req context.Request) {
50 | 	scheduler.Self.Push(&req)
51 | }
52 | 
53 | // 初始化
54 | var Self *Pholcus
55 | 
56 | func init() {
57 | 	Self = &Pholcus{}
58 | }
59 | 


--------------------------------------------------------------------------------
/common/etc_config/etc_config.go:
--------------------------------------------------------------------------------
 1 | // Package etc_config implements config initialization of one spider.
 2 | package etc_config
 3 | 
 4 | import (
 5 | 	"github.com/henrylee2cn/pholcus/common/config"
 6 | 	"github.com/henrylee2cn/pholcus/common/util"
 7 | 	"os"
 8 | )
 9 | 
10 | // Configpath gets default config path like "WD/etc/main.conf".
11 | func configpath() string {
12 | 	//wd, _ := os.Getwd()
13 | 	wd := os.Getenv("GOPATH")
14 | 	if wd == "" {
15 | 		panic("GOPATH is not setted in env.")
16 | 	}
17 | 	logpath := wd + "/etc/"
18 | 	filename := "main.conf"
19 | 	err := os.MkdirAll(logpath, 0755)
20 | 	if err != nil {
21 | 		panic("logpath error : " + logpath + "\n")
22 | 	}
23 | 	return logpath + filename
24 | }
25 | 
26 | // Config is a config singleton object for one spider.
27 | var conf *config.Config
28 | var path string
29 | 
30 | // StartConf is used in Spider for initialization at first time.
31 | func StartConf(configFilePath string) *config.Config {
32 | 	if configFilePath != "" && !util.IsFileExists(configFilePath) {
33 | 		panic("config path is not valiad:" + configFilePath)
34 | 	}
35 | 
36 | 	path = configFilePath
37 | 	return Conf()
38 | }
39 | 
40 | // Conf gets singleton instance of Config.
41 | func Conf() *config.Config {
42 | 	if conf == nil {
43 | 		if path == "" {
44 | 			path = configpath()
45 | 		}
46 | 		conf = config.NewConfig().Load(path)
47 | 	}
48 | 	return conf
49 | }
50 | 


--------------------------------------------------------------------------------
/pipeline/pipeline.go:
--------------------------------------------------------------------------------
 1 | // 数据收集
 2 | package pipeline
 3 | 
 4 | import (
 5 | 	"github.com/henrylee2cn/pholcus/common/deduplicate"
 6 | 	"github.com/henrylee2cn/pholcus/pipeline/collector"
 7 | 	// "github.com/henrylee2cn/pholcus/reporter"
 8 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
 9 | )
10 | 
11 | type Pipeline interface {
12 | 	Start()
13 | 	//接收控制通知
14 | 	CtrlR()
15 | 	//发送控制通知
16 | 	CtrlS()
17 | 	// 收集数据单元
18 | 	Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string)
19 | 	// 对比Url的fingerprint，返回是否有重复
20 | 	Deduplicate(string) bool
21 | 	// 重置
22 | 	Init(*spider.Spider)
23 | }
24 | 
25 | type pipeline struct {
26 | 	*collector.Collector
27 | 	*deduplicate.Deduplication
28 | }
29 | 
30 | func New() Pipeline {
31 | 	return &pipeline{
32 | 		Collector:     collector.NewCollector(),
33 | 		Deduplication: deduplicate.New().(*deduplicate.Deduplication),
34 | 	}
35 | }
36 | 
37 | func (self *pipeline) Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) {
38 | 	dataCell := collector.NewDataCell(ruleName, data, url, parentUrl, downloadTime)
39 | 	self.Collector.Collect(dataCell)
40 | }
41 | 
42 | func (self *pipeline) Init(sp *spider.Spider) {
43 | 	self.Collector.Init(sp)
44 | }
45 | 
46 | func (self *pipeline) Deduplicate(s string) bool {
47 | 	return self.Deduplication.Compare(s)
48 | }
49 | 
50 | func (self *pipeline) Start() {
51 | 	go self.Collector.Manage()
52 | 	// reporter.Log.Println("**************开启输出管道************")
53 | }
54 | 


--------------------------------------------------------------------------------
/config/config.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/common/queue"
 5 | 	"time"
 6 | )
 7 | 
 8 | const (
 9 | 	//软件名
10 | 	APP_NAME = "幽灵蛛数据采集 V0.1 (by Henry)"
11 | 	// 蜘蛛池容量
12 | 	CRAWLER_CAP = 50
13 | 
14 | 	// 收集器容量
15 | 	DATA_CAP = 2 << 14 //65536
16 | 
17 | 	// mongodb数据库服务器
18 | 	DB_URL = "127.0.0.1:27017"
19 | 
20 | 	//mongodb数据库名称
21 | 	DB_NAME = "temp-collection-tentinet"
22 | 
23 | 	//mongodb数据库集合
24 | 	DB_COLLECTION = "news"
25 | )
26 | 
27 | type Report struct {
28 | 	SpiderName string
29 | 	Keyword    string
30 | 	Num        string
31 | 	Time       string
32 | }
33 | 
34 | var (
35 | 	// 点击开始按钮的时间点
36 | 	StartTime time.Time
37 | 	// 小结报告通道
38 | 	ReportChan chan *Report
39 | 	// 请求页面计数
40 | 	ReqSum uint
41 | 	// 创建默认爬行队列
42 | 	CrawlerQueue *queue.Queue
43 | 
44 | 	ThreadNum uint
45 | 
46 | 	OutType string
47 | 
48 | 	// 分段转储容器容量
49 | 	DOCKER_CAP uint
50 | 
51 | 	// 分段输出池容量，最小为2
52 | 	DOCKER_QUEUE_CAP uint
53 | )
54 | 
55 | func init() {
56 | 
57 | 	ReportChan = make(chan *Report)
58 | 
59 | 	CrawlerQueue = queue.NewQueue(0)
60 | 
61 | 	InitDockerParam(50000)
62 | 
63 | }
64 | 
65 | func InitDockerParam(dockercap uint) {
66 | 	DOCKER_CAP = dockercap
67 | 	switch {
68 | 	case dockercap <= 10:
69 | 		DOCKER_QUEUE_CAP = 500
70 | 	case dockercap <= 500:
71 | 		DOCKER_QUEUE_CAP = 200
72 | 	case dockercap <= 1000:
73 | 		DOCKER_QUEUE_CAP = 100
74 | 	case dockercap <= 10000:
75 | 		DOCKER_QUEUE_CAP = 50
76 | 	case dockercap <= 100000:
77 | 		DOCKER_QUEUE_CAP = 10
78 | 	default:
79 | 		DOCKER_QUEUE_CAP = 4
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/pipeline/collector/docker.go:
--------------------------------------------------------------------------------
 1 | // 负责从收集通道接受数据并临时存储
 2 | package collector
 3 | 
 4 | import (
 5 | 	"github.com/henrylee2cn/pholcus/config"
 6 | 	"sync"
 7 | 	"time"
 8 | )
 9 | 
10 | type DockerQueue struct {
11 | 	Curr    int
12 | 	Cap     uint
13 | 	Using   map[int]bool
14 | 	Dockers [][]DataCell
15 | }
16 | 
17 | func NewDocker() []DataCell {
18 | 	return make([]DataCell, 0, config.DOCKER_CAP)
19 | }
20 | 
21 | func NewDockerQueue() *DockerQueue {
22 | 	var queueCap uint = config.DOCKER_QUEUE_CAP
23 | 	if config.DOCKER_QUEUE_CAP < 2 {
24 | 		queueCap = 2
25 | 	}
26 | 
27 | 	dockerQueue := &DockerQueue{
28 | 		Curr:    0,
29 | 		Cap:     queueCap,
30 | 		Using:   make(map[int]bool, queueCap),
31 | 		Dockers: make([][]DataCell, 0),
32 | 	}
33 | 
34 | 	dockerQueue.Using[0] = true
35 | 
36 | 	dockerQueue.Dockers = append(dockerQueue.Dockers, NewDocker())
37 | 
38 | 	return dockerQueue
39 | }
40 | 
41 | var ChangeMutex sync.Mutex
42 | 
43 | func (self *DockerQueue) Change() {
44 | 	ChangeMutex.Lock()
45 | 	defer ChangeMutex.Unlock()
46 | getLable:
47 | 	for {
48 | 		for k, v := range self.Using {
49 | 			if !v {
50 | 				self.Curr = k
51 | 				self.Using[k] = true
52 | 				break getLable
53 | 			}
54 | 		}
55 | 		self.AutoAdd()
56 | 		time.Sleep(5e8)
57 | 	}
58 | }
59 | 
60 | func (self *DockerQueue) Recover(index int) {
61 | 	self.Dockers[index] = NewDocker()
62 | 	self.Using[index] = false
63 | }
64 | 
65 | // 根据情况自动动态增加Docker
66 | func (self *DockerQueue) AutoAdd() {
67 | 	count := len(self.Dockers)
68 | 	if uint(count) < self.Cap {
69 | 		self.Dockers = append(self.Dockers, NewDocker())
70 | 		self.Using[count] = false
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/spiders/spider/rss.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/reporter"
 5 | 	"math"
 6 | 	"time"
 7 | )
 8 | 
 9 | type RSS struct {
10 | 	// RSS爬虫重新访问的５个级别（分钟）
11 | 	Level []int
12 | 	//RSS源的权重,<len(Level),起到调整更新时间级别的规则。如当一个RSS在Level[5]，但是它的Rank是3，那么更新时间调整为Level[5-3] = Level[2] = 180分钟。
13 | 	// Rank map[string]int
14 | 	// RSS源对应的间歇采集时间T，取整就得到Level
15 | 	T map[string]int
16 | 	// RSS源
17 | 	Src map[string]string
18 | 	//更新标记
19 | 	Flag map[string]bool
20 | }
21 | 
22 | func NewRSS(src map[string]string, level []int) *RSS {
23 | 	rss := &RSS{
24 | 		Level: level,
25 | 		// Level: []int{20, 60, 180, 360, 720, 1400},
26 | 		T:    make(map[string]int),
27 | 		Src:  src,
28 | 		Flag: make(map[string]bool),
29 | 	}
30 | 	for k, _ := range src {
31 | 		rss.T[k] = rss.Level[0]
32 | 	}
33 | 	return rss
34 | }
35 | 
36 | func (self *RSS) Wait(src string) {
37 | 	for k, v := range self.Level {
38 | 		if v > self.T[src] {
39 | 			if k == 0 {
40 | 				k = 1
41 | 			}
42 | 			reporter.Log.Printf("************************ ……当前RSS <%s> 的更新周期为 %v 分钟……************************", src, self.Level[k-1])
43 | 			time.Sleep(time.Minute * time.Duration(self.Level[k-1]))
44 | 			break
45 | 		}
46 | 	}
47 | 	if self.Flag[src] {
48 | 		self.T[src] = int(math.Floor(float64(self.T[src]) / 1.2))
49 | 		if self.T[src] < self.Level[0] {
50 | 			self.T[src] = self.Level[0]
51 | 		}
52 | 	} else {
53 | 		self.T[src] = int(math.Floor(float64(self.T[src]) * 1.2))
54 | 		if self.T[src] > self.Level[len(self.Level)-1] {
55 | 			self.T[src] = self.Level[len(self.Level)-1]
56 | 		}
57 | 	}
58 | 	self.Flag[src] = false
59 | }
60 | 
61 | func (self *RSS) Updata(src string) {
62 | 	self.Flag[src] = true
63 | }
64 | 


--------------------------------------------------------------------------------
/scheduler/src_manage.go:
--------------------------------------------------------------------------------
 1 | package scheduler
 2 | 
 3 | import (
 4 | 	"github.com/henrylee2cn/pholcus/downloader/context"
 5 | 	// "github.com/henrylee2cn/pholcus/reporter"
 6 | )
 7 | 
 8 | // SrcManage is an interface that who want implement an management object can realize these functions.
 9 | type SrcManager interface {
10 | 	// 存入
11 | 	Push(*context.Request)
12 | 	// 取出
13 | 	Use(int) *context.Request
14 | 	// 释放一个资源
15 | 	Free()
16 | 	// 资源队列是否闲置
17 | 	IsEmpty(int) bool
18 | 	IsAllEmpty() bool
19 | }
20 | 
21 | type SrcManage struct {
22 | 	count chan bool
23 | 	queue map[int][]*context.Request
24 | }
25 | 
26 | func NewSrcManage(capacity uint) SrcManager {
27 | 	return &SrcManage{
28 | 		count: make(chan bool, int(capacity)),
29 | 		queue: make(map[int][]*context.Request),
30 | 	}
31 | }
32 | 
33 | func (self *SrcManage) Push(req *context.Request) {
34 | 	if spiderId, ok := req.GetSpiderId(); ok {
35 | 		self.queue[spiderId] = append(self.queue[spiderId], req)
36 | 	}
37 | }
38 | 
39 | func (self *SrcManage) Use(spiderId int) *context.Request {
40 | 	if len(self.queue[spiderId]) == 0 {
41 | 		return nil
42 | 	}
43 | 	req := self.queue[spiderId][0]
44 | 	self.queue[spiderId] = self.queue[spiderId][1:]
45 | 	self.count <- true
46 | 	return req
47 | }
48 | 
49 | func (self *SrcManage) Free() {
50 | 	<-self.count
51 | }
52 | 
53 | func (self *SrcManage) IsEmpty(spiderId int) bool {
54 | 	if len(self.queue[spiderId]) == 0 {
55 | 		return true
56 | 	}
57 | 	return false
58 | }
59 | 
60 | func (self *SrcManage) IsAllEmpty() bool {
61 | 	if len(self.count) == 0 {
62 | 		for _, v := range self.queue {
63 | 			if len(v) != 0 {
64 | 				return false
65 | 			}
66 | 		}
67 | 		return true
68 | 	}
69 | 	return false
70 | }
71 | 


--------------------------------------------------------------------------------
/spiders/spider/common.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | import (
 4 | 	// "bytes"
 5 | 	"code.google.com/p/mahonia"
 6 | 	// "golang.org/x/text/encoding/simplifiedchinese"
 7 | 	// "golang.org/x/text/transform"
 8 | 	// "io/ioutil"
 9 | 	// "github.com/henrylee2cn/pholcus/downloader/context"
10 | 	"regexp"
11 | 	"strings"
12 | )
13 | 
14 | func CleanHtml(str string, depth int) string {
15 | 	if depth > 0 {
16 | 		//将HTML标签全转换成小写
17 | 		re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
18 | 		str = re.ReplaceAllStringFunc(str, strings.ToLower)
19 | 	}
20 | 	if depth > 1 {
21 | 		//去除STYLE
22 | 		re, _ := regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
23 | 		str = re.ReplaceAllString(str, "")
24 | 	}
25 | 	if depth > 2 {
26 | 		//去除SCRIPT
27 | 		re, _ := regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
28 | 		str = re.ReplaceAllString(str, "")
29 | 	}
30 | 	if depth > 3 {
31 | 		//去除所有尖括号内的HTML代码，并换成换行符
32 | 		re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
33 | 		str = re.ReplaceAllString(str, "\n")
34 | 	}
35 | 	if depth > 4 {
36 | 		//去除连续的换行符
37 | 		re, _ := regexp.Compile("\\s{2,}")
38 | 		str = re.ReplaceAllString(str, "\n")
39 | 	}
40 | 	return str
41 | }
42 | 
43 | // func Encode(src string) (dst string) {
44 | // 	data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewEncoder()))
45 | // 	if err == nil {
46 | // 		dst = string(data)
47 | // 	}
48 | // 	return
49 | // }
50 | // func Decode(src string) (dst string) {
51 | // 	data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewDecoder()))
52 | // 	if err == nil {
53 | // 		dst = string(data)
54 | // 	}
55 | // 	return
56 | // }
57 | 
58 | func DecodeString(src, charset string) string {
59 | 	return mahonia.NewDecoder(charset).ConvertString(src)
60 | }
61 | 
62 | func EncodeString(src, charset string) string {
63 | 	return mahonia.NewEncoder(charset).ConvertString(src)
64 | }
65 | 
66 | func ConvertToString(src string, srcCode string, tagCode string) string {
67 | 	srcCoder := mahonia.NewDecoder(srcCode)
68 | 	srcResult := srcCoder.ConvertString(src)
69 | 	tagCoder := mahonia.NewDecoder(tagCode)
70 | 	_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
71 | 	result := string(cdata)
72 | 	return result
73 | }
74 | 
75 | func GBKToUTF8(src string) string {
76 | 	return DecodeString(EncodeString(src, "ISO-8859-1"), "GBK")
77 | }
78 | 


--------------------------------------------------------------------------------
/common/mlog/filelog.go:
--------------------------------------------------------------------------------
 1 | package mlog
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"os"
 6 | 	"os/exec"
 7 | 	"path/filepath"
 8 | 	"strconv"
 9 | 	"time"
10 | )
11 | 
12 | // Filelog represents an active object that logs on file to record error or other useful info.
13 | // The filelog info is output to os.Stderr.
14 | // The loginst is an point of logger in Std-Packages.
15 | // The isopen is a label represents whether open filelog or not.
16 | type filelog struct {
17 | 	plog
18 | 
19 | 	loginst *log.Logger
20 | }
21 | 
22 | var flog *filelog
23 | 
24 | // LogInst get the singleton filelog object.
25 | func LogInst() *filelog {
26 | 	if flog == nil {
27 | 		InitFilelog(false, "")
28 | 	}
29 | 	return flog
30 | }
31 | 
32 | // The InitFilelog is init the flog.
33 | func InitFilelog(isopen bool, fp string) {
34 | 	if !isopen {
35 | 		flog = &filelog{}
36 | 		flog.loginst = nil
37 | 		flog.isopen = isopen
38 | 		return
39 | 	}
40 | 	if fp == "" {
41 | 		wd := os.Getenv("GOPATH")
42 | 		if wd == "" {
43 | 			//panic("GOPATH is not setted in env.")
44 | 			file, _ := exec.LookPath(os.Args[0])
45 | 			path := filepath.Dir(file)
46 | 			wd = path
47 | 		}
48 | 		if wd == "" {
49 | 			panic("GOPATH is not setted in env or can not get exe path.")
50 | 		}
51 | 		fp = wd + "/log/"
52 | 	}
53 | 	flog = newFilelog(isopen, fp)
54 | }
55 | 
56 | // The newFilelog returns initialized filelog object.
57 | // The default file path is "WORKDIR/log/log.2011-01-01".
58 | func newFilelog(isopen bool, logpath string) *filelog {
59 | 	year, month, day := time.Now().Date()
60 | 	filename := "log." + strconv.Itoa(year) + "-" + strconv.Itoa(int(month)) + "-" + strconv.Itoa(day)
61 | 	err := os.MkdirAll(logpath, 0755)
62 | 	if err != nil {
63 | 		panic("logpath error : " + logpath + "\n")
64 | 	}
65 | 
66 | 	f, err := os.OpenFile(logpath+"/"+filename, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
67 | 	if err != nil {
68 | 		panic("log file open error : " + logpath + "/" + filename + "\n")
69 | 	}
70 | 
71 | 	pfilelog := &filelog{}
72 | 	pfilelog.loginst = log.New(f, "", log.LstdFlags)
73 | 	pfilelog.isopen = isopen
74 | 	return pfilelog
75 | }
76 | 
77 | func (this *filelog) log(lable string, str string) {
78 | 	if !this.isopen {
79 | 		return
80 | 	}
81 | 	file, line := this.getCaller()
82 | 	this.loginst.Printf("%s:%d: %s %s\n", file, line, lable, str)
83 | }
84 | 
85 | // LogError logs error info.
86 | func (this *filelog) LogError(str string) {
87 | 	this.log("[ERROR]", str)
88 | }
89 | 
90 | // LogError logs normal info.
91 | func (this *filelog) LogInfo(str string) {
92 | 	this.log("[INFO]", str)
93 | }
94 | 


--------------------------------------------------------------------------------
/pholcus/keeper/login.go:
--------------------------------------------------------------------------------
 1 | // craw master module
 2 | package keeper
 3 | 
 4 | import (
 5 | 	"errors"
 6 | 	"net/http"
 7 | 	"net/http/cookiejar"
 8 | 	"strings"
 9 | )
10 | 
11 | //具体的获取cookie的方法，return出一个[]*http.cookie
12 | //函数分为两步获取cookie，当有302跳转执行IsRedirectFunc
13 | //没有302跳转执行NoRedirectFunc,都是返回[]*http.cookie
14 | func GetCookie(url string, postParam string, IsRediect bool) []*http.Cookie {
15 | 	if IsRediect == true {
16 | 		return IsRedirectFunc(url, postParam)
17 | 	}
18 | 	return NoRedirectFunc(url, postParam)
19 | }
20 | 
21 | //这是一个GetCookie函数的分支，当有302跳转的时候执行此函数
22 | func IsRedirectFunc(url string, postParam string) []*http.Cookie {
23 | 
24 | 	gCookieJar, _ := cookiejar.New(nil)
25 | 	client := &http.Client{
26 | 		CheckRedirect: noCheckRedirect, //调用noCheckRedirect,不跳转，直接返回location
27 | 		Jar:           gCookieJar,
28 | 	}
29 | 
30 | 	req1, err := http.NewRequest("POST", url, strings.NewReader(postParam))
31 | 	if err != nil {
32 | 		errors.New("add postParam err")
33 | 	}
34 | 
35 | 	resp1 := getResponse(client, req1, url)
36 | 
37 | 	//获取第一次请求的location，发起第二次请求
38 | 
39 | 	req2, err := http.NewRequest("GET", string(resp1.Header.Get("Location")), nil)
40 | 	if err != nil {
41 | 		errors.New("add postParam err")
42 | 	}
43 | 
44 | 	return getResponse(client, req2, url).Cookies()
45 | 
46 | }
47 | 
48 | //IsRedirectFunc的一个分支函数，防止302跳转，获取lostion
49 | func noCheckRedirect(req *http.Request, via []*http.Request) error {
50 | 	if len(via) >= 0 {
51 | 		return errors.New("stopped after 10 redirects")
52 | 	}
53 | 	return nil
54 | }
55 | 
56 | //这是一个GetCookie函数的分支，当没有有302跳转的时候执行此函数
57 | func NoRedirectFunc(url string, postParam string) []*http.Cookie {
58 | 	gCookieJar, _ := cookiejar.New(nil)
59 | 
60 | 	client := &http.Client{
61 | 		Jar: gCookieJar,
62 | 	}
63 | 
64 | 	req1, err := http.NewRequest("POST", url, strings.NewReader(postParam))
65 | 	if err != nil {
66 | 		errors.New("add postParam err")
67 | 	}
68 | 
69 | 	return getResponse(client, req1, url).Cookies()
70 | }
71 | 
72 | // 请求获取响应流
73 | func getResponse(client *http.Client, req *http.Request, url string) *http.Response {
74 | 	req.Header.Set("Proxy-Connection", "keep-alive")
75 | 	req.Header.Set("Cache-Control", "max-age=0")
76 | 	req.Header.Set("Accept", "*/*")
77 | 	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36")
78 | 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
79 | 	req.Header.Set("Referer", url)
80 | 	req.Header.Set("Accept-Encoding", "gzip, deflate")
81 | 	req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
82 | 	resp, err := client.Do(req)
83 | 	if err != nil {
84 | 		return nil
85 | 	}
86 | 	return resp
87 | }
88 | 


--------------------------------------------------------------------------------
/pholcus/socket/PhoSocketTest.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	// "bufio"
  5 | 	"fmt"
  6 | 	// "io/ioutil"
  7 | 	"log"
  8 | 	"net"
  9 | )
 10 | 
 11 | const (
 12 | 	PhoSocketServer = "127.0.0.1:6010"
 13 | )
 14 | 
 15 | //建立连接
 16 | 
 17 | //func 接受
 18 | 
 19 | //func 发送
 20 | 
 21 | /*
 22 | *@服务端用
 23 | *
 24 | *PhoSoketLisent()为幽灵蛛socket监听函数
 25 | *PhoSocketServer为预定义常量：server:port
 26 | *输出类型为net.Listener,一个监听句柄
 27 |  */
 28 | func PhoSoketLisent() net.Listener {
 29 | 	ln, err := net.Listen("tcp", PhoSocketServer)
 30 | 	if err != nil {
 31 | 		panic(err)
 32 | 	}
 33 | 	return ln
 34 | }
 35 | 
 36 | /*
 37 | *@客户端用
 38 | *
 39 | *PhoSoketDial()为幽灵蛛socket拨号函数，请求服务端
 40 | *PhoSocketServer为预定义常量：server:port
 41 | *输出类型为net.Conn,一个握手连接，下一步可以进行接收，发送
 42 |  */
 43 | func PhoSoketDial() net.Conn {
 44 | 	conn, err := net.Dial("tcp", PhoSocketServer)
 45 | 	if err != nil {
 46 | 		panic(err)
 47 | 	}
 48 | 	return conn
 49 | }
 50 | 
 51 | /*
 52 | *@服务端用
 53 | *
 54 | *PhoSocketAccept()为幽灵蛛socket同意连接函数
 55 | *ln为一个监听句柄
 56 | *输出类型为net.Conn,一个握手连接，下一步可以进行接收，发送
 57 |  */
 58 | func PhoSocketAccept(ln net.Listener) net.Conn {
 59 | 	for {
 60 | 		conn, err := ln.Accept()
 61 | 		if err != nil {
 62 | 			log.Fatal("get client connection error: ", err)
 63 | 		}
 64 | 		return conn
 65 | 	}
 66 | }
 67 | 
 68 | /*
 69 | *@服务端用
 70 | *
 71 | *PhoSocketSendDataClose()为幽灵蛛socket数据发送函数，并且关闭连接
 72 | *conn为握手连接，sendData为要发送的数据
 73 | *通过conn给client发送Data
 74 |  */
 75 | func PhoSocketSendDataClose(conn net.Conn, sendData string) {
 76 | 	// fmt.Fprintf(conn, sendData)
 77 | 	conn.Write([]byte(sendData))
 78 | 	conn.Close()
 79 | }
 80 | 
 81 | /*
 82 | *@共用
 83 | *
 84 | *PhoSocketSendData()为幽灵蛛socket数据发送函数,但是不关闭
 85 | *conn为握手连接，sendData为要发送的数据
 86 | *通过conn给client发送Data
 87 |  */
 88 | func PhoSocketSendData(conn net.Conn, sendData string) {
 89 | 	// fmt.Fprintf(conn, sendData)
 90 | 	conn.Write([]byte(sendData))
 91 | }
 92 | 
 93 | /*
 94 | *@共用
 95 | *
 96 | *PhoSocketAcceptData()为幽灵蛛socket数据接收函数
 97 | *conn为握手连接
 98 | *通过conn接收client发送来的Data
 99 |  */
100 | func PhoSocketAcceptData(conn net.Conn) {
101 | 	// data, err := bufio.NewReader(conn).ReadString('\n')
102 | 	databuf := make([]byte, 4096)
103 | 	n, err := conn.Read(databuf)
104 | 	// data, err := ioutil.ReadAll(conn)
105 | 	if err != nil {
106 | 		log.Fatal("get client data error: ", err)
107 | 	}
108 | 	fmt.Printf("%#v\n", string(databuf[:n]))
109 | }
110 | 
111 | //接收并发送，完关闭
112 | func AcceptAndSendClose(conn net.Conn) {
113 | 	PhoSocketAcceptData(conn)
114 | 	PhoSocketSendDataClose(conn, "this is server\n")
115 | }
116 | 
117 | //接收并发送，不关闭
118 | func AcceptAndSend(conn net.Conn) {
119 | 	PhoSocketAcceptData(conn)
120 | 	PhoSocketSendData(conn, "this is server\n")
121 | }
122 | func main() {
123 | 	conn := PhoSoketDial()
124 | 	PhoSocketSendData(conn, "hello server\n")
125 | 	PhoSocketAcceptData(conn)
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/pholcus/socket/PhoSocket.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	// "bufio"
  5 | 	"fmt"
  6 | 	// "io/ioutil"
  7 | 	"log"
  8 | 	"net"
  9 | )
 10 | 
 11 | const (
 12 | 	PhoSocketServer = "127.0.0.1:6010"
 13 | )
 14 | 
 15 | //建立连接
 16 | 
 17 | //func 接受
 18 | 
 19 | //func 发送
 20 | 
 21 | /*
 22 | *@服务端用
 23 | *
 24 | *PhoSoketLisent()为幽灵蛛socket监听函数
 25 | *PhoSocketServer为预定义常量：server:port
 26 | *输出类型为net.Listener,一个监听句柄
 27 |  */
 28 | func PhoSoketLisent() net.Listener {
 29 | 	ln, err := net.Listen("tcp", PhoSocketServer)
 30 | 	if err != nil {
 31 | 		panic(err)
 32 | 	}
 33 | 	return ln
 34 | }
 35 | 
 36 | /*
 37 | *@客户端用
 38 | *
 39 | *PhoSoketDial()为幽灵蛛socket拨号函数，请求服务端
 40 | *PhoSocketServer为预定义常量：server:port
 41 | *输出类型为net.Conn,一个握手连接，下一步可以进行接收，发送
 42 |  */
 43 | func PhoSoketDial() net.Conn {
 44 | 	conn, err := net.Dial("tcp", PhoSocketServer)
 45 | 	if err != nil {
 46 | 		panic(err)
 47 | 	}
 48 | 	return conn
 49 | }
 50 | 
 51 | /*
 52 | *@服务端用
 53 | *
 54 | *PhoSocketAccept()为幽灵蛛socket同意连接函数
 55 | *ln为一个监听句柄
 56 | *输出类型为net.Conn,一个握手连接，下一步可以进行接收，发送
 57 |  */
 58 | func PhoSocketAccept(ln net.Listener) net.Conn {
 59 | 	for {
 60 | 		conn, err := ln.Accept()
 61 | 		if err != nil {
 62 | 			log.Fatal("get client connection error: ", err)
 63 | 		}
 64 | 		return conn
 65 | 	}
 66 | }
 67 | 
 68 | /*
 69 | *@服务端用
 70 | *
 71 | *PhoSocketSendDataClose()为幽灵蛛socket数据发送函数，并且关闭连接
 72 | *conn为握手连接，sendData为要发送的数据
 73 | *通过conn给client发送Data
 74 |  */
 75 | func PhoSocketSendDataClose(conn net.Conn, sendData string) {
 76 | 	// fmt.Fprintf(conn, sendData)
 77 | 	conn.Write([]byte(sendData))
 78 | 	conn.Close()
 79 | }
 80 | 
 81 | /*
 82 | *@共用
 83 | *
 84 | *PhoSocketSendData()为幽灵蛛socket数据发送函数,但是不关闭
 85 | *conn为握手连接，sendData为要发送的数据
 86 | *通过conn给client发送Data
 87 |  */
 88 | func PhoSocketSendData(conn net.Conn, sendData string) {
 89 | 	// fmt.Fprintf(conn, sendData)
 90 | 	conn.Write([]byte(sendData))
 91 | }
 92 | 
 93 | /*
 94 | *@共用
 95 | *
 96 | *PhoSocketAcceptData()为幽灵蛛socket数据接收函数
 97 | *conn为握手连接
 98 | *通过conn接收client发送来的Data
 99 |  */
100 | func PhoSocketAcceptData(conn net.Conn) {
101 | 	// data, err := bufio.NewReader(conn).ReadString('\n')
102 | 	databuf := make([]byte, 4096)
103 | 	n, err := conn.Read(databuf)
104 | 	// data, err := ioutil.ReadAll(conn)
105 | 	if err != nil {
106 | 		log.Fatal("get client data error: ", err)
107 | 	}
108 | 	fmt.Printf("%#v\n", string(databuf[:n]))
109 | }
110 | 
111 | //接收并发送，完关闭
112 | func AcceptAndSendClose(conn net.Conn) {
113 | 	PhoSocketAcceptData(conn)
114 | 	PhoSocketSendDataClose(conn, "this is server\n")
115 | }
116 | 
117 | //接收并发送，不关闭
118 | func AcceptAndSend(conn net.Conn) {
119 | 	PhoSocketAcceptData(conn)
120 | 	PhoSocketSendData(conn, "this is server\n")
121 | }
122 | 
123 | func main() {
124 | 	ln := PhoSoketLisent()
125 | 	for {
126 | 		conn := PhoSocketAccept(ln)
127 | 		go AcceptAndSendClose(conn)
128 | 	}
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/pholcus/gui/logview.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 The Walk Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | package gui
  5 | 
  6 | import (
  7 | 	"errors"
  8 | 	"github.com/lxn/walk"
  9 | 	"github.com/lxn/win"
 10 | 	"syscall"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | type LogView struct {
 15 | 	walk.WidgetBase
 16 | 	logChan chan string
 17 | }
 18 | 
 19 | const (
 20 | 	TEM_APPENDTEXT = win.WM_USER + 6
 21 | )
 22 | 
 23 | func NewLogView(parent walk.Container) (*LogView, error) {
 24 | 	lc := make(chan string, 1024)
 25 | 	lv := &LogView{logChan: lc}
 26 | 
 27 | 	if err := walk.InitWidget(
 28 | 		lv,
 29 | 		parent,
 30 | 		"EDIT",
 31 | 		win.WS_TABSTOP|win.WS_VISIBLE|win.WS_VSCROLL|win.ES_MULTILINE|win.ES_WANTRETURN,
 32 | 		win.WS_EX_CLIENTEDGE); err != nil {
 33 | 		return nil, err
 34 | 	}
 35 | 	lv.setReadOnly(true)
 36 | 	lv.SendMessage(win.EM_SETLIMITTEXT, 4294967295, 0)
 37 | 	return lv, nil
 38 | }
 39 | 
 40 | func (*LogView) LayoutFlags() walk.LayoutFlags {
 41 | 	return walk.ShrinkableHorz | walk.ShrinkableVert | walk.GrowableHorz | walk.GrowableVert | walk.GreedyHorz | walk.GreedyVert
 42 | }
 43 | 
 44 | func (*LogView) MinSizeHint() walk.Size {
 45 | 	return walk.Size{20, 12}
 46 | }
 47 | 
 48 | func (*LogView) SizeHint() walk.Size {
 49 | 	return walk.Size{100, 100}
 50 | }
 51 | 
 52 | func (lv *LogView) setTextSelection(start, end int) {
 53 | 	lv.SendMessage(win.EM_SETSEL, uintptr(start), uintptr(end))
 54 | }
 55 | 
 56 | func (lv *LogView) textLength() int {
 57 | 	return int(lv.SendMessage(0x000E, uintptr(0), uintptr(0)))
 58 | }
 59 | 
 60 | func (lv *LogView) AppendText(value string) {
 61 | 	textLength := lv.textLength()
 62 | 	lv.setTextSelection(textLength, textLength)
 63 | 	lv.SendMessage(win.EM_REPLACESEL, 0, uintptr(unsafe.Pointer(syscall.StringToUTF16Ptr(value))))
 64 | }
 65 | 
 66 | func (lv *LogView) setReadOnly(readOnly bool) error {
 67 | 	if 0 == lv.SendMessage(win.EM_SETREADONLY, uintptr(win.BoolToBOOL(readOnly)), 0) {
 68 | 		return errors.New("fail to call EM_SETREADONLY")
 69 | 	}
 70 | 
 71 | 	return nil
 72 | }
 73 | 
 74 | func (lv *LogView) PostAppendText(value string) {
 75 | 	lv.logChan <- value
 76 | 	win.PostMessage(lv.Handle(), TEM_APPENDTEXT, 0, 0)
 77 | }
 78 | 
 79 | func (lv *LogView) Write(p []byte) (int, error) {
 80 | 	lv.PostAppendText(string(p) + "\r\n")
 81 | 	return len(p), nil
 82 | }
 83 | 
 84 | func (lv *LogView) WndProc(hwnd win.HWND, msg uint32, wParam, lParam uintptr) uintptr {
 85 | 	switch msg {
 86 | 	case win.WM_GETDLGCODE:
 87 | 		if wParam == win.VK_RETURN {
 88 | 			return win.DLGC_WANTALLKEYS
 89 | 		}
 90 | 
 91 | 		return win.DLGC_HASSETSEL | win.DLGC_WANTARROWS | win.DLGC_WANTCHARS
 92 | 	case TEM_APPENDTEXT:
 93 | 		select {
 94 | 		case value := <-lv.logChan:
 95 | 			lv.AppendText(value)
 96 | 		default:
 97 | 			return 0
 98 | 		}
 99 | 	}
100 | 
101 | 	return lv.WidgetBase.WndProc(hwnd, msg, wParam, lParam)
102 | }
103 | 


--------------------------------------------------------------------------------
/pholcus/gui/menu.go:
--------------------------------------------------------------------------------
  1 | package gui
  2 | 
  3 | import (
  4 | 	. "github.com/henrylee2cn/pholcus/spiders"
  5 | )
  6 | 
  7 | // GUI输入
  8 | type Inputor struct {
  9 | 	ThreadNum         uint
 10 | 	OutType           string
 11 | 	BaseSleeptime     uint
 12 | 	RandomSleepPeriod uint //随机暂停最大增益时长
 13 | 	MaxPage           int
 14 | 	Keywords          string //后期split()为slice
 15 | 	Spiders           []*GUISpider
 16 | 	DockerCap         uint
 17 | }
 18 | 
 19 | var Input = &Inputor{
 20 | 	// 默认值
 21 | 	ThreadNum:         20,
 22 | 	OutType:           "excel",
 23 | 	BaseSleeptime:     1000,
 24 | 	RandomSleepPeriod: 3000,
 25 | 	MaxPage:           100,
 26 | 	DockerCap:         10000,
 27 | }
 28 | 
 29 | // GUI内容
 30 | // 下拉菜单辅助结构体
 31 | type KV struct {
 32 | 	Key    string
 33 | 	Uint   uint
 34 | 	String string
 35 | }
 36 | 
 37 | var (
 38 | 	// 任务选项
 39 | 	SpiderModel = NewGUISpiderModel([]*GUISpiderCore{
 40 | 		&GUISpiderCore{
 41 | 			Spider:      BaiduSearch,
 42 | 			Description: "百度搜索结果 [www.baidu.com]",
 43 | 		},
 44 | 		&GUISpiderCore{
 45 | 			Spider:      GoogleSearch,
 46 | 			Description: "谷歌搜索结果 [www.google.com镜像]",
 47 | 		},
 48 | 		&GUISpiderCore{
 49 | 			Spider:      TaobaoSearch,
 50 | 			Description: "淘宝宝贝搜索结果 [s.taobao.com]",
 51 | 		},
 52 | 		&GUISpiderCore{
 53 | 			Spider:      JDSearch,
 54 | 			Description: "京东搜索结果 [search.jd.com]",
 55 | 		},
 56 | 		&GUISpiderCore{
 57 | 			Spider:      AlibabaProduct,
 58 | 			Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]",
 59 | 		},
 60 | 		&GUISpiderCore{
 61 | 			Spider:      Wangyi,
 62 | 			Description: "网易排行榜新闻，含点击/跟帖排名 [Auto Page] [news.163.com/rank]",
 63 | 		},
 64 | 		&GUISpiderCore{
 65 | 			Spider:      BaiduNews,
 66 | 			Description: "百度RSS新闻，实现轮询更新 [Auto Page] [news.baidu.com]",
 67 | 		},
 68 | 		&GUISpiderCore{
 69 | 			Spider:      Kaola,
 70 | 			Description: "考拉海淘商品数据 [Auto Page] [www.kaola.com]",
 71 | 		},
 72 | 		&GUISpiderCore{
 73 | 			Spider:      Shunfenghaitao,
 74 | 			Description: "顺丰海淘商品数据 [Auto Page] [www.sfht.com]",
 75 | 		},
 76 | 		&GUISpiderCore{
 77 | 			Spider:      Miyabaobei,
 78 | 			Description: "蜜芽宝贝商品数据 [Auto Page] [www.miyabaobei.com]",
 79 | 		},
 80 | 		&GUISpiderCore{
 81 | 			Spider:      Hollandandbarrett,
 82 | 			Description: "Hollandand&Barrett商品数据 [Auto Page] [www.Hollandandbarrett.com]",
 83 | 		},
 84 | 	})
 85 | 
 86 | 	// 暂停时间选项及输出类型选项
 87 | 	GUIOpt = struct {
 88 | 		OutType   []*KV
 89 | 		SleepTime []*KV
 90 | 	}{
 91 | 		OutType: []*KV{
 92 | 			{Key: "excel", String: "excel"},
 93 | 			{Key: "csv", String: "csv"},
 94 | 			{Key: "mongoDB", String: "mongoDB"},
 95 | 		},
 96 | 		SleepTime: []*KV{
 97 | 			{Key: "无暂停", Uint: 0},
 98 | 			{Key: "0.1 秒", Uint: 100},
 99 | 			{Key: "0.3 秒", Uint: 300},
100 | 			{Key: "0.5 秒", Uint: 500},
101 | 			{Key: "1 秒", Uint: 1000},
102 | 			{Key: "3 秒", Uint: 3000},
103 | 			{Key: "5 秒", Uint: 5000},
104 | 			{Key: "10 秒", Uint: 10000},
105 | 			{Key: "15 秒", Uint: 15000},
106 | 			{Key: "20 秒", Uint: 20000},
107 | 			{Key: "30 秒", Uint: 30000},
108 | 			{Key: "60 秒", Uint: 60000},
109 | 		},
110 | 	}
111 | )
112 | 


--------------------------------------------------------------------------------
/spiders/kaola.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	// "github.com/henrylee2cn/pholcus/reporter"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider" //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | // "regexp"
 25 | // "strconv"
 26 | // "strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | // 考拉海淘,海外直采,7天无理由退货,售后无忧!考拉网放心的海淘网站!
 36 | var Kaola = &Spider{
 37 | 	Name: "考拉海淘",
 38 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 39 | 	// Optional: &Optional{},
 40 | 	RuleTree: &RuleTree{
 41 | 		// Spread: []string{},
 42 | 		Root: func(self *Spider) {
 43 | 			self.AddQueue(map[string]interface{}{"url": "http://www.kaola.com", "rule": "获取版块URL"})
 44 | 		},
 45 | 
 46 | 		Nodes: map[string]*Rule{
 47 | 
 48 | 			"获取版块URL": &Rule{
 49 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 50 | 					query := resp.GetHtmlParser()
 51 | 					lis := query.Find("#funcTab li a")
 52 | 					lis.Each(func(i int, s *goquery.Selection) {
 53 | 						if i == 0 {
 54 | 							return
 55 | 						}
 56 | 						if url, ok := s.Attr("href"); ok {
 57 | 							self.AddQueue(map[string]interface{}{"url": url, "rule": "商品列表", "temp": map[string]interface{}{"goodsType": s.Text()}})
 58 | 						}
 59 | 					})
 60 | 				},
 61 | 			},
 62 | 
 63 | 			"商品列表": &Rule{
 64 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 65 | 					query := resp.GetHtmlParser()
 66 | 					query.Find(".proinfo").Each(func(i int, s *goquery.Selection) {
 67 | 						if url, ok := s.Find("a").Attr("href"); ok {
 68 | 							self.AddQueue(map[string]interface{}{
 69 | 								"url":  "http://www.kaola.com" + url,
 70 | 								"rule": "商品详情",
 71 | 								"temp": map[string]interface{}{"goodsType": resp.GetTemp("goodsType").(string)},
 72 | 							})
 73 | 						}
 74 | 					})
 75 | 				},
 76 | 			},
 77 | 
 78 | 			"商品详情": &Rule{
 79 | 				//注意：有无字段语义和是否输出数据必须保持一致
 80 | 				OutFeild: []string{
 81 | 					"标题",
 82 | 					"价格",
 83 | 					"品牌",
 84 | 					"采购地",
 85 | 					"评论数",
 86 | 					"类别",
 87 | 				},
 88 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 89 | 					query := resp.GetHtmlParser()
 90 | 					// 获取标题
 91 | 					title := query.Find(".product-title").Text()
 92 | 
 93 | 					// 获取价格
 94 | 					price := query.Find("#js_currentPrice span").Text()
 95 | 
 96 | 					// 获取品牌
 97 | 					brand := query.Find(".goods_parameter li").Eq(0).Text()
 98 | 
 99 | 					// 获取采购地
100 | 					from := query.Find(".goods_parameter li").Eq(1).Text()
101 | 
102 | 					// 获取评论数
103 | 					discussNum := query.Find("#commentCounts").Text()
104 | 
105 | 					// 结果存入Response中转
106 | 					resp.AddItem(map[string]string{
107 | 						self.GetOutFeild(resp, 0): title,
108 | 						self.GetOutFeild(resp, 1): price,
109 | 						self.GetOutFeild(resp, 2): brand,
110 | 						self.GetOutFeild(resp, 3): from,
111 | 						self.GetOutFeild(resp, 4): discussNum,
112 | 						self.GetOutFeild(resp, 5): resp.GetTemp("goodsType").(string),
113 | 					})
114 | 				},
115 | 			},
116 | 		},
117 | 	},
118 | }
119 | 


--------------------------------------------------------------------------------
/spiders/shunfenghaitao.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	// "github.com/henrylee2cn/pholcus/reporter"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider" //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	// "strconv"
 26 | 	// "strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | // 进口母婴专区，买进口奶粉、尿裤尿布、辅食、营养、洗护、日用、母婴用品  - 顺丰海淘
 36 | var Shunfenghaitao = &Spider{
 37 | 	Name: "顺丰海淘",
 38 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 39 | 	// Optional: &Optional{},
 40 | 	RuleTree: &RuleTree{
 41 | 		// Spread: []string{},
 42 | 		Root: func(self *Spider) {
 43 | 			self.AddQueue(map[string]interface{}{"url": "http://www.sfht.com", "rule": "获取版块URL"})
 44 | 		},
 45 | 
 46 | 		Nodes: map[string]*Rule{
 47 | 
 48 | 			"获取版块URL": &Rule{
 49 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 50 | 					query := resp.GetHtmlParser()
 51 | 
 52 | 					lis := query.Find(".nav-c1").First().Find("li a")
 53 | 
 54 | 					lis.Each(func(i int, s *goquery.Selection) {
 55 | 						if i == 0 {
 56 | 							return
 57 | 						}
 58 | 						if url, ok := s.Attr("href"); ok {
 59 | 							self.AddQueue(map[string]interface{}{"url": url, "rule": "商品列表", "temp": map[string]interface{}{"goodsType": s.Text()}})
 60 | 						}
 61 | 					})
 62 | 				},
 63 | 			},
 64 | 
 65 | 			"商品列表": &Rule{
 66 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 67 | 					query := resp.GetHtmlParser()
 68 | 
 69 | 					query.Find(".cms-src-item").Each(func(i int, s *goquery.Selection) {
 70 | 						if url, ok := s.Find("a").Attr("href"); ok {
 71 | 							self.AddQueue(map[string]interface{}{
 72 | 								"url":  url,
 73 | 								"rule": "商品详情",
 74 | 								"temp": map[string]interface{}{"goodsType": resp.GetTemp("goodsType").(string)},
 75 | 							})
 76 | 						}
 77 | 					})
 78 | 				},
 79 | 			},
 80 | 
 81 | 			"商品详情": &Rule{
 82 | 				//注意：有无字段语义和是否输出数据必须保持一致
 83 | 				OutFeild: []string{
 84 | 					"标题",
 85 | 					"品牌",
 86 | 					"原产地",
 87 | 					"货源地",
 88 | 					"类别",
 89 | 				},
 90 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 91 | 					query := resp.GetHtmlParser()
 92 | 
 93 | 					// 获取标题
 94 | 					title := query.Find("#titleInfo h1").Text()
 95 | 
 96 | 					// 获取品牌
 97 | 					brand := query.Find(".goods-c2 ul").Eq(0).Find("li").Eq(2).Text()
 98 | 					re, _ := regexp.Compile(`品 牌`)
 99 | 					brand = re.ReplaceAllString(brand, "")
100 | 
101 | 					// 获取原产地
102 | 					from1 := query.Find("#detailattributes li").Eq(0).Text()
103 | 
104 | 					// 获取货源地
105 | 					from2 := query.Find("#detailattributes li").Eq(1).Text()
106 | 
107 | 					// 结果存入Response中转
108 | 					resp.AddItem(map[string]string{
109 | 						self.GetOutFeild(resp, 0): title,
110 | 						self.GetOutFeild(resp, 1): brand,
111 | 						self.GetOutFeild(resp, 2): from1,
112 | 						self.GetOutFeild(resp, 3): from2,
113 | 						self.GetOutFeild(resp, 4): resp.GetTemp("goodsType").(string),
114 | 					})
115 | 				},
116 | 			},
117 | 		},
118 | 	},
119 | }
120 | 


--------------------------------------------------------------------------------
/spiders/baidusearch.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | 	// "fmt"
 32 | 	"math"
 33 | )
 34 | 
 35 | var BaiduSearch = &Spider{
 36 | 	Name: "百度搜索",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"生成请求": &Rule{
 48 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 49 | 					self.LoopAddQueue(
 50 | 						aid[0].([2]int),
 51 | 						func(i int) []string {
 52 | 							return []string{"http://www.baidu.com/s?ie=utf-8&wd=" + self.GetKeyword() + "&rn=50&pn=" + strconv.Itoa(50*i)}
 53 | 						},
 54 | 						map[string]interface{}{
 55 | 							"rule": aid[1].(string),
 56 | 						},
 57 | 					)
 58 | 					return nil
 59 | 				},
 60 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 61 | 					query := resp.GetHtmlParser()
 62 | 					total1 := query.Find(".nums").Text()
 63 | 					re, _ := regexp.Compile(`[\D]*`)
 64 | 					total1 = re.ReplaceAllString(total1, "")
 65 | 					total2, _ := strconv.Atoi(total1)
 66 | 					total := int(math.Ceil(float64(total2) / 50))
 67 | 					if total > self.MaxPage {
 68 | 						total = self.MaxPage
 69 | 					} else if total == 0 {
 70 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
 71 | 						return
 72 | 					}
 73 | 					// 调用指定规则下辅助函数
 74 | 					self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"})
 75 | 					// 用指定规则解析响应流
 76 | 					self.CallRule("搜索结果", resp)
 77 | 				},
 78 | 			},
 79 | 
 80 | 			"搜索结果": &Rule{
 81 | 				//注意：有无字段语义和是否输出数据必须保持一致
 82 | 				OutFeild: []string{
 83 | 					"标题",
 84 | 					"内容",
 85 | 					"不完整URL",
 86 | 					"百度跳转",
 87 | 				},
 88 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 89 | 					query := resp.GetHtmlParser()
 90 | 					query.Find("#content_left .c-container").Each(func(i int, s *goquery.Selection) {
 91 | 
 92 | 						title := s.Find(".t").Text()
 93 | 						content := s.Find(".c-abstract").Text()
 94 | 						href, _ := s.Find(".t >a").Attr("href")
 95 | 						tar := s.Find(".g").Text()
 96 | 
 97 | 						re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
 98 | 						// title = re.ReplaceAllStringFunc(title, strings.ToLower)
 99 | 						// content = re.ReplaceAllStringFunc(content, strings.ToLower)
100 | 
101 | 						title = re.ReplaceAllString(title, "")
102 | 						content = re.ReplaceAllString(content, "")
103 | 
104 | 						// 结果存入Response中转
105 | 						resp.AddItem(map[string]string{
106 | 							self.GetOutFeild(resp, 0): strings.Trim(title, " \t\n"),
107 | 							self.GetOutFeild(resp, 1): strings.Trim(content, " \t\n"),
108 | 							self.GetOutFeild(resp, 2): tar,
109 | 							self.GetOutFeild(resp, 3): href,
110 | 						})
111 | 					})
112 | 				},
113 | 			},
114 | 		},
115 | 	},
116 | }
117 | 


--------------------------------------------------------------------------------
/pipeline/collector/collector.go:
--------------------------------------------------------------------------------
  1 | // 数据收集
  2 | package collector
  3 | 
  4 | import (
  5 | 	"fmt"
  6 | 	"github.com/henrylee2cn/pholcus/config"
  7 | 	// "github.com/henrylee2cn/pholcus/reporter"
  8 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
  9 | 	"strconv"
 10 | 	"time"
 11 | )
 12 | 
 13 | // 每个爬取任务的数据容器
 14 | type Collector struct {
 15 | 	*spider.Spider
 16 | 	*DockerQueue
 17 | 	DataChan chan DataCell
 18 | 	ctrl     chan bool //长度为零时退出并输出
 19 | 	sum      [2]int    //收集的数据总数[过去，现在],非并发安全
 20 | 	outType  string
 21 | 	outCount [2]int
 22 | }
 23 | 
 24 | func NewCollector() *Collector {
 25 | 	self := &Collector{
 26 | 		DataChan:    make(chan DataCell, config.DATA_CAP),
 27 | 		DockerQueue: NewDockerQueue(),
 28 | 		ctrl:        make(chan bool, 1),
 29 | 	}
 30 | 	return self
 31 | }
 32 | 
 33 | func (self *Collector) Init(sp *spider.Spider) {
 34 | 	self.Spider = sp
 35 | 	self.outType = config.OutType
 36 | 	self.DataChan = make(chan DataCell, config.DATA_CAP)
 37 | 	self.DockerQueue = NewDockerQueue()
 38 | 	self.ctrl = make(chan bool, 1)
 39 | 	self.sum = [2]int{}
 40 | 	self.outCount = [2]int{}
 41 | }
 42 | 
 43 | func (self *Collector) Collect(dataCell DataCell) {
 44 | 	// reporter.Log.Println("**************断点 6 ***********")
 45 | 	self.DataChan <- dataCell
 46 | 	// reporter.Log.Println("**************断点 7 ***********")
 47 | }
 48 | 
 49 | func (self *Collector) CtrlS() {
 50 | 	self.ctrl <- true
 51 | 	// reporter.Log.Println("**************断点 10 ***********")
 52 | }
 53 | 
 54 | func (self *Collector) CtrlR() {
 55 | 	<-self.ctrl
 56 | 	// reporter.Log.Println("**************断点 9 ***********")
 57 | }
 58 | 
 59 | func (self *Collector) CtrlLen() int {
 60 | 	return len(self.ctrl)
 61 | }
 62 | 
 63 | // 数据转储输出
 64 | func (self *Collector) Manage() {
 65 | 	// reporter.Log.Println("**************开启输出管道************")
 66 | 
 67 | 	// 令self.Ctrl长度不为零
 68 | 	self.CtrlS()
 69 | 	// 只有当收到退出通知并且通道内无数据时，才退出循环
 70 | 	for !(self.CtrlLen() == 0 && len(self.DataChan) == 0) {
 71 | 		// reporter.Log.Println("**************断点 8 ***********")
 72 | 		select {
 73 | 		case data := <-self.DataChan:
 74 | 
 75 | 			self.dockerOne(data)
 76 | 		default:
 77 | 			time.Sleep(1e7) //0.1秒
 78 | 		}
 79 | 	}
 80 | 
 81 | 	// 将剩余收集到但未输出的数据输出
 82 | 	self.goOutput(self.Curr)
 83 | 
 84 | 	// 等待所有输出完成
 85 | 	for self.outCount[0] > self.outCount[1] {
 86 | 		time.Sleep(5e8)
 87 | 	}
 88 | 
 89 | 	// 返回报告
 90 | 	self.Report()
 91 | }
 92 | 
 93 | func (self *Collector) dockerOne(data DataCell) {
 94 | 
 95 | 	self.Dockers[self.Curr] = append(self.Dockers[self.Curr], data)
 96 | 
 97 | 	if uint(len(self.Dockers[self.Curr])) >= config.DOCKER_CAP {
 98 | 		// curDocker存满后输出
 99 | 		self.goOutput(self.Curr)
100 | 		// 更换一个空Docker用于curDocker
101 | 		self.Change()
102 | 	}
103 | }
104 | 
105 | func (self *Collector) goOutput(dataIndex int) {
106 | 	self.outCount[0]++
107 | 	go func() {
108 | 		self.Output(dataIndex)
109 | 		self.outCount[1]++
110 | 	}()
111 | }
112 | 
113 | // 统计数据总量
114 | func (self *Collector) Sum() int {
115 | 	return self.sum[1]
116 | }
117 | 
118 | // 统计数据总量
119 | func (self *Collector) setSum(add int) {
120 | 	self.sum[0], self.sum[1] = self.sum[1], self.sum[1]+add
121 | }
122 | 
123 | // 返回报告
124 | func (self *Collector) Report() {
125 | 	// reporter.Log.Println("**************", self.Sum(), " ***********")
126 | 	config.ReportChan <- &config.Report{
127 | 		SpiderName: self.Spider.GetName(),
128 | 		Keyword:    self.GetKeyword(),
129 | 		Num:        strconv.Itoa(self.Sum()),
130 | 		Time:       fmt.Sprintf("%.5f", time.Since(config.StartTime).Minutes()),
131 | 	}
132 | }
133 | 


--------------------------------------------------------------------------------
/common/util/util.go:
--------------------------------------------------------------------------------
  1 | // Package util contains some common functions of GO_SPIDER project.
  2 | package util
  3 | 
  4 | import (
  5 | 	"crypto/md5"
  6 | 	"encoding/hex"
  7 | 	"encoding/json"
  8 | 	"encoding/xml"
  9 | 	"fmt"
 10 | 	"golang.org/x/net/html/charset"
 11 | 	"hash/crc32"
 12 | 	"hash/fnv"
 13 | 	"io"
 14 | 	"os"
 15 | 	"regexp"
 16 | 	"strconv"
 17 | 	"strings"
 18 | )
 19 | 
 20 | // JsonpToJson modify jsonp string to json string
 21 | // Example: forbar({a:"1",b:2}) to {"a":"1","b":2}
 22 | func JsonpToJson(json string) string {
 23 | 	start := strings.Index(json, "{")
 24 | 	end := strings.LastIndex(json, "}")
 25 | 	start1 := strings.Index(json, "[")
 26 | 	if start1 > 0 && start > start1 {
 27 | 		start = start1
 28 | 		end = strings.LastIndex(json, "]")
 29 | 	}
 30 | 	if end > start && end != -1 && start != -1 {
 31 | 		json = json[start : end+1]
 32 | 	}
 33 | 	json = strings.Replace(json, "\\'", "", -1)
 34 | 	regDetail, _ := regexp.Compile("([^\\s\\:\\{\\,\\d\"]+|[a-z][a-z\\d]*)\\s*\\:")
 35 | 	return regDetail.ReplaceAllString(json, "\"$1\":")
 36 | }
 37 | 
 38 | // The GetWDPath gets the work directory path.
 39 | func GetWDPath() string {
 40 | 	wd := os.Getenv("GOPATH")
 41 | 	if wd == "" {
 42 | 		panic("GOPATH is not setted in env.")
 43 | 	}
 44 | 	return wd
 45 | }
 46 | 
 47 | // The IsDirExists judges path is directory or not.
 48 | func IsDirExists(path string) bool {
 49 | 	fi, err := os.Stat(path)
 50 | 
 51 | 	if err != nil {
 52 | 		return os.IsExist(err)
 53 | 	} else {
 54 | 		return fi.IsDir()
 55 | 	}
 56 | 
 57 | 	panic("util isDirExists not reached")
 58 | }
 59 | 
 60 | // The IsFileExists judges path is file or not.
 61 | func IsFileExists(path string) bool {
 62 | 	fi, err := os.Stat(path)
 63 | 
 64 | 	if err != nil {
 65 | 		return os.IsExist(err)
 66 | 	} else {
 67 | 		return !fi.IsDir()
 68 | 	}
 69 | 
 70 | 	panic("util isFileExists not reached")
 71 | }
 72 | 
 73 | // The IsNum judges string is number or not.
 74 | func IsNum(a string) bool {
 75 | 	reg, _ := regexp.Compile("^\\d+$")
 76 | 	return reg.MatchString(a)
 77 | }
 78 | 
 79 | // simple xml to string  support utf8
 80 | func XML2mapstr(xmldoc string) map[string]string {
 81 | 	var t xml.Token
 82 | 	var err error
 83 | 	inputReader := strings.NewReader(xmldoc)
 84 | 	decoder := xml.NewDecoder(inputReader)
 85 | 	decoder.CharsetReader = func(s string, r io.Reader) (io.Reader, error) {
 86 | 		return charset.NewReader(r, s)
 87 | 	}
 88 | 	m := make(map[string]string, 32)
 89 | 	key := ""
 90 | 	for t, err = decoder.Token(); err == nil; t, err = decoder.Token() {
 91 | 		switch token := t.(type) {
 92 | 		case xml.StartElement:
 93 | 			key = token.Name.Local
 94 | 		case xml.CharData:
 95 | 			content := string([]byte(token))
 96 | 			m[key] = content
 97 | 		default:
 98 | 			// ...
 99 | 		}
100 | 	}
101 | 
102 | 	return m
103 | }
104 | 
105 | //string to hash
106 | func MakeHash(s string) string {
107 | 	const IEEE = 0xedb88320
108 | 	var IEEETable = crc32.MakeTable(IEEE)
109 | 	hash := fmt.Sprintf("%x", crc32.Checksum([]byte(s), IEEETable))
110 | 	return hash
111 | }
112 | 
113 | func HashString(encode string) uint64 {
114 | 	hash := fnv.New64()
115 | 	hash.Write([]byte(encode))
116 | 	return hash.Sum64()
117 | }
118 | 
119 | // 制作特征值方法一
120 | func MakeUnique(obj interface{}) string {
121 | 	baseString, _ := json.Marshal(obj)
122 | 	return strconv.FormatUint(HashString(string(baseString)), 10)
123 | }
124 | 
125 | // 制作特征值方法二
126 | func MakeMd5(obj interface{}, length int) string {
127 | 	if length > 32 {
128 | 		length = 32
129 | 	}
130 | 	h := md5.New()
131 | 	baseString, _ := json.Marshal(obj)
132 | 	h.Write([]byte(baseString))
133 | 	s := hex.EncodeToString(h.Sum(nil))
134 | 	return s[:length]
135 | }
136 | 


--------------------------------------------------------------------------------
/pholcus/gui/guispider.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Walk Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package gui
  6 | 
  7 | import (
  8 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
  9 | 	"github.com/lxn/walk"
 10 | 	"sort"
 11 | 	// . "github.com/lxn/walk/declarative"
 12 | )
 13 | 
 14 | type GUISpiderCore struct {
 15 | 	Spider      *spider.Spider
 16 | 	Description string
 17 | }
 18 | 
 19 | type GUISpider struct {
 20 | 	*GUISpiderCore
 21 | 	Index   int
 22 | 	Title   string
 23 | 	checked bool
 24 | }
 25 | 
 26 | type GUISpiderModel struct {
 27 | 	walk.TableModelBase
 28 | 	walk.SorterBase
 29 | 	sortColumn int
 30 | 	sortOrder  walk.SortOrder
 31 | 	// evenBitmap *walk.Bitmap
 32 | 	// oddIcon    *walk.Icon
 33 | 	items []*GUISpider
 34 | }
 35 | 
 36 | func NewGUISpiderModel(list []*GUISpiderCore) *GUISpiderModel {
 37 | 	m := new(GUISpiderModel)
 38 | 	// m.evenBitmap, _ = walk.NewBitmapFromFile("")
 39 | 	// m.oddIcon, _ = walk.NewIconFromFile("img/x.ico")
 40 | 	for i, t := range list {
 41 | 		m.items = append(m.items, &GUISpider{
 42 | 			Index: i + 1,
 43 | 			Title: t.Spider.GetName(),
 44 | 			GUISpiderCore: &GUISpiderCore{
 45 | 				Description: t.Description,
 46 | 				Spider:      t.Spider,
 47 | 			},
 48 | 		})
 49 | 	}
 50 | 
 51 | 	return m
 52 | }
 53 | 
 54 | // Called by the TableView from SetModel and every time the model publishes a
 55 | // RowsReset event.
 56 | func (m *GUISpiderModel) RowCount() int {
 57 | 	return len(m.items)
 58 | }
 59 | 
 60 | // Called by the TableView when it needs the text to display for a given cell.
 61 | func (m *GUISpiderModel) Value(row, col int) interface{} {
 62 | 	item := m.items[row]
 63 | 
 64 | 	switch col {
 65 | 	case 0:
 66 | 		return item.Index
 67 | 
 68 | 	case 1:
 69 | 		return item.Title
 70 | 
 71 | 	case 2:
 72 | 		return item.Description
 73 | 
 74 | 	case 3:
 75 | 		return item.Spider
 76 | 	}
 77 | 	panic("unexpected col")
 78 | }
 79 | 
 80 | // Called by the TableView to retrieve if a given row is checked.
 81 | func (m *GUISpiderModel) Checked(row int) bool {
 82 | 	return m.items[row].checked
 83 | }
 84 | 
 85 | // Called by the TableView when the user toggled the check box of a given row.
 86 | func (m *GUISpiderModel) SetChecked(row int, checked bool) error {
 87 | 	m.items[row].checked = checked
 88 | 
 89 | 	return nil
 90 | }
 91 | 
 92 | //获取被选中的结果
 93 | func (m *GUISpiderModel) GetChecked() []*GUISpider {
 94 | 	rc := []*GUISpider{}
 95 | 	for idx, item := range m.items {
 96 | 		if m.Checked(idx) {
 97 | 			rc = append(rc, item)
 98 | 		}
 99 | 	}
100 | 	return rc
101 | }
102 | 
103 | // Called by the TableView to sort the model.
104 | func (m *GUISpiderModel) Sort(col int, order walk.SortOrder) error {
105 | 	m.sortColumn, m.sortOrder = col, order
106 | 
107 | 	sort.Sort(m)
108 | 
109 | 	return m.SorterBase.Sort(col, order)
110 | }
111 | 
112 | func (m *GUISpiderModel) Len() int {
113 | 	return len(m.items)
114 | }
115 | 
116 | func (m *GUISpiderModel) Less(i, j int) bool {
117 | 	a, b := m.items[i], m.items[j]
118 | 
119 | 	c := func(ls bool) bool {
120 | 		if m.sortOrder == walk.SortAscending {
121 | 			return ls
122 | 		}
123 | 
124 | 		return !ls
125 | 	}
126 | 
127 | 	switch m.sortColumn {
128 | 	case 0:
129 | 		return c(a.Index < b.Index)
130 | 
131 | 	case 1:
132 | 		return c(a.Title < b.Title)
133 | 
134 | 	case 2:
135 | 		return c(a.Description < b.Description)
136 | 
137 | 		// case 3:
138 | 		// 	return c(a.Spider < b.Spider)
139 | 	}
140 | 
141 | 	panic("unreachable")
142 | }
143 | 
144 | func (m *GUISpiderModel) Swap(i, j int) {
145 | 	m.items[i], m.items[j] = m.items[j], m.items[i]
146 | }
147 | 
148 | // Called by the TableView to retrieve an item image.
149 | // func (m *GUISpiderModel) Image(row int) interface{} {
150 | // 	// if m.items[row].Index%2 == 0 {
151 | // 	// 	return m.oddIcon
152 | // 	// }
153 | // 	return m.evenBitmap
154 | // }
155 | 


--------------------------------------------------------------------------------
/spiders/readme.md:
--------------------------------------------------------------------------------
  1 | // 蜘蛛，采集规则。
  2 | package spider
  3 | 
  4 | import (
  5 | 	"pholcus/downloader/context"
  6 | 	"pholcus/pholcus"
  7 | 	// "pholcus/pholcus/status"
  8 | )
  9 | 
 10 | type Spider struct {
 11 | 	Name      string
 12 | 	Pausetime [2]uint //暂停区间Pausetime[0]~Pausetime[0]+Pausetime[1]
 13 | 	*RuleTree
 14 | 	// *SpiderStatus
 15 | 	//以下为可选成员
 16 | 	MaxPage int
 17 | 	Keyword string
 18 | 	Depth   int
 19 | 	Id      int //所在SpiderList的下标编号
 20 | }
 21 | 
 22 | // func NewSpider() *Spider {
 23 | // 	sp := new(Spider)
 24 | // 	sp.RuleTree = &RuleTree{
 25 | // 		Nodes: make(map[string]*Rule),
 26 | // 	}
 27 | // 	return sp
 28 | // }
 29 | 
 30 | func (self *Spider) Start(sp *Spider) {
 31 | 	sp.RuleTree.Root(sp)
 32 | }
 33 | 
 34 | func (self *Spider) GetName() string {
 35 | 	return self.Name
 36 | }
 37 | 
 38 | func (self *Spider) GetId() int {
 39 | 	return self.Id
 40 | }
 41 | 
 42 | func (self *Spider) GetKeyword() string {
 43 | 	return self.Keyword
 44 | }
 45 | 
 46 | func (self *Spider) GetRules() map[string]*Rule {
 47 | 	return self.RuleTree.Nodes
 48 | }
 49 | 
 50 | // 根据响应流运行指定解析规则
 51 | func (self *Spider) GoRule(resp *context.Response) {
 52 | 	self.RuleTree.Nodes[resp.GetRuleName()].ParseFunc(self, resp)
 53 | }
 54 | 
 55 | // 用指定规则解析响应流
 56 | func (self *Spider) CallRule(ruleName string, resp *context.Response) {
 57 | 	resp.SetRuleName(ruleName)
 58 | 	self.GoRule(resp)
 59 | }
 60 | 
 61 | // 调用指定规则下辅助函数
 62 | func (self *Spider) AidRule(ruleName string, aid []interface{}) interface{} {
 63 | 	rule := self.RuleTree.Nodes[ruleName]
 64 | 	return rule.AidFunc(self, aid)
 65 | }
 66 | 
 67 | // 获取任务规则采集语义字段
 68 | func (self *Spider) GetOutFeild(resp *context.Response, index int) string {
 69 | 	return self.RuleTree.Nodes[resp.GetRuleName()].OutFeild[index]
 70 | }
 71 | 
 72 | func (self *Spider) LoopAddQueue(loop [2]int, urlFn func(int) []string, param map[string]interface{}) {
 73 | 	for ; loop[0] < loop[1]; loop[0]++ {
 74 | 		urls := urlFn(loop[0])
 75 | 		self.BulkAddQueue(urls, param)
 76 | 	}
 77 | }
 78 | 
 79 | func (self *Spider) BulkAddQueue(urls []string, param map[string]interface{}) {
 80 | 	for _, url := range urls {
 81 | 		param["url"] = url
 82 | 		self.AddQueue(param)
 83 | 	}
 84 | }
 85 | 
 86 | func (self *Spider) AddQueue(param map[string]interface{}) {
 87 | 	req := self.NewRequest(param)
 88 | 	pholcus.Self.Push(req)
 89 | }
 90 | 
 91 | // 生成请求
 92 | // param全部参数列表
 93 | // req := &Request{
 94 | // 	url:           param["url"].(string),     //必填
 95 | // 	parent:        "",                        //若有必填
 96 | // 	rule:          param["rule"].(string),    //必填
 97 | // 	spider:        param["spider"].(string),  //自动填写
 98 | // 	respType:      param["respType"].(string),//可默认
 99 | // 	method:        param["method"].(string),  //可默认
100 | // 	header:        param["header"],//可默认
101 | // 	cookies:       param["cookies"].([]*http.Cookie),//可默认
102 | // 	postdata:      param["postdata"].(string),//可默认
103 | // 	canOutsource:  param["canOutsource"].(bool),//可默认
104 | // 	checkRedirect: param["checkRedirect"].(func(req *http.Request, via []*http.Request) error),//可默认
105 | // 	proxyHost:     param["proxyHost"].(string),//可默认
106 | // 	temp:          param["temp"].(map[string]interface{}),//可默认
107 | // }
108 | 
109 | func (self *Spider) NewRequest(param map[string]interface{}) *context.Request {
110 | 	param["spider"] = self.GetName()
111 | 	req := context.NewRequest(param)
112 | 	req.SetSpiderId(self.GetId())
113 | 	return req
114 | }
115 | 
116 | //采集规则树
117 | type RuleTree struct {
118 | 	Spread []string //作为服务器时的请求分发点
119 | 	Root   func(*Spider)
120 | 	Nodes  map[string]*Rule
121 | }
122 | 
123 | // 采集规则单元
124 | type Rule struct {
125 | 	OutFeild []string //注意：有无字段语义和是否输出数据必须保持一致
126 | 	// 内容解析函数
127 | 	ParseFunc func(*Spider, *context.Response)
128 | 	// 通用辅助函数
129 | 	AidFunc func(*Spider, []interface{}) interface{}
130 | }
131 | 
132 | func (self *Rule) GetOutFeild() []string {
133 | 	return self.OutFeild
134 | }
135 | 


--------------------------------------------------------------------------------
/spiders/jdsearch.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | var JDSearch = &Spider{
 36 | 	Name: "京东搜索",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"生成请求": &Rule{
 48 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 49 | 					self.LoopAddQueue(
 50 | 						aid[0].([2]int),
 51 | 						func(i int) []string {
 52 | 							return []string{
 53 | 								"http://search.jd.com/Search?keyword=" + self.GetKeyword() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*i+2),
 54 | 								"http://search.jd.com/Search?keyword=" + self.GetKeyword() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*i+1),
 55 | 							}
 56 | 						},
 57 | 						map[string]interface{}{
 58 | 							"rule": aid[1].(string),
 59 | 						},
 60 | 					)
 61 | 					return nil
 62 | 				},
 63 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 64 | 					query := resp.GetHtmlParser()
 65 | 
 66 | 					total1 := query.Find("#top_pagi span.text").Text()
 67 | 
 68 | 					re, _ := regexp.Compile(`[\d]+$`)
 69 | 					total1 = re.FindString(total1)
 70 | 					total, _ := strconv.Atoi(total1)
 71 | 
 72 | 					if total > self.MaxPage {
 73 | 						total = self.MaxPage
 74 | 					} else if total == 0 {
 75 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
 76 | 						return
 77 | 					}
 78 | 					// 调用指定规则下辅助函数
 79 | 					self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"})
 80 | 					// 用指定规则解析响应流
 81 | 					self.CallRule("搜索结果", resp)
 82 | 				},
 83 | 			},
 84 | 
 85 | 			"搜索结果": &Rule{
 86 | 				//注意：有无字段语义和是否输出数据必须保持一致
 87 | 				OutFeild: []string{
 88 | 					"标题",
 89 | 					"价格",
 90 | 					"评论数",
 91 | 					"星级",
 92 | 					"链接",
 93 | 				},
 94 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 95 | 					query := resp.GetHtmlParser()
 96 | 
 97 | 					query.Find("#plist .list-h:nth-child(1) > li").Each(func(i int, s *goquery.Selection) {
 98 | 						// 获取标题
 99 | 						a := s.Find(".p-name a")
100 | 						title := a.Text()
101 | 
102 | 						re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
103 | 						// title = re.ReplaceAllStringFunc(title, strings.ToLower)
104 | 						title = re.ReplaceAllString(title, " ")
105 | 						title = strings.Trim(title, " \t\n")
106 | 
107 | 						// 获取价格
108 | 						price, _ := s.Find("strong[data-price]").First().Attr("data-price")
109 | 
110 | 						// 获取评论数
111 | 						e := s.Find(".extra").First()
112 | 						discuss := e.Find("a").First().Text()
113 | 						re, _ = regexp.Compile(`[\d]+`)
114 | 						discuss = re.FindString(discuss)
115 | 
116 | 						// 获取星级
117 | 						level, _ := e.Find(".star span[id]").First().Attr("class")
118 | 						level = re.FindString(level)
119 | 
120 | 						// 获取URL
121 | 						url, _ := a.Attr("href")
122 | 
123 | 						// 结果存入Response中转
124 | 						resp.AddItem(map[string]string{
125 | 							self.GetOutFeild(resp, 0): title,
126 | 							self.GetOutFeild(resp, 1): price,
127 | 							self.GetOutFeild(resp, 2): discuss,
128 | 							self.GetOutFeild(resp, 3): level,
129 | 							self.GetOutFeild(resp, 4): url,
130 | 						})
131 | 					})
132 | 				},
133 | 			},
134 | 		},
135 | 	},
136 | }
137 | 


--------------------------------------------------------------------------------
/spiders/miyabaobei.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	// "github.com/henrylee2cn/pholcus/reporter"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider" //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | var Miyabaobei = &Spider{
 36 | 	Name: "蜜芽宝贝",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AddQueue(map[string]interface{}{"url": "http://www.miyabaobei.com/", "rule": "获取版块URL"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"获取版块URL": &Rule{
 48 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 49 | 					query := resp.GetHtmlParser()
 50 | 					lis := query.Find(".ccon")
 51 | 					lis.Each(func(i int, s *goquery.Selection) {
 52 | 						s.Find("a").Each(func(n int, ss *goquery.Selection) {
 53 | 							if url, ok := ss.Attr("href"); ok {
 54 | 								if !strings.Contains(url, "http://www.miyabaobei.com") {
 55 | 									url = "http://www.miyabaobei.com" + url
 56 | 								}
 57 | 								self.AidRule("生成请求", []interface{}{
 58 | 									[2]int{0, 1},
 59 | 									url,
 60 | 									map[string]interface{}{
 61 | 										"rule": "生成请求",
 62 | 										"temp": map[string]interface{}{"baseUrl": url},
 63 | 									},
 64 | 								})
 65 | 							}
 66 | 						})
 67 | 					})
 68 | 				},
 69 | 			},
 70 | 
 71 | 			"生成请求": &Rule{
 72 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 73 | 					self.LoopAddQueue(
 74 | 						aid[0].([2]int),
 75 | 						func(i int) []string {
 76 | 							return []string{aid[1].(string) + "&per_page=" + strconv.Itoa(i*40)}
 77 | 						},
 78 | 						aid[2].(map[string]interface{}),
 79 | 					)
 80 | 					return nil
 81 | 				},
 82 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 83 | 					query := resp.GetHtmlParser()
 84 | 					totalPage := "1"
 85 | 
 86 | 					urls := query.Find(".Lpage.page p a")
 87 | 
 88 | 					if urls.Length() != 0 {
 89 | 						if urls.Last().Text() == ">" {
 90 | 							totalPage = urls.Eq(urls.Length() - 2).Text()
 91 | 						} else {
 92 | 							totalPage = urls.Last().Text()
 93 | 						}
 94 | 					}
 95 | 					total, _ := strconv.Atoi(totalPage)
 96 | 
 97 | 					// 调用指定规则下辅助函数
 98 | 					self.AidRule("生成请求", []interface{}{
 99 | 						[2]int{1, total},
100 | 						resp.GetTemp("baseUrl").(string),
101 | 						map[string]interface{}{
102 | 							"rule": "商品列表",
103 | 						},
104 | 					})
105 | 					// 用指定规则解析响应流
106 | 					self.CallRule("商品列表", resp)
107 | 				},
108 | 			},
109 | 
110 | 			"商品列表": &Rule{
111 | 				//注意：有无字段语义和是否输出数据必须保持一致
112 | 				OutFeild: []string{
113 | 					"标题",
114 | 					"价格",
115 | 					"类别",
116 | 				},
117 | 				ParseFunc: func(self *Spider, resp *context.Response) {
118 | 					query := resp.GetHtmlParser()
119 | 					//获取品类
120 | 					goodsType := query.Find(".crumbs").Text()
121 | 					re, _ := regexp.Compile("\\s")
122 | 					goodsType = re.ReplaceAllString(goodsType, "")
123 | 					re, _ = regexp.Compile("蜜芽宝贝>")
124 | 					goodsType = re.ReplaceAllString(goodsType, "")
125 | 					query.Find(".bmfo").Each(func(i int, s *goquery.Selection) {
126 | 						// 获取标题
127 | 						title, _ := s.Find("p a").First().Attr("title")
128 | 
129 | 						// 获取价格
130 | 						price := s.Find(".f20").Text()
131 | 
132 | 						// 结果存入Response中转
133 | 						resp.AddItem(map[string]string{
134 | 							self.GetOutFeild(resp, 0): title,
135 | 							self.GetOutFeild(resp, 1): price,
136 | 							self.GetOutFeild(resp, 2): goodsType,
137 | 						})
138 | 					})
139 | 				},
140 | 			},
141 | 		},
142 | 	},
143 | }
144 | 


--------------------------------------------------------------------------------
/spiders/alibaba.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | 	"net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	// "regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | var AlibabaProduct = &Spider{
 36 | 	Name: "阿里巴巴产品搜索",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"生成请求": &Rule{
 48 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 49 | 					keyword := EncodeString(self.GetKeyword(), "GBK")
 50 | 					self.LoopAddQueue(
 51 | 						aid[0].([2]int),
 52 | 						func(i int) []string {
 53 | 							return []string{"http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&fromSycm=y&uniqfield=pic_tag_id&keywords=" + keyword + "&beginPage=" + strconv.Itoa(i+1)}
 54 | 						},
 55 | 						map[string]interface{}{
 56 | 							"rule":   aid[1].(string),
 57 | 							"header": http.Header{"Content-Type": []string{"text/html", "charset=GBK"}},
 58 | 						},
 59 | 					)
 60 | 					return nil
 61 | 				},
 62 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 63 | 					query := resp.GetHtmlParser()
 64 | 					total1, _ := query.Find("#sm-pagination div[data-total-page]").First().Attr("data-total-page")
 65 | 					total1 = strings.Trim(total1, " \t\n")
 66 | 					total, _ := strconv.Atoi(total1)
 67 | 					if total > self.MaxPage {
 68 | 						total = self.MaxPage
 69 | 					} else if total == 0 {
 70 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
 71 | 						return
 72 | 					}
 73 | 
 74 | 					// 调用指定规则下辅助函数
 75 | 					self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"})
 76 | 					// 用指定规则解析响应流
 77 | 					self.CallRule("搜索结果", resp)
 78 | 				},
 79 | 			},
 80 | 
 81 | 			"搜索结果": &Rule{
 82 | 				//注意：有无字段语义和是否输出数据必须保持一致
 83 | 				OutFeild: []string{
 84 | 					"公司",
 85 | 					"标题",
 86 | 					"价格",
 87 | 					"销量",
 88 | 					"星级",
 89 | 					"地址",
 90 | 					"链接",
 91 | 				},
 92 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 93 | 					query := resp.GetHtmlParser()
 94 | 
 95 | 					query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) {
 96 | 
 97 | 						// 获取公司
 98 | 						company, _ := s.Find("a.sm-offer-companyName").First().Attr("title")
 99 | 
100 | 						// 获取标题
101 | 						t := s.Find(".sm-offer-title > a:nth-child(1)")
102 | 						title, _ := t.Attr("title")
103 | 
104 | 						// 获取URL
105 | 						url, _ := t.Attr("href")
106 | 
107 | 						// 获取价格
108 | 						price := s.Find(".sm-offer-priceNum").First().Text()
109 | 
110 | 						// 获取成交量
111 | 						sales := s.Find("span.sm-offer-trade > em").First().Text()
112 | 
113 | 						// 获取地址
114 | 						address, _ := s.Find(".sm-offer-location").First().Attr("title")
115 | 
116 | 						// 获取信用年限
117 | 						level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text()
118 | 
119 | 						// 结果存入Response中转
120 | 						resp.AddItem(map[string]string{
121 | 							self.GetOutFeild(resp, 0): company,
122 | 							self.GetOutFeild(resp, 1): title,
123 | 							self.GetOutFeild(resp, 2): price,
124 | 							self.GetOutFeild(resp, 3): sales,
125 | 							self.GetOutFeild(resp, 4): level,
126 | 							self.GetOutFeild(resp, 5): address,
127 | 							self.GetOutFeild(resp, 6): url,
128 | 						})
129 | 					})
130 | 				},
131 | 			},
132 | 		},
133 | 	},
134 | }
135 | 


--------------------------------------------------------------------------------
/spiders/taobaosearch.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	// "github.com/PuerkitoBio/goquery" //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | 	// "encoding/xml"
 19 | 	"encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | var TaobaoSearch = &Spider{
 36 | 	Name: "淘宝搜索",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"生成请求": &Rule{
 48 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 49 | 					self.LoopAddQueue(
 50 | 						aid[0].([2]int),
 51 | 						func(i int) []string {
 52 | 							return []string{"http://s.taobao.com/search?_input_charset=utf-8&q=" + self.GetKeyword() + "&s=" + strconv.Itoa(i*44)}
 53 | 						},
 54 | 						map[string]interface{}{
 55 | 							"rule": aid[1].(string),
 56 | 						},
 57 | 					)
 58 | 					return nil
 59 | 				},
 60 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 61 | 					query := resp.GetHtmlParser()
 62 | 					src := query.Find("script").Text()
 63 | 					if strings.Contains(src, "抱歉！没有找到与") {
 64 | 						reporter.Log.Println("搜索结果为 0 ！")
 65 | 						return
 66 | 					}
 67 | 
 68 | 					re, _ := regexp.Compile(`(?U)"totalPage":[\d]+,`)
 69 | 					total := re.FindString(src)
 70 | 					re, _ = regexp.Compile(`[\d]+`)
 71 | 					total = re.FindString(total)
 72 | 					totalPage, _ := strconv.Atoi(total)
 73 | 
 74 | 					if totalPage > self.MaxPage {
 75 | 						totalPage = self.MaxPage
 76 | 					} else if totalPage == 0 {
 77 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
 78 | 						return
 79 | 					}
 80 | 					// 调用指定规则下辅助函数
 81 | 					self.AidRule("生成请求", []interface{}{[2]int{1, totalPage}, "搜索结果"})
 82 | 					// 用指定规则解析响应流
 83 | 					self.CallRule("搜索结果", resp)
 84 | 				},
 85 | 			},
 86 | 
 87 | 			"搜索结果": &Rule{
 88 | 				//注意：有无字段语义和是否输出数据必须保持一致
 89 | 				OutFeild: []string{
 90 | 					"标题",
 91 | 					"价格",
 92 | 					"销量",
 93 | 					"店铺",
 94 | 					"链接",
 95 | 				},
 96 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 97 | 					query := resp.GetHtmlParser()
 98 | 					re, _ := regexp.Compile(`"auctions".*,"recommendAuctions"`)
 99 | 					src := query.Find("script").Text()
100 | 
101 | 					src = re.FindString(src)
102 | 
103 | 					re, _ = regexp.Compile(`"auctions":`)
104 | 					src = re.ReplaceAllString(src, "")
105 | 
106 | 					re, _ = regexp.Compile(`,"recommendAuctions"`)
107 | 					src = re.ReplaceAllString(src, "")
108 | 
109 | 					re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
110 | 					// src = re.ReplaceAllStringFunc(src, strings.ToLower)
111 | 					src = re.ReplaceAllString(src, " ")
112 | 
113 | 					src = strings.Trim(src, " \t\n")
114 | 
115 | 					infos := []map[string]interface{}{}
116 | 
117 | 					err := json.Unmarshal([]byte(src), &infos)
118 | 
119 | 					if err != nil {
120 | 						reporter.Log.Printf("error is %v\n", err)
121 | 						return
122 | 					} else {
123 | 						for _, info := range infos {
124 | 
125 | 							// 结果存入Response中转
126 | 							resp.AddItem(map[string]string{
127 | 								self.GetOutFeild(resp, 0): info["raw_title"].(string),
128 | 								self.GetOutFeild(resp, 1): info["view_price"].(string),
129 | 								self.GetOutFeild(resp, 2): info["view_sales"].(string),
130 | 								self.GetOutFeild(resp, 3): info["nick"].(string),
131 | 								self.GetOutFeild(resp, 4): info["detail_url"].(string),
132 | 							})
133 | 						}
134 | 					}
135 | 				},
136 | 			},
137 | 		},
138 | 	},
139 | }
140 | 


--------------------------------------------------------------------------------
/spiders/spider/spider.go:
--------------------------------------------------------------------------------
  1 | // 蜘蛛，采集规则。
  2 | package spider
  3 | 
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/downloader/context"
  6 | 	"github.com/henrylee2cn/pholcus/pholcus"
  7 | 	// "github.com/henrylee2cn/pholcus/pholcus/status"
  8 | )
  9 | 
 10 | type Spider struct {
 11 | 	Name      string
 12 | 	Pausetime [2]uint //暂停区间Pausetime[0]~Pausetime[0]+Pausetime[1]
 13 | 	*RuleTree
 14 | 	// *SpiderStatus
 15 | 	//以下为可选成员
 16 | 	MaxPage int
 17 | 	Keyword string
 18 | 	Depth   int
 19 | 	Id      int //所在SpiderList的下标编号
 20 | }
 21 | 
 22 | // func NewSpider() *Spider {
 23 | // 	sp := new(Spider)
 24 | // 	sp.RuleTree = &RuleTree{
 25 | // 		Nodes: make(map[string]*Rule),
 26 | // 	}
 27 | // 	return sp
 28 | // }
 29 | 
 30 | func (self *Spider) Start(sp *Spider) {
 31 | 	sp.RuleTree.Root(sp)
 32 | }
 33 | 
 34 | func (self *Spider) GetName() string {
 35 | 	return self.Name
 36 | }
 37 | 
 38 | func (self *Spider) GetId() int {
 39 | 	return self.Id
 40 | }
 41 | 
 42 | func (self *Spider) GetKeyword() string {
 43 | 	return self.Keyword
 44 | }
 45 | 
 46 | func (self *Spider) GetRules() map[string]*Rule {
 47 | 	return self.RuleTree.Nodes
 48 | }
 49 | 
 50 | // 根据响应流运行指定解析规则
 51 | func (self *Spider) GoRule(resp *context.Response) {
 52 | 	self.RuleTree.Nodes[resp.GetRuleName()].ParseFunc(self, resp)
 53 | }
 54 | 
 55 | // 用指定规则解析响应流
 56 | func (self *Spider) CallRule(ruleName string, resp *context.Response) {
 57 | 	resp.SetRuleName(ruleName)
 58 | 	self.GoRule(resp)
 59 | }
 60 | 
 61 | // 调用指定规则下辅助函数
 62 | func (self *Spider) AidRule(ruleName string, aid []interface{}) interface{} {
 63 | 	rule := self.RuleTree.Nodes[ruleName]
 64 | 	return rule.AidFunc(self, aid)
 65 | }
 66 | 
 67 | // 获取任务规则采集语义字段
 68 | func (self *Spider) GetOutFeild(resp *context.Response, index int) string {
 69 | 	return self.RuleTree.Nodes[resp.GetRuleName()].OutFeild[index]
 70 | }
 71 | 
 72 | // 获取任意规则采集语义字段
 73 | func (self *Spider) ShowOutFeild(ruleName string, index int) string {
 74 | 	return self.RuleTree.Nodes[ruleName].OutFeild[index]
 75 | }
 76 | 
 77 | func (self *Spider) LoopAddQueue(loop [2]int, urlFn func(int) []string, param map[string]interface{}) {
 78 | 	for ; loop[0] < loop[1]; loop[0]++ {
 79 | 		urls := urlFn(loop[0])
 80 | 		self.BulkAddQueue(urls, param)
 81 | 	}
 82 | }
 83 | 
 84 | func (self *Spider) BulkAddQueue(urls []string, param map[string]interface{}) {
 85 | 	for _, url := range urls {
 86 | 		param["url"] = url
 87 | 		self.AddQueue(param)
 88 | 	}
 89 | }
 90 | 
 91 | func (self *Spider) AddQueue(param map[string]interface{}) {
 92 | 	req := self.NewRequest(param)
 93 | 	pholcus.Self.Push(req)
 94 | }
 95 | 
 96 | // 生成请求
 97 | // param全部参数列表
 98 | // req := &Request{
 99 | // 	url:           param["url"].(string),     //必填
100 | // 	parent:        "",                        //若有必填
101 | // 	rule:          param["rule"].(string),    //必填
102 | // 	spider:        param["spider"].(string),  //自动填写
103 | // 	respType:      param["respType"].(string),//可默认
104 | // 	method:        param["method"].(string),  //可默认
105 | // 	header:        param["header"],//可默认
106 | // 	cookies:       param["cookies"].([]*http.Cookie),//可默认
107 | // 	postdata:      param["postdata"].(string),//可默认
108 | // 	canOutsource:  param["canOutsource"].(bool),//可默认
109 | // 	checkRedirect: param["checkRedirect"].(func(req *http.Request, via []*http.Request) error),//可默认
110 | // 	proxyHost:     param["proxyHost"].(string),//可默认
111 | // 	temp:          param["temp"].(map[string]interface{}),//可默认
112 | // }
113 | 
114 | func (self *Spider) NewRequest(param map[string]interface{}) *context.Request {
115 | 	param["spider"] = self.GetName()
116 | 	req := context.NewRequest(param)
117 | 	req.SetSpiderId(self.GetId())
118 | 	return req
119 | }
120 | 
121 | //采集规则树
122 | type RuleTree struct {
123 | 	Spread []string //作为服务器时的请求分发点
124 | 	Root   func(*Spider)
125 | 	Nodes  map[string]*Rule
126 | }
127 | 
128 | // 采集规则单元
129 | type Rule struct {
130 | 	OutFeild []string //注意：有无字段语义和是否输出数据必须保持一致
131 | 	// 内容解析函数
132 | 	ParseFunc func(*Spider, *context.Response)
133 | 	// 通用辅助函数
134 | 	AidFunc func(*Spider, []interface{}) interface{}
135 | }
136 | 
137 | func (self *Rule) GetOutFeild() []string {
138 | 	return self.OutFeild
139 | }
140 | 


--------------------------------------------------------------------------------
/spiders/googlesearch.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	"strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | 	// "fmt"
 32 | 	"math"
 33 | )
 34 | 
 35 | var googleIp = []string{
 36 | 	"210.242.125.100",
 37 | 	"210.242.125.96",
 38 | 	"210.242.125.91",
 39 | 	"210.242.125.95",
 40 | 	"64.233.189.163",
 41 | 	"58.123.102.5",
 42 | 	"210.242.125.97",
 43 | 	"210.242.125.115",
 44 | 	"58.123.102.28",
 45 | 	"210.242.125.70",
 46 | }
 47 | 
 48 | var GoogleSearch = &Spider{
 49 | 	Name: "谷歌搜索",
 50 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 51 | 	// Optional: &Optional{},
 52 | 	RuleTree: &RuleTree{
 53 | 		// Spread: []string{},
 54 | 		Root: func(self *Spider) {
 55 | 			var url string
 56 | 			var success bool
 57 | 			reporter.Log.Println("正在查找可用的Google镜像，该过程可能需要几分钟……")
 58 | 			for _, ip := range googleIp {
 59 | 				url = "http://" + ip + "/search?q=" + self.GetKeyword() + "&newwindow=1&biw=1600&bih=398&start="
 60 | 				if _, err := goquery.NewDocument(url); err == nil {
 61 | 					success = true
 62 | 					break
 63 | 				}
 64 | 			}
 65 | 			if !success {
 66 | 				reporter.Log.Println("没有可用的Google镜像IP！！")
 67 | 				return
 68 | 			}
 69 | 			reporter.Log.Println("开始Google搜索……")
 70 | 			self.AddQueue(map[string]interface{}{
 71 | 				"url":  url,
 72 | 				"rule": "获取总页数",
 73 | 				"temp": map[string]interface{}{
 74 | 					"baseUrl": url,
 75 | 				},
 76 | 			})
 77 | 		},
 78 | 
 79 | 		Nodes: map[string]*Rule{
 80 | 
 81 | 			"获取总页数": &Rule{
 82 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 83 | 					self.LoopAddQueue(
 84 | 						aid[0].([2]int),
 85 | 						func(i int) []string {
 86 | 							return []string{aid[1].(string) + strconv.Itoa(10*i)}
 87 | 						},
 88 | 						aid[2].(map[string]interface{}),
 89 | 					)
 90 | 					return nil
 91 | 				},
 92 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 93 | 					query := resp.GetHtmlParser()
 94 | 					txt := query.Find("#resultStats").Text()
 95 | 					reporter.Log.Println("总页数txt：", txt)
 96 | 					re, _ := regexp.Compile(`,+`)
 97 | 					txt = re.ReplaceAllString(txt, "")
 98 | 					re, _ = regexp.Compile(`[\d]+`)
 99 | 					txt = re.FindString(txt)
100 | 					num, _ := strconv.Atoi(txt)
101 | 					reporter.Log.Println("总页数：", num)
102 | 					total := int(math.Ceil(float64(num) / 10))
103 | 					if total > self.MaxPage {
104 | 						total = self.MaxPage
105 | 					} else if total == 0 {
106 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
107 | 						return
108 | 					}
109 | 					// 调用指定规则下辅助函数
110 | 					self.AidRule("获取总页数",
111 | 						[]interface{}{
112 | 							[2]int{1, total},
113 | 							resp.GetTemp("baseUrl"),
114 | 							map[string]interface{}{
115 | 								"rule": "搜索结果",
116 | 							},
117 | 						})
118 | 					// 用指定规则解析响应流
119 | 					self.CallRule("搜索结果", resp)
120 | 				},
121 | 			},
122 | 
123 | 			"搜索结果": &Rule{
124 | 				//注意：有无字段语义和是否输出数据必须保持一致
125 | 				OutFeild: []string{
126 | 					"标题",
127 | 					"内容",
128 | 					"链接",
129 | 				},
130 | 				ParseFunc: func(self *Spider, resp *context.Response) {
131 | 					query := resp.GetHtmlParser()
132 | 					query.Find("#ires li.g").Each(func(i int, s *goquery.Selection) {
133 | 						t := s.Find(".r > a")
134 | 						href, _ := t.Attr("href")
135 | 						href = strings.TrimLeft(href, "/url?q=")
136 | 						title := t.Text()
137 | 						content := s.Find(".st").Text()
138 | 						resp.AddItem(map[string]string{
139 | 							self.GetOutFeild(resp, 0): title,
140 | 							self.GetOutFeild(resp, 1): content,
141 | 							self.GetOutFeild(resp, 2): href,
142 | 						})
143 | 					})
144 | 				},
145 | 			},
146 | 		},
147 | 	},
148 | }
149 | 


--------------------------------------------------------------------------------
/downloader/context/response.go:
--------------------------------------------------------------------------------
  1 | package context
  2 | 
  3 | import (
  4 | 	"github.com/PuerkitoBio/goquery"
  5 | 	"github.com/bitly/go-simplejson"
  6 | 	"github.com/henrylee2cn/pholcus/reporter"
  7 | 	"net/http"
  8 | 	"strings"
  9 | )
 10 | 
 11 | // Response represents an entity be crawled.
 12 | type Response struct {
 13 | 	// The isfail is true when crawl process is failed and errormsg is the fail resean.
 14 | 	isfail bool
 15 | 
 16 | 	errormsg string
 17 | 
 18 | 	// The request is crawled by spider that contains url and relevent information.
 19 | 	*Request
 20 | 
 21 | 	// The body is plain text of crawl result.
 22 | 	body string
 23 | 
 24 | 	header  http.Header
 25 | 	cookies []*http.Cookie
 26 | 
 27 | 	// The docParser is a pointer of goquery boject that contains html result.
 28 | 	docParser *goquery.Document
 29 | 
 30 | 	// The jsonMap is the json result.
 31 | 	jsonMap *simplejson.Json
 32 | 
 33 | 	// The items is the container of parsed result.
 34 | 	items []map[string]string
 35 | }
 36 | 
 37 | // NewResponse returns initialized Response object.
 38 | func NewResponse(req *Request) *Response {
 39 | 	return &Response{Request: req, items: []map[string]string{}}
 40 | }
 41 | 
 42 | // SetHeader save the header of http responce
 43 | func (self *Response) SetHeader(header http.Header) {
 44 | 	self.header = header
 45 | }
 46 | 
 47 | // GetHeader returns the header of http responce
 48 | func (self *Response) GetHeader() http.Header {
 49 | 	return self.header
 50 | }
 51 | 
 52 | // SetHeader save the cookies of http responce
 53 | func (self *Response) SetCookies(cookies []*http.Cookie) {
 54 | 	self.cookies = cookies
 55 | }
 56 | 
 57 | // GetHeader returns the cookies of http responce
 58 | func (self *Response) GetCookies() []*http.Cookie {
 59 | 	return self.cookies
 60 | }
 61 | 
 62 | // IsSucc test whether download process success or not.
 63 | func (self *Response) IsSucc() bool {
 64 | 	return !self.isfail
 65 | }
 66 | 
 67 | // Errormsg show the download error message.
 68 | func (self *Response) Errormsg() string {
 69 | 	return self.errormsg
 70 | }
 71 | 
 72 | // SetStatus save status info about download process.
 73 | func (self *Response) SetStatus(isfail bool, errormsg string) {
 74 | 	self.isfail = isfail
 75 | 	self.errormsg = errormsg
 76 | }
 77 | 
 78 | // AddField saves KV string pair to ResponseItems preparing for Pipeline
 79 | func (self *Response) AddItem(data map[string]string) {
 80 | 	self.items = append(self.items, data)
 81 | }
 82 | 
 83 | func (self *Response) GetItem(idx int) map[string]string {
 84 | 	return self.items[idx]
 85 | }
 86 | 
 87 | func (self *Response) GetItems() []map[string]string {
 88 | 	return self.items
 89 | }
 90 | 
 91 | // SetRequest saves request oject of self page.
 92 | func (self *Response) SetRequest(r *Request) *Response {
 93 | 	self.Request = r
 94 | 	return self
 95 | }
 96 | 
 97 | // GetRequest returns request oject of self page.
 98 | func (self *Response) GetRequest() *Request {
 99 | 	return self.Request
100 | }
101 | 
102 | // SetBodyStr saves plain string crawled in Response.
103 | func (self *Response) SetBodyStr(body string) *Response {
104 | 	self.body = body
105 | 	return self
106 | }
107 | 
108 | // GetBodyStr returns plain string crawled.
109 | func (self *Response) GetBodyStr() string {
110 | 	return self.body
111 | }
112 | 
113 | // SetHtmlParser saves goquery object binded to target crawl result.
114 | func (self *Response) SetHtmlParser(doc *goquery.Document) *Response {
115 | 	self.docParser = doc
116 | 	return self
117 | }
118 | 
119 | // GetHtmlParser returns goquery object binded to target crawl result.
120 | func (self *Response) GetHtmlParser() *goquery.Document {
121 | 	return self.docParser
122 | }
123 | 
124 | // GetHtmlParser returns goquery object binded to target crawl result.
125 | func (self *Response) ResetHtmlParser() *goquery.Document {
126 | 	r := strings.NewReader(self.body)
127 | 	var err error
128 | 	self.docParser, err = goquery.NewDocumentFromReader(r)
129 | 	if err != nil {
130 | 		reporter.Log.Println(err.Error())
131 | 		panic(err.Error())
132 | 	}
133 | 	return self.docParser
134 | }
135 | 
136 | // SetJson saves json result.
137 | func (self *Response) SetJson(js *simplejson.Json) *Response {
138 | 	self.jsonMap = js
139 | 	return self
140 | }
141 | 
142 | // SetJson returns json result.
143 | func (self *Response) GetJson() *simplejson.Json {
144 | 	return self.jsonMap
145 | }
146 | 


--------------------------------------------------------------------------------
/pholcus/crawler/crawl.go:
--------------------------------------------------------------------------------
  1 | package crawler
  2 | 
  3 | import (
  4 | 	// "fmt"
  5 | 	"github.com/henrylee2cn/pholcus/config"
  6 | 	"github.com/henrylee2cn/pholcus/downloader"
  7 | 	"github.com/henrylee2cn/pholcus/downloader/context"
  8 | 	"github.com/henrylee2cn/pholcus/pipeline"
  9 | 	"github.com/henrylee2cn/pholcus/reporter"
 10 | 	"github.com/henrylee2cn/pholcus/scheduler"
 11 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
 12 | 	"math/rand"
 13 | 	"sync"
 14 | 	"time"
 15 | )
 16 | 
 17 | type crawler struct {
 18 | 	*spider.Spider
 19 | 	downloader.Downloader
 20 | 	pipeline.Pipeline
 21 | 	srcManage [2]uint
 22 | }
 23 | 
 24 | func New() Crawler {
 25 | 	return &crawler{
 26 | 		Pipeline:   pipeline.New(),
 27 | 		Downloader: downloader.NewHttpDownloader(),
 28 | 		srcManage:  [2]uint{},
 29 | 	}
 30 | }
 31 | 
 32 | func (self *crawler) Init(sp *spider.Spider) Crawler {
 33 | 	self.Pipeline.Init(sp)
 34 | 	self.Spider = sp
 35 | 	self.Downloader = downloader.NewHttpDownloader()
 36 | 	self.srcManage = [2]uint{}
 37 | 	return self
 38 | }
 39 | 
 40 | // 任务执行入口
 41 | func (self *crawler) Start() {
 42 | 	// 预先开启输出管理协程
 43 | 	self.Pipeline.Start()
 44 | 
 45 | 	// 开始运行
 46 | 	self.Spider.Start(self.Spider)
 47 | 	self.Run()
 48 | 	// reporter.Log.Println("**************断点 8 ***********")
 49 | 	// 通知输出模块输出未输出的数据
 50 | 	self.Pipeline.CtrlR()
 51 | 	// reporter.Log.Println("**************断点 11 ***********")
 52 | }
 53 | 
 54 | func (self *crawler) Run() {
 55 | 	for {
 56 | 		// 队列中取出一条请求
 57 | 		req := self.GetOne()
 58 | 
 59 | 		// 队列退出及空请求调控
 60 | 		if req == nil {
 61 | 			if self.canStop() {
 62 | 				// reporter.Log.Println("**************退出队列************")
 63 | 				break
 64 | 			} else {
 65 | 				time.Sleep(500 * time.Millisecond)
 66 | 				continue
 67 | 			}
 68 | 		}
 69 | 
 70 | 		// 自身资源统计
 71 | 		self.RequestIn()
 72 | 
 73 | 		// 全局统计下载页面数
 74 | 		config.ReqSum++
 75 | 
 76 | 		go func(req *context.Request) {
 77 | 			defer func() {
 78 | 				self.FreeOne()
 79 | 				self.RequestOut()
 80 | 			}()
 81 | 			reporter.Log.Println("start crawl :", req.GetUrl())
 82 | 			self.Process(req)
 83 | 		}(req)
 84 | 	}
 85 | }
 86 | 
 87 | // core processer
 88 | func (self *crawler) Process(req *context.Request) {
 89 | 	// 声明response
 90 | 	var resp *context.Response
 91 | 
 92 | 	defer func() {
 93 | 		if err := recover(); err != nil { // do not affect other
 94 | 			if strerr, ok := err.(string); ok {
 95 | 				reporter.Log.Println(strerr)
 96 | 			} else {
 97 | 				reporter.Log.Println("Process error：", err)
 98 | 			}
 99 | 		}
100 | 	}()
101 | 	// reporter.Log.Println("**************断点 1 ***********")
102 | 	// download page
103 | 	for i := 0; i < 3; i++ {
104 | 		self.sleep()
105 | 		resp = self.Downloader.Download(req)
106 | 		if resp.IsSucc() { // if fail retry 3 times
107 | 			break
108 | 		}
109 | 	}
110 | 	// reporter.Log.Println("**************断点 2 ***********")
111 | 	if !resp.IsSucc() { // if fail do not need process
112 | 		return
113 | 	}
114 | 	// reporter.Log.Println("**************断点 3 ***********")
115 | 	// 过程处理，提炼数据
116 | 	self.Spider.GoRule(resp)
117 | 	// reporter.Log.Println("**************断点 5 ***********")
118 | 	// 该条请求结果存入pipeline
119 | 	datas := resp.GetItems()
120 | 	for i, count := 0, len(datas); i < count; i++ {
121 | 		self.Pipeline.Collect(
122 | 			resp.GetRuleName(), //DataCell.RuleName
123 | 			datas[i],           //DataCell.Data
124 | 			resp.GetUrl(),      //DataCell.Url
125 | 			resp.GetParent(),   //DataCell.ParentUrl
126 | 			time.Now().Format("2006-01-02 15:04:05"),
127 | 		)
128 | 	}
129 | 	// reporter.Log.Println("**************断点 end ***********")
130 | }
131 | 
132 | // 常用基础方法
133 | func (self *crawler) sleep() {
134 | 	sleeptime := rand.Intn(int(self.Spider.Pausetime[1])) + int(self.Spider.Pausetime[0])
135 | 	time.Sleep(time.Duration(sleeptime) * time.Millisecond)
136 | }
137 | 
138 | // 从调度读取一个请求
139 | func (self *crawler) GetOne() *context.Request {
140 | 	return scheduler.Self.Use(self.Spider.GetId())
141 | }
142 | 
143 | //从调度释放一个资源空位
144 | func (self *crawler) FreeOne() {
145 | 	scheduler.Self.Free()
146 | }
147 | 
148 | func (self *crawler) RequestIn() {
149 | 	self.srcManage[0]++
150 | }
151 | 
152 | var requestOutMutex sync.Mutex
153 | 
154 | func (self *crawler) RequestOut() {
155 | 	requestOutMutex.Lock()
156 | 	defer func() {
157 | 		requestOutMutex.Unlock()
158 | 	}()
159 | 	self.srcManage[1]++
160 | }
161 | 
162 | //判断调度中是否还有属于自己的资源运行
163 | func (self *crawler) canStop() bool {
164 | 	// reporter.Log.Println("**************", self.srcManage[0], self.srcManage[1], "***********")
165 | 
166 | 	return self.srcManage[0] == self.srcManage[1] && scheduler.Self.IsEmpty(self.Spider.GetId())
167 | }
168 | 


--------------------------------------------------------------------------------
/spiders/hollandandbarrett.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | 	// "encoding/xml"
 19 | 	"encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	// "strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | 	"fmt"
 32 | 	// "math"
 33 | )
 34 | 
 35 | var Hollandandbarrett = &Spider{
 36 | 	Name: "Hollandandbarrett",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AddQueue(
 43 | 				map[string]interface{}{
 44 | 					"url":  "http://www.hollandandbarrett.com/",
 45 | 					"rule": "获取版块URL",
 46 | 				},
 47 | 			)
 48 | 		},
 49 | 
 50 | 		Nodes: map[string]*Rule{
 51 | 
 52 | 			"获取版块URL": &Rule{
 53 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 54 | 					query := resp.GetHtmlParser()
 55 | 					lis := query.Find(".footer-links nav.l-one-half a")
 56 | 
 57 | 					lis.Each(func(i int, s *goquery.Selection) {
 58 | 						if url, ok := s.Attr("href"); ok {
 59 | 							tit, _ := s.Attr("title")
 60 | 							self.AddQueue(
 61 | 								map[string]interface{}{
 62 | 									"url":  "http://www.hollandandbarrett.com" + url + "?showAll=1&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true",
 63 | 									"rule": "获取总数",
 64 | 									"temp": map[string]interface{}{
 65 | 										"type":    tit,
 66 | 										"baseUrl": url,
 67 | 									},
 68 | 								},
 69 | 							)
 70 | 						}
 71 | 					})
 72 | 				},
 73 | 			},
 74 | 
 75 | 			"获取总数": &Rule{
 76 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 77 | 
 78 | 					query := resp.GetHtmlParser()
 79 | 
 80 | 					re, _ := regexp.Compile(`(?U)"totalNumRecs":[\d]+,`)
 81 | 					total := re.FindString(query.Text())
 82 | 					re, _ = regexp.Compile(`[\d]+`)
 83 | 					total = re.FindString(total)
 84 | 					total = strings.Trim(total, " \t\n")
 85 | 
 86 | 					if total == "0" {
 87 | 						reporter.Log.Printf("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName())
 88 | 					} else {
 89 | 
 90 | 						self.AddQueue(
 91 | 							map[string]interface{}{
 92 | 								"url":  "http://www.hollandandbarrett.com" + resp.GetTemp("baseUrl").(string) + "?showAll=" + total + "&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true",
 93 | 								"rule": "商品详情",
 94 | 								"temp": map[string]interface{}{
 95 | 									"type": resp.GetTemp("type").(string),
 96 | 								},
 97 | 							},
 98 | 						)
 99 | 
100 | 					}
101 | 				},
102 | 			},
103 | 
104 | 			"商品详情": &Rule{
105 | 				//注意：有无字段语义和是否输出数据必须保持一致
106 | 				OutFeild: []string{
107 | 					"标题",
108 | 					"原价",
109 | 					"折后价",
110 | 					"打折",
111 | 					"星级",
112 | 					"分类",
113 | 				},
114 | 				ParseFunc: func(self *Spider, resp *context.Response) {
115 | 					query := resp.GetHtmlParser()
116 | 
117 | 					src := query.Text()
118 | 
119 | 					infos := map[string]interface{}{}
120 | 
121 | 					err := json.Unmarshal([]byte(src), &infos)
122 | 
123 | 					if err != nil {
124 | 						reporter.Log.Printf("error is %v\n", err)
125 | 						return
126 | 					} else {
127 | 						for _, info1 := range infos["contents"].([]interface{})[0].(map[string]interface{})["mainContent"].([]interface{})[0].(map[string]interface{})["records"].([]interface{}) {
128 | 
129 | 							info2 := info1.(map[string]interface{})["records"].([]interface{})[0].(map[string]interface{})["attributes"].(map[string]interface{})
130 | 
131 | 							var n, price1, price2, prm, level string
132 | 
133 | 							if info2["Name"] == nil {
134 | 								n = ""
135 | 							} else {
136 | 								n = fmt.Sprint(info2["Name"])
137 | 								n = strings.TrimRight(n, "]")
138 | 								n = strings.TrimLeft(n, "[")
139 | 							}
140 | 
141 | 							if info2["lp"] == nil {
142 | 								price1 = ""
143 | 							} else {
144 | 								price1 = fmt.Sprint(info2["lp"])
145 | 								price1 = strings.TrimRight(price1, "]")
146 | 								price1 = strings.TrimLeft(price1, "[")
147 | 							}
148 | 
149 | 							if info2["sp"] == nil {
150 | 								price2 = ""
151 | 							} else {
152 | 								price2 = fmt.Sprint(info2["sp"])
153 | 								price2 = strings.TrimRight(price2, "]")
154 | 								price2 = strings.TrimLeft(price2, "[")
155 | 							}
156 | 
157 | 							if info2["prm"] == nil {
158 | 								prm = ""
159 | 							} else {
160 | 								prm = fmt.Sprint(info2["prm"])
161 | 								prm = strings.TrimRight(prm, "]")
162 | 								prm = strings.TrimLeft(prm, "[")
163 | 							}
164 | 
165 | 							if info2["ratingCount"] == nil {
166 | 								level = "0"
167 | 							} else {
168 | 								level = fmt.Sprint(info2["ratingCount"])
169 | 								level = strings.TrimRight(level, "]")
170 | 								level = strings.TrimLeft(level, "[")
171 | 							}
172 | 
173 | 							// 结果存入Response中转
174 | 							resp.AddItem(map[string]string{
175 | 								self.GetOutFeild(resp, 0): n,
176 | 								self.GetOutFeild(resp, 1): price1,
177 | 								self.GetOutFeild(resp, 2): price2,
178 | 								self.GetOutFeild(resp, 3): prm,
179 | 								self.GetOutFeild(resp, 4): level,
180 | 								self.GetOutFeild(resp, 5): resp.GetTemp("type").(string),
181 | 							})
182 | 						}
183 | 					}
184 | 				},
185 | 			},
186 | 		},
187 | 	},
188 | }
189 | 


--------------------------------------------------------------------------------
/pipeline/collector/output_lib.go:
--------------------------------------------------------------------------------
  1 | //数据输出
  2 | package collector
  3 | 
  4 | import (
  5 | 	"github.com/tealeg/xlsx"
  6 | 	"gopkg.in/mgo.v2"
  7 | 	// "gopkg.in/mgo.v2/bson"
  8 | 	"encoding/csv"
  9 | 	"github.com/henrylee2cn/pholcus/config"
 10 | 	"github.com/henrylee2cn/pholcus/reporter"
 11 | 	"os"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 	// "time"
 15 | )
 16 | 
 17 | /************************ excel 输出 ***************************/
 18 | func (self *Collector) excel(dataIndex int) {
 19 | 	defer func() {
 20 | 		if err := recover(); err != nil {
 21 | 			reporter.Log.Println(err)
 22 | 		}
 23 | 	}()
 24 | 
 25 | 	var file *xlsx.File
 26 | 	var sheet *xlsx.Sheet
 27 | 	var row *xlsx.Row
 28 | 	var cell *xlsx.Cell
 29 | 	var err error
 30 | 
 31 | 	folder1 := "data"
 32 | 	_folder2 := strings.Split(config.StartTime.Format("2006-01-02 15:04:05"), ":")
 33 | 	folder2 := _folder2[0] + "时" + _folder2[1] + "分" + _folder2[2] + "秒"
 34 | 	folder2 = folder1 + "/" + folder2
 35 | 	filename := folder2 + "/" + self.Spider.GetName() + "_" + self.Spider.GetKeyword() + " " + strconv.Itoa(self.sum[0]) + "-" + strconv.Itoa(self.sum[1]) + ".xlsx"
 36 | 
 37 | 	file = xlsx.NewFile()
 38 | 
 39 | 	// 添加分类数据工作表
 40 | 	for Name, Rule := range self.GetRules() {
 41 | 		// 跳过不输出的数据
 42 | 		if len(Rule.GetOutFeild()) == 0 {
 43 | 			continue
 44 | 		}
 45 | 
 46 | 		sheet = file.AddSheet(Name)
 47 | 		row = sheet.AddRow()
 48 | 		for _, title := range Rule.GetOutFeild() {
 49 | 			cell = row.AddCell()
 50 | 			cell.Value = title
 51 | 		}
 52 | 		cell = row.AddCell()
 53 | 		cell.Value = "当前链接"
 54 | 		cell = row.AddCell()
 55 | 		cell.Value = "上级链接"
 56 | 		cell = row.AddCell()
 57 | 		cell.Value = "下载时间"
 58 | 
 59 | 		num := 0 //小计
 60 | 		for _, datacell := range self.DockerQueue.Dockers[dataIndex] {
 61 | 			if datacell["RuleName"].(string) == Name {
 62 | 				row = sheet.AddRow()
 63 | 				for _, title := range Rule.GetOutFeild() {
 64 | 					cell = row.AddCell()
 65 | 					cell.Value = datacell["Data"].(map[string]string)[title]
 66 | 				}
 67 | 				cell = row.AddCell()
 68 | 				cell.Value = datacell["Url"].(string)
 69 | 				cell = row.AddCell()
 70 | 				cell.Value = datacell["ParentUrl"].(string)
 71 | 				cell = row.AddCell()
 72 | 				cell.Value = datacell["DownloadTime"].(string)
 73 | 				num++
 74 | 			}
 75 | 		}
 76 | 
 77 | 		reporter.Log.Printf("[任务：%v | 关键词：%v | 小类：%v] 输出 %v 条数据！！！\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num)
 78 | 
 79 | 	}
 80 | 
 81 | 	// 创建/打开目录
 82 | 	// f1, err := os.Stat(folder1)
 83 | 	// if err != nil || !f1.IsDir() {
 84 | 	// 	os.Mkdir(folder1, 0)
 85 | 	// }
 86 | 
 87 | 	// f2, err := os.Stat(folder2)
 88 | 	// if err != nil || !f2.IsDir() {
 89 | 	// 	os.Mkdir(folder2, 0)
 90 | 	// }
 91 | 
 92 | 	f2, err := os.Stat(folder2)
 93 | 	if err != nil || !f2.IsDir() {
 94 | 		if err := os.MkdirAll(folder2, 0777); err != nil {
 95 | 			reporter.Log.Printf("Error: %v\n", err)
 96 | 		}
 97 | 	}
 98 | 
 99 | 	// 保存文件
100 | 	err = file.Save(filename)
101 | 
102 | 	if err != nil {
103 | 		reporter.Log.Println(err)
104 | 	}
105 | 
106 | }
107 | 
108 | /************************ CSV 输出 ***************************/
109 | func (self *Collector) csv(dataIndex int) {
110 | 	defer func() {
111 | 		if err := recover(); err != nil {
112 | 			reporter.Log.Println(err)
113 | 		}
114 | 	}()
115 | 
116 | 	folder1 := "data"
117 | 	_folder2 := strings.Split(config.StartTime.Format("2006-01-02 15:04:05"), ":")
118 | 	folder2 := _folder2[0] + "时" + _folder2[1] + "分" + _folder2[2] + "秒"
119 | 	folder2 = folder1 + "/" + folder2
120 | 	filenameBase := folder2 + "/" + self.Spider.GetName() + "_" + self.Spider.GetKeyword() + " " + strconv.Itoa(self.sum[0]) + "-" + strconv.Itoa(self.sum[1])
121 | 
122 | 	// 创建/打开目录
123 | 	f2, err := os.Stat(folder2)
124 | 	if err != nil || !f2.IsDir() {
125 | 		if err := os.MkdirAll(folder2, 0777); err != nil {
126 | 			reporter.Log.Printf("Error: %v\n", err)
127 | 		}
128 | 	}
129 | 
130 | 	// 添加分类数据工作表
131 | 	for Name, Rule := range self.GetRules() {
132 | 		// 跳过不输出的数据
133 | 		if len(Rule.GetOutFeild()) == 0 {
134 | 			continue
135 | 		}
136 | 
137 | 		file, err := os.Create(filenameBase + " (" + Name + ").csv")
138 | 
139 | 		if err != nil {
140 | 			reporter.Log.Println(err)
141 | 			continue
142 | 		}
143 | 
144 | 		// file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM
145 | 		w := csv.NewWriter(file)
146 | 		th := Rule.GetOutFeild()
147 | 		th = append(th, []string{"当前链接", "上级链接", "下载时间"}...)
148 | 		w.Write(th)
149 | 
150 | 		num := 0 //小计
151 | 		for _, datacell := range self.DockerQueue.Dockers[dataIndex] {
152 | 			if datacell["RuleName"].(string) == Name {
153 | 				row := []string{}
154 | 				for _, title := range Rule.GetOutFeild() {
155 | 					row = append(row, datacell["Data"].(map[string]string)[title])
156 | 				}
157 | 
158 | 				row = append(row, datacell["Url"].(string))
159 | 				row = append(row, datacell["ParentUrl"].(string))
160 | 				row = append(row, datacell["DownloadTime"].(string))
161 | 				w.Write(row)
162 | 
163 | 				num++
164 | 			}
165 | 		}
166 | 		// 发送缓存数据流
167 | 		w.Flush()
168 | 		// 关闭文件
169 | 		file.Close()
170 | 		// 输出报告
171 | 		reporter.Log.Printf("[任务：%v | 关键词：%v | 小类：%v] 输出 %v 条数据！！！\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num)
172 | 	}
173 | }
174 | 
175 | /************************ MongoDB 输出 ***************************/
176 | 
177 | func (self *Collector) mgo(dataIndex int) {
178 | 	session, err := mgo.Dial(config.DB_URL) //连接数据库
179 | 	if err != nil {
180 | 		panic(err)
181 | 	}
182 | 	defer session.Close()
183 | 	session.SetMode(mgo.Monotonic, true)
184 | 
185 | 	db := session.DB(config.DB_NAME)         //数据库名称
186 | 	collection := db.C(config.DB_COLLECTION) //如果该集合已经存在的话，则直接返回
187 | 
188 | 	for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ {
189 | 		err = collection.Insert((interface{})(self.DockerQueue.Dockers[dataIndex][i]))
190 | 		if err != nil {
191 | 			panic(err)
192 | 		}
193 | 	}
194 | }
195 | 


--------------------------------------------------------------------------------
/spiders/baidunews.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	"github.com/henrylee2cn/pholcus/reporter"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider"   //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | 	"net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | 	// "encoding/json"
 19 | 	"encoding/xml"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	// "strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | 	// "fmt"
 32 | 	// "math"
 33 | 	"time"
 34 | )
 35 | 
 36 | var rss_BaiduNews = NewRSS(map[string]string{
 37 | 	"国内最新":  "http://news.baidu.com/n?cmd=4&class=civilnews&tn=rss",
 38 | 	"国际最新":  "http://news.baidu.com/n?cmd=4&class=internews&tn=rss",
 39 | 	"军事最新":  "http://news.baidu.com/n?cmd=4&class=mil&tn=rss",
 40 | 	"财经最新":  "http://news.baidu.com/n?cmd=4&class=finannews&tn=rss",
 41 | 	"互联网最新": "http://news.baidu.com/n?cmd=4&class=internet&tn=rss",
 42 | 	"房产最新":  "http://news.baidu.com/n?cmd=4&class=housenews&tn=rss",
 43 | 	"汽车最新":  "http://news.baidu.com/n?cmd=4&class=autonews&tn=rss",
 44 | 	"体育最新":  "http://news.baidu.com/n?cmd=4&class=sportnews&tn=rss",
 45 | 	"娱乐最新":  "http://news.baidu.com/n?cmd=4&class=enternews&tn=rss",
 46 | 	"游戏最新":  "http://news.baidu.com/n?cmd=4&class=gamenews&tn=rss",
 47 | 	"教育最新":  "http://news.baidu.com/n?cmd=4&class=edunews&tn=rss",
 48 | 	"女人最新":  "http://news.baidu.com/n?cmd=4&class=healthnews&tn=rss",
 49 | 	"科技最新":  "http://news.baidu.com/n?cmd=4&class=technnews&tn=rss",
 50 | 	"社会最新":  "http://news.baidu.com/n?cmd=4&class=socianews&tn=rss",
 51 | },
 52 | 	[]int{1, 2, 3, 4, 5, 6},
 53 | )
 54 | 
 55 | type BaiduNewsData struct {
 56 | 	Item []BaiduNewsItem `xml:"item"`
 57 | }
 58 | 
 59 | type BaiduNewsItem struct {
 60 | 	Title       string `xml:"title"`
 61 | 	Link        string `xml:"link"`
 62 | 	Description string `xml:"description"`
 63 | 	PubDate     string `xml:"pubDate"`
 64 | 	Author      string `xml:"author"`
 65 | }
 66 | 
 67 | var BaiduNews = &Spider{
 68 | 	Name: "百度RSS新闻",
 69 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 70 | 	// Optional: &Optional{},
 71 | 	RuleTree: &RuleTree{
 72 | 		// Spread: []string{},
 73 | 		Root: func(self *Spider) {
 74 | 			for k, _ := range rss_BaiduNews.Src {
 75 | 				self.AidRule("LOOP", []interface{}{k})
 76 | 			}
 77 | 		},
 78 | 
 79 | 		Nodes: map[string]*Rule{
 80 | 			"LOOP": &Rule{
 81 | 				AidFunc: func(self *Spider, aid []interface{}) interface{} {
 82 | 					k := aid[0].(string)
 83 | 					v := rss_BaiduNews.Src[k]
 84 | 
 85 | 					self.AddQueue(map[string]interface{}{
 86 | 						"url":      v + "#" + time.Now().String(),
 87 | 						"rule":     "XML",
 88 | 						"header":   http.Header{"Content-Type": []string{"text/html", "charset=GB2312"}},
 89 | 						"respType": "text",
 90 | 						"temp":     map[string]interface{}{"src": k},
 91 | 					})
 92 | 					return nil
 93 | 				},
 94 | 			},
 95 | 			"XML": &Rule{
 96 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 97 | 					page := GBKToUTF8(resp.GetBodyStr())
 98 | 					page = strings.TrimLeft(page, `<?xml version="1.0" encoding="gb2312"?>`)
 99 | 					re, _ := regexp.Compile(`\<[\/]?rss\>`)
100 | 					page = re.ReplaceAllString(page, "")
101 | 
102 | 					content := new(BaiduNewsData)
103 | 					if err := xml.Unmarshal([]byte(page), content); err != nil {
104 | 						reporter.Log.Println(err)
105 | 						return
106 | 					}
107 | 
108 | 					src := resp.GetTemp("src").(string)
109 | 
110 | 					for _, v := range content.Item {
111 | 
112 | 						self.AddQueue(map[string]interface{}{
113 | 							"url":  v.Link,
114 | 							"rule": "新闻详情",
115 | 							"temp": map[string]interface{}{
116 | 								"title":       CleanHtml(v.Title, 4),
117 | 								"description": CleanHtml(v.Description, 4),
118 | 								"src":         src,
119 | 								"releaseTime": CleanHtml(v.PubDate, 4),
120 | 								"author":      CleanHtml(v.Author, 4),
121 | 							},
122 | 						})
123 | 					}
124 | 
125 | 					// 循环请求
126 | 					rss_BaiduNews.Wait(src)
127 | 					self.AidRule("LOOP", []interface{}{src})
128 | 				},
129 | 			},
130 | 
131 | 			"新闻详情": &Rule{
132 | 				//注意：有无字段语义和是否输出数据必须保持一致
133 | 				OutFeild: []string{
134 | 					"标题",
135 | 					"描述",
136 | 					"内容",
137 | 					"发布时间",
138 | 					"分类",
139 | 					"作者",
140 | 				},
141 | 				ParseFunc: func(self *Spider, resp *context.Response) {
142 | 					// RSS标记更新
143 | 					rss_BaiduNews.Updata(resp.GetTemp("src").(string))
144 | 
145 | 					query1 := resp.GetHtmlParser()
146 | 
147 | 					query := query1.Find("body")
148 | 
149 | 					title := resp.GetTemp("title").(string)
150 | 
151 | 					var findP func(html *goquery.Selection) *goquery.Selection
152 | 					findP = func(html *goquery.Selection) *goquery.Selection {
153 | 						if html.Is("body") {
154 | 							return html
155 | 						} else if result := html.Parent().Find("p"); len(result.Nodes) == 0 {
156 | 							return findP(html.Parent())
157 | 						} else {
158 | 							return html.Parent()
159 | 						}
160 | 					}
161 | 
162 | 					var info *goquery.Selection
163 | 
164 | 					if h1s := query.Find("h1"); len(h1s.Nodes) != 0 {
165 | 						for i := 0; i < len(h1s.Nodes); i++ {
166 | 							info = findP(h1s.Eq(i))
167 | 						}
168 | 					} else if h2s := query.Find("h2"); len(h2s.Nodes) != 0 {
169 | 						for i := 0; i < len(h2s.Nodes); i++ {
170 | 							info = findP(h2s.Eq(i))
171 | 						}
172 | 					} else if h3s := query.Find("h3"); len(h3s.Nodes) != 0 {
173 | 						for i := 0; i < len(h3s.Nodes); i++ {
174 | 							info = findP(h3s.Eq(i))
175 | 						}
176 | 					} else {
177 | 						info = query.Find("body")
178 | 					}
179 | 					// 去除标签
180 | 					// info.RemoveFiltered("script")
181 | 					// info.RemoveFiltered("style")
182 | 					infoStr, _ := info.Html()
183 | 
184 | 					// 清洗HTML
185 | 					infoStr = CleanHtml(infoStr, 5)
186 | 
187 | 					// 结果存入Response中转
188 | 					result := map[string]string{
189 | 						self.GetOutFeild(resp, 0): title,
190 | 						self.GetOutFeild(resp, 1): resp.GetTemp("description").(string),
191 | 						self.GetOutFeild(resp, 2): infoStr,
192 | 						self.GetOutFeild(resp, 3): resp.GetTemp("releaseTime").(string),
193 | 						self.GetOutFeild(resp, 4): resp.GetTemp("src").(string),
194 | 						self.GetOutFeild(resp, 5): resp.GetTemp("author").(string),
195 | 					}
196 | 					resp.AddItem(result)
197 | 				},
198 | 			},
199 | 		},
200 | 	},
201 | }
202 | 


--------------------------------------------------------------------------------
/downloader/context/request.go:
--------------------------------------------------------------------------------
  1 | package context
  2 | 
  3 | import (
  4 | 	"github.com/bitly/go-simplejson"
  5 | 	"github.com/henrylee2cn/pholcus/reporter"
  6 | 	"io/ioutil"
  7 | 	"net/http"
  8 | 	"os"
  9 | )
 10 | 
 11 | // Request represents object waiting for being crawled.
 12 | type Request struct {
 13 | 	url    string
 14 | 	parent string
 15 | 	rule   string
 16 | 	spider string
 17 | 	// Responce type: html json jsonp text
 18 | 	respType string
 19 | 	// GET POST
 20 | 	method string
 21 | 	// http header
 22 | 	header http.Header
 23 | 	// http cookies
 24 | 	cookies []*http.Cookie
 25 | 	// POST data
 26 | 	postdata string
 27 | 	//在Spider中生成时，根据ruleTree.Outsource确定
 28 | 	canOutsource bool
 29 | 	//当经过Pholcus时，被指定是否外包
 30 | 	isOutsource bool
 31 | 	// Redirect function for downloader used in http.Client
 32 | 	// If CheckRedirect returns an error, the Client's Get
 33 | 	// method returns both the previous Response.
 34 | 	// If CheckRedirect returns error.New("normal"), the error process after client.Do will ignore the error.
 35 | 	checkRedirect func(req *http.Request, via []*http.Request) error
 36 | 	//proxy host   example='localhost:80'
 37 | 	proxyHost string
 38 | 	// 标记临时数据，通过temp[x]==nil判断是否有值存入，所以请存入带类型的值，如[]int(nil)等
 39 | 	temp map[string]interface{}
 40 | }
 41 | 
 42 | // NewRequest returns initialized Request object.
 43 | // The respType is json, jsonp, html, text
 44 | 
 45 | func NewRequest(param map[string]interface{}) *Request {
 46 | 	req := &Request{
 47 | 		url:    param["url"].(string),    //必填
 48 | 		rule:   param["rule"].(string),   //必填
 49 | 		spider: param["spider"].(string), //必填
 50 | 	}
 51 | 
 52 | 	// 若有必填
 53 | 	switch v := param["parent"].(type) {
 54 | 	case string:
 55 | 		req.parent = v
 56 | 	default:
 57 | 		req.parent = ""
 58 | 	}
 59 | 
 60 | 	switch v := param["respType"].(type) {
 61 | 	case string:
 62 | 		req.respType = v
 63 | 	default:
 64 | 		req.respType = "html"
 65 | 	}
 66 | 
 67 | 	switch v := param["method"].(type) {
 68 | 	case string:
 69 | 		req.method = v
 70 | 	default:
 71 | 		req.method = "GET"
 72 | 	}
 73 | 
 74 | 	switch v := param["cookies"].(type) {
 75 | 	case []*http.Cookie:
 76 | 		req.cookies = v
 77 | 	default:
 78 | 		req.cookies = nil
 79 | 	}
 80 | 
 81 | 	switch v := param["postdata"].(type) {
 82 | 	case string:
 83 | 		req.postdata = v
 84 | 	default:
 85 | 		req.postdata = ""
 86 | 	}
 87 | 
 88 | 	switch v := param["canOutsource"].(type) {
 89 | 	case bool:
 90 | 		req.canOutsource = v
 91 | 	default:
 92 | 		req.canOutsource = false
 93 | 	}
 94 | 
 95 | 	switch v := param["checkRedirect"].(type) {
 96 | 	case func(*http.Request, []*http.Request) error:
 97 | 		req.checkRedirect = v
 98 | 	default:
 99 | 		req.checkRedirect = nil
100 | 	}
101 | 
102 | 	switch v := param["proxyHost"].(type) {
103 | 	case string:
104 | 		req.proxyHost = v
105 | 	default:
106 | 		req.proxyHost = ""
107 | 	}
108 | 
109 | 	switch v := param["temp"].(type) {
110 | 	case map[string]interface{}:
111 | 		req.temp = v
112 | 	default:
113 | 		req.temp = map[string]interface{}{}
114 | 	}
115 | 
116 | 	switch v := param["header"].(type) {
117 | 	case string:
118 | 		_, err := os.Stat(v)
119 | 		if err == nil {
120 | 			req.header = readHeaderFromFile(v)
121 | 		}
122 | 	case http.Header:
123 | 		req.header = v
124 | 	default:
125 | 		req.header = nil
126 | 	}
127 | 
128 | 	return req
129 | }
130 | 
131 | func readHeaderFromFile(headerFile string) http.Header {
132 | 	//read file , parse the header and cookies
133 | 	b, err := ioutil.ReadFile(headerFile)
134 | 	if err != nil {
135 | 		//make be:  share access error
136 | 		reporter.Log.Println(err.Error())
137 | 		return nil
138 | 	}
139 | 	js, _ := simplejson.NewJson(b)
140 | 	//constructed to header
141 | 
142 | 	h := make(http.Header)
143 | 	h.Add("User-Agent", js.Get("User-Agent").MustString())
144 | 	h.Add("Referer", js.Get("Referer").MustString())
145 | 	h.Add("Cookie", js.Get("Cookie").MustString())
146 | 	h.Add("Cache-Control", "max-age=0")
147 | 	h.Add("Connection", "keep-alive")
148 | 	return h
149 | }
150 | 
151 | //point to a json file
152 | /* xxx.json
153 | {
154 | 	"User-Agent":"curl/7.19.3 (i386-pc-win32) libcurl/7.19.3 OpenSSL/1.0.0d",
155 | 	"Referer":"http://weixin.sogou.com/gzh?openid=oIWsFt6Sb7aZmuI98AU7IXlbjJps",
156 | 	"Cookie":""
157 | }
158 | */
159 | func (self *Request) AddHeaderFile(headerFile string) *Request {
160 | 	_, err := os.Stat(headerFile)
161 | 	if err != nil {
162 | 		return self
163 | 	}
164 | 	h := readHeaderFromFile(headerFile)
165 | 	self.header = h
166 | 	return self
167 | }
168 | 
169 | // @host  http://localhost:8765/
170 | func (self *Request) AddProxyHost(host string) *Request {
171 | 	self.proxyHost = host
172 | 	return self
173 | }
174 | 
175 | func (self *Request) GetHeader() http.Header {
176 | 	return self.header
177 | }
178 | 
179 | func (self *Request) GetProxyHost() string {
180 | 	return self.proxyHost
181 | }
182 | 
183 | func (self *Request) GetRedirectFunc() func(req *http.Request, via []*http.Request) error {
184 | 	return self.checkRedirect
185 | }
186 | 
187 | func (self *Request) GetUrl() string {
188 | 	return self.url
189 | }
190 | 
191 | func (self *Request) SetUrl(url string) {
192 | 	self.url = url
193 | }
194 | 
195 | func (self *Request) GetParent() string {
196 | 	return self.parent
197 | }
198 | 
199 | func (self *Request) GetRuleName() string {
200 | 	return self.rule
201 | }
202 | 
203 | func (self *Request) SetRuleName(ruleName string) {
204 | 	self.rule = ruleName
205 | }
206 | 
207 | func (self *Request) GetSpiderName() string {
208 | 	return self.spider
209 | }
210 | 
211 | func (self *Request) GetRespType() string {
212 | 	return self.respType
213 | }
214 | 
215 | func (self *Request) GetMethod() string {
216 | 	return self.method
217 | }
218 | 
219 | func (self *Request) GetPostdata() string {
220 | 	return self.postdata
221 | }
222 | 
223 | func (self *Request) GetCookies() []*http.Cookie {
224 | 	return self.cookies
225 | }
226 | 
227 | func (self *Request) IsOutsource() bool {
228 | 	return self.isOutsource
229 | }
230 | 
231 | func (self *Request) TryOutsource() bool {
232 | 	if self.canOutsource {
233 | 		self.isOutsource = true
234 | 		return true
235 | 	} else {
236 | 		return false
237 | 	}
238 | }
239 | 
240 | func (self *Request) GetTemp(key string) interface{} {
241 | 	return self.temp[key]
242 | }
243 | 
244 | func (self *Request) GetTemps() interface{} {
245 | 	return self.temp
246 | }
247 | 
248 | func (self *Request) SetTemp(key string, value interface{}) {
249 | 	self.temp[key] = value
250 | }
251 | 
252 | func (self *Request) GetSpiderId() (int, bool) {
253 | 	value, ok := self.temp["__SPIDER_ID__"]
254 | 	return value.(int), ok
255 | }
256 | 
257 | func (self *Request) SetSpiderId(spiderId int) {
258 | 	self.temp["__SPIDER_ID__"] = spiderId
259 | }
260 | 


--------------------------------------------------------------------------------
/spiders/wangyi.go:
--------------------------------------------------------------------------------
  1 | package spiders
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/PuerkitoBio/goquery"                    //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/downloader/context" //必需
  7 | 	// "github.com/henrylee2cn/pholcus/reporter"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/spiders/spider" //必需
  9 | )
 10 | 
 11 | // 设置header包
 12 | import (
 13 | // "net/http" //http.Header
 14 | )
 15 | 
 16 | // 编码包
 17 | import (
 18 | // "encoding/xml"
 19 | // "encoding/json"
 20 | )
 21 | 
 22 | // 字符串处理包
 23 | import (
 24 | 	"regexp"
 25 | 	// "strconv"
 26 | 	"strings"
 27 | )
 28 | 
 29 | // 其他包
 30 | import (
 31 | // "fmt"
 32 | // "math"
 33 | )
 34 | 
 35 | var Wangyi = &Spider{
 36 | 	Name: "网易新闻",
 37 | 	// Pausetime: [2]uint{uint(3000), uint(1000)},
 38 | 	// Optional: &Optional{},
 39 | 	RuleTree: &RuleTree{
 40 | 		// Spread: []string{},
 41 | 		Root: func(self *Spider) {
 42 | 			self.AddQueue(map[string]interface{}{"url": "http://news.163.com/rank/", "rule": "排行榜主页"})
 43 | 		},
 44 | 
 45 | 		Nodes: map[string]*Rule{
 46 | 
 47 | 			"排行榜主页": &Rule{
 48 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 49 | 					query := resp.GetHtmlParser()
 50 | 					query.Find(".subNav a").Each(func(i int, s *goquery.Selection) {
 51 | 						if url, ok := s.Attr("href"); ok {
 52 | 							self.AddQueue(map[string]interface{}{"url": url, "rule": "新闻排行榜"})
 53 | 						}
 54 | 					})
 55 | 				},
 56 | 			},
 57 | 
 58 | 			"新闻排行榜": &Rule{
 59 | 				ParseFunc: func(self *Spider, resp *context.Response) {
 60 | 					topTit := []string{
 61 | 						"1小时前点击排行",
 62 | 						"24小时点击排行",
 63 | 						"本周点击排行",
 64 | 						"今日跟帖排行",
 65 | 						"本周跟帖排行",
 66 | 						"本月跟贴排行",
 67 | 					}
 68 | 					query := resp.GetHtmlParser()
 69 | 					// 获取新闻分类
 70 | 					newsType := query.Find(".titleBar h2").Text()
 71 | 
 72 | 					urls_top := map[string]string{}
 73 | 
 74 | 					query.Find(".tabContents").Each(func(n int, t *goquery.Selection) {
 75 | 						t.Find("tr").Each(func(i int, s *goquery.Selection) {
 76 | 							// 跳过标题栏
 77 | 							if i == 0 {
 78 | 								return
 79 | 							}
 80 | 							// 内容链接
 81 | 							url, ok := s.Find("a").Attr("href")
 82 | 
 83 | 							// 排名
 84 | 							top := s.Find(".cBlue").Text()
 85 | 
 86 | 							if ok {
 87 | 								urls_top[url] += topTit[n] + ":" + top + ","
 88 | 							}
 89 | 						})
 90 | 					})
 91 | 					for k, v := range urls_top {
 92 | 						self.AddQueue(map[string]interface{}{
 93 | 							"url":  k,
 94 | 							"rule": "热点新闻",
 95 | 							"temp": map[string]interface{}{
 96 | 								"newsType": newsType,
 97 | 								"top":      v,
 98 | 							},
 99 | 						})
100 | 					}
101 | 				},
102 | 			},
103 | 
104 | 			"热点新闻": &Rule{
105 | 				//注意：有无字段语义和是否输出数据必须保持一致
106 | 				OutFeild: []string{
107 | 					"标题",
108 | 					"内容",
109 | 					"排名",
110 | 					"类别",
111 | 					"ReleaseTime",
112 | 				},
113 | 				ParseFunc: func(self *Spider, resp *context.Response) {
114 | 					query := resp.GetHtmlParser()
115 | 
116 | 					// 若有多页内容，则获取阅读全文的链接并获取内容
117 | 					if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 {
118 | 						if pageAllUrl, ok := pageAll.Attr("href"); ok {
119 | 							self.AddQueue(map[string]interface{}{
120 | 								"url":  pageAllUrl,
121 | 								"rule": "热点新闻",
122 | 								"temp": resp.GetTemps(),
123 | 							})
124 | 						}
125 | 						return
126 | 					}
127 | 
128 | 					// 获取标题
129 | 					title := query.Find("#h1title").Text()
130 | 
131 | 					// 获取内容
132 | 					content := query.Find("#endText").Text()
133 | 					re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
134 | 					// content = re.ReplaceAllStringFunc(content, strings.ToLower)
135 | 					content = re.ReplaceAllString(content, "")
136 | 
137 | 					// 获取发布日期
138 | 					release := query.Find(".ep-time-soure").Text()
139 | 					release = strings.Split(release, "来源:")[0]
140 | 					release = strings.Trim(release, " \t\n")
141 | 
142 | 					// 结果存入Response中转
143 | 					resp.AddItem(map[string]string{
144 | 						self.GetOutFeild(resp, 0): title,
145 | 						self.GetOutFeild(resp, 1): content,
146 | 						self.GetOutFeild(resp, 2): resp.GetTemp("top").(string),
147 | 						self.GetOutFeild(resp, 3): resp.GetTemp("newsType").(string),
148 | 						self.GetOutFeild(resp, 4): release,
149 | 					})
150 | 				},
151 | 			},
152 | 		},
153 | 	},
154 | }
155 | 
156 | // 不确定因素过多，暂未实现抓取
157 | // &crawler.Rule{
158 | // 	Name: "热门跟帖",
159 | // 	Semantic: []string{
160 | // 		"新闻标题",
161 | // 		"新闻链接",
162 | // 		"评论者",
163 | // 		"评论内容",
164 | // 		"release_data",
165 | // 	},
166 | // 	Meta: map[string]int{}, //用于标记如是否已获取总页数等
167 | // 	// url生成规则，参数：循环计数、Task实例、urltag、params
168 | // 	UrlFunc: func(self crawler.Crawler, startEnd [2]int, urltag map[string]string, params []string) {
169 | // 		baseUrl := strings.Split(params[0], ".html")
170 | // 		self.AddUrl(
171 | // 			baseUrl+"_"+i+".html",
172 | // 			"json",
173 | // 			urltag,
174 | // 		)
175 | // 		return self
176 | // 	},
177 | // 	ProcessFunc: func(self crawler.Crawler, p *page.Page) {
178 | // 		// 获取该请求数据的规则名
179 | // 		name := p.GetUrlTag()["RuleName"]
180 | 
181 | // 		// 获取总页数
182 | // 		if _, ok := self.GetRuleExecPage(name); !ok {
183 | // 			// 试运行并获取总页数
184 | // 			self.AddUrl(p.GetUrl(), "html", map[string]string{}).Run(false)
185 | // 			self.CreatAndAddUrl(1, self, urltag, []string{p.GetUrl()}).Run(false)
186 | 
187 | // 			// 存入新闻标题
188 | // 			p.AddField(map[string]string{self.GetRuleSemantic(name, 0): p.GetUrlTag()["newsTitle"]})
189 | 
190 | // 			// 存入新闻链接
191 | // 			p.AddField(map[string]string{self.GetRuleSemantic(name, 1): p.GetUrlTag()["newsUrl"]})
192 | 
193 | // 			// 获取该页面数据
194 | // 			query := p.GetHtmlParser()
195 | 
196 | // 			self.SetRuleTotalPage(name, 0)
197 | 
198 | // 			total1 := query.Find(".pages").Eq(0).Find("li a").Last().Prev().Text()
199 | 
200 | // 			tatal2, _ := strconv.Atoi(total1)
201 | 
202 | // 			self.SetRuleTotalPage(name, tatal2)
203 | 
204 | // 			if total, _ := self.GetRuleExecPage(name); total == 0 {
205 | // 				log.Printf("[消息提示：%v::%v::%v] 没有抓取到任何数据！!!\n", self.GetTaskName(), self.GetKeyword(), name)
206 | // 			}
207 | // 		}
208 | 
209 | // 		query.Find("#hotReplies .reply.essence").Each(func(i int, s *goquery.Selection) {
210 | 
211 | // 			re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
212 | 
213 | // 			// 获取并存入作者及其地址
214 | // 			author := s.Find(".author").Text()
215 | // 			author = re.ReplaceAllString(author, "")
216 | // 			p.AddField(map[string]string{self.GetRuleSemantic(name, 2): author})
217 | 
218 | // 			// 获取并存入评论内容
219 | // 			body := s.Find(".body").Text()
220 | // 			body = re.ReplaceAllString(body, "")
221 | // 			p.AddField(map[string]string{self.GetRuleSemantic(name, 3): body})
222 | 
223 | // 			// 获取并存入发表时间
224 | // 			postTime := s.Find(".postTime").Text()
225 | // 			postTime = strings.Split(postTime, " 发表")[0]
226 | // 			p.AddField(map[string]string{self.GetRuleSemantic(name, 5): postTime})
227 | // 		})
228 | // 	},
229 | // }, //end
230 | 


--------------------------------------------------------------------------------
/common/config/config.go:
--------------------------------------------------------------------------------
  1 | // Package config provides for parse config file.
  2 | package config
  3 | 
  4 | import (
  5 |     "errors"
  6 |     "io/ioutil"
  7 |     "strconv"
  8 |     "strings"
  9 |     "time"
 10 | )
 11 | 
 12 | type Config struct {
 13 |     globalContent   map[string]string
 14 |     sectionContents map[string]map[string]string
 15 |     sections        []string
 16 | }
 17 | 
 18 | func NewConfig() *Config {
 19 |     return &Config{
 20 |         globalContent:   make(map[string]string),
 21 |         sectionContents: make(map[string]map[string]string),
 22 |     }
 23 | }
 24 | 
 25 | // Load reads config file and returns an initialized Config.
 26 | func (this *Config) Load(configFile string) *Config {
 27 |     stream, err := ioutil.ReadFile(configFile)
 28 |     if err != nil {
 29 |         panic("config read file error : " + configFile + "\n")
 30 |     }
 31 |     this.LoadString(string(stream))
 32 |     return this
 33 | }
 34 | 
 35 | // Save writes config content to a config file.
 36 | func (this *Config) Save(configFile string) error {
 37 |     return ioutil.WriteFile(configFile, []byte(this.String()), 0777)
 38 | }
 39 | 
 40 | func (this *Config) Clear() {
 41 |     this.globalContent = make(map[string]string)
 42 |     this.sectionContents = make(map[string]map[string]string)
 43 |     this.sections = nil
 44 | }
 45 | 
 46 | func (this *Config) LoadString(s string) error {
 47 |     lines := strings.Split(s, "\n")
 48 |     section := ""
 49 |     for _, line := range lines {
 50 |         line = strings.Trim(line, emptyRunes)
 51 |         if line == "" || line[0] == '#' {
 52 |             continue
 53 |         }
 54 |         if line[0] == '[' {
 55 |             if lineLen := len(line); line[lineLen-1] == ']' {
 56 |                 section = line[1 : lineLen-1]
 57 |                 sectionAdded := false
 58 |                 for _, oldSection := range this.sections {
 59 |                     if section == oldSection {
 60 |                         sectionAdded = true
 61 |                         break
 62 |                     }
 63 |                 }
 64 |                 if !sectionAdded {
 65 |                     this.sections = append(this.sections, section)
 66 |                 }
 67 |                 continue
 68 |             }
 69 |         }
 70 |         pair := strings.SplitN(line, "=", 2)
 71 |         if len(pair) != 2 {
 72 |             return errors.New("bad config file syntax")
 73 |         }
 74 |         key := strings.Trim(pair[0], emptyRunes)
 75 |         value := strings.Trim(pair[1], emptyRunes)
 76 |         if section == "" {
 77 |             this.globalContent[key] = value
 78 |         } else {
 79 |             if _, ok := this.sectionContents[section]; !ok {
 80 |                 this.sectionContents[section] = make(map[string]string)
 81 |             }
 82 |             this.sectionContents[section][key] = value
 83 |         }
 84 |     }
 85 |     return nil
 86 | }
 87 | 
 88 | func (this *Config) String() string {
 89 |     s := ""
 90 |     for key, value := range this.globalContent {
 91 |         s += key + "=" + value + "\n"
 92 |     }
 93 |     for section, content := range this.sectionContents {
 94 |         s += "[" + section + "]\n"
 95 |         for key, value := range content {
 96 |             s += key + "=" + value + "\n"
 97 |         }
 98 |     }
 99 |     return s
100 | }
101 | 
102 | func (this *Config) StringWithMeta() string {
103 |     s := "__sections__=" + strings.Join(this.sections, ",") + "\n"
104 |     return s + this.String()
105 | }
106 | 
107 | func (this *Config) GlobalHas(key string) bool {
108 |     if _, ok := this.globalContent[key]; ok {
109 |         return true
110 |     }
111 |     return false
112 | }
113 | 
114 | func (this *Config) GlobalGet(key string) string {
115 |     return this.globalContent[key]
116 | }
117 | 
118 | func (this *Config) GlobalSet(key string, value string) {
119 |     this.globalContent[key] = value
120 | }
121 | 
122 | func (this *Config) GlobalGetInt(key string) int {
123 |     value := this.GlobalGet(key)
124 |     if value == "" {
125 |         return 0
126 |     }
127 |     result, err := strconv.Atoi(value)
128 |     if err != nil {
129 |         return 0
130 |     }
131 |     return result
132 | }
133 | 
134 | func (this *Config) GlobalGetInt64(key string) int64 {
135 |     value := this.GlobalGet(key)
136 |     if value == "" {
137 |         return 0
138 |     }
139 |     result, err := strconv.ParseInt(value, 10, 64)
140 |     if err != nil {
141 |         return 0
142 |     }
143 |     return result
144 | }
145 | 
146 | func (this *Config) GlobalGetDuration(key string) time.Duration {
147 |     return time.Duration(this.GlobalGetInt(key)) * time.Second
148 | }
149 | 
150 | func (this *Config) GlobalGetDeadline(key string) time.Time {
151 |     return time.Now().Add(time.Duration(this.GlobalGetInt(key)) * time.Second)
152 | }
153 | 
154 | func (this *Config) GlobalGetSlice(key string, separator string) []string {
155 |     result := []string{}
156 |     value := this.GlobalGet(key)
157 |     if value != "" {
158 |         for _, part := range strings.Split(value, separator) {
159 |             result = append(result, strings.Trim(part, emptyRunes))
160 |         }
161 |     }
162 |     return result
163 | }
164 | 
165 | func (this *Config) GlobalGetSliceInt(key string, separator string) []int {
166 |     result := []int{}
167 |     value := this.GlobalGetSlice(key, separator)
168 |     for _, part := range value {
169 |         int, err := strconv.Atoi(part)
170 |         if err != nil {
171 |             continue
172 |         }
173 |         result = append(result, int)
174 |     }
175 |     return result
176 | }
177 | 
178 | func (this *Config) GlobalContent() map[string]string {
179 |     return this.globalContent
180 | }
181 | 
182 | func (this *Config) Sections() []string {
183 |     return this.sections
184 | }
185 | 
186 | func (this *Config) HasSection(section string) bool {
187 |     if _, ok := this.sectionContents[section]; ok {
188 |         return true
189 |     }
190 |     return false
191 | }
192 | 
193 | func (this *Config) SectionHas(section string, key string) bool {
194 |     if !this.HasSection(section) {
195 |         return false
196 |     }
197 |     if _, ok := this.sectionContents[section][key]; ok {
198 |         return true
199 |     }
200 |     return false
201 | }
202 | 
203 | func (this *Config) SectionGet(section string, key string) string {
204 |     if content, ok := this.sectionContents[section]; ok {
205 |         return content[key]
206 |     }
207 |     return ""
208 | }
209 | 
210 | func (this *Config) SectionSet(section string, key string, value string) {
211 |     if content, ok := this.sectionContents[section]; ok {
212 |         content[key] = value
213 |     } else {
214 |         content = make(map[string]string)
215 |         content[key] = value
216 |         this.sectionContents[section] = content
217 |     }
218 | }
219 | 
220 | func (this *Config) SectionGetInt(section string, key string) int {
221 |     value := this.SectionGet(section, key)
222 |     if value == "" {
223 |         return 0
224 |     }
225 |     result, err := strconv.Atoi(value)
226 |     if err != nil {
227 |         return 0
228 |     }
229 |     return result
230 | }
231 | 
232 | func (this *Config) SectionGetDuration(section string, key string) time.Duration {
233 |     return time.Duration(this.SectionGetInt(section, key)) * time.Second
234 | }
235 | 
236 | func (this *Config) SectionGetSlice(section string, key string, separator string) []string {
237 |     result := []string{}
238 |     value := this.SectionGet(section, key)
239 |     if value != "" {
240 |         for _, part := range strings.Split(value, separator) {
241 |             result = append(result, strings.Trim(part, emptyRunes))
242 |         }
243 |     }
244 |     return result
245 | }
246 | 
247 | func (this *Config) SectionContent(section string) map[string]string {
248 |     return this.sectionContents[section]
249 | }
250 | 
251 | func (this *Config) SectionContents() map[string]map[string]string {
252 |     return this.sectionContents
253 | }
254 | 
255 | const emptyRunes = " \r\t\v"
256 | 


--------------------------------------------------------------------------------
/pholcus/gui/guimain.go:
--------------------------------------------------------------------------------
  1 | package gui
  2 | 
  3 | import (
  4 | 	"github.com/henrylee2cn/pholcus/config"
  5 | 	"github.com/henrylee2cn/pholcus/pholcus/crawler"
  6 | 	"github.com/henrylee2cn/pholcus/reporter"
  7 | 	"github.com/henrylee2cn/pholcus/scheduler"
  8 | 	"github.com/henrylee2cn/pholcus/spiders/spider"
  9 | 	"github.com/lxn/walk"
 10 | 	. "github.com/lxn/walk/declarative"
 11 | 	"log"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 	"time"
 15 | )
 16 | 
 17 | var toggleSpecialModePB *walk.PushButton
 18 | 
 19 | func Run() {
 20 | 	var mw *walk.MainWindow
 21 | 	var db *walk.DataBinder
 22 | 	var ep walk.ErrorPresenter
 23 | 
 24 | 	if err := (MainWindow{
 25 | 		AssignTo: &mw,
 26 | 		DataBinder: DataBinder{
 27 | 			AssignTo:       &db,
 28 | 			DataSource:     Input,
 29 | 			ErrorPresenter: ErrorPresenterRef{&ep},
 30 | 		},
 31 | 		Title:   config.APP_NAME,
 32 | 		MinSize: Size{1100, 700},
 33 | 		Layout:  VBox{},
 34 | 		Children: []Widget{
 35 | 			// 任务列表
 36 | 			HSplitter{
 37 | 				Children: []Widget{
 38 | 					TableView{
 39 | 						MinSize:               Size{550, 400},
 40 | 						AlternatingRowBGColor: walk.RGB(255, 255, 224),
 41 | 						CheckBoxes:            true,
 42 | 						ColumnsOrderable:      true,
 43 | 						Columns: []TableViewColumn{
 44 | 							{Title: "#", Width: 45},
 45 | 							{Title: "任务", Width: 110 /*, Format: "%.2f", Alignment: AlignFar*/},
 46 | 							{Title: "描述", Width: 370},
 47 | 						},
 48 | 						Model: SpiderModel,
 49 | 					},
 50 | 					// 关键词
 51 | 					VSplitter{
 52 | 						MinSize: Size{550, 400},
 53 | 
 54 | 						Children: []Widget{
 55 | 							VSplitter{
 56 | 								Children: []Widget{
 57 | 									Label{
 58 | 										Text: "关键词：（多任务之间以 | 隔开，选填）",
 59 | 									},
 60 | 									LineEdit{
 61 | 										Text: Bind("Keywords"),
 62 | 									},
 63 | 								},
 64 | 							},
 65 | 
 66 | 							VSplitter{
 67 | 								Children: []Widget{
 68 | 									Label{
 69 | 										Text: "采集页数：（选填）",
 70 | 									},
 71 | 									NumberEdit{
 72 | 										Value:    Bind("MaxPage"),
 73 | 										Suffix:   "",
 74 | 										Decimals: 0,
 75 | 									},
 76 | 								},
 77 | 							},
 78 | 
 79 | 							VSplitter{
 80 | 								Children: []Widget{
 81 | 									Label{
 82 | 										Text: "*并发协程：（1~99999）",
 83 | 									},
 84 | 									NumberEdit{
 85 | 										Value:    Bind("ThreadNum", Range{1, 99999}),
 86 | 										Suffix:   "",
 87 | 										Decimals: 0,
 88 | 									},
 89 | 								},
 90 | 							},
 91 | 
 92 | 							VSplitter{
 93 | 								Children: []Widget{
 94 | 									Label{
 95 | 										Text: "*分批输出大小：（1~5,000,000 条数据）",
 96 | 									},
 97 | 									NumberEdit{
 98 | 										Value:    Bind("DockerCap", Range{1, 5000000}),
 99 | 										Suffix:   "",
100 | 										Decimals: 0,
101 | 									},
102 | 								},
103 | 							},
104 | 
105 | 							VSplitter{
106 | 								Children: []Widget{
107 | 									Label{
108 | 										Text: "*间隔基准:",
109 | 									},
110 | 									ComboBox{
111 | 										Value:         Bind("BaseSleeptime", SelRequired{}),
112 | 										BindingMember: "Uint",
113 | 										DisplayMember: "Key",
114 | 										Model:         GUIOpt.SleepTime,
115 | 									},
116 | 								},
117 | 							},
118 | 
119 | 							VSplitter{
120 | 								Children: []Widget{
121 | 									Label{
122 | 										Text: "*随机延迟:",
123 | 									},
124 | 									ComboBox{
125 | 										Value:         Bind("RandomSleepPeriod", SelRequired{}),
126 | 										BindingMember: "Uint",
127 | 										DisplayMember: "Key",
128 | 										Model:         GUIOpt.SleepTime,
129 | 									},
130 | 								},
131 | 							},
132 | 
133 | 							RadioButtonGroupBox{
134 | 								ColumnSpan: 2,
135 | 								Title:      "*输出方式",
136 | 								Layout:     HBox{},
137 | 								DataMember: "OutType",
138 | 								Buttons: []RadioButton{
139 | 									{Text: GUIOpt.OutType[0].Key, Value: GUIOpt.OutType[0].String},
140 | 									{Text: GUIOpt.OutType[1].Key, Value: GUIOpt.OutType[1].String},
141 | 									{Text: GUIOpt.OutType[2].Key, Value: GUIOpt.OutType[2].String},
142 | 								},
143 | 							},
144 | 						},
145 | 					},
146 | 				},
147 | 			},
148 | 
149 | 			Composite{
150 | 				Layout: HBox{},
151 | 				Children: []Widget{
152 | 
153 | 					// 必填项错误检查
154 | 					LineErrorPresenter{
155 | 						AssignTo:   &ep,
156 | 						ColumnSpan: 2,
157 | 					},
158 | 
159 | 					PushButton{
160 | 						Text:     "开始抓取",
161 | 						AssignTo: &toggleSpecialModePB,
162 | 						OnClicked: func() {
163 | 							if err := db.Submit(); err != nil {
164 | 								log.Print(err)
165 | 								return
166 | 							}
167 | 							Input.Spiders = SpiderModel.GetChecked()
168 | 							if len(Input.Spiders) == 0 {
169 | 								return
170 | 							}
171 | 							toggleSpecialModePB.SetEnabled(false)
172 | 							toggleSpecialModePB.SetText("正在抓取")
173 | 							SubmitAndRun()
174 | 						},
175 | 					},
176 | 				},
177 | 			},
178 | 		},
179 | 	}.Create()); err != nil {
180 | 		log.Fatal(err)
181 | 	}
182 | 
183 | 	// 绑定log输出界面
184 | 	lv, err := NewLogView(mw)
185 | 	if err != nil {
186 | 		log.Fatal(err)
187 | 	}
188 | 	log.SetOutput(lv)
189 | 
190 | 	if icon, err := walk.NewIconFromResource("ICON"); err == nil {
191 | 		mw.SetIcon(icon)
192 | 	}
193 | 
194 | 	// 运行窗体程序
195 | 	mw.Run()
196 | }
197 | 
198 | // 初始化蜘蛛列表,必须在用户前端输入之后执行！
199 | func InitSpiders() int {
200 | 	var sp = spider.Spiders{}
201 | 	spider.SpiderList.Init()
202 | 
203 | 	// 遍历任务
204 | 	for i, sps := range Input.Spiders {
205 | 		sp = append(sp, sps.Spider)
206 | 		l := len(sp) - 1
207 | 		sp[l].Id = i
208 | 		sp[l].Pausetime[0] = Input.BaseSleeptime
209 | 		sp[l].Pausetime[1] = Input.RandomSleepPeriod
210 | 		sp[l].MaxPage = Input.MaxPage
211 | 	}
212 | 
213 | 	// 遍历关键词
214 | 	if Input.Keywords != "" {
215 | 		keywordSlice := strings.Split(Input.Keywords, "|")
216 | 		for _, keyword := range keywordSlice {
217 | 			keyword = strings.Trim(keyword, " ")
218 | 			if keyword == "" {
219 | 				continue
220 | 			}
221 | 			nowLen := len(spider.SpiderList)
222 | 			for n, _ := range sp {
223 | 				sp[n].Keyword = keyword
224 | 				sp[n].Id = nowLen + n
225 | 				c := *sp[n]
226 | 				spider.SpiderList.Add(&c)
227 | 			}
228 | 		}
229 | 	} else {
230 | 		spider.SpiderList = sp
231 | 	}
232 | 	return len(spider.SpiderList)
233 | }
234 | 
235 | // 提交用户输入并开始运行
236 | func SubmitAndRun() {
237 | 	// 纠正协程数
238 | 	if Input.ThreadNum == 0 {
239 | 		Input.ThreadNum = 1
240 | 	}
241 | 
242 | 	// 初始化config参数
243 | 	config.InitDockerParam(Input.DockerCap)
244 | 	config.ThreadNum = Input.ThreadNum
245 | 	config.OutType = Input.OutType
246 | 	config.StartTime = time.Now()
247 | 	config.ReqSum = 0 // 清空下载页面计数
248 | 
249 | 	count := InitSpiders()
250 | 
251 | 	// 初始化资源队列
252 | 	scheduler.Init(Input.ThreadNum)
253 | 
254 | 	// 初始化爬行队列
255 | 	CrawlerNum := config.CRAWLER_CAP
256 | 	if count < config.CRAWLER_CAP {
257 | 		CrawlerNum = count
258 | 	}
259 | 	config.CrawlerQueue.Init(CrawlerNum)
260 | 
261 | 	reporter.Log.Printf("\n执行任务总数（任务数[*关键词数]）为 %v 个...\n", count)
262 | 	reporter.Log.Printf("\n爬行队列可容纳蜘蛛 %v 只...\n", CrawlerNum)
263 | 	reporter.Log.Printf("\n并发协程最多 %v 个……\n", Input.ThreadNum)
264 | 	reporter.Log.Printf("\n随机停顿时间为 %v~%v ms ……\n", Input.BaseSleeptime, Input.BaseSleeptime+Input.RandomSleepPeriod)
265 | 	reporter.Log.Printf("*********************************************开始抓取，请耐心等候*********************************************")
266 | 
267 | 	// 任务执行
268 | 	go func(count int) {
269 | 
270 | 		// 由现有爬行队列转换目标所需爬行队列，注意爬行队列实例还是原来的
271 | 		for s, add := 0, config.CrawlerQueue.Exchange(CrawlerNum); s < add; s++ {
272 | 			config.CrawlerQueue.Push(crawler.New())
273 | 		}
274 | 
275 | 		for i := 0; i < count; i++ {
276 | 
277 | 			// 等待从爬行队列取出空闲蜘蛛
278 | 			oneCrawler := config.CrawlerQueue.Pull().(crawler.Crawler)
279 | 
280 | 			// 并发执行爬行任务
281 | 			go func(i int, c crawler.Crawler) {
282 | 				// 执行并返回结果消息
283 | 				c.Init(spider.SpiderList[i]).Start()
284 | 				// 任务结束后回收该蜘蛛
285 | 				config.CrawlerQueue.Push(c)
286 | 
287 | 			}(i, oneCrawler)
288 | 		}
289 | 
290 | 		// 监控结束任务
291 | 		sum := 0 //数据总数
292 | 		for i := 0; i < count; i++ {
293 | 			s := <-config.ReportChan
294 | 			reporter.Log.Printf("[结束报告 -> 任务：%v | 关键词：%v] 共输出数据 %v 条，用时 %v 分钟！！！\n", s.SpiderName, s.Keyword, s.Num, s.Time)
295 | 			if slen, err := strconv.Atoi(s.Num); err == nil {
296 | 				sum += slen
297 | 			}
298 | 		}
299 | 		reporter.Log.Printf("*****************************！！本次抓取合计 %v 条数据，下载页面 %v 个，耗时：%.5f 分钟！！***************************", sum, config.ReqSum, time.Since(config.StartTime).Minutes())
300 | 
301 | 		// 按钮状态控制
302 | 		toggleSpecialModePB.SetText("开始抓取")
303 | 		toggleSpecialModePB.SetEnabled(true)
304 | 	}(count)
305 | }
306 | 


--------------------------------------------------------------------------------
/downloader/downloader_http.go:
--------------------------------------------------------------------------------
  1 | package downloader
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"github.com/PuerkitoBio/goquery"
  6 | 	"github.com/bitly/go-simplejson"
  7 | 	"github.com/henrylee2cn/pholcus/downloader/context"
  8 | 	//    iconv "github.com/djimenez/iconv-go"
  9 | 	"github.com/henrylee2cn/pholcus/common/util"
 10 | 	"github.com/henrylee2cn/pholcus/reporter"
 11 | 	//    "golang.org/x/text/encoding/simplifiedchinese"
 12 | 	//    "golang.org/x/text/transform"
 13 | 	"io"
 14 | 	"io/ioutil"
 15 | 	"net/http"
 16 | 	"net/url"
 17 | 	//"fmt"
 18 | 	"golang.org/x/net/html/charset"
 19 | 	//    "regexp"
 20 | 	//    "golang.org/x/net/html"
 21 | 	"strings"
 22 | )
 23 | 
 24 | // The HttpDownloader download response by package net/http.
 25 | // The "html" content is contained in dom parser of package goquery.
 26 | // The "json" content is saved.
 27 | // The "jsonp" content is modified to json.
 28 | // The "text" content will save body plain text only.
 29 | // The response result is saved in Response.
 30 | type HttpDownloader struct{}
 31 | 
 32 | func NewHttpDownloader() *HttpDownloader {
 33 | 	return &HttpDownloader{}
 34 | }
 35 | 
 36 | func (self *HttpDownloader) Download(req *context.Request) *context.Response {
 37 | 	var mtype string
 38 | 	var p = context.NewResponse(req)
 39 | 	mtype = req.GetRespType()
 40 | 	switch mtype {
 41 | 	case "html":
 42 | 		return self.downloadHtml(p, req)
 43 | 	case "json":
 44 | 		fallthrough
 45 | 	case "jsonp":
 46 | 		return self.downloadJson(p, req)
 47 | 	case "text":
 48 | 		return self.downloadText(p, req)
 49 | 	default:
 50 | 		reporter.Log.Println("error request type:" + mtype)
 51 | 	}
 52 | 	return p
 53 | }
 54 | 
 55 | /*
 56 | // The acceptableCharset is test for whether Content-Type is UTF-8 or not
 57 | func (self *HttpDownloader) acceptableCharset(contentTypes []string) bool {
 58 |     // each type is like [text/html; charset=UTF-8]
 59 |     // we want the UTF-8 only
 60 |     for _, cType := range contentTypes {
 61 |         if strings.Index(cType, "UTF-8") != -1 || strings.Index(cType, "utf-8") != -1 {
 62 |             return true
 63 |         }
 64 |     }
 65 |     return false
 66 | }
 67 | // The getCharset used for parsing the header["Content-Type"] string to get charset of the
 68 | func (self *HttpDownloader) getCharset(header http.Header) string {
 69 |     reg, err := regexp.Compile("charset=(.*)$")
 70 |     if err != nil {
 71 |         reporter.Log.Println(err.Error())
 72 |         return ""
 73 |     }
 74 |     var charset string
 75 |     for _, cType := range header["Content-Type"] {
 76 |         substrings := reg.FindStringSubmatch(cType)
 77 |         if len(substrings) == 2 {
 78 |             charset = substrings[1]
 79 |         }
 80 |     }
 81 |     return charset
 82 | }
 83 | // Use golang.org/x/text/encoding. Get response body and change it to utf-8
 84 | func (self *HttpDownloader) changeCharsetEncoding(charset string, sor io.ReadCloser) string {
 85 |     ischange := true
 86 |     var tr transform.Transformer
 87 |     cs := strings.ToLower(charset)
 88 |     if cs == "gbk" {
 89 |         tr = simplifiedchinese.GBK.NewDecoder()
 90 |     } else if cs == "gb18030" {
 91 |         tr = simplifiedchinese.GB18030.NewDecoder()
 92 |     } else if cs == "hzgb2312" || cs == "gb2312" || cs == "hz-gb2312" {
 93 |         tr = simplifiedchinese.HZGB2312.NewDecoder()
 94 |     } else {
 95 |         ischange = false
 96 |     }
 97 |     var destReader io.Reader
 98 |     if ischange {
 99 |         transReader := transform.NewReader(sor, tr)
100 |         destReader = transReader
101 |     } else {
102 |         destReader = sor
103 |     }
104 |     var sorbody []byte
105 |     var err error
106 |     if sorbody, err = ioutil.ReadAll(destReader); err != nil {
107 |         reporter.Log.Println(err.Error())
108 |         return ""
109 |     }
110 |     bodystr := string(sorbody)
111 |     return bodystr
112 | }
113 | // Use go-iconv. Get response body and change it to utf-8
114 | func (self *HttpDownloader) changeCharsetGoIconv(charset string, sor io.ReadCloser) string {
115 |     var err error
116 |     var converter *iconv.Converter
117 |     if charset != "" && strings.ToLower(charset) != "utf-8" && strings.ToLower(charset) != "utf8" {
118 |         converter, err = iconv.NewConverter(charset, "utf-8")
119 |         if err != nil {
120 |             reporter.Log.Println(err.Error())
121 |             return ""
122 |         }
123 |         defer converter.Close()
124 |     }
125 |     var sorbody []byte
126 |     if sorbody, err = ioutil.ReadAll(sor); err != nil {
127 |         reporter.Log.Println(err.Error())
128 |         return ""
129 |     }
130 |     bodystr := string(sorbody)
131 |     var destbody string
132 |     if converter != nil {
133 |         // convert to utf8
134 |         destbody, err = converter.ConvertString(bodystr)
135 |         if err != nil {
136 |             reporter.Log.Println(err.Error())
137 |             return ""
138 |         }
139 |     } else {
140 |         destbody = bodystr
141 |     }
142 |     return destbody
143 | }
144 | */
145 | 
146 | // Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8
147 | func (self *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor io.ReadCloser) string {
148 | 	var err error
149 | 	destReader, err := charset.NewReader(sor, contentTypeStr)
150 | 
151 | 	if err != nil {
152 | 		reporter.Log.Println(err.Error())
153 | 		destReader = sor
154 | 	}
155 | 
156 | 	var sorbody []byte
157 | 	if sorbody, err = ioutil.ReadAll(destReader); err != nil {
158 | 		reporter.Log.Println(err.Error())
159 | 		// For gb2312, an error will be returned.
160 | 		// Error like: simplifiedchinese: invalid GBK encoding
161 | 		// return ""
162 | 	}
163 | 	//e,name,certain := charset.DetermineEncoding(sorbody,contentTypeStr)
164 | 	bodystr := string(sorbody)
165 | 
166 | 	return bodystr
167 | }
168 | 
169 | // choose http GET/method to download
170 | func connectByHttp(p *context.Response, req *context.Request) (*http.Response, error) {
171 | 	client := &http.Client{
172 | 		CheckRedirect: req.GetRedirectFunc(),
173 | 	}
174 | 
175 | 	httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata()))
176 | 	if header := req.GetHeader(); header != nil {
177 | 		httpreq.Header = req.GetHeader()
178 | 	}
179 | 
180 | 	if cookies := req.GetCookies(); cookies != nil {
181 | 		for i := range cookies {
182 | 			httpreq.AddCookie(cookies[i])
183 | 		}
184 | 	}
185 | 
186 | 	var resp *http.Response
187 | 	if resp, err = client.Do(httpreq); err != nil {
188 | 		if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" {
189 | 			//  normal
190 | 		} else {
191 | 			reporter.Log.Println(err.Error())
192 | 			p.SetStatus(true, err.Error())
193 | 			//fmt.Printf("client do error %v \r\n", err)
194 | 			return nil, err
195 | 		}
196 | 	}
197 | 
198 | 	return resp, nil
199 | }
200 | 
201 | // choose a proxy server to excute http GET/method to download
202 | func connectByHttpProxy(p *context.Response, in_req *context.Request) (*http.Response, error) {
203 | 	request, _ := http.NewRequest("GET", in_req.GetUrl(), nil)
204 | 	proxy, err := url.Parse(in_req.GetProxyHost())
205 | 	if err != nil {
206 | 		return nil, err
207 | 	}
208 | 	client := &http.Client{
209 | 		Transport: &http.Transport{
210 | 			Proxy: http.ProxyURL(proxy),
211 | 		},
212 | 	}
213 | 	resp, err := client.Do(request)
214 | 	if err != nil {
215 | 		return nil, err
216 | 	}
217 | 	return resp, nil
218 | 
219 | }
220 | 
221 | // Download file and change the charset of response charset.
222 | func (self *HttpDownloader) downloadFile(p *context.Response, req *context.Request) (*context.Response, string) {
223 | 	var err error
224 | 	var urlstr string
225 | 	if urlstr = req.GetUrl(); len(urlstr) == 0 {
226 | 		reporter.Log.Println("url is empty")
227 | 		p.SetStatus(true, "url is empty")
228 | 		return p, ""
229 | 	}
230 | 
231 | 	var resp *http.Response
232 | 
233 | 	if proxystr := req.GetProxyHost(); len(proxystr) != 0 {
234 | 		//using http proxy
235 | 		//fmt.Print("HttpProxy Enter ",proxystr,"\n")
236 | 		resp, err = connectByHttpProxy(p, req)
237 | 	} else {
238 | 		//normal http download
239 | 		//fmt.Print("Http Normal Enter \n",proxystr,"\n")
240 | 		resp, err = connectByHttp(p, req)
241 | 	}
242 | 
243 | 	if err != nil {
244 | 		return p, ""
245 | 	}
246 | 
247 | 	//b, _ := ioutil.ReadAll(resp.Body)
248 | 	//fmt.Printf("Resp body %v \r\n", string(b))
249 | 
250 | 	p.SetHeader(resp.Header)
251 | 	p.SetCookies(resp.Cookies())
252 | 
253 | 	// get converter to utf-8
254 | 	bodyStr := self.changeCharsetEncodingAuto(resp.Header.Get("Content-Type"), resp.Body)
255 | 	//fmt.Printf("utf-8 body %v \r\n", bodyStr)
256 | 	defer resp.Body.Close()
257 | 	return p, bodyStr
258 | }
259 | 
260 | func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response {
261 | 	var err error
262 | 	p, destbody := self.downloadFile(p, req)
263 | 	//fmt.Printf("Destbody %v \r\n", destbody)
264 | 	if !p.IsSucc() {
265 | 		//fmt.Print("Response error \r\n")
266 | 		return p
267 | 	}
268 | 	bodyReader := bytes.NewReader([]byte(destbody))
269 | 
270 | 	var doc *goquery.Document
271 | 	if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil {
272 | 		reporter.Log.Println(err.Error())
273 | 		p.SetStatus(true, err.Error())
274 | 		return p
275 | 	}
276 | 
277 | 	var body string
278 | 	if body, err = doc.Html(); err != nil {
279 | 		reporter.Log.Println(err.Error())
280 | 		p.SetStatus(true, err.Error())
281 | 		return p
282 | 	}
283 | 
284 | 	p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "")
285 | 
286 | 	return p
287 | }
288 | 
289 | func (self *HttpDownloader) downloadJson(p *context.Response, req *context.Request) *context.Response {
290 | 	var err error
291 | 	p, destbody := self.downloadFile(p, req)
292 | 	if !p.IsSucc() {
293 | 		return p
294 | 	}
295 | 
296 | 	var body []byte
297 | 	body = []byte(destbody)
298 | 	mtype := req.GetRespType()
299 | 	if mtype == "jsonp" {
300 | 		tmpstr := util.JsonpToJson(destbody)
301 | 		body = []byte(tmpstr)
302 | 	}
303 | 
304 | 	var r *simplejson.Json
305 | 	if r, err = simplejson.NewJson(body); err != nil {
306 | 		reporter.Log.Println(string(body) + "\t" + err.Error())
307 | 		p.SetStatus(true, err.Error())
308 | 		return p
309 | 	}
310 | 
311 | 	// json result
312 | 	p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "")
313 | 
314 | 	return p
315 | }
316 | 
317 | func (self *HttpDownloader) downloadText(p *context.Response, req *context.Request) *context.Response {
318 | 	p, destbody := self.downloadFile(p, req)
319 | 	if !p.IsSucc() {
320 | 		return p
321 | 	}
322 | 	p.SetBodyStr(destbody).SetStatus(false, "")
323 | 	return p
324 | }
325 | 


--------------------------------------------------------------------------------