├── pholcus ├── node │ └── node.go ├── status │ └── status.go ├── gui │ ├── rsrc.syso │ ├── guimain.manifest │ ├── logview.go │ ├── menu.go │ ├── guispider.go │ └── guimain.go ├── crawler │ ├── crawler.go │ └── crawl.go ├── pholcus.go ├── keeper │ └── login.go └── socket │ ├── PhoSocketTest.go │ └── PhoSocket.go ├── doc ├── ICON.ico ├── project.png └── GUI编译命令.txt ├── main.go ├── reporter ├── reporter.go └── report.go ├── spiders ├── spider │ ├── spiderlist.go │ ├── rss.go │ ├── common.go │ └── spider.go ├── kaola.go ├── shunfenghaitao.go ├── baidusearch.go ├── readme.md ├── jdsearch.go ├── miyabaobei.go ├── alibaba.go ├── taobaosearch.go ├── googlesearch.go ├── hollandandbarrett.go ├── baidunews.go └── wangyi.go ├── README.md ├── downloader ├── downloader.go ├── context │ ├── response.go │ └── request.go └── downloader_http.go ├── pipeline ├── collector │ ├── datacell.go │ ├── output.go │ ├── docker.go │ ├── collector.go │ └── output_lib.go └── pipeline.go ├── main.manifest ├── common ├── deduplicate │ └── deduplicate.go ├── mlog │ ├── mlog.go │ ├── strace.go │ └── filelog.go ├── queue │ └── queue.go ├── etc_config │ └── etc_config.go ├── util │ └── util.go └── config │ └── config.go ├── scheduler ├── scheduler.go └── src_manage.go └── config └── config.go /pholcus/node/node.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import () 4 | -------------------------------------------------------------------------------- /pholcus/status/status.go: -------------------------------------------------------------------------------- 1 | package status 2 | 3 | import () 4 | -------------------------------------------------------------------------------- /doc/ICON.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/ICON.ico -------------------------------------------------------------------------------- /doc/project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/project.png -------------------------------------------------------------------------------- /pholcus/gui/rsrc.syso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/pholcus/gui/rsrc.syso -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/pholcus/gui" 5 | ) 6 | 7 | func main() { 8 | gui.Run() 9 | } 10 | -------------------------------------------------------------------------------- /doc/GUI编译命令.txt: -------------------------------------------------------------------------------- 1 | go get github.com/akavel/rsrc 2 | rsrc -manifest test.manifest -o rsrc.syso 3 | 4 | go build 5 | 6 | 7 | go build -ldflags="-H windowsgui" -------------------------------------------------------------------------------- /reporter/reporter.go: -------------------------------------------------------------------------------- 1 | package reporter 2 | 3 | type Reporter interface { 4 | Printf(format string, v ...interface{}) 5 | Println(v ...interface{}) 6 | send(string) 7 | } 8 | -------------------------------------------------------------------------------- /pholcus/crawler/crawler.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/spiders/spider" 5 | ) 6 | 7 | type Crawler interface { 8 | Init(*spider.Spider) Crawler 9 | Start() 10 | } 11 | -------------------------------------------------------------------------------- /spiders/spider/spiderlist.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | type Spiders []*Spider 4 | 5 | var SpiderList = Spiders{} 6 | 7 | func (Spiders) Init() { 8 | SpiderList = Spiders{} 9 | } 10 | 11 | func (Spiders) Add(sp *Spider) { 12 | SpiderList = append(SpiderList, sp) 13 | } 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## pholcus 2 | Pholcus(幽灵蛛)是一款Go语言编写的爬虫软件框架(含GUI界面),优雅的爬虫规则、可控的高并发、任意的批量任务、多种输出方式、大量Demo,并且考虑了支持分布式布局。 3 | 4 | ![image](https://github.com/henrylee2cn/pholcus/blob/master/doc/project.png) 5 | 6 | 7 | 8 | **安装幽灵蛛** 9 | ``` 10 | go get github.com/henrylee2cn/pholcus 11 | ``` 12 | -------------------------------------------------------------------------------- /downloader/downloader.go: -------------------------------------------------------------------------------- 1 | package downloader 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/downloader/context" 5 | ) 6 | 7 | // The Downloader interface. 8 | // You can implement the interface by implement function Download. 9 | // Function Download need to return Page instance pointer that has request result downloaded from Request. 10 | type Downloader interface { 11 | Download(req *context.Request) *context.Response 12 | } 13 | -------------------------------------------------------------------------------- /pipeline/collector/datacell.go: -------------------------------------------------------------------------------- 1 | // 数据存储单元 2 | package collector 3 | 4 | type DataCell map[string]interface{} 5 | 6 | func NewDataCell(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) DataCell { 7 | return DataCell{ 8 | "RuleName": ruleName, //规定Data中的key 9 | "Data": data, //数据存储,key须与Rule的Fields保持一致 10 | "Url": url, //用于索引 11 | "ParentUrl": parentUrl, //DataCell的上级url 12 | "DownloadTime": downloadTime, 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /main.manifest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /pholcus/gui/guimain.manifest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /reporter/report.go: -------------------------------------------------------------------------------- 1 | package reporter 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | ) 7 | 8 | type Report struct{} 9 | 10 | func (self *Report) send(str string) { 11 | if true { 12 | 13 | } 14 | } 15 | 16 | func (self *Report) Printf(format string, v ...interface{}) { 17 | log.Printf(format, v...) 18 | self.send(fmt.Sprintf(format, v...)) 19 | } 20 | 21 | func (self *Report) Println(v ...interface{}) { 22 | log.Println(v...) 23 | self.send(fmt.Sprintln(v...)) 24 | } 25 | 26 | var Log Reporter 27 | 28 | func init() { 29 | Log = &Report{} 30 | } 31 | -------------------------------------------------------------------------------- /common/deduplicate/deduplicate.go: -------------------------------------------------------------------------------- 1 | package deduplicate 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/common/util" 5 | ) 6 | 7 | type Deduplicate interface { 8 | // 采集非重复样本并返回对比结果,重复为true 9 | Compare(obj interface{}) bool 10 | } 11 | 12 | type Deduplication struct { 13 | sampling map[string]bool 14 | } 15 | 16 | func New() Deduplicate { 17 | return &Deduplication{ 18 | sampling: make(map[string]bool), 19 | } 20 | } 21 | 22 | // 对比是否已存在,不存在则采样 23 | func (self *Deduplication) Compare(obj interface{}) bool { 24 | s := util.MakeUnique(obj) 25 | if !self.sampling[s] { 26 | self.sampling[s] = true 27 | return false 28 | } 29 | return true 30 | } 31 | -------------------------------------------------------------------------------- /common/mlog/mlog.go: -------------------------------------------------------------------------------- 1 | // Package mlog implements log operations. 2 | package mlog 3 | 4 | import ( 5 | "runtime" 6 | ) 7 | 8 | // The plog is a public function combiation for other log objects. 9 | type plog struct { 10 | isopen bool 11 | } 12 | 13 | // GetCaller returns file name and line number at the third step of runtime. 14 | func (*plog) getCaller() (string, int) { 15 | _, file, line, ok := runtime.Caller(3) 16 | if !ok { 17 | file = "???" 18 | line = 0 19 | } 20 | return file, line 21 | } 22 | 23 | // Open makes log open. 24 | func (this *plog) Open() { 25 | this.isopen = true 26 | } 27 | 28 | // Close makes log close. 29 | func (this *plog) Close() { 30 | this.isopen = false 31 | } 32 | -------------------------------------------------------------------------------- /pipeline/collector/output.go: -------------------------------------------------------------------------------- 1 | //数据输出管理 2 | package collector 3 | 4 | import ( 5 | "github.com/henrylee2cn/pholcus/config" 6 | "log" 7 | // "fmt" 8 | "time" 9 | ) 10 | 11 | func (self *Collector) Output(dataIndex int) { 12 | defer func() { 13 | err := recover() 14 | if err != nil { 15 | log.Printf("输出时出错!\n") 16 | } else { 17 | // 正常情况下回收内存 18 | self.DockerQueue.Recover(dataIndex) 19 | } 20 | }() 21 | 22 | dataLen := len(self.DockerQueue.Dockers[dataIndex]) 23 | if dataLen == 0 { 24 | // log.Println("没有抓到结果!!!") 25 | return 26 | } 27 | 28 | // 输出数据统计 29 | self.setSum(dataLen) 30 | 31 | // 选择执行输出 32 | switch self.outType { 33 | case "excel": 34 | self.excel(dataIndex) 35 | case "csv": 36 | self.csv(dataIndex) 37 | case "mongoDB": 38 | self.mgo(dataIndex) 39 | default: 40 | self.excel(dataIndex) 41 | } 42 | log.Printf("[任务:%v | 关键词:%v | 批次:%v] 输出 %v 条数据,用时 %.5f 分钟!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), self.outCount[1]+1, dataLen, time.Since(config.StartTime).Minutes()) 43 | } 44 | -------------------------------------------------------------------------------- /common/mlog/strace.go: -------------------------------------------------------------------------------- 1 | package mlog 2 | 3 | import ( 4 | "log" 5 | "os" 6 | ) 7 | 8 | // Strace represents an active object that strace the processing of spider. 9 | // The strace info is output to os.Stderr. 10 | // The loginst is an point of logger in Std-Packages. 11 | // The isopen is a label represents whether open strace or not. 12 | type strace struct { 13 | plog 14 | 15 | loginst *log.Logger 16 | } 17 | 18 | var pstrace *strace 19 | 20 | // StraceInst get the singleton strace object. 21 | func StraceInst() *strace { 22 | if pstrace == nil { 23 | pstrace = newStrace() 24 | } 25 | return pstrace 26 | } 27 | 28 | // The newStrace returns initialized strace object. 29 | func newStrace() *strace { 30 | pstrace := &strace{} 31 | pstrace.loginst = log.New(os.Stderr, "", log.LstdFlags) 32 | pstrace.isopen = true 33 | return pstrace 34 | } 35 | 36 | // Println output the str to os.Stderr. 37 | func (this *strace) Println(str string) { 38 | if !this.isopen { 39 | return 40 | } 41 | this.loginst.Printf("%s\n", str) 42 | } 43 | -------------------------------------------------------------------------------- /common/queue/queue.go: -------------------------------------------------------------------------------- 1 | package queue 2 | 3 | type Queue struct { 4 | PoolSize int 5 | PoolChan chan interface{} 6 | } 7 | 8 | func NewQueue(size int) *Queue { 9 | return &Queue{ 10 | PoolSize: size, 11 | PoolChan: make(chan interface{}, size), 12 | } 13 | } 14 | 15 | func (this *Queue) Init(size int) *Queue { 16 | this.PoolSize = size 17 | this.PoolChan = make(chan interface{}, size) 18 | return this 19 | } 20 | 21 | func (this *Queue) Push(i interface{}) bool { 22 | if len(this.PoolChan) == this.PoolSize { 23 | return false 24 | } 25 | this.PoolChan <- i 26 | return true 27 | } 28 | 29 | func (this *Queue) PushSlice(s []interface{}) { 30 | for _, i := range s { 31 | this.Push(i) 32 | } 33 | } 34 | 35 | func (this *Queue) Pull() interface{} { 36 | return <-this.PoolChan 37 | } 38 | 39 | // 二次使用Queue实例时,根据容量需求进行高效转换 40 | func (this *Queue) Exchange(num int) (add int) { 41 | last := len(this.PoolChan) 42 | 43 | if last >= num { 44 | add = int(0) 45 | return 46 | } 47 | 48 | if this.PoolSize < num { 49 | pool := []interface{}{} 50 | for i := 0; i < last; i++ { 51 | pool = append(pool, <-this.PoolChan) 52 | } 53 | // 重新定义、赋值 54 | this.Init(num).PushSlice(pool) 55 | } 56 | 57 | add = num - last 58 | return 59 | } 60 | -------------------------------------------------------------------------------- /scheduler/scheduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/common/deduplicate" 5 | "github.com/henrylee2cn/pholcus/downloader/context" 6 | ) 7 | 8 | type Scheduler interface { 9 | // 采集非重复url并返回对比结果,重复为true 10 | Compare(string) bool 11 | 12 | SrcManager 13 | // 以下为具体方法列表 14 | // 存入 15 | // Push(*context.Request) 16 | // 取出 17 | // Use(string) *context.Request 18 | // 释放一个资源 19 | // Free() 20 | // 资源队列是否闲置 21 | // IsEmpty(string) bool 22 | // IsAllEmpty() bool 23 | 24 | } 25 | 26 | type scheduler struct { 27 | *SrcManage 28 | *deduplicate.Deduplication 29 | } 30 | 31 | func New(capacity uint) Scheduler { 32 | return &scheduler{ 33 | SrcManage: NewSrcManage(capacity).(*SrcManage), 34 | Deduplication: deduplicate.New().(*deduplicate.Deduplication), 35 | } 36 | } 37 | 38 | func (self *scheduler) Push(req *context.Request) { 39 | is := self.Compare(req.GetUrl()) 40 | // 有重复则返回 41 | if is { 42 | return 43 | } 44 | self.SrcManage.Push(req) 45 | } 46 | 47 | func (self *scheduler) Compare(url string) bool { 48 | return self.Deduplication.Compare(url) 49 | } 50 | 51 | // 定义全局调度 52 | var Self Scheduler 53 | 54 | func Init(capacity uint) Scheduler { 55 | Self = New(capacity) 56 | return Self 57 | } 58 | -------------------------------------------------------------------------------- /pholcus/pholcus.go: -------------------------------------------------------------------------------- 1 | package pholcus 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/downloader/context" 5 | // "github.com/henrylee2cn/pholcus/pholcus/node" 6 | // "github.com/henrylee2cn/pholcus/pholcus/status" 7 | "github.com/henrylee2cn/pholcus/scheduler" 8 | "sync" 9 | ) 10 | 11 | type Pholcus struct { 12 | // *node.Node 13 | // *Status 14 | isOutsource bool 15 | } 16 | 17 | var pushMutex sync.Mutex 18 | 19 | func (self *Pholcus) Push(req *context.Request) { 20 | pushMutex.Lock() 21 | defer func() { 22 | pushMutex.Unlock() 23 | }() 24 | if !self.TryOutsource(req) { 25 | scheduler.Self.Push(req) 26 | } 27 | } 28 | 29 | func (self *Pholcus) TryOutsource(req *context.Request) bool { 30 | if self.IsOutsource() && req.TryOutsource() { 31 | self.Send(*req) 32 | return true 33 | } 34 | return false 35 | } 36 | 37 | func (self *Pholcus) SetOutsource(serve bool) { 38 | self.isOutsource = serve 39 | } 40 | 41 | func (self *Pholcus) IsOutsource() bool { 42 | return self.isOutsource 43 | } 44 | 45 | func (self *Pholcus) Send(req context.Request) { 46 | 47 | } 48 | 49 | func (self *Pholcus) Receive(req context.Request) { 50 | scheduler.Self.Push(&req) 51 | } 52 | 53 | // 初始化 54 | var Self *Pholcus 55 | 56 | func init() { 57 | Self = &Pholcus{} 58 | } 59 | -------------------------------------------------------------------------------- /common/etc_config/etc_config.go: -------------------------------------------------------------------------------- 1 | // Package etc_config implements config initialization of one spider. 2 | package etc_config 3 | 4 | import ( 5 | "github.com/henrylee2cn/pholcus/common/config" 6 | "github.com/henrylee2cn/pholcus/common/util" 7 | "os" 8 | ) 9 | 10 | // Configpath gets default config path like "WD/etc/main.conf". 11 | func configpath() string { 12 | //wd, _ := os.Getwd() 13 | wd := os.Getenv("GOPATH") 14 | if wd == "" { 15 | panic("GOPATH is not setted in env.") 16 | } 17 | logpath := wd + "/etc/" 18 | filename := "main.conf" 19 | err := os.MkdirAll(logpath, 0755) 20 | if err != nil { 21 | panic("logpath error : " + logpath + "\n") 22 | } 23 | return logpath + filename 24 | } 25 | 26 | // Config is a config singleton object for one spider. 27 | var conf *config.Config 28 | var path string 29 | 30 | // StartConf is used in Spider for initialization at first time. 31 | func StartConf(configFilePath string) *config.Config { 32 | if configFilePath != "" && !util.IsFileExists(configFilePath) { 33 | panic("config path is not valiad:" + configFilePath) 34 | } 35 | 36 | path = configFilePath 37 | return Conf() 38 | } 39 | 40 | // Conf gets singleton instance of Config. 41 | func Conf() *config.Config { 42 | if conf == nil { 43 | if path == "" { 44 | path = configpath() 45 | } 46 | conf = config.NewConfig().Load(path) 47 | } 48 | return conf 49 | } 50 | -------------------------------------------------------------------------------- /pipeline/pipeline.go: -------------------------------------------------------------------------------- 1 | // 数据收集 2 | package pipeline 3 | 4 | import ( 5 | "github.com/henrylee2cn/pholcus/common/deduplicate" 6 | "github.com/henrylee2cn/pholcus/pipeline/collector" 7 | // "github.com/henrylee2cn/pholcus/reporter" 8 | "github.com/henrylee2cn/pholcus/spiders/spider" 9 | ) 10 | 11 | type Pipeline interface { 12 | Start() 13 | //接收控制通知 14 | CtrlR() 15 | //发送控制通知 16 | CtrlS() 17 | // 收集数据单元 18 | Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) 19 | // 对比Url的fingerprint,返回是否有重复 20 | Deduplicate(string) bool 21 | // 重置 22 | Init(*spider.Spider) 23 | } 24 | 25 | type pipeline struct { 26 | *collector.Collector 27 | *deduplicate.Deduplication 28 | } 29 | 30 | func New() Pipeline { 31 | return &pipeline{ 32 | Collector: collector.NewCollector(), 33 | Deduplication: deduplicate.New().(*deduplicate.Deduplication), 34 | } 35 | } 36 | 37 | func (self *pipeline) Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) { 38 | dataCell := collector.NewDataCell(ruleName, data, url, parentUrl, downloadTime) 39 | self.Collector.Collect(dataCell) 40 | } 41 | 42 | func (self *pipeline) Init(sp *spider.Spider) { 43 | self.Collector.Init(sp) 44 | } 45 | 46 | func (self *pipeline) Deduplicate(s string) bool { 47 | return self.Deduplication.Compare(s) 48 | } 49 | 50 | func (self *pipeline) Start() { 51 | go self.Collector.Manage() 52 | // reporter.Log.Println("**************开启输出管道************") 53 | } 54 | -------------------------------------------------------------------------------- /config/config.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/common/queue" 5 | "time" 6 | ) 7 | 8 | const ( 9 | //软件名 10 | APP_NAME = "幽灵蛛数据采集 V0.1 (by Henry)" 11 | // 蜘蛛池容量 12 | CRAWLER_CAP = 50 13 | 14 | // 收集器容量 15 | DATA_CAP = 2 << 14 //65536 16 | 17 | // mongodb数据库服务器 18 | DB_URL = "127.0.0.1:27017" 19 | 20 | //mongodb数据库名称 21 | DB_NAME = "temp-collection-tentinet" 22 | 23 | //mongodb数据库集合 24 | DB_COLLECTION = "news" 25 | ) 26 | 27 | type Report struct { 28 | SpiderName string 29 | Keyword string 30 | Num string 31 | Time string 32 | } 33 | 34 | var ( 35 | // 点击开始按钮的时间点 36 | StartTime time.Time 37 | // 小结报告通道 38 | ReportChan chan *Report 39 | // 请求页面计数 40 | ReqSum uint 41 | // 创建默认爬行队列 42 | CrawlerQueue *queue.Queue 43 | 44 | ThreadNum uint 45 | 46 | OutType string 47 | 48 | // 分段转储容器容量 49 | DOCKER_CAP uint 50 | 51 | // 分段输出池容量,最小为2 52 | DOCKER_QUEUE_CAP uint 53 | ) 54 | 55 | func init() { 56 | 57 | ReportChan = make(chan *Report) 58 | 59 | CrawlerQueue = queue.NewQueue(0) 60 | 61 | InitDockerParam(50000) 62 | 63 | } 64 | 65 | func InitDockerParam(dockercap uint) { 66 | DOCKER_CAP = dockercap 67 | switch { 68 | case dockercap <= 10: 69 | DOCKER_QUEUE_CAP = 500 70 | case dockercap <= 500: 71 | DOCKER_QUEUE_CAP = 200 72 | case dockercap <= 1000: 73 | DOCKER_QUEUE_CAP = 100 74 | case dockercap <= 10000: 75 | DOCKER_QUEUE_CAP = 50 76 | case dockercap <= 100000: 77 | DOCKER_QUEUE_CAP = 10 78 | default: 79 | DOCKER_QUEUE_CAP = 4 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /pipeline/collector/docker.go: -------------------------------------------------------------------------------- 1 | // 负责从收集通道接受数据并临时存储 2 | package collector 3 | 4 | import ( 5 | "github.com/henrylee2cn/pholcus/config" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type DockerQueue struct { 11 | Curr int 12 | Cap uint 13 | Using map[int]bool 14 | Dockers [][]DataCell 15 | } 16 | 17 | func NewDocker() []DataCell { 18 | return make([]DataCell, 0, config.DOCKER_CAP) 19 | } 20 | 21 | func NewDockerQueue() *DockerQueue { 22 | var queueCap uint = config.DOCKER_QUEUE_CAP 23 | if config.DOCKER_QUEUE_CAP < 2 { 24 | queueCap = 2 25 | } 26 | 27 | dockerQueue := &DockerQueue{ 28 | Curr: 0, 29 | Cap: queueCap, 30 | Using: make(map[int]bool, queueCap), 31 | Dockers: make([][]DataCell, 0), 32 | } 33 | 34 | dockerQueue.Using[0] = true 35 | 36 | dockerQueue.Dockers = append(dockerQueue.Dockers, NewDocker()) 37 | 38 | return dockerQueue 39 | } 40 | 41 | var ChangeMutex sync.Mutex 42 | 43 | func (self *DockerQueue) Change() { 44 | ChangeMutex.Lock() 45 | defer ChangeMutex.Unlock() 46 | getLable: 47 | for { 48 | for k, v := range self.Using { 49 | if !v { 50 | self.Curr = k 51 | self.Using[k] = true 52 | break getLable 53 | } 54 | } 55 | self.AutoAdd() 56 | time.Sleep(5e8) 57 | } 58 | } 59 | 60 | func (self *DockerQueue) Recover(index int) { 61 | self.Dockers[index] = NewDocker() 62 | self.Using[index] = false 63 | } 64 | 65 | // 根据情况自动动态增加Docker 66 | func (self *DockerQueue) AutoAdd() { 67 | count := len(self.Dockers) 68 | if uint(count) < self.Cap { 69 | self.Dockers = append(self.Dockers, NewDocker()) 70 | self.Using[count] = false 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /spiders/spider/rss.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/reporter" 5 | "math" 6 | "time" 7 | ) 8 | 9 | type RSS struct { 10 | // RSS爬虫重新访问的5个级别(分钟) 11 | Level []int 12 | //RSS源的权重, self.T[src] { 39 | if k == 0 { 40 | k = 1 41 | } 42 | reporter.Log.Printf("************************ ……当前RSS <%s> 的更新周期为 %v 分钟……************************", src, self.Level[k-1]) 43 | time.Sleep(time.Minute * time.Duration(self.Level[k-1])) 44 | break 45 | } 46 | } 47 | if self.Flag[src] { 48 | self.T[src] = int(math.Floor(float64(self.T[src]) / 1.2)) 49 | if self.T[src] < self.Level[0] { 50 | self.T[src] = self.Level[0] 51 | } 52 | } else { 53 | self.T[src] = int(math.Floor(float64(self.T[src]) * 1.2)) 54 | if self.T[src] > self.Level[len(self.Level)-1] { 55 | self.T[src] = self.Level[len(self.Level)-1] 56 | } 57 | } 58 | self.Flag[src] = false 59 | } 60 | 61 | func (self *RSS) Updata(src string) { 62 | self.Flag[src] = true 63 | } 64 | -------------------------------------------------------------------------------- /scheduler/src_manage.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/downloader/context" 5 | // "github.com/henrylee2cn/pholcus/reporter" 6 | ) 7 | 8 | // SrcManage is an interface that who want implement an management object can realize these functions. 9 | type SrcManager interface { 10 | // 存入 11 | Push(*context.Request) 12 | // 取出 13 | Use(int) *context.Request 14 | // 释放一个资源 15 | Free() 16 | // 资源队列是否闲置 17 | IsEmpty(int) bool 18 | IsAllEmpty() bool 19 | } 20 | 21 | type SrcManage struct { 22 | count chan bool 23 | queue map[int][]*context.Request 24 | } 25 | 26 | func NewSrcManage(capacity uint) SrcManager { 27 | return &SrcManage{ 28 | count: make(chan bool, int(capacity)), 29 | queue: make(map[int][]*context.Request), 30 | } 31 | } 32 | 33 | func (self *SrcManage) Push(req *context.Request) { 34 | if spiderId, ok := req.GetSpiderId(); ok { 35 | self.queue[spiderId] = append(self.queue[spiderId], req) 36 | } 37 | } 38 | 39 | func (self *SrcManage) Use(spiderId int) *context.Request { 40 | if len(self.queue[spiderId]) == 0 { 41 | return nil 42 | } 43 | req := self.queue[spiderId][0] 44 | self.queue[spiderId] = self.queue[spiderId][1:] 45 | self.count <- true 46 | return req 47 | } 48 | 49 | func (self *SrcManage) Free() { 50 | <-self.count 51 | } 52 | 53 | func (self *SrcManage) IsEmpty(spiderId int) bool { 54 | if len(self.queue[spiderId]) == 0 { 55 | return true 56 | } 57 | return false 58 | } 59 | 60 | func (self *SrcManage) IsAllEmpty() bool { 61 | if len(self.count) == 0 { 62 | for _, v := range self.queue { 63 | if len(v) != 0 { 64 | return false 65 | } 66 | } 67 | return true 68 | } 69 | return false 70 | } 71 | -------------------------------------------------------------------------------- /spiders/spider/common.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | // "bytes" 5 | "code.google.com/p/mahonia" 6 | // "golang.org/x/text/encoding/simplifiedchinese" 7 | // "golang.org/x/text/transform" 8 | // "io/ioutil" 9 | // "github.com/henrylee2cn/pholcus/downloader/context" 10 | "regexp" 11 | "strings" 12 | ) 13 | 14 | func CleanHtml(str string, depth int) string { 15 | if depth > 0 { 16 | //将HTML标签全转换成小写 17 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 18 | str = re.ReplaceAllStringFunc(str, strings.ToLower) 19 | } 20 | if depth > 1 { 21 | //去除STYLE 22 | re, _ := regexp.Compile("\\") 23 | str = re.ReplaceAllString(str, "") 24 | } 25 | if depth > 2 { 26 | //去除SCRIPT 27 | re, _ := regexp.Compile("\\") 28 | str = re.ReplaceAllString(str, "") 29 | } 30 | if depth > 3 { 31 | //去除所有尖括号内的HTML代码,并换成换行符 32 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 33 | str = re.ReplaceAllString(str, "\n") 34 | } 35 | if depth > 4 { 36 | //去除连续的换行符 37 | re, _ := regexp.Compile("\\s{2,}") 38 | str = re.ReplaceAllString(str, "\n") 39 | } 40 | return str 41 | } 42 | 43 | // func Encode(src string) (dst string) { 44 | // data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewEncoder())) 45 | // if err == nil { 46 | // dst = string(data) 47 | // } 48 | // return 49 | // } 50 | // func Decode(src string) (dst string) { 51 | // data, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(src)), simplifiedchinese.GBK.NewDecoder())) 52 | // if err == nil { 53 | // dst = string(data) 54 | // } 55 | // return 56 | // } 57 | 58 | func DecodeString(src, charset string) string { 59 | return mahonia.NewDecoder(charset).ConvertString(src) 60 | } 61 | 62 | func EncodeString(src, charset string) string { 63 | return mahonia.NewEncoder(charset).ConvertString(src) 64 | } 65 | 66 | func ConvertToString(src string, srcCode string, tagCode string) string { 67 | srcCoder := mahonia.NewDecoder(srcCode) 68 | srcResult := srcCoder.ConvertString(src) 69 | tagCoder := mahonia.NewDecoder(tagCode) 70 | _, cdata, _ := tagCoder.Translate([]byte(srcResult), true) 71 | result := string(cdata) 72 | return result 73 | } 74 | 75 | func GBKToUTF8(src string) string { 76 | return DecodeString(EncodeString(src, "ISO-8859-1"), "GBK") 77 | } 78 | -------------------------------------------------------------------------------- /common/mlog/filelog.go: -------------------------------------------------------------------------------- 1 | package mlog 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/exec" 7 | "path/filepath" 8 | "strconv" 9 | "time" 10 | ) 11 | 12 | // Filelog represents an active object that logs on file to record error or other useful info. 13 | // The filelog info is output to os.Stderr. 14 | // The loginst is an point of logger in Std-Packages. 15 | // The isopen is a label represents whether open filelog or not. 16 | type filelog struct { 17 | plog 18 | 19 | loginst *log.Logger 20 | } 21 | 22 | var flog *filelog 23 | 24 | // LogInst get the singleton filelog object. 25 | func LogInst() *filelog { 26 | if flog == nil { 27 | InitFilelog(false, "") 28 | } 29 | return flog 30 | } 31 | 32 | // The InitFilelog is init the flog. 33 | func InitFilelog(isopen bool, fp string) { 34 | if !isopen { 35 | flog = &filelog{} 36 | flog.loginst = nil 37 | flog.isopen = isopen 38 | return 39 | } 40 | if fp == "" { 41 | wd := os.Getenv("GOPATH") 42 | if wd == "" { 43 | //panic("GOPATH is not setted in env.") 44 | file, _ := exec.LookPath(os.Args[0]) 45 | path := filepath.Dir(file) 46 | wd = path 47 | } 48 | if wd == "" { 49 | panic("GOPATH is not setted in env or can not get exe path.") 50 | } 51 | fp = wd + "/log/" 52 | } 53 | flog = newFilelog(isopen, fp) 54 | } 55 | 56 | // The newFilelog returns initialized filelog object. 57 | // The default file path is "WORKDIR/log/log.2011-01-01". 58 | func newFilelog(isopen bool, logpath string) *filelog { 59 | year, month, day := time.Now().Date() 60 | filename := "log." + strconv.Itoa(year) + "-" + strconv.Itoa(int(month)) + "-" + strconv.Itoa(day) 61 | err := os.MkdirAll(logpath, 0755) 62 | if err != nil { 63 | panic("logpath error : " + logpath + "\n") 64 | } 65 | 66 | f, err := os.OpenFile(logpath+"/"+filename, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) 67 | if err != nil { 68 | panic("log file open error : " + logpath + "/" + filename + "\n") 69 | } 70 | 71 | pfilelog := &filelog{} 72 | pfilelog.loginst = log.New(f, "", log.LstdFlags) 73 | pfilelog.isopen = isopen 74 | return pfilelog 75 | } 76 | 77 | func (this *filelog) log(lable string, str string) { 78 | if !this.isopen { 79 | return 80 | } 81 | file, line := this.getCaller() 82 | this.loginst.Printf("%s:%d: %s %s\n", file, line, lable, str) 83 | } 84 | 85 | // LogError logs error info. 86 | func (this *filelog) LogError(str string) { 87 | this.log("[ERROR]", str) 88 | } 89 | 90 | // LogError logs normal info. 91 | func (this *filelog) LogInfo(str string) { 92 | this.log("[INFO]", str) 93 | } 94 | -------------------------------------------------------------------------------- /pholcus/keeper/login.go: -------------------------------------------------------------------------------- 1 | // craw master module 2 | package keeper 3 | 4 | import ( 5 | "errors" 6 | "net/http" 7 | "net/http/cookiejar" 8 | "strings" 9 | ) 10 | 11 | //具体的获取cookie的方法,return出一个[]*http.cookie 12 | //函数分为两步获取cookie,当有302跳转执行IsRedirectFunc 13 | //没有302跳转执行NoRedirectFunc,都是返回[]*http.cookie 14 | func GetCookie(url string, postParam string, IsRediect bool) []*http.Cookie { 15 | if IsRediect == true { 16 | return IsRedirectFunc(url, postParam) 17 | } 18 | return NoRedirectFunc(url, postParam) 19 | } 20 | 21 | //这是一个GetCookie函数的分支,当有302跳转的时候执行此函数 22 | func IsRedirectFunc(url string, postParam string) []*http.Cookie { 23 | 24 | gCookieJar, _ := cookiejar.New(nil) 25 | client := &http.Client{ 26 | CheckRedirect: noCheckRedirect, //调用noCheckRedirect,不跳转,直接返回location 27 | Jar: gCookieJar, 28 | } 29 | 30 | req1, err := http.NewRequest("POST", url, strings.NewReader(postParam)) 31 | if err != nil { 32 | errors.New("add postParam err") 33 | } 34 | 35 | resp1 := getResponse(client, req1, url) 36 | 37 | //获取第一次请求的location,发起第二次请求 38 | 39 | req2, err := http.NewRequest("GET", string(resp1.Header.Get("Location")), nil) 40 | if err != nil { 41 | errors.New("add postParam err") 42 | } 43 | 44 | return getResponse(client, req2, url).Cookies() 45 | 46 | } 47 | 48 | //IsRedirectFunc的一个分支函数,防止302跳转,获取lostion 49 | func noCheckRedirect(req *http.Request, via []*http.Request) error { 50 | if len(via) >= 0 { 51 | return errors.New("stopped after 10 redirects") 52 | } 53 | return nil 54 | } 55 | 56 | //这是一个GetCookie函数的分支,当没有有302跳转的时候执行此函数 57 | func NoRedirectFunc(url string, postParam string) []*http.Cookie { 58 | gCookieJar, _ := cookiejar.New(nil) 59 | 60 | client := &http.Client{ 61 | Jar: gCookieJar, 62 | } 63 | 64 | req1, err := http.NewRequest("POST", url, strings.NewReader(postParam)) 65 | if err != nil { 66 | errors.New("add postParam err") 67 | } 68 | 69 | return getResponse(client, req1, url).Cookies() 70 | } 71 | 72 | // 请求获取响应流 73 | func getResponse(client *http.Client, req *http.Request, url string) *http.Response { 74 | req.Header.Set("Proxy-Connection", "keep-alive") 75 | req.Header.Set("Cache-Control", "max-age=0") 76 | req.Header.Set("Accept", "*/*") 77 | req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36") 78 | req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 79 | req.Header.Set("Referer", url) 80 | req.Header.Set("Accept-Encoding", "gzip, deflate") 81 | req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8") 82 | resp, err := client.Do(req) 83 | if err != nil { 84 | return nil 85 | } 86 | return resp 87 | } 88 | -------------------------------------------------------------------------------- /pholcus/socket/PhoSocketTest.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | // "bufio" 5 | "fmt" 6 | // "io/ioutil" 7 | "log" 8 | "net" 9 | ) 10 | 11 | const ( 12 | PhoSocketServer = "127.0.0.1:6010" 13 | ) 14 | 15 | //建立连接 16 | 17 | //func 接受 18 | 19 | //func 发送 20 | 21 | /* 22 | *@服务端用 23 | * 24 | *PhoSoketLisent()为幽灵蛛socket监听函数 25 | *PhoSocketServer为预定义常量:server:port 26 | *输出类型为net.Listener,一个监听句柄 27 | */ 28 | func PhoSoketLisent() net.Listener { 29 | ln, err := net.Listen("tcp", PhoSocketServer) 30 | if err != nil { 31 | panic(err) 32 | } 33 | return ln 34 | } 35 | 36 | /* 37 | *@客户端用 38 | * 39 | *PhoSoketDial()为幽灵蛛socket拨号函数,请求服务端 40 | *PhoSocketServer为预定义常量:server:port 41 | *输出类型为net.Conn,一个握手连接,下一步可以进行接收,发送 42 | */ 43 | func PhoSoketDial() net.Conn { 44 | conn, err := net.Dial("tcp", PhoSocketServer) 45 | if err != nil { 46 | panic(err) 47 | } 48 | return conn 49 | } 50 | 51 | /* 52 | *@服务端用 53 | * 54 | *PhoSocketAccept()为幽灵蛛socket同意连接函数 55 | *ln为一个监听句柄 56 | *输出类型为net.Conn,一个握手连接,下一步可以进行接收,发送 57 | */ 58 | func PhoSocketAccept(ln net.Listener) net.Conn { 59 | for { 60 | conn, err := ln.Accept() 61 | if err != nil { 62 | log.Fatal("get client connection error: ", err) 63 | } 64 | return conn 65 | } 66 | } 67 | 68 | /* 69 | *@服务端用 70 | * 71 | *PhoSocketSendDataClose()为幽灵蛛socket数据发送函数,并且关闭连接 72 | *conn为握手连接,sendData为要发送的数据 73 | *通过conn给client发送Data 74 | */ 75 | func PhoSocketSendDataClose(conn net.Conn, sendData string) { 76 | // fmt.Fprintf(conn, sendData) 77 | conn.Write([]byte(sendData)) 78 | conn.Close() 79 | } 80 | 81 | /* 82 | *@共用 83 | * 84 | *PhoSocketSendData()为幽灵蛛socket数据发送函数,但是不关闭 85 | *conn为握手连接,sendData为要发送的数据 86 | *通过conn给client发送Data 87 | */ 88 | func PhoSocketSendData(conn net.Conn, sendData string) { 89 | // fmt.Fprintf(conn, sendData) 90 | conn.Write([]byte(sendData)) 91 | } 92 | 93 | /* 94 | *@共用 95 | * 96 | *PhoSocketAcceptData()为幽灵蛛socket数据接收函数 97 | *conn为握手连接 98 | *通过conn接收client发送来的Data 99 | */ 100 | func PhoSocketAcceptData(conn net.Conn) { 101 | // data, err := bufio.NewReader(conn).ReadString('\n') 102 | databuf := make([]byte, 4096) 103 | n, err := conn.Read(databuf) 104 | // data, err := ioutil.ReadAll(conn) 105 | if err != nil { 106 | log.Fatal("get client data error: ", err) 107 | } 108 | fmt.Printf("%#v\n", string(databuf[:n])) 109 | } 110 | 111 | //接收并发送,完关闭 112 | func AcceptAndSendClose(conn net.Conn) { 113 | PhoSocketAcceptData(conn) 114 | PhoSocketSendDataClose(conn, "this is server\n") 115 | } 116 | 117 | //接收并发送,不关闭 118 | func AcceptAndSend(conn net.Conn) { 119 | PhoSocketAcceptData(conn) 120 | PhoSocketSendData(conn, "this is server\n") 121 | } 122 | func main() { 123 | conn := PhoSoketDial() 124 | PhoSocketSendData(conn, "hello server\n") 125 | PhoSocketAcceptData(conn) 126 | 127 | } 128 | -------------------------------------------------------------------------------- /pholcus/socket/PhoSocket.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | // "bufio" 5 | "fmt" 6 | // "io/ioutil" 7 | "log" 8 | "net" 9 | ) 10 | 11 | const ( 12 | PhoSocketServer = "127.0.0.1:6010" 13 | ) 14 | 15 | //建立连接 16 | 17 | //func 接受 18 | 19 | //func 发送 20 | 21 | /* 22 | *@服务端用 23 | * 24 | *PhoSoketLisent()为幽灵蛛socket监听函数 25 | *PhoSocketServer为预定义常量:server:port 26 | *输出类型为net.Listener,一个监听句柄 27 | */ 28 | func PhoSoketLisent() net.Listener { 29 | ln, err := net.Listen("tcp", PhoSocketServer) 30 | if err != nil { 31 | panic(err) 32 | } 33 | return ln 34 | } 35 | 36 | /* 37 | *@客户端用 38 | * 39 | *PhoSoketDial()为幽灵蛛socket拨号函数,请求服务端 40 | *PhoSocketServer为预定义常量:server:port 41 | *输出类型为net.Conn,一个握手连接,下一步可以进行接收,发送 42 | */ 43 | func PhoSoketDial() net.Conn { 44 | conn, err := net.Dial("tcp", PhoSocketServer) 45 | if err != nil { 46 | panic(err) 47 | } 48 | return conn 49 | } 50 | 51 | /* 52 | *@服务端用 53 | * 54 | *PhoSocketAccept()为幽灵蛛socket同意连接函数 55 | *ln为一个监听句柄 56 | *输出类型为net.Conn,一个握手连接,下一步可以进行接收,发送 57 | */ 58 | func PhoSocketAccept(ln net.Listener) net.Conn { 59 | for { 60 | conn, err := ln.Accept() 61 | if err != nil { 62 | log.Fatal("get client connection error: ", err) 63 | } 64 | return conn 65 | } 66 | } 67 | 68 | /* 69 | *@服务端用 70 | * 71 | *PhoSocketSendDataClose()为幽灵蛛socket数据发送函数,并且关闭连接 72 | *conn为握手连接,sendData为要发送的数据 73 | *通过conn给client发送Data 74 | */ 75 | func PhoSocketSendDataClose(conn net.Conn, sendData string) { 76 | // fmt.Fprintf(conn, sendData) 77 | conn.Write([]byte(sendData)) 78 | conn.Close() 79 | } 80 | 81 | /* 82 | *@共用 83 | * 84 | *PhoSocketSendData()为幽灵蛛socket数据发送函数,但是不关闭 85 | *conn为握手连接,sendData为要发送的数据 86 | *通过conn给client发送Data 87 | */ 88 | func PhoSocketSendData(conn net.Conn, sendData string) { 89 | // fmt.Fprintf(conn, sendData) 90 | conn.Write([]byte(sendData)) 91 | } 92 | 93 | /* 94 | *@共用 95 | * 96 | *PhoSocketAcceptData()为幽灵蛛socket数据接收函数 97 | *conn为握手连接 98 | *通过conn接收client发送来的Data 99 | */ 100 | func PhoSocketAcceptData(conn net.Conn) { 101 | // data, err := bufio.NewReader(conn).ReadString('\n') 102 | databuf := make([]byte, 4096) 103 | n, err := conn.Read(databuf) 104 | // data, err := ioutil.ReadAll(conn) 105 | if err != nil { 106 | log.Fatal("get client data error: ", err) 107 | } 108 | fmt.Printf("%#v\n", string(databuf[:n])) 109 | } 110 | 111 | //接收并发送,完关闭 112 | func AcceptAndSendClose(conn net.Conn) { 113 | PhoSocketAcceptData(conn) 114 | PhoSocketSendDataClose(conn, "this is server\n") 115 | } 116 | 117 | //接收并发送,不关闭 118 | func AcceptAndSend(conn net.Conn) { 119 | PhoSocketAcceptData(conn) 120 | PhoSocketSendData(conn, "this is server\n") 121 | } 122 | 123 | func main() { 124 | ln := PhoSoketLisent() 125 | for { 126 | conn := PhoSocketAccept(ln) 127 | go AcceptAndSendClose(conn) 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /pholcus/gui/logview.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Walk Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | package gui 5 | 6 | import ( 7 | "errors" 8 | "github.com/lxn/walk" 9 | "github.com/lxn/win" 10 | "syscall" 11 | "unsafe" 12 | ) 13 | 14 | type LogView struct { 15 | walk.WidgetBase 16 | logChan chan string 17 | } 18 | 19 | const ( 20 | TEM_APPENDTEXT = win.WM_USER + 6 21 | ) 22 | 23 | func NewLogView(parent walk.Container) (*LogView, error) { 24 | lc := make(chan string, 1024) 25 | lv := &LogView{logChan: lc} 26 | 27 | if err := walk.InitWidget( 28 | lv, 29 | parent, 30 | "EDIT", 31 | win.WS_TABSTOP|win.WS_VISIBLE|win.WS_VSCROLL|win.ES_MULTILINE|win.ES_WANTRETURN, 32 | win.WS_EX_CLIENTEDGE); err != nil { 33 | return nil, err 34 | } 35 | lv.setReadOnly(true) 36 | lv.SendMessage(win.EM_SETLIMITTEXT, 4294967295, 0) 37 | return lv, nil 38 | } 39 | 40 | func (*LogView) LayoutFlags() walk.LayoutFlags { 41 | return walk.ShrinkableHorz | walk.ShrinkableVert | walk.GrowableHorz | walk.GrowableVert | walk.GreedyHorz | walk.GreedyVert 42 | } 43 | 44 | func (*LogView) MinSizeHint() walk.Size { 45 | return walk.Size{20, 12} 46 | } 47 | 48 | func (*LogView) SizeHint() walk.Size { 49 | return walk.Size{100, 100} 50 | } 51 | 52 | func (lv *LogView) setTextSelection(start, end int) { 53 | lv.SendMessage(win.EM_SETSEL, uintptr(start), uintptr(end)) 54 | } 55 | 56 | func (lv *LogView) textLength() int { 57 | return int(lv.SendMessage(0x000E, uintptr(0), uintptr(0))) 58 | } 59 | 60 | func (lv *LogView) AppendText(value string) { 61 | textLength := lv.textLength() 62 | lv.setTextSelection(textLength, textLength) 63 | lv.SendMessage(win.EM_REPLACESEL, 0, uintptr(unsafe.Pointer(syscall.StringToUTF16Ptr(value)))) 64 | } 65 | 66 | func (lv *LogView) setReadOnly(readOnly bool) error { 67 | if 0 == lv.SendMessage(win.EM_SETREADONLY, uintptr(win.BoolToBOOL(readOnly)), 0) { 68 | return errors.New("fail to call EM_SETREADONLY") 69 | } 70 | 71 | return nil 72 | } 73 | 74 | func (lv *LogView) PostAppendText(value string) { 75 | lv.logChan <- value 76 | win.PostMessage(lv.Handle(), TEM_APPENDTEXT, 0, 0) 77 | } 78 | 79 | func (lv *LogView) Write(p []byte) (int, error) { 80 | lv.PostAppendText(string(p) + "\r\n") 81 | return len(p), nil 82 | } 83 | 84 | func (lv *LogView) WndProc(hwnd win.HWND, msg uint32, wParam, lParam uintptr) uintptr { 85 | switch msg { 86 | case win.WM_GETDLGCODE: 87 | if wParam == win.VK_RETURN { 88 | return win.DLGC_WANTALLKEYS 89 | } 90 | 91 | return win.DLGC_HASSETSEL | win.DLGC_WANTARROWS | win.DLGC_WANTCHARS 92 | case TEM_APPENDTEXT: 93 | select { 94 | case value := <-lv.logChan: 95 | lv.AppendText(value) 96 | default: 97 | return 0 98 | } 99 | } 100 | 101 | return lv.WidgetBase.WndProc(hwnd, msg, wParam, lParam) 102 | } 103 | -------------------------------------------------------------------------------- /pholcus/gui/menu.go: -------------------------------------------------------------------------------- 1 | package gui 2 | 3 | import ( 4 | . "github.com/henrylee2cn/pholcus/spiders" 5 | ) 6 | 7 | // GUI输入 8 | type Inputor struct { 9 | ThreadNum uint 10 | OutType string 11 | BaseSleeptime uint 12 | RandomSleepPeriod uint //随机暂停最大增益时长 13 | MaxPage int 14 | Keywords string //后期split()为slice 15 | Spiders []*GUISpider 16 | DockerCap uint 17 | } 18 | 19 | var Input = &Inputor{ 20 | // 默认值 21 | ThreadNum: 20, 22 | OutType: "excel", 23 | BaseSleeptime: 1000, 24 | RandomSleepPeriod: 3000, 25 | MaxPage: 100, 26 | DockerCap: 10000, 27 | } 28 | 29 | // GUI内容 30 | // 下拉菜单辅助结构体 31 | type KV struct { 32 | Key string 33 | Uint uint 34 | String string 35 | } 36 | 37 | var ( 38 | // 任务选项 39 | SpiderModel = NewGUISpiderModel([]*GUISpiderCore{ 40 | &GUISpiderCore{ 41 | Spider: BaiduSearch, 42 | Description: "百度搜索结果 [www.baidu.com]", 43 | }, 44 | &GUISpiderCore{ 45 | Spider: GoogleSearch, 46 | Description: "谷歌搜索结果 [www.google.com镜像]", 47 | }, 48 | &GUISpiderCore{ 49 | Spider: TaobaoSearch, 50 | Description: "淘宝宝贝搜索结果 [s.taobao.com]", 51 | }, 52 | &GUISpiderCore{ 53 | Spider: JDSearch, 54 | Description: "京东搜索结果 [search.jd.com]", 55 | }, 56 | &GUISpiderCore{ 57 | Spider: AlibabaProduct, 58 | Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]", 59 | }, 60 | &GUISpiderCore{ 61 | Spider: Wangyi, 62 | Description: "网易排行榜新闻,含点击/跟帖排名 [Auto Page] [news.163.com/rank]", 63 | }, 64 | &GUISpiderCore{ 65 | Spider: BaiduNews, 66 | Description: "百度RSS新闻,实现轮询更新 [Auto Page] [news.baidu.com]", 67 | }, 68 | &GUISpiderCore{ 69 | Spider: Kaola, 70 | Description: "考拉海淘商品数据 [Auto Page] [www.kaola.com]", 71 | }, 72 | &GUISpiderCore{ 73 | Spider: Shunfenghaitao, 74 | Description: "顺丰海淘商品数据 [Auto Page] [www.sfht.com]", 75 | }, 76 | &GUISpiderCore{ 77 | Spider: Miyabaobei, 78 | Description: "蜜芽宝贝商品数据 [Auto Page] [www.miyabaobei.com]", 79 | }, 80 | &GUISpiderCore{ 81 | Spider: Hollandandbarrett, 82 | Description: "Hollandand&Barrett商品数据 [Auto Page] [www.Hollandandbarrett.com]", 83 | }, 84 | }) 85 | 86 | // 暂停时间选项及输出类型选项 87 | GUIOpt = struct { 88 | OutType []*KV 89 | SleepTime []*KV 90 | }{ 91 | OutType: []*KV{ 92 | {Key: "excel", String: "excel"}, 93 | {Key: "csv", String: "csv"}, 94 | {Key: "mongoDB", String: "mongoDB"}, 95 | }, 96 | SleepTime: []*KV{ 97 | {Key: "无暂停", Uint: 0}, 98 | {Key: "0.1 秒", Uint: 100}, 99 | {Key: "0.3 秒", Uint: 300}, 100 | {Key: "0.5 秒", Uint: 500}, 101 | {Key: "1 秒", Uint: 1000}, 102 | {Key: "3 秒", Uint: 3000}, 103 | {Key: "5 秒", Uint: 5000}, 104 | {Key: "10 秒", Uint: 10000}, 105 | {Key: "15 秒", Uint: 15000}, 106 | {Key: "20 秒", Uint: 20000}, 107 | {Key: "30 秒", Uint: 30000}, 108 | {Key: "60 秒", Uint: 60000}, 109 | }, 110 | } 111 | ) 112 | -------------------------------------------------------------------------------- /spiders/kaola.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | // "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | // "regexp" 25 | // "strconv" 26 | // "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | // 考拉海淘,海外直采,7天无理由退货,售后无忧!考拉网放心的海淘网站! 36 | var Kaola = &Spider{ 37 | Name: "考拉海淘", 38 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 39 | // Optional: &Optional{}, 40 | RuleTree: &RuleTree{ 41 | // Spread: []string{}, 42 | Root: func(self *Spider) { 43 | self.AddQueue(map[string]interface{}{"url": "http://www.kaola.com", "rule": "获取版块URL"}) 44 | }, 45 | 46 | Nodes: map[string]*Rule{ 47 | 48 | "获取版块URL": &Rule{ 49 | ParseFunc: func(self *Spider, resp *context.Response) { 50 | query := resp.GetHtmlParser() 51 | lis := query.Find("#funcTab li a") 52 | lis.Each(func(i int, s *goquery.Selection) { 53 | if i == 0 { 54 | return 55 | } 56 | if url, ok := s.Attr("href"); ok { 57 | self.AddQueue(map[string]interface{}{"url": url, "rule": "商品列表", "temp": map[string]interface{}{"goodsType": s.Text()}}) 58 | } 59 | }) 60 | }, 61 | }, 62 | 63 | "商品列表": &Rule{ 64 | ParseFunc: func(self *Spider, resp *context.Response) { 65 | query := resp.GetHtmlParser() 66 | query.Find(".proinfo").Each(func(i int, s *goquery.Selection) { 67 | if url, ok := s.Find("a").Attr("href"); ok { 68 | self.AddQueue(map[string]interface{}{ 69 | "url": "http://www.kaola.com" + url, 70 | "rule": "商品详情", 71 | "temp": map[string]interface{}{"goodsType": resp.GetTemp("goodsType").(string)}, 72 | }) 73 | } 74 | }) 75 | }, 76 | }, 77 | 78 | "商品详情": &Rule{ 79 | //注意:有无字段语义和是否输出数据必须保持一致 80 | OutFeild: []string{ 81 | "标题", 82 | "价格", 83 | "品牌", 84 | "采购地", 85 | "评论数", 86 | "类别", 87 | }, 88 | ParseFunc: func(self *Spider, resp *context.Response) { 89 | query := resp.GetHtmlParser() 90 | // 获取标题 91 | title := query.Find(".product-title").Text() 92 | 93 | // 获取价格 94 | price := query.Find("#js_currentPrice span").Text() 95 | 96 | // 获取品牌 97 | brand := query.Find(".goods_parameter li").Eq(0).Text() 98 | 99 | // 获取采购地 100 | from := query.Find(".goods_parameter li").Eq(1).Text() 101 | 102 | // 获取评论数 103 | discussNum := query.Find("#commentCounts").Text() 104 | 105 | // 结果存入Response中转 106 | resp.AddItem(map[string]string{ 107 | self.GetOutFeild(resp, 0): title, 108 | self.GetOutFeild(resp, 1): price, 109 | self.GetOutFeild(resp, 2): brand, 110 | self.GetOutFeild(resp, 3): from, 111 | self.GetOutFeild(resp, 4): discussNum, 112 | self.GetOutFeild(resp, 5): resp.GetTemp("goodsType").(string), 113 | }) 114 | }, 115 | }, 116 | }, 117 | }, 118 | } 119 | -------------------------------------------------------------------------------- /spiders/shunfenghaitao.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | // "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | // "strconv" 26 | // "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | // 进口母婴专区,买进口奶粉、尿裤尿布、辅食、营养、洗护、日用、母婴用品 - 顺丰海淘 36 | var Shunfenghaitao = &Spider{ 37 | Name: "顺丰海淘", 38 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 39 | // Optional: &Optional{}, 40 | RuleTree: &RuleTree{ 41 | // Spread: []string{}, 42 | Root: func(self *Spider) { 43 | self.AddQueue(map[string]interface{}{"url": "http://www.sfht.com", "rule": "获取版块URL"}) 44 | }, 45 | 46 | Nodes: map[string]*Rule{ 47 | 48 | "获取版块URL": &Rule{ 49 | ParseFunc: func(self *Spider, resp *context.Response) { 50 | query := resp.GetHtmlParser() 51 | 52 | lis := query.Find(".nav-c1").First().Find("li a") 53 | 54 | lis.Each(func(i int, s *goquery.Selection) { 55 | if i == 0 { 56 | return 57 | } 58 | if url, ok := s.Attr("href"); ok { 59 | self.AddQueue(map[string]interface{}{"url": url, "rule": "商品列表", "temp": map[string]interface{}{"goodsType": s.Text()}}) 60 | } 61 | }) 62 | }, 63 | }, 64 | 65 | "商品列表": &Rule{ 66 | ParseFunc: func(self *Spider, resp *context.Response) { 67 | query := resp.GetHtmlParser() 68 | 69 | query.Find(".cms-src-item").Each(func(i int, s *goquery.Selection) { 70 | if url, ok := s.Find("a").Attr("href"); ok { 71 | self.AddQueue(map[string]interface{}{ 72 | "url": url, 73 | "rule": "商品详情", 74 | "temp": map[string]interface{}{"goodsType": resp.GetTemp("goodsType").(string)}, 75 | }) 76 | } 77 | }) 78 | }, 79 | }, 80 | 81 | "商品详情": &Rule{ 82 | //注意:有无字段语义和是否输出数据必须保持一致 83 | OutFeild: []string{ 84 | "标题", 85 | "品牌", 86 | "原产地", 87 | "货源地", 88 | "类别", 89 | }, 90 | ParseFunc: func(self *Spider, resp *context.Response) { 91 | query := resp.GetHtmlParser() 92 | 93 | // 获取标题 94 | title := query.Find("#titleInfo h1").Text() 95 | 96 | // 获取品牌 97 | brand := query.Find(".goods-c2 ul").Eq(0).Find("li").Eq(2).Text() 98 | re, _ := regexp.Compile(`品 牌`) 99 | brand = re.ReplaceAllString(brand, "") 100 | 101 | // 获取原产地 102 | from1 := query.Find("#detailattributes li").Eq(0).Text() 103 | 104 | // 获取货源地 105 | from2 := query.Find("#detailattributes li").Eq(1).Text() 106 | 107 | // 结果存入Response中转 108 | resp.AddItem(map[string]string{ 109 | self.GetOutFeild(resp, 0): title, 110 | self.GetOutFeild(resp, 1): brand, 111 | self.GetOutFeild(resp, 2): from1, 112 | self.GetOutFeild(resp, 3): from2, 113 | self.GetOutFeild(resp, 4): resp.GetTemp("goodsType").(string), 114 | }) 115 | }, 116 | }, 117 | }, 118 | }, 119 | } 120 | -------------------------------------------------------------------------------- /spiders/baidusearch.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | "math" 33 | ) 34 | 35 | var BaiduSearch = &Spider{ 36 | Name: "百度搜索", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "生成请求": &Rule{ 48 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 49 | self.LoopAddQueue( 50 | aid[0].([2]int), 51 | func(i int) []string { 52 | return []string{"http://www.baidu.com/s?ie=utf-8&wd=" + self.GetKeyword() + "&rn=50&pn=" + strconv.Itoa(50*i)} 53 | }, 54 | map[string]interface{}{ 55 | "rule": aid[1].(string), 56 | }, 57 | ) 58 | return nil 59 | }, 60 | ParseFunc: func(self *Spider, resp *context.Response) { 61 | query := resp.GetHtmlParser() 62 | total1 := query.Find(".nums").Text() 63 | re, _ := regexp.Compile(`[\D]*`) 64 | total1 = re.ReplaceAllString(total1, "") 65 | total2, _ := strconv.Atoi(total1) 66 | total := int(math.Ceil(float64(total2) / 50)) 67 | if total > self.MaxPage { 68 | total = self.MaxPage 69 | } else if total == 0 { 70 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 71 | return 72 | } 73 | // 调用指定规则下辅助函数 74 | self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"}) 75 | // 用指定规则解析响应流 76 | self.CallRule("搜索结果", resp) 77 | }, 78 | }, 79 | 80 | "搜索结果": &Rule{ 81 | //注意:有无字段语义和是否输出数据必须保持一致 82 | OutFeild: []string{ 83 | "标题", 84 | "内容", 85 | "不完整URL", 86 | "百度跳转", 87 | }, 88 | ParseFunc: func(self *Spider, resp *context.Response) { 89 | query := resp.GetHtmlParser() 90 | query.Find("#content_left .c-container").Each(func(i int, s *goquery.Selection) { 91 | 92 | title := s.Find(".t").Text() 93 | content := s.Find(".c-abstract").Text() 94 | href, _ := s.Find(".t >a").Attr("href") 95 | tar := s.Find(".g").Text() 96 | 97 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 98 | // title = re.ReplaceAllStringFunc(title, strings.ToLower) 99 | // content = re.ReplaceAllStringFunc(content, strings.ToLower) 100 | 101 | title = re.ReplaceAllString(title, "") 102 | content = re.ReplaceAllString(content, "") 103 | 104 | // 结果存入Response中转 105 | resp.AddItem(map[string]string{ 106 | self.GetOutFeild(resp, 0): strings.Trim(title, " \t\n"), 107 | self.GetOutFeild(resp, 1): strings.Trim(content, " \t\n"), 108 | self.GetOutFeild(resp, 2): tar, 109 | self.GetOutFeild(resp, 3): href, 110 | }) 111 | }) 112 | }, 113 | }, 114 | }, 115 | }, 116 | } 117 | -------------------------------------------------------------------------------- /pipeline/collector/collector.go: -------------------------------------------------------------------------------- 1 | // 数据收集 2 | package collector 3 | 4 | import ( 5 | "fmt" 6 | "github.com/henrylee2cn/pholcus/config" 7 | // "github.com/henrylee2cn/pholcus/reporter" 8 | "github.com/henrylee2cn/pholcus/spiders/spider" 9 | "strconv" 10 | "time" 11 | ) 12 | 13 | // 每个爬取任务的数据容器 14 | type Collector struct { 15 | *spider.Spider 16 | *DockerQueue 17 | DataChan chan DataCell 18 | ctrl chan bool //长度为零时退出并输出 19 | sum [2]int //收集的数据总数[过去,现在],非并发安全 20 | outType string 21 | outCount [2]int 22 | } 23 | 24 | func NewCollector() *Collector { 25 | self := &Collector{ 26 | DataChan: make(chan DataCell, config.DATA_CAP), 27 | DockerQueue: NewDockerQueue(), 28 | ctrl: make(chan bool, 1), 29 | } 30 | return self 31 | } 32 | 33 | func (self *Collector) Init(sp *spider.Spider) { 34 | self.Spider = sp 35 | self.outType = config.OutType 36 | self.DataChan = make(chan DataCell, config.DATA_CAP) 37 | self.DockerQueue = NewDockerQueue() 38 | self.ctrl = make(chan bool, 1) 39 | self.sum = [2]int{} 40 | self.outCount = [2]int{} 41 | } 42 | 43 | func (self *Collector) Collect(dataCell DataCell) { 44 | // reporter.Log.Println("**************断点 6 ***********") 45 | self.DataChan <- dataCell 46 | // reporter.Log.Println("**************断点 7 ***********") 47 | } 48 | 49 | func (self *Collector) CtrlS() { 50 | self.ctrl <- true 51 | // reporter.Log.Println("**************断点 10 ***********") 52 | } 53 | 54 | func (self *Collector) CtrlR() { 55 | <-self.ctrl 56 | // reporter.Log.Println("**************断点 9 ***********") 57 | } 58 | 59 | func (self *Collector) CtrlLen() int { 60 | return len(self.ctrl) 61 | } 62 | 63 | // 数据转储输出 64 | func (self *Collector) Manage() { 65 | // reporter.Log.Println("**************开启输出管道************") 66 | 67 | // 令self.Ctrl长度不为零 68 | self.CtrlS() 69 | // 只有当收到退出通知并且通道内无数据时,才退出循环 70 | for !(self.CtrlLen() == 0 && len(self.DataChan) == 0) { 71 | // reporter.Log.Println("**************断点 8 ***********") 72 | select { 73 | case data := <-self.DataChan: 74 | 75 | self.dockerOne(data) 76 | default: 77 | time.Sleep(1e7) //0.1秒 78 | } 79 | } 80 | 81 | // 将剩余收集到但未输出的数据输出 82 | self.goOutput(self.Curr) 83 | 84 | // 等待所有输出完成 85 | for self.outCount[0] > self.outCount[1] { 86 | time.Sleep(5e8) 87 | } 88 | 89 | // 返回报告 90 | self.Report() 91 | } 92 | 93 | func (self *Collector) dockerOne(data DataCell) { 94 | 95 | self.Dockers[self.Curr] = append(self.Dockers[self.Curr], data) 96 | 97 | if uint(len(self.Dockers[self.Curr])) >= config.DOCKER_CAP { 98 | // curDocker存满后输出 99 | self.goOutput(self.Curr) 100 | // 更换一个空Docker用于curDocker 101 | self.Change() 102 | } 103 | } 104 | 105 | func (self *Collector) goOutput(dataIndex int) { 106 | self.outCount[0]++ 107 | go func() { 108 | self.Output(dataIndex) 109 | self.outCount[1]++ 110 | }() 111 | } 112 | 113 | // 统计数据总量 114 | func (self *Collector) Sum() int { 115 | return self.sum[1] 116 | } 117 | 118 | // 统计数据总量 119 | func (self *Collector) setSum(add int) { 120 | self.sum[0], self.sum[1] = self.sum[1], self.sum[1]+add 121 | } 122 | 123 | // 返回报告 124 | func (self *Collector) Report() { 125 | // reporter.Log.Println("**************", self.Sum(), " ***********") 126 | config.ReportChan <- &config.Report{ 127 | SpiderName: self.Spider.GetName(), 128 | Keyword: self.GetKeyword(), 129 | Num: strconv.Itoa(self.Sum()), 130 | Time: fmt.Sprintf("%.5f", time.Since(config.StartTime).Minutes()), 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /common/util/util.go: -------------------------------------------------------------------------------- 1 | // Package util contains some common functions of GO_SPIDER project. 2 | package util 3 | 4 | import ( 5 | "crypto/md5" 6 | "encoding/hex" 7 | "encoding/json" 8 | "encoding/xml" 9 | "fmt" 10 | "golang.org/x/net/html/charset" 11 | "hash/crc32" 12 | "hash/fnv" 13 | "io" 14 | "os" 15 | "regexp" 16 | "strconv" 17 | "strings" 18 | ) 19 | 20 | // JsonpToJson modify jsonp string to json string 21 | // Example: forbar({a:"1",b:2}) to {"a":"1","b":2} 22 | func JsonpToJson(json string) string { 23 | start := strings.Index(json, "{") 24 | end := strings.LastIndex(json, "}") 25 | start1 := strings.Index(json, "[") 26 | if start1 > 0 && start > start1 { 27 | start = start1 28 | end = strings.LastIndex(json, "]") 29 | } 30 | if end > start && end != -1 && start != -1 { 31 | json = json[start : end+1] 32 | } 33 | json = strings.Replace(json, "\\'", "", -1) 34 | regDetail, _ := regexp.Compile("([^\\s\\:\\{\\,\\d\"]+|[a-z][a-z\\d]*)\\s*\\:") 35 | return regDetail.ReplaceAllString(json, "\"$1\":") 36 | } 37 | 38 | // The GetWDPath gets the work directory path. 39 | func GetWDPath() string { 40 | wd := os.Getenv("GOPATH") 41 | if wd == "" { 42 | panic("GOPATH is not setted in env.") 43 | } 44 | return wd 45 | } 46 | 47 | // The IsDirExists judges path is directory or not. 48 | func IsDirExists(path string) bool { 49 | fi, err := os.Stat(path) 50 | 51 | if err != nil { 52 | return os.IsExist(err) 53 | } else { 54 | return fi.IsDir() 55 | } 56 | 57 | panic("util isDirExists not reached") 58 | } 59 | 60 | // The IsFileExists judges path is file or not. 61 | func IsFileExists(path string) bool { 62 | fi, err := os.Stat(path) 63 | 64 | if err != nil { 65 | return os.IsExist(err) 66 | } else { 67 | return !fi.IsDir() 68 | } 69 | 70 | panic("util isFileExists not reached") 71 | } 72 | 73 | // The IsNum judges string is number or not. 74 | func IsNum(a string) bool { 75 | reg, _ := regexp.Compile("^\\d+$") 76 | return reg.MatchString(a) 77 | } 78 | 79 | // simple xml to string support utf8 80 | func XML2mapstr(xmldoc string) map[string]string { 81 | var t xml.Token 82 | var err error 83 | inputReader := strings.NewReader(xmldoc) 84 | decoder := xml.NewDecoder(inputReader) 85 | decoder.CharsetReader = func(s string, r io.Reader) (io.Reader, error) { 86 | return charset.NewReader(r, s) 87 | } 88 | m := make(map[string]string, 32) 89 | key := "" 90 | for t, err = decoder.Token(); err == nil; t, err = decoder.Token() { 91 | switch token := t.(type) { 92 | case xml.StartElement: 93 | key = token.Name.Local 94 | case xml.CharData: 95 | content := string([]byte(token)) 96 | m[key] = content 97 | default: 98 | // ... 99 | } 100 | } 101 | 102 | return m 103 | } 104 | 105 | //string to hash 106 | func MakeHash(s string) string { 107 | const IEEE = 0xedb88320 108 | var IEEETable = crc32.MakeTable(IEEE) 109 | hash := fmt.Sprintf("%x", crc32.Checksum([]byte(s), IEEETable)) 110 | return hash 111 | } 112 | 113 | func HashString(encode string) uint64 { 114 | hash := fnv.New64() 115 | hash.Write([]byte(encode)) 116 | return hash.Sum64() 117 | } 118 | 119 | // 制作特征值方法一 120 | func MakeUnique(obj interface{}) string { 121 | baseString, _ := json.Marshal(obj) 122 | return strconv.FormatUint(HashString(string(baseString)), 10) 123 | } 124 | 125 | // 制作特征值方法二 126 | func MakeMd5(obj interface{}, length int) string { 127 | if length > 32 { 128 | length = 32 129 | } 130 | h := md5.New() 131 | baseString, _ := json.Marshal(obj) 132 | h.Write([]byte(baseString)) 133 | s := hex.EncodeToString(h.Sum(nil)) 134 | return s[:length] 135 | } 136 | -------------------------------------------------------------------------------- /pholcus/gui/guispider.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Walk Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gui 6 | 7 | import ( 8 | "github.com/henrylee2cn/pholcus/spiders/spider" 9 | "github.com/lxn/walk" 10 | "sort" 11 | // . "github.com/lxn/walk/declarative" 12 | ) 13 | 14 | type GUISpiderCore struct { 15 | Spider *spider.Spider 16 | Description string 17 | } 18 | 19 | type GUISpider struct { 20 | *GUISpiderCore 21 | Index int 22 | Title string 23 | checked bool 24 | } 25 | 26 | type GUISpiderModel struct { 27 | walk.TableModelBase 28 | walk.SorterBase 29 | sortColumn int 30 | sortOrder walk.SortOrder 31 | // evenBitmap *walk.Bitmap 32 | // oddIcon *walk.Icon 33 | items []*GUISpider 34 | } 35 | 36 | func NewGUISpiderModel(list []*GUISpiderCore) *GUISpiderModel { 37 | m := new(GUISpiderModel) 38 | // m.evenBitmap, _ = walk.NewBitmapFromFile("") 39 | // m.oddIcon, _ = walk.NewIconFromFile("img/x.ico") 40 | for i, t := range list { 41 | m.items = append(m.items, &GUISpider{ 42 | Index: i + 1, 43 | Title: t.Spider.GetName(), 44 | GUISpiderCore: &GUISpiderCore{ 45 | Description: t.Description, 46 | Spider: t.Spider, 47 | }, 48 | }) 49 | } 50 | 51 | return m 52 | } 53 | 54 | // Called by the TableView from SetModel and every time the model publishes a 55 | // RowsReset event. 56 | func (m *GUISpiderModel) RowCount() int { 57 | return len(m.items) 58 | } 59 | 60 | // Called by the TableView when it needs the text to display for a given cell. 61 | func (m *GUISpiderModel) Value(row, col int) interface{} { 62 | item := m.items[row] 63 | 64 | switch col { 65 | case 0: 66 | return item.Index 67 | 68 | case 1: 69 | return item.Title 70 | 71 | case 2: 72 | return item.Description 73 | 74 | case 3: 75 | return item.Spider 76 | } 77 | panic("unexpected col") 78 | } 79 | 80 | // Called by the TableView to retrieve if a given row is checked. 81 | func (m *GUISpiderModel) Checked(row int) bool { 82 | return m.items[row].checked 83 | } 84 | 85 | // Called by the TableView when the user toggled the check box of a given row. 86 | func (m *GUISpiderModel) SetChecked(row int, checked bool) error { 87 | m.items[row].checked = checked 88 | 89 | return nil 90 | } 91 | 92 | //获取被选中的结果 93 | func (m *GUISpiderModel) GetChecked() []*GUISpider { 94 | rc := []*GUISpider{} 95 | for idx, item := range m.items { 96 | if m.Checked(idx) { 97 | rc = append(rc, item) 98 | } 99 | } 100 | return rc 101 | } 102 | 103 | // Called by the TableView to sort the model. 104 | func (m *GUISpiderModel) Sort(col int, order walk.SortOrder) error { 105 | m.sortColumn, m.sortOrder = col, order 106 | 107 | sort.Sort(m) 108 | 109 | return m.SorterBase.Sort(col, order) 110 | } 111 | 112 | func (m *GUISpiderModel) Len() int { 113 | return len(m.items) 114 | } 115 | 116 | func (m *GUISpiderModel) Less(i, j int) bool { 117 | a, b := m.items[i], m.items[j] 118 | 119 | c := func(ls bool) bool { 120 | if m.sortOrder == walk.SortAscending { 121 | return ls 122 | } 123 | 124 | return !ls 125 | } 126 | 127 | switch m.sortColumn { 128 | case 0: 129 | return c(a.Index < b.Index) 130 | 131 | case 1: 132 | return c(a.Title < b.Title) 133 | 134 | case 2: 135 | return c(a.Description < b.Description) 136 | 137 | // case 3: 138 | // return c(a.Spider < b.Spider) 139 | } 140 | 141 | panic("unreachable") 142 | } 143 | 144 | func (m *GUISpiderModel) Swap(i, j int) { 145 | m.items[i], m.items[j] = m.items[j], m.items[i] 146 | } 147 | 148 | // Called by the TableView to retrieve an item image. 149 | // func (m *GUISpiderModel) Image(row int) interface{} { 150 | // // if m.items[row].Index%2 == 0 { 151 | // // return m.oddIcon 152 | // // } 153 | // return m.evenBitmap 154 | // } 155 | -------------------------------------------------------------------------------- /spiders/readme.md: -------------------------------------------------------------------------------- 1 | // 蜘蛛,采集规则。 2 | package spider 3 | 4 | import ( 5 | "pholcus/downloader/context" 6 | "pholcus/pholcus" 7 | // "pholcus/pholcus/status" 8 | ) 9 | 10 | type Spider struct { 11 | Name string 12 | Pausetime [2]uint //暂停区间Pausetime[0]~Pausetime[0]+Pausetime[1] 13 | *RuleTree 14 | // *SpiderStatus 15 | //以下为可选成员 16 | MaxPage int 17 | Keyword string 18 | Depth int 19 | Id int //所在SpiderList的下标编号 20 | } 21 | 22 | // func NewSpider() *Spider { 23 | // sp := new(Spider) 24 | // sp.RuleTree = &RuleTree{ 25 | // Nodes: make(map[string]*Rule), 26 | // } 27 | // return sp 28 | // } 29 | 30 | func (self *Spider) Start(sp *Spider) { 31 | sp.RuleTree.Root(sp) 32 | } 33 | 34 | func (self *Spider) GetName() string { 35 | return self.Name 36 | } 37 | 38 | func (self *Spider) GetId() int { 39 | return self.Id 40 | } 41 | 42 | func (self *Spider) GetKeyword() string { 43 | return self.Keyword 44 | } 45 | 46 | func (self *Spider) GetRules() map[string]*Rule { 47 | return self.RuleTree.Nodes 48 | } 49 | 50 | // 根据响应流运行指定解析规则 51 | func (self *Spider) GoRule(resp *context.Response) { 52 | self.RuleTree.Nodes[resp.GetRuleName()].ParseFunc(self, resp) 53 | } 54 | 55 | // 用指定规则解析响应流 56 | func (self *Spider) CallRule(ruleName string, resp *context.Response) { 57 | resp.SetRuleName(ruleName) 58 | self.GoRule(resp) 59 | } 60 | 61 | // 调用指定规则下辅助函数 62 | func (self *Spider) AidRule(ruleName string, aid []interface{}) interface{} { 63 | rule := self.RuleTree.Nodes[ruleName] 64 | return rule.AidFunc(self, aid) 65 | } 66 | 67 | // 获取任务规则采集语义字段 68 | func (self *Spider) GetOutFeild(resp *context.Response, index int) string { 69 | return self.RuleTree.Nodes[resp.GetRuleName()].OutFeild[index] 70 | } 71 | 72 | func (self *Spider) LoopAddQueue(loop [2]int, urlFn func(int) []string, param map[string]interface{}) { 73 | for ; loop[0] < loop[1]; loop[0]++ { 74 | urls := urlFn(loop[0]) 75 | self.BulkAddQueue(urls, param) 76 | } 77 | } 78 | 79 | func (self *Spider) BulkAddQueue(urls []string, param map[string]interface{}) { 80 | for _, url := range urls { 81 | param["url"] = url 82 | self.AddQueue(param) 83 | } 84 | } 85 | 86 | func (self *Spider) AddQueue(param map[string]interface{}) { 87 | req := self.NewRequest(param) 88 | pholcus.Self.Push(req) 89 | } 90 | 91 | // 生成请求 92 | // param全部参数列表 93 | // req := &Request{ 94 | // url: param["url"].(string), //必填 95 | // parent: "", //若有必填 96 | // rule: param["rule"].(string), //必填 97 | // spider: param["spider"].(string), //自动填写 98 | // respType: param["respType"].(string),//可默认 99 | // method: param["method"].(string), //可默认 100 | // header: param["header"],//可默认 101 | // cookies: param["cookies"].([]*http.Cookie),//可默认 102 | // postdata: param["postdata"].(string),//可默认 103 | // canOutsource: param["canOutsource"].(bool),//可默认 104 | // checkRedirect: param["checkRedirect"].(func(req *http.Request, via []*http.Request) error),//可默认 105 | // proxyHost: param["proxyHost"].(string),//可默认 106 | // temp: param["temp"].(map[string]interface{}),//可默认 107 | // } 108 | 109 | func (self *Spider) NewRequest(param map[string]interface{}) *context.Request { 110 | param["spider"] = self.GetName() 111 | req := context.NewRequest(param) 112 | req.SetSpiderId(self.GetId()) 113 | return req 114 | } 115 | 116 | //采集规则树 117 | type RuleTree struct { 118 | Spread []string //作为服务器时的请求分发点 119 | Root func(*Spider) 120 | Nodes map[string]*Rule 121 | } 122 | 123 | // 采集规则单元 124 | type Rule struct { 125 | OutFeild []string //注意:有无字段语义和是否输出数据必须保持一致 126 | // 内容解析函数 127 | ParseFunc func(*Spider, *context.Response) 128 | // 通用辅助函数 129 | AidFunc func(*Spider, []interface{}) interface{} 130 | } 131 | 132 | func (self *Rule) GetOutFeild() []string { 133 | return self.OutFeild 134 | } 135 | -------------------------------------------------------------------------------- /spiders/jdsearch.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | var JDSearch = &Spider{ 36 | Name: "京东搜索", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "生成请求": &Rule{ 48 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 49 | self.LoopAddQueue( 50 | aid[0].([2]int), 51 | func(i int) []string { 52 | return []string{ 53 | "http://search.jd.com/Search?keyword=" + self.GetKeyword() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*i+2), 54 | "http://search.jd.com/Search?keyword=" + self.GetKeyword() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*i+1), 55 | } 56 | }, 57 | map[string]interface{}{ 58 | "rule": aid[1].(string), 59 | }, 60 | ) 61 | return nil 62 | }, 63 | ParseFunc: func(self *Spider, resp *context.Response) { 64 | query := resp.GetHtmlParser() 65 | 66 | total1 := query.Find("#top_pagi span.text").Text() 67 | 68 | re, _ := regexp.Compile(`[\d]+$`) 69 | total1 = re.FindString(total1) 70 | total, _ := strconv.Atoi(total1) 71 | 72 | if total > self.MaxPage { 73 | total = self.MaxPage 74 | } else if total == 0 { 75 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 76 | return 77 | } 78 | // 调用指定规则下辅助函数 79 | self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"}) 80 | // 用指定规则解析响应流 81 | self.CallRule("搜索结果", resp) 82 | }, 83 | }, 84 | 85 | "搜索结果": &Rule{ 86 | //注意:有无字段语义和是否输出数据必须保持一致 87 | OutFeild: []string{ 88 | "标题", 89 | "价格", 90 | "评论数", 91 | "星级", 92 | "链接", 93 | }, 94 | ParseFunc: func(self *Spider, resp *context.Response) { 95 | query := resp.GetHtmlParser() 96 | 97 | query.Find("#plist .list-h:nth-child(1) > li").Each(func(i int, s *goquery.Selection) { 98 | // 获取标题 99 | a := s.Find(".p-name a") 100 | title := a.Text() 101 | 102 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 103 | // title = re.ReplaceAllStringFunc(title, strings.ToLower) 104 | title = re.ReplaceAllString(title, " ") 105 | title = strings.Trim(title, " \t\n") 106 | 107 | // 获取价格 108 | price, _ := s.Find("strong[data-price]").First().Attr("data-price") 109 | 110 | // 获取评论数 111 | e := s.Find(".extra").First() 112 | discuss := e.Find("a").First().Text() 113 | re, _ = regexp.Compile(`[\d]+`) 114 | discuss = re.FindString(discuss) 115 | 116 | // 获取星级 117 | level, _ := e.Find(".star span[id]").First().Attr("class") 118 | level = re.FindString(level) 119 | 120 | // 获取URL 121 | url, _ := a.Attr("href") 122 | 123 | // 结果存入Response中转 124 | resp.AddItem(map[string]string{ 125 | self.GetOutFeild(resp, 0): title, 126 | self.GetOutFeild(resp, 1): price, 127 | self.GetOutFeild(resp, 2): discuss, 128 | self.GetOutFeild(resp, 3): level, 129 | self.GetOutFeild(resp, 4): url, 130 | }) 131 | }) 132 | }, 133 | }, 134 | }, 135 | }, 136 | } 137 | -------------------------------------------------------------------------------- /spiders/miyabaobei.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | // "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | var Miyabaobei = &Spider{ 36 | Name: "蜜芽宝贝", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AddQueue(map[string]interface{}{"url": "http://www.miyabaobei.com/", "rule": "获取版块URL"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "获取版块URL": &Rule{ 48 | ParseFunc: func(self *Spider, resp *context.Response) { 49 | query := resp.GetHtmlParser() 50 | lis := query.Find(".ccon") 51 | lis.Each(func(i int, s *goquery.Selection) { 52 | s.Find("a").Each(func(n int, ss *goquery.Selection) { 53 | if url, ok := ss.Attr("href"); ok { 54 | if !strings.Contains(url, "http://www.miyabaobei.com") { 55 | url = "http://www.miyabaobei.com" + url 56 | } 57 | self.AidRule("生成请求", []interface{}{ 58 | [2]int{0, 1}, 59 | url, 60 | map[string]interface{}{ 61 | "rule": "生成请求", 62 | "temp": map[string]interface{}{"baseUrl": url}, 63 | }, 64 | }) 65 | } 66 | }) 67 | }) 68 | }, 69 | }, 70 | 71 | "生成请求": &Rule{ 72 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 73 | self.LoopAddQueue( 74 | aid[0].([2]int), 75 | func(i int) []string { 76 | return []string{aid[1].(string) + "&per_page=" + strconv.Itoa(i*40)} 77 | }, 78 | aid[2].(map[string]interface{}), 79 | ) 80 | return nil 81 | }, 82 | ParseFunc: func(self *Spider, resp *context.Response) { 83 | query := resp.GetHtmlParser() 84 | totalPage := "1" 85 | 86 | urls := query.Find(".Lpage.page p a") 87 | 88 | if urls.Length() != 0 { 89 | if urls.Last().Text() == ">" { 90 | totalPage = urls.Eq(urls.Length() - 2).Text() 91 | } else { 92 | totalPage = urls.Last().Text() 93 | } 94 | } 95 | total, _ := strconv.Atoi(totalPage) 96 | 97 | // 调用指定规则下辅助函数 98 | self.AidRule("生成请求", []interface{}{ 99 | [2]int{1, total}, 100 | resp.GetTemp("baseUrl").(string), 101 | map[string]interface{}{ 102 | "rule": "商品列表", 103 | }, 104 | }) 105 | // 用指定规则解析响应流 106 | self.CallRule("商品列表", resp) 107 | }, 108 | }, 109 | 110 | "商品列表": &Rule{ 111 | //注意:有无字段语义和是否输出数据必须保持一致 112 | OutFeild: []string{ 113 | "标题", 114 | "价格", 115 | "类别", 116 | }, 117 | ParseFunc: func(self *Spider, resp *context.Response) { 118 | query := resp.GetHtmlParser() 119 | //获取品类 120 | goodsType := query.Find(".crumbs").Text() 121 | re, _ := regexp.Compile("\\s") 122 | goodsType = re.ReplaceAllString(goodsType, "") 123 | re, _ = regexp.Compile("蜜芽宝贝>") 124 | goodsType = re.ReplaceAllString(goodsType, "") 125 | query.Find(".bmfo").Each(func(i int, s *goquery.Selection) { 126 | // 获取标题 127 | title, _ := s.Find("p a").First().Attr("title") 128 | 129 | // 获取价格 130 | price := s.Find(".f20").Text() 131 | 132 | // 结果存入Response中转 133 | resp.AddItem(map[string]string{ 134 | self.GetOutFeild(resp, 0): title, 135 | self.GetOutFeild(resp, 1): price, 136 | self.GetOutFeild(resp, 2): goodsType, 137 | }) 138 | }) 139 | }, 140 | }, 141 | }, 142 | }, 143 | } 144 | -------------------------------------------------------------------------------- /spiders/alibaba.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | // "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | var AlibabaProduct = &Spider{ 36 | Name: "阿里巴巴产品搜索", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "生成请求": &Rule{ 48 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 49 | keyword := EncodeString(self.GetKeyword(), "GBK") 50 | self.LoopAddQueue( 51 | aid[0].([2]int), 52 | func(i int) []string { 53 | return []string{"http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&fromSycm=y&uniqfield=pic_tag_id&keywords=" + keyword + "&beginPage=" + strconv.Itoa(i+1)} 54 | }, 55 | map[string]interface{}{ 56 | "rule": aid[1].(string), 57 | "header": http.Header{"Content-Type": []string{"text/html", "charset=GBK"}}, 58 | }, 59 | ) 60 | return nil 61 | }, 62 | ParseFunc: func(self *Spider, resp *context.Response) { 63 | query := resp.GetHtmlParser() 64 | total1, _ := query.Find("#sm-pagination div[data-total-page]").First().Attr("data-total-page") 65 | total1 = strings.Trim(total1, " \t\n") 66 | total, _ := strconv.Atoi(total1) 67 | if total > self.MaxPage { 68 | total = self.MaxPage 69 | } else if total == 0 { 70 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 71 | return 72 | } 73 | 74 | // 调用指定规则下辅助函数 75 | self.AidRule("生成请求", []interface{}{[2]int{1, total}, "搜索结果"}) 76 | // 用指定规则解析响应流 77 | self.CallRule("搜索结果", resp) 78 | }, 79 | }, 80 | 81 | "搜索结果": &Rule{ 82 | //注意:有无字段语义和是否输出数据必须保持一致 83 | OutFeild: []string{ 84 | "公司", 85 | "标题", 86 | "价格", 87 | "销量", 88 | "星级", 89 | "地址", 90 | "链接", 91 | }, 92 | ParseFunc: func(self *Spider, resp *context.Response) { 93 | query := resp.GetHtmlParser() 94 | 95 | query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) { 96 | 97 | // 获取公司 98 | company, _ := s.Find("a.sm-offer-companyName").First().Attr("title") 99 | 100 | // 获取标题 101 | t := s.Find(".sm-offer-title > a:nth-child(1)") 102 | title, _ := t.Attr("title") 103 | 104 | // 获取URL 105 | url, _ := t.Attr("href") 106 | 107 | // 获取价格 108 | price := s.Find(".sm-offer-priceNum").First().Text() 109 | 110 | // 获取成交量 111 | sales := s.Find("span.sm-offer-trade > em").First().Text() 112 | 113 | // 获取地址 114 | address, _ := s.Find(".sm-offer-location").First().Attr("title") 115 | 116 | // 获取信用年限 117 | level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text() 118 | 119 | // 结果存入Response中转 120 | resp.AddItem(map[string]string{ 121 | self.GetOutFeild(resp, 0): company, 122 | self.GetOutFeild(resp, 1): title, 123 | self.GetOutFeild(resp, 2): price, 124 | self.GetOutFeild(resp, 3): sales, 125 | self.GetOutFeild(resp, 4): level, 126 | self.GetOutFeild(resp, 5): address, 127 | self.GetOutFeild(resp, 6): url, 128 | }) 129 | }) 130 | }, 131 | }, 132 | }, 133 | }, 134 | } 135 | -------------------------------------------------------------------------------- /spiders/taobaosearch.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | // "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | var TaobaoSearch = &Spider{ 36 | Name: "淘宝搜索", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AidRule("生成请求", []interface{}{[2]int{0, 1}, "生成请求"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "生成请求": &Rule{ 48 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 49 | self.LoopAddQueue( 50 | aid[0].([2]int), 51 | func(i int) []string { 52 | return []string{"http://s.taobao.com/search?_input_charset=utf-8&q=" + self.GetKeyword() + "&s=" + strconv.Itoa(i*44)} 53 | }, 54 | map[string]interface{}{ 55 | "rule": aid[1].(string), 56 | }, 57 | ) 58 | return nil 59 | }, 60 | ParseFunc: func(self *Spider, resp *context.Response) { 61 | query := resp.GetHtmlParser() 62 | src := query.Find("script").Text() 63 | if strings.Contains(src, "抱歉!没有找到与") { 64 | reporter.Log.Println("搜索结果为 0 !") 65 | return 66 | } 67 | 68 | re, _ := regexp.Compile(`(?U)"totalPage":[\d]+,`) 69 | total := re.FindString(src) 70 | re, _ = regexp.Compile(`[\d]+`) 71 | total = re.FindString(total) 72 | totalPage, _ := strconv.Atoi(total) 73 | 74 | if totalPage > self.MaxPage { 75 | totalPage = self.MaxPage 76 | } else if totalPage == 0 { 77 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 78 | return 79 | } 80 | // 调用指定规则下辅助函数 81 | self.AidRule("生成请求", []interface{}{[2]int{1, totalPage}, "搜索结果"}) 82 | // 用指定规则解析响应流 83 | self.CallRule("搜索结果", resp) 84 | }, 85 | }, 86 | 87 | "搜索结果": &Rule{ 88 | //注意:有无字段语义和是否输出数据必须保持一致 89 | OutFeild: []string{ 90 | "标题", 91 | "价格", 92 | "销量", 93 | "店铺", 94 | "链接", 95 | }, 96 | ParseFunc: func(self *Spider, resp *context.Response) { 97 | query := resp.GetHtmlParser() 98 | re, _ := regexp.Compile(`"auctions".*,"recommendAuctions"`) 99 | src := query.Find("script").Text() 100 | 101 | src = re.FindString(src) 102 | 103 | re, _ = regexp.Compile(`"auctions":`) 104 | src = re.ReplaceAllString(src, "") 105 | 106 | re, _ = regexp.Compile(`,"recommendAuctions"`) 107 | src = re.ReplaceAllString(src, "") 108 | 109 | re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") 110 | // src = re.ReplaceAllStringFunc(src, strings.ToLower) 111 | src = re.ReplaceAllString(src, " ") 112 | 113 | src = strings.Trim(src, " \t\n") 114 | 115 | infos := []map[string]interface{}{} 116 | 117 | err := json.Unmarshal([]byte(src), &infos) 118 | 119 | if err != nil { 120 | reporter.Log.Printf("error is %v\n", err) 121 | return 122 | } else { 123 | for _, info := range infos { 124 | 125 | // 结果存入Response中转 126 | resp.AddItem(map[string]string{ 127 | self.GetOutFeild(resp, 0): info["raw_title"].(string), 128 | self.GetOutFeild(resp, 1): info["view_price"].(string), 129 | self.GetOutFeild(resp, 2): info["view_sales"].(string), 130 | self.GetOutFeild(resp, 3): info["nick"].(string), 131 | self.GetOutFeild(resp, 4): info["detail_url"].(string), 132 | }) 133 | } 134 | } 135 | }, 136 | }, 137 | }, 138 | }, 139 | } 140 | -------------------------------------------------------------------------------- /spiders/spider/spider.go: -------------------------------------------------------------------------------- 1 | // 蜘蛛,采集规则。 2 | package spider 3 | 4 | import ( 5 | "github.com/henrylee2cn/pholcus/downloader/context" 6 | "github.com/henrylee2cn/pholcus/pholcus" 7 | // "github.com/henrylee2cn/pholcus/pholcus/status" 8 | ) 9 | 10 | type Spider struct { 11 | Name string 12 | Pausetime [2]uint //暂停区间Pausetime[0]~Pausetime[0]+Pausetime[1] 13 | *RuleTree 14 | // *SpiderStatus 15 | //以下为可选成员 16 | MaxPage int 17 | Keyword string 18 | Depth int 19 | Id int //所在SpiderList的下标编号 20 | } 21 | 22 | // func NewSpider() *Spider { 23 | // sp := new(Spider) 24 | // sp.RuleTree = &RuleTree{ 25 | // Nodes: make(map[string]*Rule), 26 | // } 27 | // return sp 28 | // } 29 | 30 | func (self *Spider) Start(sp *Spider) { 31 | sp.RuleTree.Root(sp) 32 | } 33 | 34 | func (self *Spider) GetName() string { 35 | return self.Name 36 | } 37 | 38 | func (self *Spider) GetId() int { 39 | return self.Id 40 | } 41 | 42 | func (self *Spider) GetKeyword() string { 43 | return self.Keyword 44 | } 45 | 46 | func (self *Spider) GetRules() map[string]*Rule { 47 | return self.RuleTree.Nodes 48 | } 49 | 50 | // 根据响应流运行指定解析规则 51 | func (self *Spider) GoRule(resp *context.Response) { 52 | self.RuleTree.Nodes[resp.GetRuleName()].ParseFunc(self, resp) 53 | } 54 | 55 | // 用指定规则解析响应流 56 | func (self *Spider) CallRule(ruleName string, resp *context.Response) { 57 | resp.SetRuleName(ruleName) 58 | self.GoRule(resp) 59 | } 60 | 61 | // 调用指定规则下辅助函数 62 | func (self *Spider) AidRule(ruleName string, aid []interface{}) interface{} { 63 | rule := self.RuleTree.Nodes[ruleName] 64 | return rule.AidFunc(self, aid) 65 | } 66 | 67 | // 获取任务规则采集语义字段 68 | func (self *Spider) GetOutFeild(resp *context.Response, index int) string { 69 | return self.RuleTree.Nodes[resp.GetRuleName()].OutFeild[index] 70 | } 71 | 72 | // 获取任意规则采集语义字段 73 | func (self *Spider) ShowOutFeild(ruleName string, index int) string { 74 | return self.RuleTree.Nodes[ruleName].OutFeild[index] 75 | } 76 | 77 | func (self *Spider) LoopAddQueue(loop [2]int, urlFn func(int) []string, param map[string]interface{}) { 78 | for ; loop[0] < loop[1]; loop[0]++ { 79 | urls := urlFn(loop[0]) 80 | self.BulkAddQueue(urls, param) 81 | } 82 | } 83 | 84 | func (self *Spider) BulkAddQueue(urls []string, param map[string]interface{}) { 85 | for _, url := range urls { 86 | param["url"] = url 87 | self.AddQueue(param) 88 | } 89 | } 90 | 91 | func (self *Spider) AddQueue(param map[string]interface{}) { 92 | req := self.NewRequest(param) 93 | pholcus.Self.Push(req) 94 | } 95 | 96 | // 生成请求 97 | // param全部参数列表 98 | // req := &Request{ 99 | // url: param["url"].(string), //必填 100 | // parent: "", //若有必填 101 | // rule: param["rule"].(string), //必填 102 | // spider: param["spider"].(string), //自动填写 103 | // respType: param["respType"].(string),//可默认 104 | // method: param["method"].(string), //可默认 105 | // header: param["header"],//可默认 106 | // cookies: param["cookies"].([]*http.Cookie),//可默认 107 | // postdata: param["postdata"].(string),//可默认 108 | // canOutsource: param["canOutsource"].(bool),//可默认 109 | // checkRedirect: param["checkRedirect"].(func(req *http.Request, via []*http.Request) error),//可默认 110 | // proxyHost: param["proxyHost"].(string),//可默认 111 | // temp: param["temp"].(map[string]interface{}),//可默认 112 | // } 113 | 114 | func (self *Spider) NewRequest(param map[string]interface{}) *context.Request { 115 | param["spider"] = self.GetName() 116 | req := context.NewRequest(param) 117 | req.SetSpiderId(self.GetId()) 118 | return req 119 | } 120 | 121 | //采集规则树 122 | type RuleTree struct { 123 | Spread []string //作为服务器时的请求分发点 124 | Root func(*Spider) 125 | Nodes map[string]*Rule 126 | } 127 | 128 | // 采集规则单元 129 | type Rule struct { 130 | OutFeild []string //注意:有无字段语义和是否输出数据必须保持一致 131 | // 内容解析函数 132 | ParseFunc func(*Spider, *context.Response) 133 | // 通用辅助函数 134 | AidFunc func(*Spider, []interface{}) interface{} 135 | } 136 | 137 | func (self *Rule) GetOutFeild() []string { 138 | return self.OutFeild 139 | } 140 | -------------------------------------------------------------------------------- /spiders/googlesearch.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | "math" 33 | ) 34 | 35 | var googleIp = []string{ 36 | "210.242.125.100", 37 | "210.242.125.96", 38 | "210.242.125.91", 39 | "210.242.125.95", 40 | "64.233.189.163", 41 | "58.123.102.5", 42 | "210.242.125.97", 43 | "210.242.125.115", 44 | "58.123.102.28", 45 | "210.242.125.70", 46 | } 47 | 48 | var GoogleSearch = &Spider{ 49 | Name: "谷歌搜索", 50 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 51 | // Optional: &Optional{}, 52 | RuleTree: &RuleTree{ 53 | // Spread: []string{}, 54 | Root: func(self *Spider) { 55 | var url string 56 | var success bool 57 | reporter.Log.Println("正在查找可用的Google镜像,该过程可能需要几分钟……") 58 | for _, ip := range googleIp { 59 | url = "http://" + ip + "/search?q=" + self.GetKeyword() + "&newwindow=1&biw=1600&bih=398&start=" 60 | if _, err := goquery.NewDocument(url); err == nil { 61 | success = true 62 | break 63 | } 64 | } 65 | if !success { 66 | reporter.Log.Println("没有可用的Google镜像IP!!") 67 | return 68 | } 69 | reporter.Log.Println("开始Google搜索……") 70 | self.AddQueue(map[string]interface{}{ 71 | "url": url, 72 | "rule": "获取总页数", 73 | "temp": map[string]interface{}{ 74 | "baseUrl": url, 75 | }, 76 | }) 77 | }, 78 | 79 | Nodes: map[string]*Rule{ 80 | 81 | "获取总页数": &Rule{ 82 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 83 | self.LoopAddQueue( 84 | aid[0].([2]int), 85 | func(i int) []string { 86 | return []string{aid[1].(string) + strconv.Itoa(10*i)} 87 | }, 88 | aid[2].(map[string]interface{}), 89 | ) 90 | return nil 91 | }, 92 | ParseFunc: func(self *Spider, resp *context.Response) { 93 | query := resp.GetHtmlParser() 94 | txt := query.Find("#resultStats").Text() 95 | reporter.Log.Println("总页数txt:", txt) 96 | re, _ := regexp.Compile(`,+`) 97 | txt = re.ReplaceAllString(txt, "") 98 | re, _ = regexp.Compile(`[\d]+`) 99 | txt = re.FindString(txt) 100 | num, _ := strconv.Atoi(txt) 101 | reporter.Log.Println("总页数:", num) 102 | total := int(math.Ceil(float64(num) / 10)) 103 | if total > self.MaxPage { 104 | total = self.MaxPage 105 | } else if total == 0 { 106 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 107 | return 108 | } 109 | // 调用指定规则下辅助函数 110 | self.AidRule("获取总页数", 111 | []interface{}{ 112 | [2]int{1, total}, 113 | resp.GetTemp("baseUrl"), 114 | map[string]interface{}{ 115 | "rule": "搜索结果", 116 | }, 117 | }) 118 | // 用指定规则解析响应流 119 | self.CallRule("搜索结果", resp) 120 | }, 121 | }, 122 | 123 | "搜索结果": &Rule{ 124 | //注意:有无字段语义和是否输出数据必须保持一致 125 | OutFeild: []string{ 126 | "标题", 127 | "内容", 128 | "链接", 129 | }, 130 | ParseFunc: func(self *Spider, resp *context.Response) { 131 | query := resp.GetHtmlParser() 132 | query.Find("#ires li.g").Each(func(i int, s *goquery.Selection) { 133 | t := s.Find(".r > a") 134 | href, _ := t.Attr("href") 135 | href = strings.TrimLeft(href, "/url?q=") 136 | title := t.Text() 137 | content := s.Find(".st").Text() 138 | resp.AddItem(map[string]string{ 139 | self.GetOutFeild(resp, 0): title, 140 | self.GetOutFeild(resp, 1): content, 141 | self.GetOutFeild(resp, 2): href, 142 | }) 143 | }) 144 | }, 145 | }, 146 | }, 147 | }, 148 | } 149 | -------------------------------------------------------------------------------- /downloader/context/response.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "github.com/bitly/go-simplejson" 6 | "github.com/henrylee2cn/pholcus/reporter" 7 | "net/http" 8 | "strings" 9 | ) 10 | 11 | // Response represents an entity be crawled. 12 | type Response struct { 13 | // The isfail is true when crawl process is failed and errormsg is the fail resean. 14 | isfail bool 15 | 16 | errormsg string 17 | 18 | // The request is crawled by spider that contains url and relevent information. 19 | *Request 20 | 21 | // The body is plain text of crawl result. 22 | body string 23 | 24 | header http.Header 25 | cookies []*http.Cookie 26 | 27 | // The docParser is a pointer of goquery boject that contains html result. 28 | docParser *goquery.Document 29 | 30 | // The jsonMap is the json result. 31 | jsonMap *simplejson.Json 32 | 33 | // The items is the container of parsed result. 34 | items []map[string]string 35 | } 36 | 37 | // NewResponse returns initialized Response object. 38 | func NewResponse(req *Request) *Response { 39 | return &Response{Request: req, items: []map[string]string{}} 40 | } 41 | 42 | // SetHeader save the header of http responce 43 | func (self *Response) SetHeader(header http.Header) { 44 | self.header = header 45 | } 46 | 47 | // GetHeader returns the header of http responce 48 | func (self *Response) GetHeader() http.Header { 49 | return self.header 50 | } 51 | 52 | // SetHeader save the cookies of http responce 53 | func (self *Response) SetCookies(cookies []*http.Cookie) { 54 | self.cookies = cookies 55 | } 56 | 57 | // GetHeader returns the cookies of http responce 58 | func (self *Response) GetCookies() []*http.Cookie { 59 | return self.cookies 60 | } 61 | 62 | // IsSucc test whether download process success or not. 63 | func (self *Response) IsSucc() bool { 64 | return !self.isfail 65 | } 66 | 67 | // Errormsg show the download error message. 68 | func (self *Response) Errormsg() string { 69 | return self.errormsg 70 | } 71 | 72 | // SetStatus save status info about download process. 73 | func (self *Response) SetStatus(isfail bool, errormsg string) { 74 | self.isfail = isfail 75 | self.errormsg = errormsg 76 | } 77 | 78 | // AddField saves KV string pair to ResponseItems preparing for Pipeline 79 | func (self *Response) AddItem(data map[string]string) { 80 | self.items = append(self.items, data) 81 | } 82 | 83 | func (self *Response) GetItem(idx int) map[string]string { 84 | return self.items[idx] 85 | } 86 | 87 | func (self *Response) GetItems() []map[string]string { 88 | return self.items 89 | } 90 | 91 | // SetRequest saves request oject of self page. 92 | func (self *Response) SetRequest(r *Request) *Response { 93 | self.Request = r 94 | return self 95 | } 96 | 97 | // GetRequest returns request oject of self page. 98 | func (self *Response) GetRequest() *Request { 99 | return self.Request 100 | } 101 | 102 | // SetBodyStr saves plain string crawled in Response. 103 | func (self *Response) SetBodyStr(body string) *Response { 104 | self.body = body 105 | return self 106 | } 107 | 108 | // GetBodyStr returns plain string crawled. 109 | func (self *Response) GetBodyStr() string { 110 | return self.body 111 | } 112 | 113 | // SetHtmlParser saves goquery object binded to target crawl result. 114 | func (self *Response) SetHtmlParser(doc *goquery.Document) *Response { 115 | self.docParser = doc 116 | return self 117 | } 118 | 119 | // GetHtmlParser returns goquery object binded to target crawl result. 120 | func (self *Response) GetHtmlParser() *goquery.Document { 121 | return self.docParser 122 | } 123 | 124 | // GetHtmlParser returns goquery object binded to target crawl result. 125 | func (self *Response) ResetHtmlParser() *goquery.Document { 126 | r := strings.NewReader(self.body) 127 | var err error 128 | self.docParser, err = goquery.NewDocumentFromReader(r) 129 | if err != nil { 130 | reporter.Log.Println(err.Error()) 131 | panic(err.Error()) 132 | } 133 | return self.docParser 134 | } 135 | 136 | // SetJson saves json result. 137 | func (self *Response) SetJson(js *simplejson.Json) *Response { 138 | self.jsonMap = js 139 | return self 140 | } 141 | 142 | // SetJson returns json result. 143 | func (self *Response) GetJson() *simplejson.Json { 144 | return self.jsonMap 145 | } 146 | -------------------------------------------------------------------------------- /pholcus/crawler/crawl.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | // "fmt" 5 | "github.com/henrylee2cn/pholcus/config" 6 | "github.com/henrylee2cn/pholcus/downloader" 7 | "github.com/henrylee2cn/pholcus/downloader/context" 8 | "github.com/henrylee2cn/pholcus/pipeline" 9 | "github.com/henrylee2cn/pholcus/reporter" 10 | "github.com/henrylee2cn/pholcus/scheduler" 11 | "github.com/henrylee2cn/pholcus/spiders/spider" 12 | "math/rand" 13 | "sync" 14 | "time" 15 | ) 16 | 17 | type crawler struct { 18 | *spider.Spider 19 | downloader.Downloader 20 | pipeline.Pipeline 21 | srcManage [2]uint 22 | } 23 | 24 | func New() Crawler { 25 | return &crawler{ 26 | Pipeline: pipeline.New(), 27 | Downloader: downloader.NewHttpDownloader(), 28 | srcManage: [2]uint{}, 29 | } 30 | } 31 | 32 | func (self *crawler) Init(sp *spider.Spider) Crawler { 33 | self.Pipeline.Init(sp) 34 | self.Spider = sp 35 | self.Downloader = downloader.NewHttpDownloader() 36 | self.srcManage = [2]uint{} 37 | return self 38 | } 39 | 40 | // 任务执行入口 41 | func (self *crawler) Start() { 42 | // 预先开启输出管理协程 43 | self.Pipeline.Start() 44 | 45 | // 开始运行 46 | self.Spider.Start(self.Spider) 47 | self.Run() 48 | // reporter.Log.Println("**************断点 8 ***********") 49 | // 通知输出模块输出未输出的数据 50 | self.Pipeline.CtrlR() 51 | // reporter.Log.Println("**************断点 11 ***********") 52 | } 53 | 54 | func (self *crawler) Run() { 55 | for { 56 | // 队列中取出一条请求 57 | req := self.GetOne() 58 | 59 | // 队列退出及空请求调控 60 | if req == nil { 61 | if self.canStop() { 62 | // reporter.Log.Println("**************退出队列************") 63 | break 64 | } else { 65 | time.Sleep(500 * time.Millisecond) 66 | continue 67 | } 68 | } 69 | 70 | // 自身资源统计 71 | self.RequestIn() 72 | 73 | // 全局统计下载页面数 74 | config.ReqSum++ 75 | 76 | go func(req *context.Request) { 77 | defer func() { 78 | self.FreeOne() 79 | self.RequestOut() 80 | }() 81 | reporter.Log.Println("start crawl :", req.GetUrl()) 82 | self.Process(req) 83 | }(req) 84 | } 85 | } 86 | 87 | // core processer 88 | func (self *crawler) Process(req *context.Request) { 89 | // 声明response 90 | var resp *context.Response 91 | 92 | defer func() { 93 | if err := recover(); err != nil { // do not affect other 94 | if strerr, ok := err.(string); ok { 95 | reporter.Log.Println(strerr) 96 | } else { 97 | reporter.Log.Println("Process error:", err) 98 | } 99 | } 100 | }() 101 | // reporter.Log.Println("**************断点 1 ***********") 102 | // download page 103 | for i := 0; i < 3; i++ { 104 | self.sleep() 105 | resp = self.Downloader.Download(req) 106 | if resp.IsSucc() { // if fail retry 3 times 107 | break 108 | } 109 | } 110 | // reporter.Log.Println("**************断点 2 ***********") 111 | if !resp.IsSucc() { // if fail do not need process 112 | return 113 | } 114 | // reporter.Log.Println("**************断点 3 ***********") 115 | // 过程处理,提炼数据 116 | self.Spider.GoRule(resp) 117 | // reporter.Log.Println("**************断点 5 ***********") 118 | // 该条请求结果存入pipeline 119 | datas := resp.GetItems() 120 | for i, count := 0, len(datas); i < count; i++ { 121 | self.Pipeline.Collect( 122 | resp.GetRuleName(), //DataCell.RuleName 123 | datas[i], //DataCell.Data 124 | resp.GetUrl(), //DataCell.Url 125 | resp.GetParent(), //DataCell.ParentUrl 126 | time.Now().Format("2006-01-02 15:04:05"), 127 | ) 128 | } 129 | // reporter.Log.Println("**************断点 end ***********") 130 | } 131 | 132 | // 常用基础方法 133 | func (self *crawler) sleep() { 134 | sleeptime := rand.Intn(int(self.Spider.Pausetime[1])) + int(self.Spider.Pausetime[0]) 135 | time.Sleep(time.Duration(sleeptime) * time.Millisecond) 136 | } 137 | 138 | // 从调度读取一个请求 139 | func (self *crawler) GetOne() *context.Request { 140 | return scheduler.Self.Use(self.Spider.GetId()) 141 | } 142 | 143 | //从调度释放一个资源空位 144 | func (self *crawler) FreeOne() { 145 | scheduler.Self.Free() 146 | } 147 | 148 | func (self *crawler) RequestIn() { 149 | self.srcManage[0]++ 150 | } 151 | 152 | var requestOutMutex sync.Mutex 153 | 154 | func (self *crawler) RequestOut() { 155 | requestOutMutex.Lock() 156 | defer func() { 157 | requestOutMutex.Unlock() 158 | }() 159 | self.srcManage[1]++ 160 | } 161 | 162 | //判断调度中是否还有属于自己的资源运行 163 | func (self *crawler) canStop() bool { 164 | // reporter.Log.Println("**************", self.srcManage[0], self.srcManage[1], "***********") 165 | 166 | return self.srcManage[0] == self.srcManage[1] && scheduler.Self.IsEmpty(self.Spider.GetId()) 167 | } 168 | -------------------------------------------------------------------------------- /spiders/hollandandbarrett.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | // "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | "fmt" 32 | // "math" 33 | ) 34 | 35 | var Hollandandbarrett = &Spider{ 36 | Name: "Hollandandbarrett", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AddQueue( 43 | map[string]interface{}{ 44 | "url": "http://www.hollandandbarrett.com/", 45 | "rule": "获取版块URL", 46 | }, 47 | ) 48 | }, 49 | 50 | Nodes: map[string]*Rule{ 51 | 52 | "获取版块URL": &Rule{ 53 | ParseFunc: func(self *Spider, resp *context.Response) { 54 | query := resp.GetHtmlParser() 55 | lis := query.Find(".footer-links nav.l-one-half a") 56 | 57 | lis.Each(func(i int, s *goquery.Selection) { 58 | if url, ok := s.Attr("href"); ok { 59 | tit, _ := s.Attr("title") 60 | self.AddQueue( 61 | map[string]interface{}{ 62 | "url": "http://www.hollandandbarrett.com" + url + "?showAll=1&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true", 63 | "rule": "获取总数", 64 | "temp": map[string]interface{}{ 65 | "type": tit, 66 | "baseUrl": url, 67 | }, 68 | }, 69 | ) 70 | } 71 | }) 72 | }, 73 | }, 74 | 75 | "获取总数": &Rule{ 76 | ParseFunc: func(self *Spider, resp *context.Response) { 77 | 78 | query := resp.GetHtmlParser() 79 | 80 | re, _ := regexp.Compile(`(?U)"totalNumRecs":[\d]+,`) 81 | total := re.FindString(query.Text()) 82 | re, _ = regexp.Compile(`[\d]+`) 83 | total = re.FindString(total) 84 | total = strings.Trim(total, " \t\n") 85 | 86 | if total == "0" { 87 | reporter.Log.Printf("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", self.GetName(), self.GetKeyword(), resp.GetRuleName()) 88 | } else { 89 | 90 | self.AddQueue( 91 | map[string]interface{}{ 92 | "url": "http://www.hollandandbarrett.com" + resp.GetTemp("baseUrl").(string) + "?showAll=" + total + "&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true", 93 | "rule": "商品详情", 94 | "temp": map[string]interface{}{ 95 | "type": resp.GetTemp("type").(string), 96 | }, 97 | }, 98 | ) 99 | 100 | } 101 | }, 102 | }, 103 | 104 | "商品详情": &Rule{ 105 | //注意:有无字段语义和是否输出数据必须保持一致 106 | OutFeild: []string{ 107 | "标题", 108 | "原价", 109 | "折后价", 110 | "打折", 111 | "星级", 112 | "分类", 113 | }, 114 | ParseFunc: func(self *Spider, resp *context.Response) { 115 | query := resp.GetHtmlParser() 116 | 117 | src := query.Text() 118 | 119 | infos := map[string]interface{}{} 120 | 121 | err := json.Unmarshal([]byte(src), &infos) 122 | 123 | if err != nil { 124 | reporter.Log.Printf("error is %v\n", err) 125 | return 126 | } else { 127 | for _, info1 := range infos["contents"].([]interface{})[0].(map[string]interface{})["mainContent"].([]interface{})[0].(map[string]interface{})["records"].([]interface{}) { 128 | 129 | info2 := info1.(map[string]interface{})["records"].([]interface{})[0].(map[string]interface{})["attributes"].(map[string]interface{}) 130 | 131 | var n, price1, price2, prm, level string 132 | 133 | if info2["Name"] == nil { 134 | n = "" 135 | } else { 136 | n = fmt.Sprint(info2["Name"]) 137 | n = strings.TrimRight(n, "]") 138 | n = strings.TrimLeft(n, "[") 139 | } 140 | 141 | if info2["lp"] == nil { 142 | price1 = "" 143 | } else { 144 | price1 = fmt.Sprint(info2["lp"]) 145 | price1 = strings.TrimRight(price1, "]") 146 | price1 = strings.TrimLeft(price1, "[") 147 | } 148 | 149 | if info2["sp"] == nil { 150 | price2 = "" 151 | } else { 152 | price2 = fmt.Sprint(info2["sp"]) 153 | price2 = strings.TrimRight(price2, "]") 154 | price2 = strings.TrimLeft(price2, "[") 155 | } 156 | 157 | if info2["prm"] == nil { 158 | prm = "" 159 | } else { 160 | prm = fmt.Sprint(info2["prm"]) 161 | prm = strings.TrimRight(prm, "]") 162 | prm = strings.TrimLeft(prm, "[") 163 | } 164 | 165 | if info2["ratingCount"] == nil { 166 | level = "0" 167 | } else { 168 | level = fmt.Sprint(info2["ratingCount"]) 169 | level = strings.TrimRight(level, "]") 170 | level = strings.TrimLeft(level, "[") 171 | } 172 | 173 | // 结果存入Response中转 174 | resp.AddItem(map[string]string{ 175 | self.GetOutFeild(resp, 0): n, 176 | self.GetOutFeild(resp, 1): price1, 177 | self.GetOutFeild(resp, 2): price2, 178 | self.GetOutFeild(resp, 3): prm, 179 | self.GetOutFeild(resp, 4): level, 180 | self.GetOutFeild(resp, 5): resp.GetTemp("type").(string), 181 | }) 182 | } 183 | } 184 | }, 185 | }, 186 | }, 187 | }, 188 | } 189 | -------------------------------------------------------------------------------- /pipeline/collector/output_lib.go: -------------------------------------------------------------------------------- 1 | //数据输出 2 | package collector 3 | 4 | import ( 5 | "github.com/tealeg/xlsx" 6 | "gopkg.in/mgo.v2" 7 | // "gopkg.in/mgo.v2/bson" 8 | "encoding/csv" 9 | "github.com/henrylee2cn/pholcus/config" 10 | "github.com/henrylee2cn/pholcus/reporter" 11 | "os" 12 | "strconv" 13 | "strings" 14 | // "time" 15 | ) 16 | 17 | /************************ excel 输出 ***************************/ 18 | func (self *Collector) excel(dataIndex int) { 19 | defer func() { 20 | if err := recover(); err != nil { 21 | reporter.Log.Println(err) 22 | } 23 | }() 24 | 25 | var file *xlsx.File 26 | var sheet *xlsx.Sheet 27 | var row *xlsx.Row 28 | var cell *xlsx.Cell 29 | var err error 30 | 31 | folder1 := "data" 32 | _folder2 := strings.Split(config.StartTime.Format("2006-01-02 15:04:05"), ":") 33 | folder2 := _folder2[0] + "时" + _folder2[1] + "分" + _folder2[2] + "秒" 34 | folder2 = folder1 + "/" + folder2 35 | filename := folder2 + "/" + self.Spider.GetName() + "_" + self.Spider.GetKeyword() + " " + strconv.Itoa(self.sum[0]) + "-" + strconv.Itoa(self.sum[1]) + ".xlsx" 36 | 37 | file = xlsx.NewFile() 38 | 39 | // 添加分类数据工作表 40 | for Name, Rule := range self.GetRules() { 41 | // 跳过不输出的数据 42 | if len(Rule.GetOutFeild()) == 0 { 43 | continue 44 | } 45 | 46 | sheet = file.AddSheet(Name) 47 | row = sheet.AddRow() 48 | for _, title := range Rule.GetOutFeild() { 49 | cell = row.AddCell() 50 | cell.Value = title 51 | } 52 | cell = row.AddCell() 53 | cell.Value = "当前链接" 54 | cell = row.AddCell() 55 | cell.Value = "上级链接" 56 | cell = row.AddCell() 57 | cell.Value = "下载时间" 58 | 59 | num := 0 //小计 60 | for _, datacell := range self.DockerQueue.Dockers[dataIndex] { 61 | if datacell["RuleName"].(string) == Name { 62 | row = sheet.AddRow() 63 | for _, title := range Rule.GetOutFeild() { 64 | cell = row.AddCell() 65 | cell.Value = datacell["Data"].(map[string]string)[title] 66 | } 67 | cell = row.AddCell() 68 | cell.Value = datacell["Url"].(string) 69 | cell = row.AddCell() 70 | cell.Value = datacell["ParentUrl"].(string) 71 | cell = row.AddCell() 72 | cell.Value = datacell["DownloadTime"].(string) 73 | num++ 74 | } 75 | } 76 | 77 | reporter.Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) 78 | 79 | } 80 | 81 | // 创建/打开目录 82 | // f1, err := os.Stat(folder1) 83 | // if err != nil || !f1.IsDir() { 84 | // os.Mkdir(folder1, 0) 85 | // } 86 | 87 | // f2, err := os.Stat(folder2) 88 | // if err != nil || !f2.IsDir() { 89 | // os.Mkdir(folder2, 0) 90 | // } 91 | 92 | f2, err := os.Stat(folder2) 93 | if err != nil || !f2.IsDir() { 94 | if err := os.MkdirAll(folder2, 0777); err != nil { 95 | reporter.Log.Printf("Error: %v\n", err) 96 | } 97 | } 98 | 99 | // 保存文件 100 | err = file.Save(filename) 101 | 102 | if err != nil { 103 | reporter.Log.Println(err) 104 | } 105 | 106 | } 107 | 108 | /************************ CSV 输出 ***************************/ 109 | func (self *Collector) csv(dataIndex int) { 110 | defer func() { 111 | if err := recover(); err != nil { 112 | reporter.Log.Println(err) 113 | } 114 | }() 115 | 116 | folder1 := "data" 117 | _folder2 := strings.Split(config.StartTime.Format("2006-01-02 15:04:05"), ":") 118 | folder2 := _folder2[0] + "时" + _folder2[1] + "分" + _folder2[2] + "秒" 119 | folder2 = folder1 + "/" + folder2 120 | filenameBase := folder2 + "/" + self.Spider.GetName() + "_" + self.Spider.GetKeyword() + " " + strconv.Itoa(self.sum[0]) + "-" + strconv.Itoa(self.sum[1]) 121 | 122 | // 创建/打开目录 123 | f2, err := os.Stat(folder2) 124 | if err != nil || !f2.IsDir() { 125 | if err := os.MkdirAll(folder2, 0777); err != nil { 126 | reporter.Log.Printf("Error: %v\n", err) 127 | } 128 | } 129 | 130 | // 添加分类数据工作表 131 | for Name, Rule := range self.GetRules() { 132 | // 跳过不输出的数据 133 | if len(Rule.GetOutFeild()) == 0 { 134 | continue 135 | } 136 | 137 | file, err := os.Create(filenameBase + " (" + Name + ").csv") 138 | 139 | if err != nil { 140 | reporter.Log.Println(err) 141 | continue 142 | } 143 | 144 | // file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM 145 | w := csv.NewWriter(file) 146 | th := Rule.GetOutFeild() 147 | th = append(th, []string{"当前链接", "上级链接", "下载时间"}...) 148 | w.Write(th) 149 | 150 | num := 0 //小计 151 | for _, datacell := range self.DockerQueue.Dockers[dataIndex] { 152 | if datacell["RuleName"].(string) == Name { 153 | row := []string{} 154 | for _, title := range Rule.GetOutFeild() { 155 | row = append(row, datacell["Data"].(map[string]string)[title]) 156 | } 157 | 158 | row = append(row, datacell["Url"].(string)) 159 | row = append(row, datacell["ParentUrl"].(string)) 160 | row = append(row, datacell["DownloadTime"].(string)) 161 | w.Write(row) 162 | 163 | num++ 164 | } 165 | } 166 | // 发送缓存数据流 167 | w.Flush() 168 | // 关闭文件 169 | file.Close() 170 | // 输出报告 171 | reporter.Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) 172 | } 173 | } 174 | 175 | /************************ MongoDB 输出 ***************************/ 176 | 177 | func (self *Collector) mgo(dataIndex int) { 178 | session, err := mgo.Dial(config.DB_URL) //连接数据库 179 | if err != nil { 180 | panic(err) 181 | } 182 | defer session.Close() 183 | session.SetMode(mgo.Monotonic, true) 184 | 185 | db := session.DB(config.DB_NAME) //数据库名称 186 | collection := db.C(config.DB_COLLECTION) //如果该集合已经存在的话,则直接返回 187 | 188 | for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ { 189 | err = collection.Insert((interface{})(self.DockerQueue.Dockers[dataIndex][i])) 190 | if err != nil { 191 | panic(err) 192 | } 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /spiders/baidunews.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/json" 19 | "encoding/xml" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | // "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | "time" 34 | ) 35 | 36 | var rss_BaiduNews = NewRSS(map[string]string{ 37 | "国内最新": "http://news.baidu.com/n?cmd=4&class=civilnews&tn=rss", 38 | "国际最新": "http://news.baidu.com/n?cmd=4&class=internews&tn=rss", 39 | "军事最新": "http://news.baidu.com/n?cmd=4&class=mil&tn=rss", 40 | "财经最新": "http://news.baidu.com/n?cmd=4&class=finannews&tn=rss", 41 | "互联网最新": "http://news.baidu.com/n?cmd=4&class=internet&tn=rss", 42 | "房产最新": "http://news.baidu.com/n?cmd=4&class=housenews&tn=rss", 43 | "汽车最新": "http://news.baidu.com/n?cmd=4&class=autonews&tn=rss", 44 | "体育最新": "http://news.baidu.com/n?cmd=4&class=sportnews&tn=rss", 45 | "娱乐最新": "http://news.baidu.com/n?cmd=4&class=enternews&tn=rss", 46 | "游戏最新": "http://news.baidu.com/n?cmd=4&class=gamenews&tn=rss", 47 | "教育最新": "http://news.baidu.com/n?cmd=4&class=edunews&tn=rss", 48 | "女人最新": "http://news.baidu.com/n?cmd=4&class=healthnews&tn=rss", 49 | "科技最新": "http://news.baidu.com/n?cmd=4&class=technnews&tn=rss", 50 | "社会最新": "http://news.baidu.com/n?cmd=4&class=socianews&tn=rss", 51 | }, 52 | []int{1, 2, 3, 4, 5, 6}, 53 | ) 54 | 55 | type BaiduNewsData struct { 56 | Item []BaiduNewsItem `xml:"item"` 57 | } 58 | 59 | type BaiduNewsItem struct { 60 | Title string `xml:"title"` 61 | Link string `xml:"link"` 62 | Description string `xml:"description"` 63 | PubDate string `xml:"pubDate"` 64 | Author string `xml:"author"` 65 | } 66 | 67 | var BaiduNews = &Spider{ 68 | Name: "百度RSS新闻", 69 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 70 | // Optional: &Optional{}, 71 | RuleTree: &RuleTree{ 72 | // Spread: []string{}, 73 | Root: func(self *Spider) { 74 | for k, _ := range rss_BaiduNews.Src { 75 | self.AidRule("LOOP", []interface{}{k}) 76 | } 77 | }, 78 | 79 | Nodes: map[string]*Rule{ 80 | "LOOP": &Rule{ 81 | AidFunc: func(self *Spider, aid []interface{}) interface{} { 82 | k := aid[0].(string) 83 | v := rss_BaiduNews.Src[k] 84 | 85 | self.AddQueue(map[string]interface{}{ 86 | "url": v + "#" + time.Now().String(), 87 | "rule": "XML", 88 | "header": http.Header{"Content-Type": []string{"text/html", "charset=GB2312"}}, 89 | "respType": "text", 90 | "temp": map[string]interface{}{"src": k}, 91 | }) 92 | return nil 93 | }, 94 | }, 95 | "XML": &Rule{ 96 | ParseFunc: func(self *Spider, resp *context.Response) { 97 | page := GBKToUTF8(resp.GetBodyStr()) 98 | page = strings.TrimLeft(page, ``) 99 | re, _ := regexp.Compile(`\<[\/]?rss\>`) 100 | page = re.ReplaceAllString(page, "") 101 | 102 | content := new(BaiduNewsData) 103 | if err := xml.Unmarshal([]byte(page), content); err != nil { 104 | reporter.Log.Println(err) 105 | return 106 | } 107 | 108 | src := resp.GetTemp("src").(string) 109 | 110 | for _, v := range content.Item { 111 | 112 | self.AddQueue(map[string]interface{}{ 113 | "url": v.Link, 114 | "rule": "新闻详情", 115 | "temp": map[string]interface{}{ 116 | "title": CleanHtml(v.Title, 4), 117 | "description": CleanHtml(v.Description, 4), 118 | "src": src, 119 | "releaseTime": CleanHtml(v.PubDate, 4), 120 | "author": CleanHtml(v.Author, 4), 121 | }, 122 | }) 123 | } 124 | 125 | // 循环请求 126 | rss_BaiduNews.Wait(src) 127 | self.AidRule("LOOP", []interface{}{src}) 128 | }, 129 | }, 130 | 131 | "新闻详情": &Rule{ 132 | //注意:有无字段语义和是否输出数据必须保持一致 133 | OutFeild: []string{ 134 | "标题", 135 | "描述", 136 | "内容", 137 | "发布时间", 138 | "分类", 139 | "作者", 140 | }, 141 | ParseFunc: func(self *Spider, resp *context.Response) { 142 | // RSS标记更新 143 | rss_BaiduNews.Updata(resp.GetTemp("src").(string)) 144 | 145 | query1 := resp.GetHtmlParser() 146 | 147 | query := query1.Find("body") 148 | 149 | title := resp.GetTemp("title").(string) 150 | 151 | var findP func(html *goquery.Selection) *goquery.Selection 152 | findP = func(html *goquery.Selection) *goquery.Selection { 153 | if html.Is("body") { 154 | return html 155 | } else if result := html.Parent().Find("p"); len(result.Nodes) == 0 { 156 | return findP(html.Parent()) 157 | } else { 158 | return html.Parent() 159 | } 160 | } 161 | 162 | var info *goquery.Selection 163 | 164 | if h1s := query.Find("h1"); len(h1s.Nodes) != 0 { 165 | for i := 0; i < len(h1s.Nodes); i++ { 166 | info = findP(h1s.Eq(i)) 167 | } 168 | } else if h2s := query.Find("h2"); len(h2s.Nodes) != 0 { 169 | for i := 0; i < len(h2s.Nodes); i++ { 170 | info = findP(h2s.Eq(i)) 171 | } 172 | } else if h3s := query.Find("h3"); len(h3s.Nodes) != 0 { 173 | for i := 0; i < len(h3s.Nodes); i++ { 174 | info = findP(h3s.Eq(i)) 175 | } 176 | } else { 177 | info = query.Find("body") 178 | } 179 | // 去除标签 180 | // info.RemoveFiltered("script") 181 | // info.RemoveFiltered("style") 182 | infoStr, _ := info.Html() 183 | 184 | // 清洗HTML 185 | infoStr = CleanHtml(infoStr, 5) 186 | 187 | // 结果存入Response中转 188 | result := map[string]string{ 189 | self.GetOutFeild(resp, 0): title, 190 | self.GetOutFeild(resp, 1): resp.GetTemp("description").(string), 191 | self.GetOutFeild(resp, 2): infoStr, 192 | self.GetOutFeild(resp, 3): resp.GetTemp("releaseTime").(string), 193 | self.GetOutFeild(resp, 4): resp.GetTemp("src").(string), 194 | self.GetOutFeild(resp, 5): resp.GetTemp("author").(string), 195 | } 196 | resp.AddItem(result) 197 | }, 198 | }, 199 | }, 200 | }, 201 | } 202 | -------------------------------------------------------------------------------- /downloader/context/request.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "github.com/bitly/go-simplejson" 5 | "github.com/henrylee2cn/pholcus/reporter" 6 | "io/ioutil" 7 | "net/http" 8 | "os" 9 | ) 10 | 11 | // Request represents object waiting for being crawled. 12 | type Request struct { 13 | url string 14 | parent string 15 | rule string 16 | spider string 17 | // Responce type: html json jsonp text 18 | respType string 19 | // GET POST 20 | method string 21 | // http header 22 | header http.Header 23 | // http cookies 24 | cookies []*http.Cookie 25 | // POST data 26 | postdata string 27 | //在Spider中生成时,根据ruleTree.Outsource确定 28 | canOutsource bool 29 | //当经过Pholcus时,被指定是否外包 30 | isOutsource bool 31 | // Redirect function for downloader used in http.Client 32 | // If CheckRedirect returns an error, the Client's Get 33 | // method returns both the previous Response. 34 | // If CheckRedirect returns error.New("normal"), the error process after client.Do will ignore the error. 35 | checkRedirect func(req *http.Request, via []*http.Request) error 36 | //proxy host example='localhost:80' 37 | proxyHost string 38 | // 标记临时数据,通过temp[x]==nil判断是否有值存入,所以请存入带类型的值,如[]int(nil)等 39 | temp map[string]interface{} 40 | } 41 | 42 | // NewRequest returns initialized Request object. 43 | // The respType is json, jsonp, html, text 44 | 45 | func NewRequest(param map[string]interface{}) *Request { 46 | req := &Request{ 47 | url: param["url"].(string), //必填 48 | rule: param["rule"].(string), //必填 49 | spider: param["spider"].(string), //必填 50 | } 51 | 52 | // 若有必填 53 | switch v := param["parent"].(type) { 54 | case string: 55 | req.parent = v 56 | default: 57 | req.parent = "" 58 | } 59 | 60 | switch v := param["respType"].(type) { 61 | case string: 62 | req.respType = v 63 | default: 64 | req.respType = "html" 65 | } 66 | 67 | switch v := param["method"].(type) { 68 | case string: 69 | req.method = v 70 | default: 71 | req.method = "GET" 72 | } 73 | 74 | switch v := param["cookies"].(type) { 75 | case []*http.Cookie: 76 | req.cookies = v 77 | default: 78 | req.cookies = nil 79 | } 80 | 81 | switch v := param["postdata"].(type) { 82 | case string: 83 | req.postdata = v 84 | default: 85 | req.postdata = "" 86 | } 87 | 88 | switch v := param["canOutsource"].(type) { 89 | case bool: 90 | req.canOutsource = v 91 | default: 92 | req.canOutsource = false 93 | } 94 | 95 | switch v := param["checkRedirect"].(type) { 96 | case func(*http.Request, []*http.Request) error: 97 | req.checkRedirect = v 98 | default: 99 | req.checkRedirect = nil 100 | } 101 | 102 | switch v := param["proxyHost"].(type) { 103 | case string: 104 | req.proxyHost = v 105 | default: 106 | req.proxyHost = "" 107 | } 108 | 109 | switch v := param["temp"].(type) { 110 | case map[string]interface{}: 111 | req.temp = v 112 | default: 113 | req.temp = map[string]interface{}{} 114 | } 115 | 116 | switch v := param["header"].(type) { 117 | case string: 118 | _, err := os.Stat(v) 119 | if err == nil { 120 | req.header = readHeaderFromFile(v) 121 | } 122 | case http.Header: 123 | req.header = v 124 | default: 125 | req.header = nil 126 | } 127 | 128 | return req 129 | } 130 | 131 | func readHeaderFromFile(headerFile string) http.Header { 132 | //read file , parse the header and cookies 133 | b, err := ioutil.ReadFile(headerFile) 134 | if err != nil { 135 | //make be: share access error 136 | reporter.Log.Println(err.Error()) 137 | return nil 138 | } 139 | js, _ := simplejson.NewJson(b) 140 | //constructed to header 141 | 142 | h := make(http.Header) 143 | h.Add("User-Agent", js.Get("User-Agent").MustString()) 144 | h.Add("Referer", js.Get("Referer").MustString()) 145 | h.Add("Cookie", js.Get("Cookie").MustString()) 146 | h.Add("Cache-Control", "max-age=0") 147 | h.Add("Connection", "keep-alive") 148 | return h 149 | } 150 | 151 | //point to a json file 152 | /* xxx.json 153 | { 154 | "User-Agent":"curl/7.19.3 (i386-pc-win32) libcurl/7.19.3 OpenSSL/1.0.0d", 155 | "Referer":"http://weixin.sogou.com/gzh?openid=oIWsFt6Sb7aZmuI98AU7IXlbjJps", 156 | "Cookie":"" 157 | } 158 | */ 159 | func (self *Request) AddHeaderFile(headerFile string) *Request { 160 | _, err := os.Stat(headerFile) 161 | if err != nil { 162 | return self 163 | } 164 | h := readHeaderFromFile(headerFile) 165 | self.header = h 166 | return self 167 | } 168 | 169 | // @host http://localhost:8765/ 170 | func (self *Request) AddProxyHost(host string) *Request { 171 | self.proxyHost = host 172 | return self 173 | } 174 | 175 | func (self *Request) GetHeader() http.Header { 176 | return self.header 177 | } 178 | 179 | func (self *Request) GetProxyHost() string { 180 | return self.proxyHost 181 | } 182 | 183 | func (self *Request) GetRedirectFunc() func(req *http.Request, via []*http.Request) error { 184 | return self.checkRedirect 185 | } 186 | 187 | func (self *Request) GetUrl() string { 188 | return self.url 189 | } 190 | 191 | func (self *Request) SetUrl(url string) { 192 | self.url = url 193 | } 194 | 195 | func (self *Request) GetParent() string { 196 | return self.parent 197 | } 198 | 199 | func (self *Request) GetRuleName() string { 200 | return self.rule 201 | } 202 | 203 | func (self *Request) SetRuleName(ruleName string) { 204 | self.rule = ruleName 205 | } 206 | 207 | func (self *Request) GetSpiderName() string { 208 | return self.spider 209 | } 210 | 211 | func (self *Request) GetRespType() string { 212 | return self.respType 213 | } 214 | 215 | func (self *Request) GetMethod() string { 216 | return self.method 217 | } 218 | 219 | func (self *Request) GetPostdata() string { 220 | return self.postdata 221 | } 222 | 223 | func (self *Request) GetCookies() []*http.Cookie { 224 | return self.cookies 225 | } 226 | 227 | func (self *Request) IsOutsource() bool { 228 | return self.isOutsource 229 | } 230 | 231 | func (self *Request) TryOutsource() bool { 232 | if self.canOutsource { 233 | self.isOutsource = true 234 | return true 235 | } else { 236 | return false 237 | } 238 | } 239 | 240 | func (self *Request) GetTemp(key string) interface{} { 241 | return self.temp[key] 242 | } 243 | 244 | func (self *Request) GetTemps() interface{} { 245 | return self.temp 246 | } 247 | 248 | func (self *Request) SetTemp(key string, value interface{}) { 249 | self.temp[key] = value 250 | } 251 | 252 | func (self *Request) GetSpiderId() (int, bool) { 253 | value, ok := self.temp["__SPIDER_ID__"] 254 | return value.(int), ok 255 | } 256 | 257 | func (self *Request) SetSpiderId(spiderId int) { 258 | self.temp["__SPIDER_ID__"] = spiderId 259 | } 260 | -------------------------------------------------------------------------------- /spiders/wangyi.go: -------------------------------------------------------------------------------- 1 | package spiders 2 | 3 | // 基础包 4 | import ( 5 | "github.com/PuerkitoBio/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/downloader/context" //必需 7 | // "github.com/henrylee2cn/pholcus/reporter" //信息输出 8 | . "github.com/henrylee2cn/pholcus/spiders/spider" //必需 9 | ) 10 | 11 | // 设置header包 12 | import ( 13 | // "net/http" //http.Header 14 | ) 15 | 16 | // 编码包 17 | import ( 18 | // "encoding/xml" 19 | // "encoding/json" 20 | ) 21 | 22 | // 字符串处理包 23 | import ( 24 | "regexp" 25 | // "strconv" 26 | "strings" 27 | ) 28 | 29 | // 其他包 30 | import ( 31 | // "fmt" 32 | // "math" 33 | ) 34 | 35 | var Wangyi = &Spider{ 36 | Name: "网易新闻", 37 | // Pausetime: [2]uint{uint(3000), uint(1000)}, 38 | // Optional: &Optional{}, 39 | RuleTree: &RuleTree{ 40 | // Spread: []string{}, 41 | Root: func(self *Spider) { 42 | self.AddQueue(map[string]interface{}{"url": "http://news.163.com/rank/", "rule": "排行榜主页"}) 43 | }, 44 | 45 | Nodes: map[string]*Rule{ 46 | 47 | "排行榜主页": &Rule{ 48 | ParseFunc: func(self *Spider, resp *context.Response) { 49 | query := resp.GetHtmlParser() 50 | query.Find(".subNav a").Each(func(i int, s *goquery.Selection) { 51 | if url, ok := s.Attr("href"); ok { 52 | self.AddQueue(map[string]interface{}{"url": url, "rule": "新闻排行榜"}) 53 | } 54 | }) 55 | }, 56 | }, 57 | 58 | "新闻排行榜": &Rule{ 59 | ParseFunc: func(self *Spider, resp *context.Response) { 60 | topTit := []string{ 61 | "1小时前点击排行", 62 | "24小时点击排行", 63 | "本周点击排行", 64 | "今日跟帖排行", 65 | "本周跟帖排行", 66 | "本月跟贴排行", 67 | } 68 | query := resp.GetHtmlParser() 69 | // 获取新闻分类 70 | newsType := query.Find(".titleBar h2").Text() 71 | 72 | urls_top := map[string]string{} 73 | 74 | query.Find(".tabContents").Each(func(n int, t *goquery.Selection) { 75 | t.Find("tr").Each(func(i int, s *goquery.Selection) { 76 | // 跳过标题栏 77 | if i == 0 { 78 | return 79 | } 80 | // 内容链接 81 | url, ok := s.Find("a").Attr("href") 82 | 83 | // 排名 84 | top := s.Find(".cBlue").Text() 85 | 86 | if ok { 87 | urls_top[url] += topTit[n] + ":" + top + "," 88 | } 89 | }) 90 | }) 91 | for k, v := range urls_top { 92 | self.AddQueue(map[string]interface{}{ 93 | "url": k, 94 | "rule": "热点新闻", 95 | "temp": map[string]interface{}{ 96 | "newsType": newsType, 97 | "top": v, 98 | }, 99 | }) 100 | } 101 | }, 102 | }, 103 | 104 | "热点新闻": &Rule{ 105 | //注意:有无字段语义和是否输出数据必须保持一致 106 | OutFeild: []string{ 107 | "标题", 108 | "内容", 109 | "排名", 110 | "类别", 111 | "ReleaseTime", 112 | }, 113 | ParseFunc: func(self *Spider, resp *context.Response) { 114 | query := resp.GetHtmlParser() 115 | 116 | // 若有多页内容,则获取阅读全文的链接并获取内容 117 | if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 { 118 | if pageAllUrl, ok := pageAll.Attr("href"); ok { 119 | self.AddQueue(map[string]interface{}{ 120 | "url": pageAllUrl, 121 | "rule": "热点新闻", 122 | "temp": resp.GetTemps(), 123 | }) 124 | } 125 | return 126 | } 127 | 128 | // 获取标题 129 | title := query.Find("#h1title").Text() 130 | 131 | // 获取内容 132 | content := query.Find("#endText").Text() 133 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 134 | // content = re.ReplaceAllStringFunc(content, strings.ToLower) 135 | content = re.ReplaceAllString(content, "") 136 | 137 | // 获取发布日期 138 | release := query.Find(".ep-time-soure").Text() 139 | release = strings.Split(release, "来源:")[0] 140 | release = strings.Trim(release, " \t\n") 141 | 142 | // 结果存入Response中转 143 | resp.AddItem(map[string]string{ 144 | self.GetOutFeild(resp, 0): title, 145 | self.GetOutFeild(resp, 1): content, 146 | self.GetOutFeild(resp, 2): resp.GetTemp("top").(string), 147 | self.GetOutFeild(resp, 3): resp.GetTemp("newsType").(string), 148 | self.GetOutFeild(resp, 4): release, 149 | }) 150 | }, 151 | }, 152 | }, 153 | }, 154 | } 155 | 156 | // 不确定因素过多,暂未实现抓取 157 | // &crawler.Rule{ 158 | // Name: "热门跟帖", 159 | // Semantic: []string{ 160 | // "新闻标题", 161 | // "新闻链接", 162 | // "评论者", 163 | // "评论内容", 164 | // "release_data", 165 | // }, 166 | // Meta: map[string]int{}, //用于标记如是否已获取总页数等 167 | // // url生成规则,参数:循环计数、Task实例、urltag、params 168 | // UrlFunc: func(self crawler.Crawler, startEnd [2]int, urltag map[string]string, params []string) { 169 | // baseUrl := strings.Split(params[0], ".html") 170 | // self.AddUrl( 171 | // baseUrl+"_"+i+".html", 172 | // "json", 173 | // urltag, 174 | // ) 175 | // return self 176 | // }, 177 | // ProcessFunc: func(self crawler.Crawler, p *page.Page) { 178 | // // 获取该请求数据的规则名 179 | // name := p.GetUrlTag()["RuleName"] 180 | 181 | // // 获取总页数 182 | // if _, ok := self.GetRuleExecPage(name); !ok { 183 | // // 试运行并获取总页数 184 | // self.AddUrl(p.GetUrl(), "html", map[string]string{}).Run(false) 185 | // self.CreatAndAddUrl(1, self, urltag, []string{p.GetUrl()}).Run(false) 186 | 187 | // // 存入新闻标题 188 | // p.AddField(map[string]string{self.GetRuleSemantic(name, 0): p.GetUrlTag()["newsTitle"]}) 189 | 190 | // // 存入新闻链接 191 | // p.AddField(map[string]string{self.GetRuleSemantic(name, 1): p.GetUrlTag()["newsUrl"]}) 192 | 193 | // // 获取该页面数据 194 | // query := p.GetHtmlParser() 195 | 196 | // self.SetRuleTotalPage(name, 0) 197 | 198 | // total1 := query.Find(".pages").Eq(0).Find("li a").Last().Prev().Text() 199 | 200 | // tatal2, _ := strconv.Atoi(total1) 201 | 202 | // self.SetRuleTotalPage(name, tatal2) 203 | 204 | // if total, _ := self.GetRuleExecPage(name); total == 0 { 205 | // log.Printf("[消息提示:%v::%v::%v] 没有抓取到任何数据!!!\n", self.GetTaskName(), self.GetKeyword(), name) 206 | // } 207 | // } 208 | 209 | // query.Find("#hotReplies .reply.essence").Each(func(i int, s *goquery.Selection) { 210 | 211 | // re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") 212 | 213 | // // 获取并存入作者及其地址 214 | // author := s.Find(".author").Text() 215 | // author = re.ReplaceAllString(author, "") 216 | // p.AddField(map[string]string{self.GetRuleSemantic(name, 2): author}) 217 | 218 | // // 获取并存入评论内容 219 | // body := s.Find(".body").Text() 220 | // body = re.ReplaceAllString(body, "") 221 | // p.AddField(map[string]string{self.GetRuleSemantic(name, 3): body}) 222 | 223 | // // 获取并存入发表时间 224 | // postTime := s.Find(".postTime").Text() 225 | // postTime = strings.Split(postTime, " 发表")[0] 226 | // p.AddField(map[string]string{self.GetRuleSemantic(name, 5): postTime}) 227 | // }) 228 | // }, 229 | // }, //end 230 | -------------------------------------------------------------------------------- /common/config/config.go: -------------------------------------------------------------------------------- 1 | // Package config provides for parse config file. 2 | package config 3 | 4 | import ( 5 | "errors" 6 | "io/ioutil" 7 | "strconv" 8 | "strings" 9 | "time" 10 | ) 11 | 12 | type Config struct { 13 | globalContent map[string]string 14 | sectionContents map[string]map[string]string 15 | sections []string 16 | } 17 | 18 | func NewConfig() *Config { 19 | return &Config{ 20 | globalContent: make(map[string]string), 21 | sectionContents: make(map[string]map[string]string), 22 | } 23 | } 24 | 25 | // Load reads config file and returns an initialized Config. 26 | func (this *Config) Load(configFile string) *Config { 27 | stream, err := ioutil.ReadFile(configFile) 28 | if err != nil { 29 | panic("config read file error : " + configFile + "\n") 30 | } 31 | this.LoadString(string(stream)) 32 | return this 33 | } 34 | 35 | // Save writes config content to a config file. 36 | func (this *Config) Save(configFile string) error { 37 | return ioutil.WriteFile(configFile, []byte(this.String()), 0777) 38 | } 39 | 40 | func (this *Config) Clear() { 41 | this.globalContent = make(map[string]string) 42 | this.sectionContents = make(map[string]map[string]string) 43 | this.sections = nil 44 | } 45 | 46 | func (this *Config) LoadString(s string) error { 47 | lines := strings.Split(s, "\n") 48 | section := "" 49 | for _, line := range lines { 50 | line = strings.Trim(line, emptyRunes) 51 | if line == "" || line[0] == '#' { 52 | continue 53 | } 54 | if line[0] == '[' { 55 | if lineLen := len(line); line[lineLen-1] == ']' { 56 | section = line[1 : lineLen-1] 57 | sectionAdded := false 58 | for _, oldSection := range this.sections { 59 | if section == oldSection { 60 | sectionAdded = true 61 | break 62 | } 63 | } 64 | if !sectionAdded { 65 | this.sections = append(this.sections, section) 66 | } 67 | continue 68 | } 69 | } 70 | pair := strings.SplitN(line, "=", 2) 71 | if len(pair) != 2 { 72 | return errors.New("bad config file syntax") 73 | } 74 | key := strings.Trim(pair[0], emptyRunes) 75 | value := strings.Trim(pair[1], emptyRunes) 76 | if section == "" { 77 | this.globalContent[key] = value 78 | } else { 79 | if _, ok := this.sectionContents[section]; !ok { 80 | this.sectionContents[section] = make(map[string]string) 81 | } 82 | this.sectionContents[section][key] = value 83 | } 84 | } 85 | return nil 86 | } 87 | 88 | func (this *Config) String() string { 89 | s := "" 90 | for key, value := range this.globalContent { 91 | s += key + "=" + value + "\n" 92 | } 93 | for section, content := range this.sectionContents { 94 | s += "[" + section + "]\n" 95 | for key, value := range content { 96 | s += key + "=" + value + "\n" 97 | } 98 | } 99 | return s 100 | } 101 | 102 | func (this *Config) StringWithMeta() string { 103 | s := "__sections__=" + strings.Join(this.sections, ",") + "\n" 104 | return s + this.String() 105 | } 106 | 107 | func (this *Config) GlobalHas(key string) bool { 108 | if _, ok := this.globalContent[key]; ok { 109 | return true 110 | } 111 | return false 112 | } 113 | 114 | func (this *Config) GlobalGet(key string) string { 115 | return this.globalContent[key] 116 | } 117 | 118 | func (this *Config) GlobalSet(key string, value string) { 119 | this.globalContent[key] = value 120 | } 121 | 122 | func (this *Config) GlobalGetInt(key string) int { 123 | value := this.GlobalGet(key) 124 | if value == "" { 125 | return 0 126 | } 127 | result, err := strconv.Atoi(value) 128 | if err != nil { 129 | return 0 130 | } 131 | return result 132 | } 133 | 134 | func (this *Config) GlobalGetInt64(key string) int64 { 135 | value := this.GlobalGet(key) 136 | if value == "" { 137 | return 0 138 | } 139 | result, err := strconv.ParseInt(value, 10, 64) 140 | if err != nil { 141 | return 0 142 | } 143 | return result 144 | } 145 | 146 | func (this *Config) GlobalGetDuration(key string) time.Duration { 147 | return time.Duration(this.GlobalGetInt(key)) * time.Second 148 | } 149 | 150 | func (this *Config) GlobalGetDeadline(key string) time.Time { 151 | return time.Now().Add(time.Duration(this.GlobalGetInt(key)) * time.Second) 152 | } 153 | 154 | func (this *Config) GlobalGetSlice(key string, separator string) []string { 155 | result := []string{} 156 | value := this.GlobalGet(key) 157 | if value != "" { 158 | for _, part := range strings.Split(value, separator) { 159 | result = append(result, strings.Trim(part, emptyRunes)) 160 | } 161 | } 162 | return result 163 | } 164 | 165 | func (this *Config) GlobalGetSliceInt(key string, separator string) []int { 166 | result := []int{} 167 | value := this.GlobalGetSlice(key, separator) 168 | for _, part := range value { 169 | int, err := strconv.Atoi(part) 170 | if err != nil { 171 | continue 172 | } 173 | result = append(result, int) 174 | } 175 | return result 176 | } 177 | 178 | func (this *Config) GlobalContent() map[string]string { 179 | return this.globalContent 180 | } 181 | 182 | func (this *Config) Sections() []string { 183 | return this.sections 184 | } 185 | 186 | func (this *Config) HasSection(section string) bool { 187 | if _, ok := this.sectionContents[section]; ok { 188 | return true 189 | } 190 | return false 191 | } 192 | 193 | func (this *Config) SectionHas(section string, key string) bool { 194 | if !this.HasSection(section) { 195 | return false 196 | } 197 | if _, ok := this.sectionContents[section][key]; ok { 198 | return true 199 | } 200 | return false 201 | } 202 | 203 | func (this *Config) SectionGet(section string, key string) string { 204 | if content, ok := this.sectionContents[section]; ok { 205 | return content[key] 206 | } 207 | return "" 208 | } 209 | 210 | func (this *Config) SectionSet(section string, key string, value string) { 211 | if content, ok := this.sectionContents[section]; ok { 212 | content[key] = value 213 | } else { 214 | content = make(map[string]string) 215 | content[key] = value 216 | this.sectionContents[section] = content 217 | } 218 | } 219 | 220 | func (this *Config) SectionGetInt(section string, key string) int { 221 | value := this.SectionGet(section, key) 222 | if value == "" { 223 | return 0 224 | } 225 | result, err := strconv.Atoi(value) 226 | if err != nil { 227 | return 0 228 | } 229 | return result 230 | } 231 | 232 | func (this *Config) SectionGetDuration(section string, key string) time.Duration { 233 | return time.Duration(this.SectionGetInt(section, key)) * time.Second 234 | } 235 | 236 | func (this *Config) SectionGetSlice(section string, key string, separator string) []string { 237 | result := []string{} 238 | value := this.SectionGet(section, key) 239 | if value != "" { 240 | for _, part := range strings.Split(value, separator) { 241 | result = append(result, strings.Trim(part, emptyRunes)) 242 | } 243 | } 244 | return result 245 | } 246 | 247 | func (this *Config) SectionContent(section string) map[string]string { 248 | return this.sectionContents[section] 249 | } 250 | 251 | func (this *Config) SectionContents() map[string]map[string]string { 252 | return this.sectionContents 253 | } 254 | 255 | const emptyRunes = " \r\t\v" 256 | -------------------------------------------------------------------------------- /pholcus/gui/guimain.go: -------------------------------------------------------------------------------- 1 | package gui 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/config" 5 | "github.com/henrylee2cn/pholcus/pholcus/crawler" 6 | "github.com/henrylee2cn/pholcus/reporter" 7 | "github.com/henrylee2cn/pholcus/scheduler" 8 | "github.com/henrylee2cn/pholcus/spiders/spider" 9 | "github.com/lxn/walk" 10 | . "github.com/lxn/walk/declarative" 11 | "log" 12 | "strconv" 13 | "strings" 14 | "time" 15 | ) 16 | 17 | var toggleSpecialModePB *walk.PushButton 18 | 19 | func Run() { 20 | var mw *walk.MainWindow 21 | var db *walk.DataBinder 22 | var ep walk.ErrorPresenter 23 | 24 | if err := (MainWindow{ 25 | AssignTo: &mw, 26 | DataBinder: DataBinder{ 27 | AssignTo: &db, 28 | DataSource: Input, 29 | ErrorPresenter: ErrorPresenterRef{&ep}, 30 | }, 31 | Title: config.APP_NAME, 32 | MinSize: Size{1100, 700}, 33 | Layout: VBox{}, 34 | Children: []Widget{ 35 | // 任务列表 36 | HSplitter{ 37 | Children: []Widget{ 38 | TableView{ 39 | MinSize: Size{550, 400}, 40 | AlternatingRowBGColor: walk.RGB(255, 255, 224), 41 | CheckBoxes: true, 42 | ColumnsOrderable: true, 43 | Columns: []TableViewColumn{ 44 | {Title: "#", Width: 45}, 45 | {Title: "任务", Width: 110 /*, Format: "%.2f", Alignment: AlignFar*/}, 46 | {Title: "描述", Width: 370}, 47 | }, 48 | Model: SpiderModel, 49 | }, 50 | // 关键词 51 | VSplitter{ 52 | MinSize: Size{550, 400}, 53 | 54 | Children: []Widget{ 55 | VSplitter{ 56 | Children: []Widget{ 57 | Label{ 58 | Text: "关键词:(多任务之间以 | 隔开,选填)", 59 | }, 60 | LineEdit{ 61 | Text: Bind("Keywords"), 62 | }, 63 | }, 64 | }, 65 | 66 | VSplitter{ 67 | Children: []Widget{ 68 | Label{ 69 | Text: "采集页数:(选填)", 70 | }, 71 | NumberEdit{ 72 | Value: Bind("MaxPage"), 73 | Suffix: "", 74 | Decimals: 0, 75 | }, 76 | }, 77 | }, 78 | 79 | VSplitter{ 80 | Children: []Widget{ 81 | Label{ 82 | Text: "*并发协程:(1~99999)", 83 | }, 84 | NumberEdit{ 85 | Value: Bind("ThreadNum", Range{1, 99999}), 86 | Suffix: "", 87 | Decimals: 0, 88 | }, 89 | }, 90 | }, 91 | 92 | VSplitter{ 93 | Children: []Widget{ 94 | Label{ 95 | Text: "*分批输出大小:(1~5,000,000 条数据)", 96 | }, 97 | NumberEdit{ 98 | Value: Bind("DockerCap", Range{1, 5000000}), 99 | Suffix: "", 100 | Decimals: 0, 101 | }, 102 | }, 103 | }, 104 | 105 | VSplitter{ 106 | Children: []Widget{ 107 | Label{ 108 | Text: "*间隔基准:", 109 | }, 110 | ComboBox{ 111 | Value: Bind("BaseSleeptime", SelRequired{}), 112 | BindingMember: "Uint", 113 | DisplayMember: "Key", 114 | Model: GUIOpt.SleepTime, 115 | }, 116 | }, 117 | }, 118 | 119 | VSplitter{ 120 | Children: []Widget{ 121 | Label{ 122 | Text: "*随机延迟:", 123 | }, 124 | ComboBox{ 125 | Value: Bind("RandomSleepPeriod", SelRequired{}), 126 | BindingMember: "Uint", 127 | DisplayMember: "Key", 128 | Model: GUIOpt.SleepTime, 129 | }, 130 | }, 131 | }, 132 | 133 | RadioButtonGroupBox{ 134 | ColumnSpan: 2, 135 | Title: "*输出方式", 136 | Layout: HBox{}, 137 | DataMember: "OutType", 138 | Buttons: []RadioButton{ 139 | {Text: GUIOpt.OutType[0].Key, Value: GUIOpt.OutType[0].String}, 140 | {Text: GUIOpt.OutType[1].Key, Value: GUIOpt.OutType[1].String}, 141 | {Text: GUIOpt.OutType[2].Key, Value: GUIOpt.OutType[2].String}, 142 | }, 143 | }, 144 | }, 145 | }, 146 | }, 147 | }, 148 | 149 | Composite{ 150 | Layout: HBox{}, 151 | Children: []Widget{ 152 | 153 | // 必填项错误检查 154 | LineErrorPresenter{ 155 | AssignTo: &ep, 156 | ColumnSpan: 2, 157 | }, 158 | 159 | PushButton{ 160 | Text: "开始抓取", 161 | AssignTo: &toggleSpecialModePB, 162 | OnClicked: func() { 163 | if err := db.Submit(); err != nil { 164 | log.Print(err) 165 | return 166 | } 167 | Input.Spiders = SpiderModel.GetChecked() 168 | if len(Input.Spiders) == 0 { 169 | return 170 | } 171 | toggleSpecialModePB.SetEnabled(false) 172 | toggleSpecialModePB.SetText("正在抓取") 173 | SubmitAndRun() 174 | }, 175 | }, 176 | }, 177 | }, 178 | }, 179 | }.Create()); err != nil { 180 | log.Fatal(err) 181 | } 182 | 183 | // 绑定log输出界面 184 | lv, err := NewLogView(mw) 185 | if err != nil { 186 | log.Fatal(err) 187 | } 188 | log.SetOutput(lv) 189 | 190 | if icon, err := walk.NewIconFromResource("ICON"); err == nil { 191 | mw.SetIcon(icon) 192 | } 193 | 194 | // 运行窗体程序 195 | mw.Run() 196 | } 197 | 198 | // 初始化蜘蛛列表,必须在用户前端输入之后执行! 199 | func InitSpiders() int { 200 | var sp = spider.Spiders{} 201 | spider.SpiderList.Init() 202 | 203 | // 遍历任务 204 | for i, sps := range Input.Spiders { 205 | sp = append(sp, sps.Spider) 206 | l := len(sp) - 1 207 | sp[l].Id = i 208 | sp[l].Pausetime[0] = Input.BaseSleeptime 209 | sp[l].Pausetime[1] = Input.RandomSleepPeriod 210 | sp[l].MaxPage = Input.MaxPage 211 | } 212 | 213 | // 遍历关键词 214 | if Input.Keywords != "" { 215 | keywordSlice := strings.Split(Input.Keywords, "|") 216 | for _, keyword := range keywordSlice { 217 | keyword = strings.Trim(keyword, " ") 218 | if keyword == "" { 219 | continue 220 | } 221 | nowLen := len(spider.SpiderList) 222 | for n, _ := range sp { 223 | sp[n].Keyword = keyword 224 | sp[n].Id = nowLen + n 225 | c := *sp[n] 226 | spider.SpiderList.Add(&c) 227 | } 228 | } 229 | } else { 230 | spider.SpiderList = sp 231 | } 232 | return len(spider.SpiderList) 233 | } 234 | 235 | // 提交用户输入并开始运行 236 | func SubmitAndRun() { 237 | // 纠正协程数 238 | if Input.ThreadNum == 0 { 239 | Input.ThreadNum = 1 240 | } 241 | 242 | // 初始化config参数 243 | config.InitDockerParam(Input.DockerCap) 244 | config.ThreadNum = Input.ThreadNum 245 | config.OutType = Input.OutType 246 | config.StartTime = time.Now() 247 | config.ReqSum = 0 // 清空下载页面计数 248 | 249 | count := InitSpiders() 250 | 251 | // 初始化资源队列 252 | scheduler.Init(Input.ThreadNum) 253 | 254 | // 初始化爬行队列 255 | CrawlerNum := config.CRAWLER_CAP 256 | if count < config.CRAWLER_CAP { 257 | CrawlerNum = count 258 | } 259 | config.CrawlerQueue.Init(CrawlerNum) 260 | 261 | reporter.Log.Printf("\n执行任务总数(任务数[*关键词数])为 %v 个...\n", count) 262 | reporter.Log.Printf("\n爬行队列可容纳蜘蛛 %v 只...\n", CrawlerNum) 263 | reporter.Log.Printf("\n并发协程最多 %v 个……\n", Input.ThreadNum) 264 | reporter.Log.Printf("\n随机停顿时间为 %v~%v ms ……\n", Input.BaseSleeptime, Input.BaseSleeptime+Input.RandomSleepPeriod) 265 | reporter.Log.Printf("*********************************************开始抓取,请耐心等候*********************************************") 266 | 267 | // 任务执行 268 | go func(count int) { 269 | 270 | // 由现有爬行队列转换目标所需爬行队列,注意爬行队列实例还是原来的 271 | for s, add := 0, config.CrawlerQueue.Exchange(CrawlerNum); s < add; s++ { 272 | config.CrawlerQueue.Push(crawler.New()) 273 | } 274 | 275 | for i := 0; i < count; i++ { 276 | 277 | // 等待从爬行队列取出空闲蜘蛛 278 | oneCrawler := config.CrawlerQueue.Pull().(crawler.Crawler) 279 | 280 | // 并发执行爬行任务 281 | go func(i int, c crawler.Crawler) { 282 | // 执行并返回结果消息 283 | c.Init(spider.SpiderList[i]).Start() 284 | // 任务结束后回收该蜘蛛 285 | config.CrawlerQueue.Push(c) 286 | 287 | }(i, oneCrawler) 288 | } 289 | 290 | // 监控结束任务 291 | sum := 0 //数据总数 292 | for i := 0; i < count; i++ { 293 | s := <-config.ReportChan 294 | reporter.Log.Printf("[结束报告 -> 任务:%v | 关键词:%v] 共输出数据 %v 条,用时 %v 分钟!!!\n", s.SpiderName, s.Keyword, s.Num, s.Time) 295 | if slen, err := strconv.Atoi(s.Num); err == nil { 296 | sum += slen 297 | } 298 | } 299 | reporter.Log.Printf("*****************************!!本次抓取合计 %v 条数据,下载页面 %v 个,耗时:%.5f 分钟!!***************************", sum, config.ReqSum, time.Since(config.StartTime).Minutes()) 300 | 301 | // 按钮状态控制 302 | toggleSpecialModePB.SetText("开始抓取") 303 | toggleSpecialModePB.SetEnabled(true) 304 | }(count) 305 | } 306 | -------------------------------------------------------------------------------- /downloader/downloader_http.go: -------------------------------------------------------------------------------- 1 | package downloader 2 | 3 | import ( 4 | "bytes" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/bitly/go-simplejson" 7 | "github.com/henrylee2cn/pholcus/downloader/context" 8 | // iconv "github.com/djimenez/iconv-go" 9 | "github.com/henrylee2cn/pholcus/common/util" 10 | "github.com/henrylee2cn/pholcus/reporter" 11 | // "golang.org/x/text/encoding/simplifiedchinese" 12 | // "golang.org/x/text/transform" 13 | "io" 14 | "io/ioutil" 15 | "net/http" 16 | "net/url" 17 | //"fmt" 18 | "golang.org/x/net/html/charset" 19 | // "regexp" 20 | // "golang.org/x/net/html" 21 | "strings" 22 | ) 23 | 24 | // The HttpDownloader download response by package net/http. 25 | // The "html" content is contained in dom parser of package goquery. 26 | // The "json" content is saved. 27 | // The "jsonp" content is modified to json. 28 | // The "text" content will save body plain text only. 29 | // The response result is saved in Response. 30 | type HttpDownloader struct{} 31 | 32 | func NewHttpDownloader() *HttpDownloader { 33 | return &HttpDownloader{} 34 | } 35 | 36 | func (self *HttpDownloader) Download(req *context.Request) *context.Response { 37 | var mtype string 38 | var p = context.NewResponse(req) 39 | mtype = req.GetRespType() 40 | switch mtype { 41 | case "html": 42 | return self.downloadHtml(p, req) 43 | case "json": 44 | fallthrough 45 | case "jsonp": 46 | return self.downloadJson(p, req) 47 | case "text": 48 | return self.downloadText(p, req) 49 | default: 50 | reporter.Log.Println("error request type:" + mtype) 51 | } 52 | return p 53 | } 54 | 55 | /* 56 | // The acceptableCharset is test for whether Content-Type is UTF-8 or not 57 | func (self *HttpDownloader) acceptableCharset(contentTypes []string) bool { 58 | // each type is like [text/html; charset=UTF-8] 59 | // we want the UTF-8 only 60 | for _, cType := range contentTypes { 61 | if strings.Index(cType, "UTF-8") != -1 || strings.Index(cType, "utf-8") != -1 { 62 | return true 63 | } 64 | } 65 | return false 66 | } 67 | // The getCharset used for parsing the header["Content-Type"] string to get charset of the 68 | func (self *HttpDownloader) getCharset(header http.Header) string { 69 | reg, err := regexp.Compile("charset=(.*)$") 70 | if err != nil { 71 | reporter.Log.Println(err.Error()) 72 | return "" 73 | } 74 | var charset string 75 | for _, cType := range header["Content-Type"] { 76 | substrings := reg.FindStringSubmatch(cType) 77 | if len(substrings) == 2 { 78 | charset = substrings[1] 79 | } 80 | } 81 | return charset 82 | } 83 | // Use golang.org/x/text/encoding. Get response body and change it to utf-8 84 | func (self *HttpDownloader) changeCharsetEncoding(charset string, sor io.ReadCloser) string { 85 | ischange := true 86 | var tr transform.Transformer 87 | cs := strings.ToLower(charset) 88 | if cs == "gbk" { 89 | tr = simplifiedchinese.GBK.NewDecoder() 90 | } else if cs == "gb18030" { 91 | tr = simplifiedchinese.GB18030.NewDecoder() 92 | } else if cs == "hzgb2312" || cs == "gb2312" || cs == "hz-gb2312" { 93 | tr = simplifiedchinese.HZGB2312.NewDecoder() 94 | } else { 95 | ischange = false 96 | } 97 | var destReader io.Reader 98 | if ischange { 99 | transReader := transform.NewReader(sor, tr) 100 | destReader = transReader 101 | } else { 102 | destReader = sor 103 | } 104 | var sorbody []byte 105 | var err error 106 | if sorbody, err = ioutil.ReadAll(destReader); err != nil { 107 | reporter.Log.Println(err.Error()) 108 | return "" 109 | } 110 | bodystr := string(sorbody) 111 | return bodystr 112 | } 113 | // Use go-iconv. Get response body and change it to utf-8 114 | func (self *HttpDownloader) changeCharsetGoIconv(charset string, sor io.ReadCloser) string { 115 | var err error 116 | var converter *iconv.Converter 117 | if charset != "" && strings.ToLower(charset) != "utf-8" && strings.ToLower(charset) != "utf8" { 118 | converter, err = iconv.NewConverter(charset, "utf-8") 119 | if err != nil { 120 | reporter.Log.Println(err.Error()) 121 | return "" 122 | } 123 | defer converter.Close() 124 | } 125 | var sorbody []byte 126 | if sorbody, err = ioutil.ReadAll(sor); err != nil { 127 | reporter.Log.Println(err.Error()) 128 | return "" 129 | } 130 | bodystr := string(sorbody) 131 | var destbody string 132 | if converter != nil { 133 | // convert to utf8 134 | destbody, err = converter.ConvertString(bodystr) 135 | if err != nil { 136 | reporter.Log.Println(err.Error()) 137 | return "" 138 | } 139 | } else { 140 | destbody = bodystr 141 | } 142 | return destbody 143 | } 144 | */ 145 | 146 | // Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 147 | func (self *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor io.ReadCloser) string { 148 | var err error 149 | destReader, err := charset.NewReader(sor, contentTypeStr) 150 | 151 | if err != nil { 152 | reporter.Log.Println(err.Error()) 153 | destReader = sor 154 | } 155 | 156 | var sorbody []byte 157 | if sorbody, err = ioutil.ReadAll(destReader); err != nil { 158 | reporter.Log.Println(err.Error()) 159 | // For gb2312, an error will be returned. 160 | // Error like: simplifiedchinese: invalid GBK encoding 161 | // return "" 162 | } 163 | //e,name,certain := charset.DetermineEncoding(sorbody,contentTypeStr) 164 | bodystr := string(sorbody) 165 | 166 | return bodystr 167 | } 168 | 169 | // choose http GET/method to download 170 | func connectByHttp(p *context.Response, req *context.Request) (*http.Response, error) { 171 | client := &http.Client{ 172 | CheckRedirect: req.GetRedirectFunc(), 173 | } 174 | 175 | httpreq, err := http.NewRequest(req.GetMethod(), req.GetUrl(), strings.NewReader(req.GetPostdata())) 176 | if header := req.GetHeader(); header != nil { 177 | httpreq.Header = req.GetHeader() 178 | } 179 | 180 | if cookies := req.GetCookies(); cookies != nil { 181 | for i := range cookies { 182 | httpreq.AddCookie(cookies[i]) 183 | } 184 | } 185 | 186 | var resp *http.Response 187 | if resp, err = client.Do(httpreq); err != nil { 188 | if e, ok := err.(*url.Error); ok && e.Err != nil && e.Err.Error() == "normal" { 189 | // normal 190 | } else { 191 | reporter.Log.Println(err.Error()) 192 | p.SetStatus(true, err.Error()) 193 | //fmt.Printf("client do error %v \r\n", err) 194 | return nil, err 195 | } 196 | } 197 | 198 | return resp, nil 199 | } 200 | 201 | // choose a proxy server to excute http GET/method to download 202 | func connectByHttpProxy(p *context.Response, in_req *context.Request) (*http.Response, error) { 203 | request, _ := http.NewRequest("GET", in_req.GetUrl(), nil) 204 | proxy, err := url.Parse(in_req.GetProxyHost()) 205 | if err != nil { 206 | return nil, err 207 | } 208 | client := &http.Client{ 209 | Transport: &http.Transport{ 210 | Proxy: http.ProxyURL(proxy), 211 | }, 212 | } 213 | resp, err := client.Do(request) 214 | if err != nil { 215 | return nil, err 216 | } 217 | return resp, nil 218 | 219 | } 220 | 221 | // Download file and change the charset of response charset. 222 | func (self *HttpDownloader) downloadFile(p *context.Response, req *context.Request) (*context.Response, string) { 223 | var err error 224 | var urlstr string 225 | if urlstr = req.GetUrl(); len(urlstr) == 0 { 226 | reporter.Log.Println("url is empty") 227 | p.SetStatus(true, "url is empty") 228 | return p, "" 229 | } 230 | 231 | var resp *http.Response 232 | 233 | if proxystr := req.GetProxyHost(); len(proxystr) != 0 { 234 | //using http proxy 235 | //fmt.Print("HttpProxy Enter ",proxystr,"\n") 236 | resp, err = connectByHttpProxy(p, req) 237 | } else { 238 | //normal http download 239 | //fmt.Print("Http Normal Enter \n",proxystr,"\n") 240 | resp, err = connectByHttp(p, req) 241 | } 242 | 243 | if err != nil { 244 | return p, "" 245 | } 246 | 247 | //b, _ := ioutil.ReadAll(resp.Body) 248 | //fmt.Printf("Resp body %v \r\n", string(b)) 249 | 250 | p.SetHeader(resp.Header) 251 | p.SetCookies(resp.Cookies()) 252 | 253 | // get converter to utf-8 254 | bodyStr := self.changeCharsetEncodingAuto(resp.Header.Get("Content-Type"), resp.Body) 255 | //fmt.Printf("utf-8 body %v \r\n", bodyStr) 256 | defer resp.Body.Close() 257 | return p, bodyStr 258 | } 259 | 260 | func (self *HttpDownloader) downloadHtml(p *context.Response, req *context.Request) *context.Response { 261 | var err error 262 | p, destbody := self.downloadFile(p, req) 263 | //fmt.Printf("Destbody %v \r\n", destbody) 264 | if !p.IsSucc() { 265 | //fmt.Print("Response error \r\n") 266 | return p 267 | } 268 | bodyReader := bytes.NewReader([]byte(destbody)) 269 | 270 | var doc *goquery.Document 271 | if doc, err = goquery.NewDocumentFromReader(bodyReader); err != nil { 272 | reporter.Log.Println(err.Error()) 273 | p.SetStatus(true, err.Error()) 274 | return p 275 | } 276 | 277 | var body string 278 | if body, err = doc.Html(); err != nil { 279 | reporter.Log.Println(err.Error()) 280 | p.SetStatus(true, err.Error()) 281 | return p 282 | } 283 | 284 | p.SetBodyStr(body).SetHtmlParser(doc).SetStatus(false, "") 285 | 286 | return p 287 | } 288 | 289 | func (self *HttpDownloader) downloadJson(p *context.Response, req *context.Request) *context.Response { 290 | var err error 291 | p, destbody := self.downloadFile(p, req) 292 | if !p.IsSucc() { 293 | return p 294 | } 295 | 296 | var body []byte 297 | body = []byte(destbody) 298 | mtype := req.GetRespType() 299 | if mtype == "jsonp" { 300 | tmpstr := util.JsonpToJson(destbody) 301 | body = []byte(tmpstr) 302 | } 303 | 304 | var r *simplejson.Json 305 | if r, err = simplejson.NewJson(body); err != nil { 306 | reporter.Log.Println(string(body) + "\t" + err.Error()) 307 | p.SetStatus(true, err.Error()) 308 | return p 309 | } 310 | 311 | // json result 312 | p.SetBodyStr(string(body)).SetJson(r).SetStatus(false, "") 313 | 314 | return p 315 | } 316 | 317 | func (self *HttpDownloader) downloadText(p *context.Response, req *context.Request) *context.Response { 318 | p, destbody := self.downloadFile(p, req) 319 | if !p.IsSucc() { 320 | return p 321 | } 322 | p.SetBodyStr(destbody).SetStatus(false, "") 323 | return p 324 | } 325 | --------------------------------------------------------------------------------