├── pholcus
├── node
│ └── node.go
├── status
│ └── status.go
├── gui
│ ├── rsrc.syso
│ ├── guimain.manifest
│ ├── logview.go
│ ├── menu.go
│ ├── guispider.go
│ └── guimain.go
├── crawler
│ ├── crawler.go
│ └── crawl.go
├── pholcus.go
├── keeper
│ └── login.go
└── socket
│ ├── PhoSocketTest.go
│ └── PhoSocket.go
├── doc
├── ICON.ico
├── project.png
└── GUI编译命令.txt
├── main.go
├── reporter
├── reporter.go
└── report.go
├── spiders
├── spider
│ ├── spiderlist.go
│ ├── rss.go
│ ├── common.go
│ └── spider.go
├── kaola.go
├── shunfenghaitao.go
├── baidusearch.go
├── readme.md
├── jdsearch.go
├── miyabaobei.go
├── alibaba.go
├── taobaosearch.go
├── googlesearch.go
├── hollandandbarrett.go
├── baidunews.go
└── wangyi.go
├── README.md
├── downloader
├── downloader.go
├── context
│ ├── response.go
│ └── request.go
└── downloader_http.go
├── pipeline
├── collector
│ ├── datacell.go
│ ├── output.go
│ ├── docker.go
│ ├── collector.go
│ └── output_lib.go
└── pipeline.go
├── main.manifest
├── common
├── deduplicate
│ └── deduplicate.go
├── mlog
│ ├── mlog.go
│ ├── strace.go
│ └── filelog.go
├── queue
│ └── queue.go
├── etc_config
│ └── etc_config.go
├── util
│ └── util.go
└── config
│ └── config.go
├── scheduler
├── scheduler.go
└── src_manage.go
└── config
└── config.go
/pholcus/node/node.go:
--------------------------------------------------------------------------------
1 | package node
2 |
3 | import ()
4 |
--------------------------------------------------------------------------------
/pholcus/status/status.go:
--------------------------------------------------------------------------------
1 | package status
2 |
3 | import ()
4 |
--------------------------------------------------------------------------------
/doc/ICON.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/ICON.ico
--------------------------------------------------------------------------------
/doc/project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/doc/project.png
--------------------------------------------------------------------------------
/pholcus/gui/rsrc.syso:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lichanglu/pholcus/HEAD/pholcus/gui/rsrc.syso
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/pholcus/gui"
5 | )
6 |
7 | func main() {
8 | gui.Run()
9 | }
10 |
--------------------------------------------------------------------------------
/doc/GUI编译命令.txt:
--------------------------------------------------------------------------------
1 | go get github.com/akavel/rsrc
2 | rsrc -manifest test.manifest -o rsrc.syso
3 |
4 | go build
5 |
6 |
7 | go build -ldflags="-H windowsgui"
--------------------------------------------------------------------------------
/reporter/reporter.go:
--------------------------------------------------------------------------------
1 | package reporter
2 |
3 | type Reporter interface {
4 | Printf(format string, v ...interface{})
5 | Println(v ...interface{})
6 | send(string)
7 | }
8 |
--------------------------------------------------------------------------------
/pholcus/crawler/crawler.go:
--------------------------------------------------------------------------------
1 | package crawler
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/spiders/spider"
5 | )
6 |
7 | type Crawler interface {
8 | Init(*spider.Spider) Crawler
9 | Start()
10 | }
11 |
--------------------------------------------------------------------------------
/spiders/spider/spiderlist.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | type Spiders []*Spider
4 |
5 | var SpiderList = Spiders{}
6 |
7 | func (Spiders) Init() {
8 | SpiderList = Spiders{}
9 | }
10 |
11 | func (Spiders) Add(sp *Spider) {
12 | SpiderList = append(SpiderList, sp)
13 | }
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## pholcus
2 | Pholcus(幽灵蛛)是一款Go语言编写的爬虫软件框架(含GUI界面),优雅的爬虫规则、可控的高并发、任意的批量任务、多种输出方式、大量Demo,并且考虑了支持分布式布局。
3 |
4 | 
5 |
6 |
7 |
8 | **安装幽灵蛛**
9 | ```
10 | go get github.com/henrylee2cn/pholcus
11 | ```
12 |
--------------------------------------------------------------------------------
/downloader/downloader.go:
--------------------------------------------------------------------------------
1 | package downloader
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/downloader/context"
5 | )
6 |
7 | // The Downloader interface.
8 | // You can implement the interface by implement function Download.
9 | // Function Download need to return Page instance pointer that has request result downloaded from Request.
10 | type Downloader interface {
11 | Download(req *context.Request) *context.Response
12 | }
13 |
--------------------------------------------------------------------------------
/pipeline/collector/datacell.go:
--------------------------------------------------------------------------------
1 | // 数据存储单元
2 | package collector
3 |
4 | type DataCell map[string]interface{}
5 |
6 | func NewDataCell(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) DataCell {
7 | return DataCell{
8 | "RuleName": ruleName, //规定Data中的key
9 | "Data": data, //数据存储,key须与Rule的Fields保持一致
10 | "Url": url, //用于索引
11 | "ParentUrl": parentUrl, //DataCell的上级url
12 | "DownloadTime": downloadTime,
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/main.manifest:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/pholcus/gui/guimain.manifest:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/reporter/report.go:
--------------------------------------------------------------------------------
1 | package reporter
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | )
7 |
8 | type Report struct{}
9 |
10 | func (self *Report) send(str string) {
11 | if true {
12 |
13 | }
14 | }
15 |
16 | func (self *Report) Printf(format string, v ...interface{}) {
17 | log.Printf(format, v...)
18 | self.send(fmt.Sprintf(format, v...))
19 | }
20 |
21 | func (self *Report) Println(v ...interface{}) {
22 | log.Println(v...)
23 | self.send(fmt.Sprintln(v...))
24 | }
25 |
26 | var Log Reporter
27 |
28 | func init() {
29 | Log = &Report{}
30 | }
31 |
--------------------------------------------------------------------------------
/common/deduplicate/deduplicate.go:
--------------------------------------------------------------------------------
1 | package deduplicate
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/common/util"
5 | )
6 |
7 | type Deduplicate interface {
8 | // 采集非重复样本并返回对比结果,重复为true
9 | Compare(obj interface{}) bool
10 | }
11 |
12 | type Deduplication struct {
13 | sampling map[string]bool
14 | }
15 |
16 | func New() Deduplicate {
17 | return &Deduplication{
18 | sampling: make(map[string]bool),
19 | }
20 | }
21 |
22 | // 对比是否已存在,不存在则采样
23 | func (self *Deduplication) Compare(obj interface{}) bool {
24 | s := util.MakeUnique(obj)
25 | if !self.sampling[s] {
26 | self.sampling[s] = true
27 | return false
28 | }
29 | return true
30 | }
31 |
--------------------------------------------------------------------------------
/common/mlog/mlog.go:
--------------------------------------------------------------------------------
1 | // Package mlog implements log operations.
2 | package mlog
3 |
4 | import (
5 | "runtime"
6 | )
7 |
8 | // The plog is a public function combiation for other log objects.
9 | type plog struct {
10 | isopen bool
11 | }
12 |
13 | // GetCaller returns file name and line number at the third step of runtime.
14 | func (*plog) getCaller() (string, int) {
15 | _, file, line, ok := runtime.Caller(3)
16 | if !ok {
17 | file = "???"
18 | line = 0
19 | }
20 | return file, line
21 | }
22 |
23 | // Open makes log open.
24 | func (this *plog) Open() {
25 | this.isopen = true
26 | }
27 |
28 | // Close makes log close.
29 | func (this *plog) Close() {
30 | this.isopen = false
31 | }
32 |
--------------------------------------------------------------------------------
/pipeline/collector/output.go:
--------------------------------------------------------------------------------
1 | //数据输出管理
2 | package collector
3 |
4 | import (
5 | "github.com/henrylee2cn/pholcus/config"
6 | "log"
7 | // "fmt"
8 | "time"
9 | )
10 |
11 | func (self *Collector) Output(dataIndex int) {
12 | defer func() {
13 | err := recover()
14 | if err != nil {
15 | log.Printf("输出时出错!\n")
16 | } else {
17 | // 正常情况下回收内存
18 | self.DockerQueue.Recover(dataIndex)
19 | }
20 | }()
21 |
22 | dataLen := len(self.DockerQueue.Dockers[dataIndex])
23 | if dataLen == 0 {
24 | // log.Println("没有抓到结果!!!")
25 | return
26 | }
27 |
28 | // 输出数据统计
29 | self.setSum(dataLen)
30 |
31 | // 选择执行输出
32 | switch self.outType {
33 | case "excel":
34 | self.excel(dataIndex)
35 | case "csv":
36 | self.csv(dataIndex)
37 | case "mongoDB":
38 | self.mgo(dataIndex)
39 | default:
40 | self.excel(dataIndex)
41 | }
42 | log.Printf("[任务:%v | 关键词:%v | 批次:%v] 输出 %v 条数据,用时 %.5f 分钟!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), self.outCount[1]+1, dataLen, time.Since(config.StartTime).Minutes())
43 | }
44 |
--------------------------------------------------------------------------------
/common/mlog/strace.go:
--------------------------------------------------------------------------------
1 | package mlog
2 |
3 | import (
4 | "log"
5 | "os"
6 | )
7 |
8 | // Strace represents an active object that strace the processing of spider.
9 | // The strace info is output to os.Stderr.
10 | // The loginst is an point of logger in Std-Packages.
11 | // The isopen is a label represents whether open strace or not.
12 | type strace struct {
13 | plog
14 |
15 | loginst *log.Logger
16 | }
17 |
18 | var pstrace *strace
19 |
20 | // StraceInst get the singleton strace object.
21 | func StraceInst() *strace {
22 | if pstrace == nil {
23 | pstrace = newStrace()
24 | }
25 | return pstrace
26 | }
27 |
28 | // The newStrace returns initialized strace object.
29 | func newStrace() *strace {
30 | pstrace := &strace{}
31 | pstrace.loginst = log.New(os.Stderr, "", log.LstdFlags)
32 | pstrace.isopen = true
33 | return pstrace
34 | }
35 |
36 | // Println output the str to os.Stderr.
37 | func (this *strace) Println(str string) {
38 | if !this.isopen {
39 | return
40 | }
41 | this.loginst.Printf("%s\n", str)
42 | }
43 |
--------------------------------------------------------------------------------
/common/queue/queue.go:
--------------------------------------------------------------------------------
1 | package queue
2 |
3 | type Queue struct {
4 | PoolSize int
5 | PoolChan chan interface{}
6 | }
7 |
8 | func NewQueue(size int) *Queue {
9 | return &Queue{
10 | PoolSize: size,
11 | PoolChan: make(chan interface{}, size),
12 | }
13 | }
14 |
15 | func (this *Queue) Init(size int) *Queue {
16 | this.PoolSize = size
17 | this.PoolChan = make(chan interface{}, size)
18 | return this
19 | }
20 |
21 | func (this *Queue) Push(i interface{}) bool {
22 | if len(this.PoolChan) == this.PoolSize {
23 | return false
24 | }
25 | this.PoolChan <- i
26 | return true
27 | }
28 |
29 | func (this *Queue) PushSlice(s []interface{}) {
30 | for _, i := range s {
31 | this.Push(i)
32 | }
33 | }
34 |
35 | func (this *Queue) Pull() interface{} {
36 | return <-this.PoolChan
37 | }
38 |
39 | // 二次使用Queue实例时,根据容量需求进行高效转换
40 | func (this *Queue) Exchange(num int) (add int) {
41 | last := len(this.PoolChan)
42 |
43 | if last >= num {
44 | add = int(0)
45 | return
46 | }
47 |
48 | if this.PoolSize < num {
49 | pool := []interface{}{}
50 | for i := 0; i < last; i++ {
51 | pool = append(pool, <-this.PoolChan)
52 | }
53 | // 重新定义、赋值
54 | this.Init(num).PushSlice(pool)
55 | }
56 |
57 | add = num - last
58 | return
59 | }
60 |
--------------------------------------------------------------------------------
/scheduler/scheduler.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/common/deduplicate"
5 | "github.com/henrylee2cn/pholcus/downloader/context"
6 | )
7 |
8 | type Scheduler interface {
9 | // 采集非重复url并返回对比结果,重复为true
10 | Compare(string) bool
11 |
12 | SrcManager
13 | // 以下为具体方法列表
14 | // 存入
15 | // Push(*context.Request)
16 | // 取出
17 | // Use(string) *context.Request
18 | // 释放一个资源
19 | // Free()
20 | // 资源队列是否闲置
21 | // IsEmpty(string) bool
22 | // IsAllEmpty() bool
23 |
24 | }
25 |
26 | type scheduler struct {
27 | *SrcManage
28 | *deduplicate.Deduplication
29 | }
30 |
31 | func New(capacity uint) Scheduler {
32 | return &scheduler{
33 | SrcManage: NewSrcManage(capacity).(*SrcManage),
34 | Deduplication: deduplicate.New().(*deduplicate.Deduplication),
35 | }
36 | }
37 |
38 | func (self *scheduler) Push(req *context.Request) {
39 | is := self.Compare(req.GetUrl())
40 | // 有重复则返回
41 | if is {
42 | return
43 | }
44 | self.SrcManage.Push(req)
45 | }
46 |
47 | func (self *scheduler) Compare(url string) bool {
48 | return self.Deduplication.Compare(url)
49 | }
50 |
51 | // 定义全局调度
52 | var Self Scheduler
53 |
54 | func Init(capacity uint) Scheduler {
55 | Self = New(capacity)
56 | return Self
57 | }
58 |
--------------------------------------------------------------------------------
/pholcus/pholcus.go:
--------------------------------------------------------------------------------
1 | package pholcus
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/downloader/context"
5 | // "github.com/henrylee2cn/pholcus/pholcus/node"
6 | // "github.com/henrylee2cn/pholcus/pholcus/status"
7 | "github.com/henrylee2cn/pholcus/scheduler"
8 | "sync"
9 | )
10 |
11 | type Pholcus struct {
12 | // *node.Node
13 | // *Status
14 | isOutsource bool
15 | }
16 |
17 | var pushMutex sync.Mutex
18 |
19 | func (self *Pholcus) Push(req *context.Request) {
20 | pushMutex.Lock()
21 | defer func() {
22 | pushMutex.Unlock()
23 | }()
24 | if !self.TryOutsource(req) {
25 | scheduler.Self.Push(req)
26 | }
27 | }
28 |
29 | func (self *Pholcus) TryOutsource(req *context.Request) bool {
30 | if self.IsOutsource() && req.TryOutsource() {
31 | self.Send(*req)
32 | return true
33 | }
34 | return false
35 | }
36 |
37 | func (self *Pholcus) SetOutsource(serve bool) {
38 | self.isOutsource = serve
39 | }
40 |
41 | func (self *Pholcus) IsOutsource() bool {
42 | return self.isOutsource
43 | }
44 |
45 | func (self *Pholcus) Send(req context.Request) {
46 |
47 | }
48 |
49 | func (self *Pholcus) Receive(req context.Request) {
50 | scheduler.Self.Push(&req)
51 | }
52 |
53 | // 初始化
54 | var Self *Pholcus
55 |
56 | func init() {
57 | Self = &Pholcus{}
58 | }
59 |
--------------------------------------------------------------------------------
/common/etc_config/etc_config.go:
--------------------------------------------------------------------------------
1 | // Package etc_config implements config initialization of one spider.
2 | package etc_config
3 |
4 | import (
5 | "github.com/henrylee2cn/pholcus/common/config"
6 | "github.com/henrylee2cn/pholcus/common/util"
7 | "os"
8 | )
9 |
10 | // Configpath gets default config path like "WD/etc/main.conf".
11 | func configpath() string {
12 | //wd, _ := os.Getwd()
13 | wd := os.Getenv("GOPATH")
14 | if wd == "" {
15 | panic("GOPATH is not setted in env.")
16 | }
17 | logpath := wd + "/etc/"
18 | filename := "main.conf"
19 | err := os.MkdirAll(logpath, 0755)
20 | if err != nil {
21 | panic("logpath error : " + logpath + "\n")
22 | }
23 | return logpath + filename
24 | }
25 |
26 | // Config is a config singleton object for one spider.
27 | var conf *config.Config
28 | var path string
29 |
30 | // StartConf is used in Spider for initialization at first time.
31 | func StartConf(configFilePath string) *config.Config {
32 | if configFilePath != "" && !util.IsFileExists(configFilePath) {
33 | panic("config path is not valiad:" + configFilePath)
34 | }
35 |
36 | path = configFilePath
37 | return Conf()
38 | }
39 |
40 | // Conf gets singleton instance of Config.
41 | func Conf() *config.Config {
42 | if conf == nil {
43 | if path == "" {
44 | path = configpath()
45 | }
46 | conf = config.NewConfig().Load(path)
47 | }
48 | return conf
49 | }
50 |
--------------------------------------------------------------------------------
/pipeline/pipeline.go:
--------------------------------------------------------------------------------
1 | // 数据收集
2 | package pipeline
3 |
4 | import (
5 | "github.com/henrylee2cn/pholcus/common/deduplicate"
6 | "github.com/henrylee2cn/pholcus/pipeline/collector"
7 | // "github.com/henrylee2cn/pholcus/reporter"
8 | "github.com/henrylee2cn/pholcus/spiders/spider"
9 | )
10 |
11 | type Pipeline interface {
12 | Start()
13 | //接收控制通知
14 | CtrlR()
15 | //发送控制通知
16 | CtrlS()
17 | // 收集数据单元
18 | Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string)
19 | // 对比Url的fingerprint,返回是否有重复
20 | Deduplicate(string) bool
21 | // 重置
22 | Init(*spider.Spider)
23 | }
24 |
25 | type pipeline struct {
26 | *collector.Collector
27 | *deduplicate.Deduplication
28 | }
29 |
30 | func New() Pipeline {
31 | return &pipeline{
32 | Collector: collector.NewCollector(),
33 | Deduplication: deduplicate.New().(*deduplicate.Deduplication),
34 | }
35 | }
36 |
37 | func (self *pipeline) Collect(ruleName string, data map[string]string, url string, parentUrl string, downloadTime string) {
38 | dataCell := collector.NewDataCell(ruleName, data, url, parentUrl, downloadTime)
39 | self.Collector.Collect(dataCell)
40 | }
41 |
42 | func (self *pipeline) Init(sp *spider.Spider) {
43 | self.Collector.Init(sp)
44 | }
45 |
46 | func (self *pipeline) Deduplicate(s string) bool {
47 | return self.Deduplication.Compare(s)
48 | }
49 |
50 | func (self *pipeline) Start() {
51 | go self.Collector.Manage()
52 | // reporter.Log.Println("**************开启输出管道************")
53 | }
54 |
--------------------------------------------------------------------------------
/config/config.go:
--------------------------------------------------------------------------------
1 | package config
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/common/queue"
5 | "time"
6 | )
7 |
8 | const (
9 | //软件名
10 | APP_NAME = "幽灵蛛数据采集 V0.1 (by Henry)"
11 | // 蜘蛛池容量
12 | CRAWLER_CAP = 50
13 |
14 | // 收集器容量
15 | DATA_CAP = 2 << 14 //65536
16 |
17 | // mongodb数据库服务器
18 | DB_URL = "127.0.0.1:27017"
19 |
20 | //mongodb数据库名称
21 | DB_NAME = "temp-collection-tentinet"
22 |
23 | //mongodb数据库集合
24 | DB_COLLECTION = "news"
25 | )
26 |
27 | type Report struct {
28 | SpiderName string
29 | Keyword string
30 | Num string
31 | Time string
32 | }
33 |
34 | var (
35 | // 点击开始按钮的时间点
36 | StartTime time.Time
37 | // 小结报告通道
38 | ReportChan chan *Report
39 | // 请求页面计数
40 | ReqSum uint
41 | // 创建默认爬行队列
42 | CrawlerQueue *queue.Queue
43 |
44 | ThreadNum uint
45 |
46 | OutType string
47 |
48 | // 分段转储容器容量
49 | DOCKER_CAP uint
50 |
51 | // 分段输出池容量,最小为2
52 | DOCKER_QUEUE_CAP uint
53 | )
54 |
55 | func init() {
56 |
57 | ReportChan = make(chan *Report)
58 |
59 | CrawlerQueue = queue.NewQueue(0)
60 |
61 | InitDockerParam(50000)
62 |
63 | }
64 |
65 | func InitDockerParam(dockercap uint) {
66 | DOCKER_CAP = dockercap
67 | switch {
68 | case dockercap <= 10:
69 | DOCKER_QUEUE_CAP = 500
70 | case dockercap <= 500:
71 | DOCKER_QUEUE_CAP = 200
72 | case dockercap <= 1000:
73 | DOCKER_QUEUE_CAP = 100
74 | case dockercap <= 10000:
75 | DOCKER_QUEUE_CAP = 50
76 | case dockercap <= 100000:
77 | DOCKER_QUEUE_CAP = 10
78 | default:
79 | DOCKER_QUEUE_CAP = 4
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/pipeline/collector/docker.go:
--------------------------------------------------------------------------------
1 | // 负责从收集通道接受数据并临时存储
2 | package collector
3 |
4 | import (
5 | "github.com/henrylee2cn/pholcus/config"
6 | "sync"
7 | "time"
8 | )
9 |
10 | type DockerQueue struct {
11 | Curr int
12 | Cap uint
13 | Using map[int]bool
14 | Dockers [][]DataCell
15 | }
16 |
17 | func NewDocker() []DataCell {
18 | return make([]DataCell, 0, config.DOCKER_CAP)
19 | }
20 |
21 | func NewDockerQueue() *DockerQueue {
22 | var queueCap uint = config.DOCKER_QUEUE_CAP
23 | if config.DOCKER_QUEUE_CAP < 2 {
24 | queueCap = 2
25 | }
26 |
27 | dockerQueue := &DockerQueue{
28 | Curr: 0,
29 | Cap: queueCap,
30 | Using: make(map[int]bool, queueCap),
31 | Dockers: make([][]DataCell, 0),
32 | }
33 |
34 | dockerQueue.Using[0] = true
35 |
36 | dockerQueue.Dockers = append(dockerQueue.Dockers, NewDocker())
37 |
38 | return dockerQueue
39 | }
40 |
41 | var ChangeMutex sync.Mutex
42 |
43 | func (self *DockerQueue) Change() {
44 | ChangeMutex.Lock()
45 | defer ChangeMutex.Unlock()
46 | getLable:
47 | for {
48 | for k, v := range self.Using {
49 | if !v {
50 | self.Curr = k
51 | self.Using[k] = true
52 | break getLable
53 | }
54 | }
55 | self.AutoAdd()
56 | time.Sleep(5e8)
57 | }
58 | }
59 |
60 | func (self *DockerQueue) Recover(index int) {
61 | self.Dockers[index] = NewDocker()
62 | self.Using[index] = false
63 | }
64 |
65 | // 根据情况自动动态增加Docker
66 | func (self *DockerQueue) AutoAdd() {
67 | count := len(self.Dockers)
68 | if uint(count) < self.Cap {
69 | self.Dockers = append(self.Dockers, NewDocker())
70 | self.Using[count] = false
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/spiders/spider/rss.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/reporter"
5 | "math"
6 | "time"
7 | )
8 |
9 | type RSS struct {
10 | // RSS爬虫重新访问的5个级别(分钟)
11 | Level []int
12 | //RSS源的权重, self.T[src] {
39 | if k == 0 {
40 | k = 1
41 | }
42 | reporter.Log.Printf("************************ ……当前RSS <%s> 的更新周期为 %v 分钟……************************", src, self.Level[k-1])
43 | time.Sleep(time.Minute * time.Duration(self.Level[k-1]))
44 | break
45 | }
46 | }
47 | if self.Flag[src] {
48 | self.T[src] = int(math.Floor(float64(self.T[src]) / 1.2))
49 | if self.T[src] < self.Level[0] {
50 | self.T[src] = self.Level[0]
51 | }
52 | } else {
53 | self.T[src] = int(math.Floor(float64(self.T[src]) * 1.2))
54 | if self.T[src] > self.Level[len(self.Level)-1] {
55 | self.T[src] = self.Level[len(self.Level)-1]
56 | }
57 | }
58 | self.Flag[src] = false
59 | }
60 |
61 | func (self *RSS) Updata(src string) {
62 | self.Flag[src] = true
63 | }
64 |
--------------------------------------------------------------------------------
/scheduler/src_manage.go:
--------------------------------------------------------------------------------
1 | package scheduler
2 |
3 | import (
4 | "github.com/henrylee2cn/pholcus/downloader/context"
5 | // "github.com/henrylee2cn/pholcus/reporter"
6 | )
7 |
8 | // SrcManage is an interface that who want implement an management object can realize these functions.
9 | type SrcManager interface {
10 | // 存入
11 | Push(*context.Request)
12 | // 取出
13 | Use(int) *context.Request
14 | // 释放一个资源
15 | Free()
16 | // 资源队列是否闲置
17 | IsEmpty(int) bool
18 | IsAllEmpty() bool
19 | }
20 |
21 | type SrcManage struct {
22 | count chan bool
23 | queue map[int][]*context.Request
24 | }
25 |
26 | func NewSrcManage(capacity uint) SrcManager {
27 | return &SrcManage{
28 | count: make(chan bool, int(capacity)),
29 | queue: make(map[int][]*context.Request),
30 | }
31 | }
32 |
33 | func (self *SrcManage) Push(req *context.Request) {
34 | if spiderId, ok := req.GetSpiderId(); ok {
35 | self.queue[spiderId] = append(self.queue[spiderId], req)
36 | }
37 | }
38 |
39 | func (self *SrcManage) Use(spiderId int) *context.Request {
40 | if len(self.queue[spiderId]) == 0 {
41 | return nil
42 | }
43 | req := self.queue[spiderId][0]
44 | self.queue[spiderId] = self.queue[spiderId][1:]
45 | self.count <- true
46 | return req
47 | }
48 |
49 | func (self *SrcManage) Free() {
50 | <-self.count
51 | }
52 |
53 | func (self *SrcManage) IsEmpty(spiderId int) bool {
54 | if len(self.queue[spiderId]) == 0 {
55 | return true
56 | }
57 | return false
58 | }
59 |
60 | func (self *SrcManage) IsAllEmpty() bool {
61 | if len(self.count) == 0 {
62 | for _, v := range self.queue {
63 | if len(v) != 0 {
64 | return false
65 | }
66 | }
67 | return true
68 | }
69 | return false
70 | }
71 |
--------------------------------------------------------------------------------
/spiders/spider/common.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | // "bytes"
5 | "code.google.com/p/mahonia"
6 | // "golang.org/x/text/encoding/simplifiedchinese"
7 | // "golang.org/x/text/transform"
8 | // "io/ioutil"
9 | // "github.com/henrylee2cn/pholcus/downloader/context"
10 | "regexp"
11 | "strings"
12 | )
13 |
14 | func CleanHtml(str string, depth int) string {
15 | if depth > 0 {
16 | //将HTML标签全转换成小写
17 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
18 | str = re.ReplaceAllStringFunc(str, strings.ToLower)
19 | }
20 | if depth > 1 {
21 | //去除STYLE
22 | re, _ := regexp.Compile("\\