├── crawler
    └── crawler.go
├── download
    └── download.go
├── main.go
├── page
    └── page.go
├── pipeline
    ├── collector.go
    ├── data.go
    └── pipeline.go
├── request
    └── request.go
├── scheduler
    ├── matrix.go
    ├── shceduler.go
    └── status.go
└── spider
    ├── rule.go
    └── spider.go


/crawler/crawler.go:
--------------------------------------------------------------------------------
  1 | package crawler
  2 | 
  3 | import (
  4 | 	"github.com/liunian1004/go-crawler/spider"
  5 | 	"github.com/liunian1004/go-crawler/download"
  6 | 	"github.com/liunian1004/go-crawler/pipeline"
  7 | 	page2 "github.com/liunian1004/go-crawler/page"
  8 | 	"time"
  9 | 	"github.com/liunian1004/go-crawler/request"
 10 | )
 11 | 
 12 | // 采集器
 13 | type CrawlerInterface interface {
 14 | 	Init(spider *spider.Spider) CrawlerInterface // 初始化
 15 | 	Run()
 16 | 	Stop()
 17 | 	GetID() int
 18 | 	CanStop() bool
 19 | }
 20 | 
 21 | // 从抽象层面调度各模块
 22 | type Crawler struct {
 23 | 	Spider *spider.Spider // 采集蜘蛛
 24 | 	Download download.Downloader // 下载器
 25 | 	Pipeline pipeline.Pipeliner // 数据输出
 26 | 	ID int // 采集器 ID
 27 | }
 28 | 
 29 | func New(id int) CrawlerInterface {
 30 | 	return &Crawler{
 31 | 		ID: id,
 32 | 		Download: download.EasyDownload{},
 33 | 	}
 34 | }
 35 | 
 36 | func (c *Crawler) Init(spider *spider.Spider) CrawlerInterface {
 37 | 	// 初始化蜘蛛 scheduler
 38 | 
 39 | 	// 初始化 Pipeline
 40 | 
 41 | 	return c
 42 | }
 43 | 
 44 | func (c *Crawler) Run() {
 45 | 	// 开启收集
 46 | 	c.Pipeline.Start()
 47 | 
 48 | 	t := make(chan bool)
 49 | 
 50 | 	go func() {
 51 | 		c.run()
 52 | 		close(t)
 53 | 	}()
 54 | 
 55 | 	c.Spider.Start()
 56 | 
 57 | 	// 阻塞等待退出
 58 | 	<-t
 59 | 
 60 | 	c.Pipeline.Stop()
 61 | }
 62 | 
 63 | func (c *Crawler) run() {
 64 | 	for {
 65 | 		// 从队列中取出一条请求
 66 | 		req := c.GetOneRequest()
 67 | 		if req == nil {
 68 | 			// 是否可以停止任务
 69 | 			//if self.Spider.CanStop() {
 70 | 			//	break
 71 | 			//}
 72 | 			time.Sleep(20 * time.Millisecond)
 73 | 			continue
 74 | 		}
 75 | 
 76 | 		// 执行请求，限制当前的并发数，获取一个资源，获取不到时阻塞
 77 | 		c.UseOneThread()
 78 | 		go func() {
 79 | 			defer func() {
 80 | 				c.ReleaseOneThread()
 81 | 			}()
 82 | 			// 拿到进入资源，进入执行抓取
 83 | 			c.Process(req)
 84 | 		}()
 85 | 
 86 | 		// 等待一会
 87 | 		time.Sleep(20 * time.Millisecond)
 88 | 	}
 89 | }
 90 | 
 91 | func (c *Crawler) Process(req *request.Request) {
 92 | 	// 下载的 URL
 93 | 
 94 | 	// 下载的蜘蛛
 95 | 	spider1 := c.Spider
 96 | 
 97 | 	// 使用下载器下载
 98 | 	page := c.Download.Download(spider1, req)
 99 | 
100 | 	// 使用规则，解析下载回来的 page 对象 (每个 Request 都有对应的规则)
101 | 	page.Parse(req.GetRuleName())
102 | 
103 | 	err := c.Pipeline.CollectItems(page.GetItems())
104 | 	if err != nil {
105 | 		return
106 | 	}
107 | 	err = c.Pipeline.CollectFiles(page.GetFiles())
108 | 	if err != nil {
109 | 		return
110 | 	}
111 | 
112 | 	// 记录成功的请求，判断是否作为失败的请求添加至队列尾部
113 | 
114 | 	// 统计成功页数
115 | 
116 | 	// 释放 Page
117 | 	page2.ReleasePage(page)
118 | }
119 | 
120 | func (*Crawler) Stop() {
121 | 	panic("implement me")
122 | }
123 | 
124 | func (c *Crawler) GetID() int {
125 | 	return c.ID
126 | }
127 | 
128 | func (c *Crawler) SetID(id int) {
129 | 	c.ID = id
130 | }
131 | 
132 | func (c *Crawler) CanStop() bool {
133 | 	panic("implement me")
134 | }
135 | 
136 | func (c *Crawler) GetOneRequest() *request.Request {
137 | 	return nil
138 | }
139 | 
140 | func (c *Crawler) UseOneThread() {
141 | }
142 | 
143 | func (c *Crawler) ReleaseOneThread() {
144 | }
145 | 
146 | // 辅助方法
147 | 
148 | 


--------------------------------------------------------------------------------
/download/download.go:
--------------------------------------------------------------------------------
 1 | package download
 2 | 
 3 | import (
 4 | 	"github.com/liunian1004/go-crawler/spider"
 5 | 	"github.com/liunian1004/go-crawler/page"
 6 | 	"github.com/liunian1004/go-crawler/pipeline"
 7 | 	"github.com/liunian1004/go-crawler/request"
 8 | )
 9 | 
10 | type Downloader interface{
11 | 	Download(spider *spider.Spider, request *request.Request) *page.Page // 根据抽象规则和请求对象下载页面
12 | }
13 | 
14 | type EasyDownload struct {
15 | }
16 | 
17 | func (EasyDownload) Download(spider *spider.Spider, request *request.Request) *page.Page {
18 | 	return &page.Page{
19 | 		Spider: spider,
20 | 		Items: []pipeline.Item{
21 | 			map[string]interface{}{ "1": 1},
22 | 			},
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"runtime"
 5 | 	"github.com/liunian1004/go-crawler/crawler"
 6 | 	"github.com/liunian1004/go-crawler/spider"
 7 | )
 8 | 
 9 | func init() {
10 | 	runtime.GOMAXPROCS(runtime.NumCPU())
11 | }
12 | 
13 | func main() {
14 | 	c := crawler.Crawler{}
15 | 	c.Init(&spider.Spider{
16 | 		Name: "Test",
17 | 	}).Run()
18 | }


--------------------------------------------------------------------------------
/page/page.go:
--------------------------------------------------------------------------------
 1 | package page
 2 | 
 3 | import (
 4 | 	"github.com/liunian1004/go-crawler/pipeline"
 5 | 	"net/http"
 6 | 	"github.com/liunian1004/go-crawler/request"
 7 | 	"github.com/liunian1004/go-crawler/spider"
 8 | 	"sync"
 9 | )
10 | 
11 | type Pager interface{
12 | 	Parse(ruleName string) // 解析返回对象中的文件和数据
13 | 	GetItems()
14 | 	GetFiles()
15 | }
16 | 
17 | type Page struct {
18 | 	Spider *spider.Spider // 规则
19 | 
20 | 	Request *request.Request // 请求
21 | 	Response *http.Response // 响应流
22 | 
23 | 	Items []pipeline.Item
24 | 	Files []pipeline.File
25 | }
26 | 
27 | var pagePool = &sync.Pool{
28 | 	New: func() interface{} {
29 | 		return &Page{
30 | 			Items: []pipeline.Item{},
31 | 			Files: []pipeline.File{},
32 | 		}
33 | 	},
34 | }
35 | 
36 | func GetPage(s *spider.Spider, request *request.Request) *Page {
37 | 	page := pagePool.Get().(*Page)
38 | 	page.Spider = s
39 | 	page.Request = request
40 | 	return page
41 | }
42 | 
43 | func ReleasePage(page *Page) {
44 | 	if page.Response != nil {
45 | 		page.Response.Body.Close()
46 | 		page.Response = nil
47 | 	}
48 | 	page.Request = nil
49 | 	page.Spider = nil
50 | 	page.Items = page.Items[:0]
51 | 	page.Files = page.Files[:0]
52 | 	// 后续还要增加释放的资源
53 | 	pagePool.Put(page)
54 | }
55 | 
56 | func (p *Page) GetItems() []pipeline.Item {
57 | 	return p.Items
58 | }
59 | 
60 | func (p *Page) GetFiles() []pipeline.File {
61 | 	return p.Files
62 | }
63 | 
64 | func (p *Page) Parse(ruleName string) *Page {
65 | 	// 根据 rule 将数据解析到 Items 和 Files
66 | 	// get Rule
67 | 	rule := p.getRule(ruleName)
68 | 
69 | 	if rule.ParseFunc == nil {
70 | 		panic("解析函数不存在")
71 | 	}
72 | 	
73 | 	// 解析数据
74 | 	rule.ParseFunc(p)
75 | 
76 | 	return p
77 | }
78 | 
79 | func (p *Page) getRule(ruleName string) (rule *spider.Rule) {
80 | 	rule, b := p.Spider.GetRule(ruleName)
81 | 	if b == false {
82 | 		panic("获取规则失败")
83 | 	}
84 | 	return
85 | }


--------------------------------------------------------------------------------
/pipeline/collector.go:
--------------------------------------------------------------------------------
 1 | package pipeline
 2 | 
 3 | // 数据收集器，实现 Pipeline 接口
 4 | type Collector struct {
 5 | 	ItemChan chan Item
 6 | 	FileChan chan File
 7 | 
 8 | 	ItemCache []Item // 分配输出结果的缓存
 9 | 
10 | 	outputType string // 输出方式
11 | 
12 | 	ItemCount int
13 | 	ItemOutputTimes int
14 | 	FileCount int
15 | 
16 | 	Total int // 收集的数据总数
17 | }
18 | 
19 | func (c *Collector) Start() {
20 | 	go func() {
21 | 		// 从队列迭代获取数据，直至队列 close
22 | 		for data := range c.ItemChan {
23 | 			c.ItemCache = append(c.ItemCache, data)
24 | 
25 | 			// 数据条数
26 | 			c.ItemCount++
27 | 			
28 | 			// 判断是否满足一组数据的输出条件，或者是超时，先不实现
29 | 			if len(c.ItemCache) < 1 {
30 | 				continue
31 | 			}
32 | 
33 | 			c.ItemOutputTimes++
34 | 			// 输出 阻塞调用
35 | 			c.outputData()
36 | 		}
37 | 		// Stop 退出时 close(ItemChan) 时调用，输出剩下的数据
38 | 		c.ItemOutputTimes++
39 | 		c.outputData()
40 | 	}()
41 | }
42 | 
43 | func (c *Collector) Stop() {
44 | 	panic("implement me")
45 | }
46 | 
47 | func (c *Collector) CollectFiles(files []File) error {
48 | 	for _, file := range files {
49 | 		// 写入 chan
50 | 		c.FileChan <- file
51 | 	}
52 | 	return nil
53 | }
54 | 
55 | func (c *Collector) CollectItems(items []Item) error {
56 | 	for _, item := range items {
57 | 		c.ItemChan <- item
58 | 	}
59 | 	return nil
60 | }
61 | 
62 | func (c *Collector) outputData()  {
63 | 	// 输出数据
64 | 
65 | }


--------------------------------------------------------------------------------
/pipeline/data.go:
--------------------------------------------------------------------------------
1 | package pipeline
2 | 
3 | type Item map[string]interface{}
4 | // 存储的完整文件名为： file/"Dir"/"RuleName"/"time"/"Name"
5 | type File map[string]interface{}
6 | 


--------------------------------------------------------------------------------
/pipeline/pipeline.go:
--------------------------------------------------------------------------------
1 | package pipeline
2 | 
3 | type Pipeliner interface{
4 | 	Start() // 启动收集
5 | 	Stop() // 停止收集
6 | 	CollectFiles(files []File) error
7 | 	CollectItems(items []Item) error
8 | }
9 | 


--------------------------------------------------------------------------------
/request/request.go:
--------------------------------------------------------------------------------
1 | package request
2 | 
3 | type Request struct {
4 | 	Rule string // 解析响应的对应规则
5 | }
6 | 
7 | func (r *Request) GetRuleName() string {
8 | 	return r.Rule
9 | }


--------------------------------------------------------------------------------
/scheduler/matrix.go:
--------------------------------------------------------------------------------
 1 | package scheduler
 2 | 
 3 | import (
 4 | 	"github.com/liunian1004/go-crawler/request"
 5 | )
 6 | 
 7 | type Matrix struct {
 8 | 	SpiderName string
 9 | 	//MaxPage int64 // 最大采集页数
10 | 	Requests []*request.Request // 请求队列
11 | }
12 | 
13 | func NewMatrix(spiderName string) *Matrix {
14 | 	return &Matrix{
15 | 		SpiderName: spiderName,
16 | 		//MaxPage: int64(maxPage),
17 | 		Requests: make([]*request.Request, 0),
18 | 	}
19 | }
20 | 
21 | func (m *Matrix) Push(req *request.Request) {
22 | 	// 停止爬虫的请求上限
23 | 	//if m.MaxPage >= 0 {
24 | 	//	return
25 | 	//}
26 | 
27 | 	// 根据 Scheduler 状态限制队列中的请求数
28 | 
29 | 	// 去除重复的请求
30 | 
31 | 	m.Requests = append(m.Requests, req)
32 | 
33 | 	//atomic.AddInt64(&m.MaxPage, 1)
34 | }
35 | 
36 | func (m *Matrix) Pull() (req *request.Request) {
37 | 	req = m.Requests[0]
38 | 
39 | 	// 如果使用代理，设置代理。
40 | 
41 | 	m.Requests = m.Requests[1:]
42 | 	return
43 | }


--------------------------------------------------------------------------------
/scheduler/shceduler.go:
--------------------------------------------------------------------------------
 1 | package scheduler
 2 | 
 3 | import (
 4 | 	"sync"
 5 | )
 6 | 
 7 | type Scheduler struct {
 8 | 	Status      Status    // 运行状态
 9 | 	ThreadCount chan bool // 并发数
10 | 	UseProxy    bool      // 是否使用代理
11 | 	//Proxy *Proxy
12 | 	Matrices []*Matrix
13 | 	sync.RWMutex
14 | }
15 | 
16 | var GlobalScheduler = &Scheduler{
17 | 	Status: RUN,
18 | 	ThreadCount: make(chan bool, 10),
19 | 	//Proxy
20 | 	Matrices: []*Matrix{},
21 | }
22 | 
23 | func InsertGlobalSchedulerMatrix(spiderName string) *Matrix {
24 | 	matrix := NewMatrix(spiderName)
25 | 	GlobalScheduler.RLock()
26 |  	defer GlobalScheduler.RUnlock()
27 | 	GlobalScheduler.Matrices = append(GlobalScheduler.Matrices, matrix)
28 | 	return matrix
29 | }


--------------------------------------------------------------------------------
/scheduler/status.go:
--------------------------------------------------------------------------------
 1 | package scheduler
 2 | 
 3 | type Status int
 4 | 
 5 | // 运行状态
 6 | const (
 7 | 	STOPPED Status = iota - 1
 8 | 	STOP
 9 | 	RUN
10 | 	PAUSE
11 | )


--------------------------------------------------------------------------------
/spider/rule.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | import "github.com/liunian1004/go-crawler/page"
 4 | 
 5 | // 采集规则节点
 6 | type Rule struct {
 7 | 	ParseFunc func(page *page.Page)
 8 | }
 9 | 
10 | type RuleTree struct {
11 | 	// 根节点，此时 Page 为空
12 | 	Root func(page *page.Page)
13 | 	Trunk map[string]*Rule
14 | }


--------------------------------------------------------------------------------
/spider/spider.go:
--------------------------------------------------------------------------------
 1 | package spider
 2 | 
 3 | import (
 4 | 	"github.com/liunian1004/go-crawler/page"
 5 | 	"sync"
 6 | 	"github.com/liunian1004/go-crawler/scheduler"
 7 | 	"github.com/liunian1004/go-crawler/request"
 8 | )
 9 | 
10 | type Spider struct {
11 | 	ID int
12 | 	Name string
13 | 	SubName string
14 | 	Description string
15 | 	UseCookie bool
16 | 	NotDefaultField bool // 是否禁止输出结果中的默认字段 Url/ParentUrl/DownloadTime
17 | 
18 | 	RuleTree RuleTree
19 | 
20 | 	Matrix *scheduler.Matrix // 存储请求
21 | 
22 | 	Status scheduler.Status // 执行状态
23 | 	lock sync.RWMutex
24 | 	once sync.Once
25 | }
26 | 
27 | var GlobalSpiders map[string]*Spider
28 | 
29 | func (s *Spider) Register() {
30 | 	GlobalSpiders[s.Name] = s
31 | }
32 | 
33 | // 查找对应的规则
34 | func (s *Spider) GetRule(ruleName string) (rule *Rule, r bool) {
35 | 	rule, r = s.RuleTree.Trunk[ruleName]
36 | 	return
37 | }
38 | 
39 | func (s *Spider) Start() {
40 | 	s.RuleTree.Root(page.GetPage(s, nil))
41 | 	s.lock.Lock()
42 | 	s.Status = scheduler.RUN
43 | 	s.lock.Unlock()
44 | }
45 | 
46 | func (s *Spider) Stop() {
47 | 	s.lock.Lock()
48 | 	s.Status = scheduler.STOP
49 | 	s.lock.Unlock()
50 | }
51 | 
52 | // 初始化队列
53 | func (s *Spider) SpiderMatrixInit() *Spider {
54 | 	s.Matrix = scheduler.InsertGlobalSchedulerMatrix(s.Name)
55 | 	return s
56 | }
57 | 
58 | // 添加请求
59 | func (s *Spider) PushRequest(request *request.Request) {
60 | 	s.Matrix.Push(request)
61 | }
62 | 
63 | // 取出请求
64 | func (s *Spider) PullRequest() (request *request.Request) {
65 | 	return s.Matrix.Pull()
66 | }
67 | 
68 | // 保持成功记录
69 | func (s *Spider) SaveSuccess() {
70 | 	
71 | }
72 | 
73 | // 保存失败记录
74 | func (s *Spider) SaveFailure() {
75 | 
76 | }


--------------------------------------------------------------------------------