├── crawler └── crawler.go ├── download └── download.go ├── main.go ├── page └── page.go ├── pipeline ├── collector.go ├── data.go └── pipeline.go ├── request └── request.go ├── scheduler ├── matrix.go ├── shceduler.go └── status.go └── spider ├── rule.go └── spider.go /crawler/crawler.go: -------------------------------------------------------------------------------- 1 | package crawler 2 | 3 | import ( 4 | "github.com/liunian1004/go-crawler/spider" 5 | "github.com/liunian1004/go-crawler/download" 6 | "github.com/liunian1004/go-crawler/pipeline" 7 | page2 "github.com/liunian1004/go-crawler/page" 8 | "time" 9 | "github.com/liunian1004/go-crawler/request" 10 | ) 11 | 12 | // 采集器 13 | type CrawlerInterface interface { 14 | Init(spider *spider.Spider) CrawlerInterface // 初始化 15 | Run() 16 | Stop() 17 | GetID() int 18 | CanStop() bool 19 | } 20 | 21 | // 从抽象层面调度各模块 22 | type Crawler struct { 23 | Spider *spider.Spider // 采集蜘蛛 24 | Download download.Downloader // 下载器 25 | Pipeline pipeline.Pipeliner // 数据输出 26 | ID int // 采集器 ID 27 | } 28 | 29 | func New(id int) CrawlerInterface { 30 | return &Crawler{ 31 | ID: id, 32 | Download: download.EasyDownload{}, 33 | } 34 | } 35 | 36 | func (c *Crawler) Init(spider *spider.Spider) CrawlerInterface { 37 | // 初始化蜘蛛 scheduler 38 | 39 | // 初始化 Pipeline 40 | 41 | return c 42 | } 43 | 44 | func (c *Crawler) Run() { 45 | // 开启收集 46 | c.Pipeline.Start() 47 | 48 | t := make(chan bool) 49 | 50 | go func() { 51 | c.run() 52 | close(t) 53 | }() 54 | 55 | c.Spider.Start() 56 | 57 | // 阻塞等待退出 58 | <-t 59 | 60 | c.Pipeline.Stop() 61 | } 62 | 63 | func (c *Crawler) run() { 64 | for { 65 | // 从队列中取出一条请求 66 | req := c.GetOneRequest() 67 | if req == nil { 68 | // 是否可以停止任务 69 | //if self.Spider.CanStop() { 70 | // break 71 | //} 72 | time.Sleep(20 * time.Millisecond) 73 | continue 74 | } 75 | 76 | // 执行请求,限制当前的并发数,获取一个资源,获取不到时阻塞 77 | c.UseOneThread() 78 | go func() { 79 | defer func() { 80 | c.ReleaseOneThread() 81 | }() 82 | // 拿到进入资源,进入执行抓取 83 | c.Process(req) 84 | }() 85 | 86 | // 等待一会 87 | time.Sleep(20 * time.Millisecond) 88 | } 89 | } 90 | 91 | func (c *Crawler) Process(req *request.Request) { 92 | // 下载的 URL 93 | 94 | // 下载的蜘蛛 95 | spider1 := c.Spider 96 | 97 | // 使用下载器下载 98 | page := c.Download.Download(spider1, req) 99 | 100 | // 使用规则,解析下载回来的 page 对象 (每个 Request 都有对应的规则) 101 | page.Parse(req.GetRuleName()) 102 | 103 | err := c.Pipeline.CollectItems(page.GetItems()) 104 | if err != nil { 105 | return 106 | } 107 | err = c.Pipeline.CollectFiles(page.GetFiles()) 108 | if err != nil { 109 | return 110 | } 111 | 112 | // 记录成功的请求,判断是否作为失败的请求添加至队列尾部 113 | 114 | // 统计成功页数 115 | 116 | // 释放 Page 117 | page2.ReleasePage(page) 118 | } 119 | 120 | func (*Crawler) Stop() { 121 | panic("implement me") 122 | } 123 | 124 | func (c *Crawler) GetID() int { 125 | return c.ID 126 | } 127 | 128 | func (c *Crawler) SetID(id int) { 129 | c.ID = id 130 | } 131 | 132 | func (c *Crawler) CanStop() bool { 133 | panic("implement me") 134 | } 135 | 136 | func (c *Crawler) GetOneRequest() *request.Request { 137 | return nil 138 | } 139 | 140 | func (c *Crawler) UseOneThread() { 141 | } 142 | 143 | func (c *Crawler) ReleaseOneThread() { 144 | } 145 | 146 | // 辅助方法 147 | 148 | -------------------------------------------------------------------------------- /download/download.go: -------------------------------------------------------------------------------- 1 | package download 2 | 3 | import ( 4 | "github.com/liunian1004/go-crawler/spider" 5 | "github.com/liunian1004/go-crawler/page" 6 | "github.com/liunian1004/go-crawler/pipeline" 7 | "github.com/liunian1004/go-crawler/request" 8 | ) 9 | 10 | type Downloader interface{ 11 | Download(spider *spider.Spider, request *request.Request) *page.Page // 根据抽象规则和请求对象下载页面 12 | } 13 | 14 | type EasyDownload struct { 15 | } 16 | 17 | func (EasyDownload) Download(spider *spider.Spider, request *request.Request) *page.Page { 18 | return &page.Page{ 19 | Spider: spider, 20 | Items: []pipeline.Item{ 21 | map[string]interface{}{ "1": 1}, 22 | }, 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "runtime" 5 | "github.com/liunian1004/go-crawler/crawler" 6 | "github.com/liunian1004/go-crawler/spider" 7 | ) 8 | 9 | func init() { 10 | runtime.GOMAXPROCS(runtime.NumCPU()) 11 | } 12 | 13 | func main() { 14 | c := crawler.Crawler{} 15 | c.Init(&spider.Spider{ 16 | Name: "Test", 17 | }).Run() 18 | } -------------------------------------------------------------------------------- /page/page.go: -------------------------------------------------------------------------------- 1 | package page 2 | 3 | import ( 4 | "github.com/liunian1004/go-crawler/pipeline" 5 | "net/http" 6 | "github.com/liunian1004/go-crawler/request" 7 | "github.com/liunian1004/go-crawler/spider" 8 | "sync" 9 | ) 10 | 11 | type Pager interface{ 12 | Parse(ruleName string) // 解析返回对象中的文件和数据 13 | GetItems() 14 | GetFiles() 15 | } 16 | 17 | type Page struct { 18 | Spider *spider.Spider // 规则 19 | 20 | Request *request.Request // 请求 21 | Response *http.Response // 响应流 22 | 23 | Items []pipeline.Item 24 | Files []pipeline.File 25 | } 26 | 27 | var pagePool = &sync.Pool{ 28 | New: func() interface{} { 29 | return &Page{ 30 | Items: []pipeline.Item{}, 31 | Files: []pipeline.File{}, 32 | } 33 | }, 34 | } 35 | 36 | func GetPage(s *spider.Spider, request *request.Request) *Page { 37 | page := pagePool.Get().(*Page) 38 | page.Spider = s 39 | page.Request = request 40 | return page 41 | } 42 | 43 | func ReleasePage(page *Page) { 44 | if page.Response != nil { 45 | page.Response.Body.Close() 46 | page.Response = nil 47 | } 48 | page.Request = nil 49 | page.Spider = nil 50 | page.Items = page.Items[:0] 51 | page.Files = page.Files[:0] 52 | // 后续还要增加释放的资源 53 | pagePool.Put(page) 54 | } 55 | 56 | func (p *Page) GetItems() []pipeline.Item { 57 | return p.Items 58 | } 59 | 60 | func (p *Page) GetFiles() []pipeline.File { 61 | return p.Files 62 | } 63 | 64 | func (p *Page) Parse(ruleName string) *Page { 65 | // 根据 rule 将数据解析到 Items 和 Files 66 | // get Rule 67 | rule := p.getRule(ruleName) 68 | 69 | if rule.ParseFunc == nil { 70 | panic("解析函数不存在") 71 | } 72 | 73 | // 解析数据 74 | rule.ParseFunc(p) 75 | 76 | return p 77 | } 78 | 79 | func (p *Page) getRule(ruleName string) (rule *spider.Rule) { 80 | rule, b := p.Spider.GetRule(ruleName) 81 | if b == false { 82 | panic("获取规则失败") 83 | } 84 | return 85 | } -------------------------------------------------------------------------------- /pipeline/collector.go: -------------------------------------------------------------------------------- 1 | package pipeline 2 | 3 | // 数据收集器,实现 Pipeline 接口 4 | type Collector struct { 5 | ItemChan chan Item 6 | FileChan chan File 7 | 8 | ItemCache []Item // 分配输出结果的缓存 9 | 10 | outputType string // 输出方式 11 | 12 | ItemCount int 13 | ItemOutputTimes int 14 | FileCount int 15 | 16 | Total int // 收集的数据总数 17 | } 18 | 19 | func (c *Collector) Start() { 20 | go func() { 21 | // 从队列迭代获取数据,直至队列 close 22 | for data := range c.ItemChan { 23 | c.ItemCache = append(c.ItemCache, data) 24 | 25 | // 数据条数 26 | c.ItemCount++ 27 | 28 | // 判断是否满足一组数据的输出条件,或者是超时,先不实现 29 | if len(c.ItemCache) < 1 { 30 | continue 31 | } 32 | 33 | c.ItemOutputTimes++ 34 | // 输出 阻塞调用 35 | c.outputData() 36 | } 37 | // Stop 退出时 close(ItemChan) 时调用,输出剩下的数据 38 | c.ItemOutputTimes++ 39 | c.outputData() 40 | }() 41 | } 42 | 43 | func (c *Collector) Stop() { 44 | panic("implement me") 45 | } 46 | 47 | func (c *Collector) CollectFiles(files []File) error { 48 | for _, file := range files { 49 | // 写入 chan 50 | c.FileChan <- file 51 | } 52 | return nil 53 | } 54 | 55 | func (c *Collector) CollectItems(items []Item) error { 56 | for _, item := range items { 57 | c.ItemChan <- item 58 | } 59 | return nil 60 | } 61 | 62 | func (c *Collector) outputData() { 63 | // 输出数据 64 | 65 | } -------------------------------------------------------------------------------- /pipeline/data.go: -------------------------------------------------------------------------------- 1 | package pipeline 2 | 3 | type Item map[string]interface{} 4 | // 存储的完整文件名为: file/"Dir"/"RuleName"/"time"/"Name" 5 | type File map[string]interface{} 6 | -------------------------------------------------------------------------------- /pipeline/pipeline.go: -------------------------------------------------------------------------------- 1 | package pipeline 2 | 3 | type Pipeliner interface{ 4 | Start() // 启动收集 5 | Stop() // 停止收集 6 | CollectFiles(files []File) error 7 | CollectItems(items []Item) error 8 | } 9 | -------------------------------------------------------------------------------- /request/request.go: -------------------------------------------------------------------------------- 1 | package request 2 | 3 | type Request struct { 4 | Rule string // 解析响应的对应规则 5 | } 6 | 7 | func (r *Request) GetRuleName() string { 8 | return r.Rule 9 | } -------------------------------------------------------------------------------- /scheduler/matrix.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "github.com/liunian1004/go-crawler/request" 5 | ) 6 | 7 | type Matrix struct { 8 | SpiderName string 9 | //MaxPage int64 // 最大采集页数 10 | Requests []*request.Request // 请求队列 11 | } 12 | 13 | func NewMatrix(spiderName string) *Matrix { 14 | return &Matrix{ 15 | SpiderName: spiderName, 16 | //MaxPage: int64(maxPage), 17 | Requests: make([]*request.Request, 0), 18 | } 19 | } 20 | 21 | func (m *Matrix) Push(req *request.Request) { 22 | // 停止爬虫的请求上限 23 | //if m.MaxPage >= 0 { 24 | // return 25 | //} 26 | 27 | // 根据 Scheduler 状态限制队列中的请求数 28 | 29 | // 去除重复的请求 30 | 31 | m.Requests = append(m.Requests, req) 32 | 33 | //atomic.AddInt64(&m.MaxPage, 1) 34 | } 35 | 36 | func (m *Matrix) Pull() (req *request.Request) { 37 | req = m.Requests[0] 38 | 39 | // 如果使用代理,设置代理。 40 | 41 | m.Requests = m.Requests[1:] 42 | return 43 | } -------------------------------------------------------------------------------- /scheduler/shceduler.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | type Scheduler struct { 8 | Status Status // 运行状态 9 | ThreadCount chan bool // 并发数 10 | UseProxy bool // 是否使用代理 11 | //Proxy *Proxy 12 | Matrices []*Matrix 13 | sync.RWMutex 14 | } 15 | 16 | var GlobalScheduler = &Scheduler{ 17 | Status: RUN, 18 | ThreadCount: make(chan bool, 10), 19 | //Proxy 20 | Matrices: []*Matrix{}, 21 | } 22 | 23 | func InsertGlobalSchedulerMatrix(spiderName string) *Matrix { 24 | matrix := NewMatrix(spiderName) 25 | GlobalScheduler.RLock() 26 | defer GlobalScheduler.RUnlock() 27 | GlobalScheduler.Matrices = append(GlobalScheduler.Matrices, matrix) 28 | return matrix 29 | } -------------------------------------------------------------------------------- /scheduler/status.go: -------------------------------------------------------------------------------- 1 | package scheduler 2 | 3 | type Status int 4 | 5 | // 运行状态 6 | const ( 7 | STOPPED Status = iota - 1 8 | STOP 9 | RUN 10 | PAUSE 11 | ) -------------------------------------------------------------------------------- /spider/rule.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import "github.com/liunian1004/go-crawler/page" 4 | 5 | // 采集规则节点 6 | type Rule struct { 7 | ParseFunc func(page *page.Page) 8 | } 9 | 10 | type RuleTree struct { 11 | // 根节点,此时 Page 为空 12 | Root func(page *page.Page) 13 | Trunk map[string]*Rule 14 | } -------------------------------------------------------------------------------- /spider/spider.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "github.com/liunian1004/go-crawler/page" 5 | "sync" 6 | "github.com/liunian1004/go-crawler/scheduler" 7 | "github.com/liunian1004/go-crawler/request" 8 | ) 9 | 10 | type Spider struct { 11 | ID int 12 | Name string 13 | SubName string 14 | Description string 15 | UseCookie bool 16 | NotDefaultField bool // 是否禁止输出结果中的默认字段 Url/ParentUrl/DownloadTime 17 | 18 | RuleTree RuleTree 19 | 20 | Matrix *scheduler.Matrix // 存储请求 21 | 22 | Status scheduler.Status // 执行状态 23 | lock sync.RWMutex 24 | once sync.Once 25 | } 26 | 27 | var GlobalSpiders map[string]*Spider 28 | 29 | func (s *Spider) Register() { 30 | GlobalSpiders[s.Name] = s 31 | } 32 | 33 | // 查找对应的规则 34 | func (s *Spider) GetRule(ruleName string) (rule *Rule, r bool) { 35 | rule, r = s.RuleTree.Trunk[ruleName] 36 | return 37 | } 38 | 39 | func (s *Spider) Start() { 40 | s.RuleTree.Root(page.GetPage(s, nil)) 41 | s.lock.Lock() 42 | s.Status = scheduler.RUN 43 | s.lock.Unlock() 44 | } 45 | 46 | func (s *Spider) Stop() { 47 | s.lock.Lock() 48 | s.Status = scheduler.STOP 49 | s.lock.Unlock() 50 | } 51 | 52 | // 初始化队列 53 | func (s *Spider) SpiderMatrixInit() *Spider { 54 | s.Matrix = scheduler.InsertGlobalSchedulerMatrix(s.Name) 55 | return s 56 | } 57 | 58 | // 添加请求 59 | func (s *Spider) PushRequest(request *request.Request) { 60 | s.Matrix.Push(request) 61 | } 62 | 63 | // 取出请求 64 | func (s *Spider) PullRequest() (request *request.Request) { 65 | return s.Matrix.Pull() 66 | } 67 | 68 | // 保持成功记录 69 | func (s *Spider) SaveSuccess() { 70 | 71 | } 72 | 73 | // 保存失败记录 74 | func (s *Spider) SaveFailure() { 75 | 76 | } --------------------------------------------------------------------------------