├── .gitignore ├── README.md ├── config.yaml ├── config └── fetcher.go ├── engine ├── concurrent.go └── simple.go ├── fetcher └── fetcher.go ├── go.mod ├── main.go ├── model ├── item.go └── request.go ├── parser ├── article.go ├── forum.go ├── section.go └── utils.go └── persist ├── file.go └── persist.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Go template 3 | # Binaries for programs and plugins 4 | *.exe 5 | *.exe~ 6 | *.dll 7 | *.so 8 | *.dylib 9 | 10 | # Test binary, build with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | ### Example user template template 16 | ### Example user template 17 | 18 | # IntelliJ project files 19 | .idea 20 | *.iml 21 | out 22 | gen 23 | go.sum 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # discuz-crawler 2 | 3 | ### 简介 4 | 5 | 一个易配置,可扩展的discuz论坛系统的爬虫 6 | 7 | - [x] 解析器、数据持久化、调度分离,方便扩展 8 | - [x] 配置goquery(类似jQuery)选择器来获取网页内容 9 | - [x] 配置请求头 10 | - [ ] 关键词过滤 11 | - [ ] 使用cookies爬取 12 | - [ ] 并发爬取 13 | - [ ] 失败重试 14 | 15 | ### 使用 16 | 17 | 1. 编译好的二进制文件和配置文件 **config.yaml** 和放在同一个目录下 18 | 2. 配置 **config.yaml** 19 | - **seed** 20 | - **url** 配置爬取的初始(种子)页面 21 | - **parser** 配置初始(种子)页面对应的解析器 22 | - 选项 **forum** 、 **section** 或 **article**,分别对应主页,板块页,文章页。方便对所有板块、单一板块或单一文章进行爬取 23 | - **selector** 配置选择器,语法几乎与 **jQuery** 一致,方便适配不同的 **discuz** 主题,用于定位爬取的HTML页面上相对应的DOM元素。**article** 定位td标签,其他页面为a标签。 24 | - **section** 定位主页的“板块”a标签 25 | - **sub_section** 定位主页的“子板块”a标签 26 | - **next_page** 定位板块页的“下一页”a标签 27 | - **title** 定位板块页的“文章标题”a标签 28 | - **article** 定位文章页的“文章内容”td标签 29 | - **header** 配置请求头 -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # 种子页面 2 | seed: 3 | url: 'http://www.discuz.net/forum.php' 4 | # 解析器(forum,section,article) 5 | parser: 'forum' 6 | # 选择器 7 | selector: 8 | # 板块 9 | section: '.fl_g dt a' 10 | # 子版块 11 | sub_section: '.fl_tb .fl_g dl dt:nth-child(1) a' 12 | # 下一页按钮 13 | next_page: '.nxt' 14 | # 标题 15 | title: '.s.xst' 16 | # 文章 17 | article: 'div[id^=post_]:nth-child(3) td[id^=postmessage]' 18 | # 请求头 19 | header: 20 | Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' 21 | Accept-Language: 'zh-CN,zh;q=0.9,en;q=0.8' 22 | DNT: '1' 23 | Host: 'www.discuz.net' 24 | Referer: 'https://www.discuz.net/' 25 | User-Agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' -------------------------------------------------------------------------------- /config/fetcher.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "gopkg.in/yaml.v2" 5 | "io/ioutil" 6 | "log" 7 | ) 8 | 9 | type CrawlerConfig struct { 10 | Seed struct { 11 | Url string `yaml:"url"` 12 | Parser string `yaml:"parser"` 13 | } 14 | Selector struct { 15 | Section string `yaml:"section"` 16 | SubSection string `yaml:"subSection"` 17 | NextPage string `yaml:"next_page"` 18 | Title string `yaml:"title"` 19 | Article string `yaml:"article"` 20 | } 21 | Header map[string]string `yaml:"header"` 22 | } 23 | 24 | var Crawler = CrawlerConfig{} 25 | 26 | func init() { 27 | yamlFile, err := ioutil.ReadFile("config.yaml") 28 | if err != nil { 29 | log.Fatalf("读取yaml配置文件config.yaml失败: %s ", err) 30 | } 31 | err = yaml.Unmarshal(yamlFile, &Crawler) 32 | if err != nil { 33 | log.Fatalf("yaml配置文件格式有误: %s", err) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /engine/concurrent.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import ( 4 | "dicuz-crawler/fetcher" 5 | "dicuz-crawler/model" 6 | "dicuz-crawler/persist" 7 | "log" 8 | ) 9 | 10 | type Concurrent struct { 11 | Saver persist.Saver 12 | WorkerCount int 13 | } 14 | 15 | func (e *Concurrent) Run(seeds ...model.Request) { 16 | in := make(chan model.Request) 17 | out := make(chan model.ParseResult) 18 | e.Saver.Init() 19 | 20 | workerNum := 0 21 | for i := 0; i < e.WorkerCount; i++ { 22 | workerNum++ 23 | createWorker(in, out) 24 | } 25 | 26 | for _, request := range seeds { 27 | go func() { in <- request }() 28 | } 29 | 30 | count := 0 31 | for { 32 | result := <-out 33 | //log.Printf("result:%v", result) 34 | for _, request := range result.Requests { 35 | log.Printf("request:%v", request) 36 | go func() { in <- request }() 37 | } 38 | for _, item := range result.Items { 39 | log.Printf("#%d-item: %+v", count, item) 40 | item, ok := item.(model.Item) 41 | if ok { 42 | err := e.Saver.Save(item) 43 | if err != nil { 44 | log.Printf("数据 %v 保存出错: %s", item, err) 45 | } 46 | } 47 | count++ 48 | } 49 | } 50 | 51 | e.Saver.Close() 52 | } 53 | 54 | func Worker(request model.Request) (model.ParseResult, error) { 55 | doc, err := fetcher.Fetch(request.Url) 56 | if err != nil { 57 | return model.ParseResult{}, err 58 | } 59 | return request.ParseFunc(doc, request.Deliver), nil 60 | } 61 | 62 | func createWorker(in chan model.Request, out chan model.ParseResult) { 63 | go func() { 64 | for { 65 | request := <-in 66 | result, err := Worker(request) 67 | if err != nil { 68 | continue 69 | } 70 | out <- result 71 | } 72 | }() 73 | } 74 | -------------------------------------------------------------------------------- /engine/simple.go: -------------------------------------------------------------------------------- 1 | package engine 2 | 3 | import ( 4 | "dicuz-crawler/fetcher" 5 | "dicuz-crawler/model" 6 | "dicuz-crawler/persist" 7 | "log" 8 | ) 9 | 10 | type Simple struct { 11 | Saver persist.Saver 12 | } 13 | 14 | func (e Simple) Run(seeds ...model.Request) { 15 | var requests []model.Request 16 | for _, seed := range seeds { 17 | requests = append(requests, seed) 18 | } 19 | 20 | e.Saver.Init() 21 | 22 | count := 0 23 | for len(requests) > 0 { 24 | request := requests[0] 25 | requests = requests[1:] 26 | 27 | parseResult, err := e.worker(request) 28 | if err != nil { 29 | continue 30 | } 31 | requests = append(requests, parseResult.Requests...) 32 | 33 | for _, item := range parseResult.Items { 34 | log.Printf("#%d-item: %+v", count, item) 35 | item, ok := item.(model.Item) 36 | if ok { 37 | err := e.Saver.Save(item) 38 | if err != nil { 39 | log.Printf("数据 %v 保存出错: %s", item, err) 40 | } 41 | } 42 | count++ 43 | } 44 | } 45 | 46 | e.Saver.Close() 47 | } 48 | 49 | func (e Simple) worker(request model.Request) (model.ParseResult, error) { 50 | doc, err := fetcher.Fetch(request.Url) 51 | if err != nil { 52 | return model.ParseResult{}, err 53 | } 54 | return request.ParseFunc(doc, request.Deliver), nil 55 | } 56 | -------------------------------------------------------------------------------- /fetcher/fetcher.go: -------------------------------------------------------------------------------- 1 | package fetcher 2 | 3 | import ( 4 | "bufio" 5 | "dicuz-crawler/config" 6 | "dicuz-crawler/parser" 7 | "errors" 8 | "github.com/PuerkitoBio/goquery" 9 | "golang.org/x/net/html/charset" 10 | "golang.org/x/text/encoding" 11 | "golang.org/x/text/encoding/unicode" 12 | "golang.org/x/text/transform" 13 | "io" 14 | "log" 15 | "net/http" 16 | "strconv" 17 | ) 18 | 19 | func Fetch(url string) (*goquery.Document, error) { 20 | client := &http.Client{CheckRedirect: redirect} 21 | req, err := http.NewRequest("GET", url, nil) 22 | for k, v := range config.Crawler.Header { 23 | req.Header.Add(k, v) 24 | } 25 | //log.Printf("请求url: %s", url) 26 | res, err := client.Do(req) 27 | if err != nil { 28 | log.Printf("请求错误: %s", err.Error()) 29 | return nil, err 30 | } 31 | defer res.Body.Close() 32 | if res.StatusCode != 200 { 33 | log.Printf("status code: %d %s", res.StatusCode, res.Status) 34 | return nil, errors.New("status code: " + strconv.Itoa(res.StatusCode) + " " + res.Status) 35 | } 36 | utf8Reader := transform.NewReader(res.Body, DetermineEncoding(res.Body).NewDecoder()) 37 | doc, err := goquery.NewDocumentFromReader(utf8Reader) 38 | if err != nil { 39 | log.Printf("解析dom出错: %s", err.Error()) 40 | return nil, err 41 | } 42 | return doc, nil 43 | } 44 | 45 | func redirect(req *http.Request, via []*http.Request) (e error) { 46 | if req.URL.String() != parser.SeedUrlParse.Host { 47 | return errors.New("host不同源: " + req.URL.String()) 48 | } 49 | return nil 50 | } 51 | 52 | func DetermineEncoding(r io.Reader) encoding.Encoding { 53 | bytes, err := bufio.NewReader(r).Peek(1024) 54 | if err != nil { 55 | log.Printf("DetermineEncoding出错: %s", err.Error()) 56 | return unicode.UTF8 57 | } 58 | e, _, _ := charset.DetermineEncoding(bytes, "") 59 | return e 60 | } 61 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module dicuz-crawler 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.5.0 7 | golang.org/x/net v0.0.0-20190424112056-4829fb13d2c6 8 | golang.org/x/text v0.3.2 9 | gopkg.in/yaml.v2 v2.2.2 10 | ) 11 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "dicuz-crawler/config" 5 | "dicuz-crawler/engine" 6 | "dicuz-crawler/model" 7 | "dicuz-crawler/parser" 8 | "dicuz-crawler/persist" 9 | ) 10 | 11 | func main() { 12 | e := engine.Simple{ 13 | Saver: &persist.FileSaver{}, 14 | } 15 | //e := engine.Concurrent{ 16 | // Saver: &persist.FileSaver{}, 17 | // WorkerCount: 1, 18 | //} 19 | e.Run(model.Request{ 20 | Url: config.Crawler.Seed.Url, 21 | ParseFunc: parser.StrToFuncOfParser(config.Crawler.Seed.Parser), 22 | }) 23 | } 24 | -------------------------------------------------------------------------------- /model/item.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | type Item struct { 4 | Id string `json:"id"` 5 | Url string `json:"url"` 6 | Section string `json:"section"` 7 | Title string `json:"title"` 8 | Content string `json:"content"` 9 | } 10 | -------------------------------------------------------------------------------- /model/request.go: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | import "github.com/PuerkitoBio/goquery" 4 | 5 | type Request struct { 6 | Url string 7 | ParseFunc func(*goquery.Document, Item) ParseResult 8 | Deliver Item 9 | } 10 | 11 | type ParseResult struct { 12 | Requests []Request 13 | Items []interface{} 14 | } 15 | -------------------------------------------------------------------------------- /parser/article.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "dicuz-crawler/config" 5 | "dicuz-crawler/model" 6 | "github.com/PuerkitoBio/goquery" 7 | "strings" 8 | ) 9 | 10 | func ParseArticle(doc *goquery.Document, item model.Item) model.ParseResult { 11 | parseResult := model.ParseResult{} 12 | doc.Find(config.Crawler.Selector.Article).Each(func(i int, selection *goquery.Selection) { 13 | content, _ := selection.Html() 14 | content = strings.Replace(content, "\n", "", -1) 15 | //log.Printf("content: %s", content) 16 | item.Content = content 17 | parseResult.Items = append(parseResult.Items, item) 18 | }) 19 | return parseResult 20 | } 21 | -------------------------------------------------------------------------------- /parser/forum.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "dicuz-crawler/config" 5 | "dicuz-crawler/model" 6 | "github.com/PuerkitoBio/goquery" 7 | ) 8 | 9 | func ParseForum(doc *goquery.Document, _ model.Item) model.ParseResult { 10 | parseResult := model.ParseResult{} 11 | doc.Find(config.Crawler.Selector.Section).Each(func(i int, selection *goquery.Selection) { 12 | url, _ := selection.Attr("href") 13 | url, _ = RelativeToAbsoluteOfUrl(url) 14 | content := selection.Text() 15 | //log.Printf("url: %s, title: %s", url, content) 16 | parseResult.Items = append(parseResult.Items, content) 17 | parseResult.Requests = append(parseResult.Requests, model.Request{ 18 | Url: url, 19 | ParseFunc: ParseSection, 20 | Deliver: model.Item{ 21 | Section: content, 22 | }, 23 | }) 24 | }) 25 | doc.Find(config.Crawler.Selector.SubSection).Each(func(i int, selection *goquery.Selection) { 26 | url, _ := selection.Attr("href") 27 | url, _ = RelativeToAbsoluteOfUrl(url) 28 | content := selection.Text() 29 | //log.Printf("url: %s, title: %s", url, content) 30 | parseResult.Items = append(parseResult.Items, content) 31 | parseResult.Requests = append(parseResult.Requests, model.Request{ 32 | Url: url, 33 | ParseFunc: ParseSection, 34 | Deliver: model.Item{ 35 | Section: content, 36 | }, 37 | }) 38 | }) 39 | return parseResult 40 | } 41 | -------------------------------------------------------------------------------- /parser/section.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "dicuz-crawler/config" 5 | "dicuz-crawler/model" 6 | "github.com/PuerkitoBio/goquery" 7 | "regexp" 8 | ) 9 | 10 | var IdRe0 = regexp.MustCompile(`/thread-([\d]+)-[\d]+-[\d]+.html`) 11 | var IdRe1 = regexp.MustCompile(`tid=([\d]+)&`) 12 | 13 | func ParseSection(doc *goquery.Document, item model.Item) model.ParseResult { 14 | parseResult := model.ParseResult{} 15 | doc.Find(config.Crawler.Selector.Title).Each(func(i int, selection *goquery.Selection) { 16 | content, _ := selection.Html() 17 | url, _ := selection.Attr("href") 18 | //log.Printf("url: %s, content: %s", url, content) 19 | url, _ = RelativeToAbsoluteOfUrl(url) 20 | item.Title = content 21 | item.Url = url 22 | match := IdRe0.FindSubmatch([]byte(url)) 23 | var matchResult string 24 | if len(match) >= 2 { 25 | matchResult = string(match[1]) 26 | item.Id = matchResult 27 | } 28 | if len(matchResult) == 0 { 29 | match = IdRe1.FindSubmatch([]byte(url)) 30 | if len(match) >= 2 { 31 | matchResult = string(match[1]) 32 | item.Id = matchResult 33 | } else { 34 | item.Id = "-" 35 | } 36 | } 37 | parseResult.Items = append(parseResult.Items, content) 38 | parseResult.Requests = append(parseResult.Requests, model.Request{ 39 | Url: url, 40 | ParseFunc: ParseArticle, 41 | Deliver: item, 42 | }) 43 | }) 44 | count := 0 45 | doc.Find(config.Crawler.Selector.NextPage).Each(func(i int, selection *goquery.Selection) { 46 | if count > 0 { 47 | return 48 | } 49 | url, _ := selection.Attr("href") 50 | content := selection.Text() 51 | url, _ = RelativeToAbsoluteOfUrl(url) 52 | //log.Printf("url: %s, title: %s", url, content) 53 | parseResult.Items = append(parseResult.Items, content) 54 | parseResult.Requests = append(parseResult.Requests, model.Request{ 55 | Url: url, 56 | ParseFunc: ParseSection, 57 | Deliver: item, 58 | }) 59 | count++ 60 | }) 61 | return parseResult 62 | } 63 | -------------------------------------------------------------------------------- /parser/utils.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "dicuz-crawler/config" 5 | "dicuz-crawler/model" 6 | "errors" 7 | "fmt" 8 | "github.com/PuerkitoBio/goquery" 9 | "net/url" 10 | ) 11 | 12 | var seedUrl string 13 | var SeedUrlParse *url.URL 14 | 15 | func init() { 16 | seedUrl = config.Crawler.Seed.Url 17 | SeedUrlParse, _ = url.Parse(seedUrl) 18 | } 19 | 20 | func RelativeToAbsoluteOfUrl(relativeUrl string) (absoluteUrl string, err error) { 21 | u, err := url.Parse(relativeUrl) 22 | if err != nil { 23 | return relativeUrl, errors.New("relativeUrl: " + err.Error()) 24 | } 25 | if u.Scheme == "http" || u.Scheme == "https" { 26 | return relativeUrl, errors.New("unknown scheme") 27 | } 28 | return fmt.Sprintf("%s://%s/%s", SeedUrlParse.Scheme, SeedUrlParse.Host, relativeUrl), nil 29 | } 30 | 31 | func StrToFuncOfParser(parserStr string) func(*goquery.Document, model.Item) model.ParseResult { 32 | switch parserStr { 33 | case "forum": 34 | return ParseForum 35 | case "section": 36 | return ParseSection 37 | case "article": 38 | return ParseArticle 39 | default: 40 | return ParseForum 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /persist/file.go: -------------------------------------------------------------------------------- 1 | package persist 2 | 3 | import ( 4 | "bytes" 5 | "dicuz-crawler/model" 6 | "encoding/json" 7 | "log" 8 | "os" 9 | "strconv" 10 | "time" 11 | ) 12 | 13 | type FileSaver struct { 14 | File *os.File 15 | } 16 | 17 | func (f *FileSaver) Init() { 18 | fileName := "save" + strconv.FormatInt(time.Now().Unix(), 10) + ".txt" 19 | var err error 20 | f.File, err = os.OpenFile(fileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModeAppend) 21 | if err != nil { 22 | log.Println("创建(打开)文件"+fileName+"失败: %s", err) 23 | } 24 | } 25 | 26 | func (f *FileSaver) Save(item model.Item) error { 27 | buffer := &bytes.Buffer{} 28 | encoder := json.NewEncoder(buffer) 29 | encoder.SetEscapeHTML(false) //禁止转义 30 | encoder.Encode(item) 31 | _, err := f.File.Write(buffer.Bytes()) 32 | return err 33 | } 34 | 35 | func (f *FileSaver) Close() { 36 | f.File.Close() 37 | } 38 | -------------------------------------------------------------------------------- /persist/persist.go: -------------------------------------------------------------------------------- 1 | package persist 2 | 3 | import "dicuz-crawler/model" 4 | 5 | type Saver interface { 6 | Init() 7 | Save(item model.Item) error 8 | Close() 9 | } 10 | --------------------------------------------------------------------------------