├── .gitignore
├── README.md
├── config.yaml
├── config
    └── fetcher.go
├── engine
    ├── concurrent.go
    └── simple.go
├── fetcher
    └── fetcher.go
├── go.mod
├── main.go
├── model
    ├── item.go
    └── request.go
├── parser
    ├── article.go
    ├── forum.go
    ├── section.go
    └── utils.go
└── persist
    ├── file.go
    └── persist.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | ### Go template
 3 | # Binaries for programs and plugins
 4 | *.exe
 5 | *.exe~
 6 | *.dll
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Test binary, build with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 | ### Example user template template
16 | ### Example user template
17 | 
18 | # IntelliJ project files
19 | .idea
20 | *.iml
21 | out
22 | gen
23 | go.sum
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # discuz-crawler
 2 | 
 3 | ### 简介
 4 | 
 5 | 一个易配置，可扩展的discuz论坛系统的爬虫
 6 | 
 7 | - [x] 解析器、数据持久化、调度分离，方便扩展
 8 | - [x] 配置goquery(类似jQuery)选择器来获取网页内容
 9 | - [x] 配置请求头
10 | - [ ] 关键词过滤
11 | - [ ] 使用cookies爬取
12 | - [ ] 并发爬取
13 | - [ ] 失败重试
14 | 
15 | ### 使用
16 | 
17 | 1. 编译好的二进制文件和配置文件 **config.yaml** 和放在同一个目录下
18 | 2. 配置 **config.yaml**
19 |     - **seed**
20 |         - **url** 配置爬取的初始(种子)页面
21 |         - **parser** 配置初始(种子)页面对应的解析器
22 |             - 选项 **forum** 、 **section** 或 **article**，分别对应主页，板块页，文章页。方便对所有板块、单一板块或单一文章进行爬取
23 |     - **selector** 配置选择器，语法几乎与 **jQuery** 一致，方便适配不同的 **discuz** 主题，用于定位爬取的HTML页面上相对应的DOM元素。**article** 定位td标签，其他页面为a标签。
24 |         - **section** 定位主页的“板块”a标签
25 |         - **sub_section** 定位主页的“子板块”a标签
26 |         - **next_page** 定位板块页的“下一页”a标签
27 |         - **title** 定位板块页的“文章标题”a标签
28 |         - **article** 定位文章页的“文章内容”td标签
29 |     - **header** 配置请求头


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # 种子页面
 2 | seed:
 3 |   url: 'http://www.discuz.net/forum.php'
 4 |   # 解析器(forum,section,article)
 5 |   parser: 'forum'
 6 | # 选择器
 7 | selector:
 8 |   # 板块
 9 |   section: '.fl_g dt a'
10 |   # 子版块
11 |   sub_section: '.fl_tb .fl_g dl dt:nth-child(1) a'
12 |   # 下一页按钮
13 |   next_page: '.nxt'
14 |   # 标题
15 |   title: '.s.xst'
16 |   # 文章
17 |   article: 'div[id^=post_]:nth-child(3) td[id^=postmessage]'
18 | # 请求头
19 | header:
20 |   Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
21 |   Accept-Language: 'zh-CN,zh;q=0.9,en;q=0.8'
22 |   DNT: '1'
23 |   Host: 'www.discuz.net'
24 |   Referer: 'https://www.discuz.net/'
25 |   User-Agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'


--------------------------------------------------------------------------------
/config/fetcher.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | import (
 4 | 	"gopkg.in/yaml.v2"
 5 | 	"io/ioutil"
 6 | 	"log"
 7 | )
 8 | 
 9 | type CrawlerConfig struct {
10 | 	Seed struct {
11 | 		Url    string `yaml:"url"`
12 | 		Parser string `yaml:"parser"`
13 | 	}
14 | 	Selector struct {
15 | 		Section    string `yaml:"section"`
16 | 		SubSection string `yaml:"subSection"`
17 | 		NextPage   string `yaml:"next_page"`
18 | 		Title      string `yaml:"title"`
19 | 		Article    string `yaml:"article"`
20 | 	}
21 | 	Header map[string]string `yaml:"header"`
22 | }
23 | 
24 | var Crawler = CrawlerConfig{}
25 | 
26 | func init() {
27 | 	yamlFile, err := ioutil.ReadFile("config.yaml")
28 | 	if err != nil {
29 | 		log.Fatalf("读取yaml配置文件config.yaml失败: %s ", err)
30 | 	}
31 | 	err = yaml.Unmarshal(yamlFile, &Crawler)
32 | 	if err != nil {
33 | 		log.Fatalf("yaml配置文件格式有误: %s", err)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/engine/concurrent.go:
--------------------------------------------------------------------------------
 1 | package engine
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/fetcher"
 5 | 	"dicuz-crawler/model"
 6 | 	"dicuz-crawler/persist"
 7 | 	"log"
 8 | )
 9 | 
10 | type Concurrent struct {
11 | 	Saver       persist.Saver
12 | 	WorkerCount int
13 | }
14 | 
15 | func (e *Concurrent) Run(seeds ...model.Request) {
16 | 	in := make(chan model.Request)
17 | 	out := make(chan model.ParseResult)
18 | 	e.Saver.Init()
19 | 
20 | 	workerNum := 0
21 | 	for i := 0; i < e.WorkerCount; i++ {
22 | 		workerNum++
23 | 		createWorker(in, out)
24 | 	}
25 | 
26 | 	for _, request := range seeds {
27 | 		go func() { in <- request }()
28 | 	}
29 | 
30 | 	count := 0
31 | 	for {
32 | 		result := <-out
33 | 		//log.Printf("result:%v", result)
34 | 		for _, request := range result.Requests {
35 | 			log.Printf("request:%v", request)
36 | 			go func() { in <- request }()
37 | 		}
38 | 		for _, item := range result.Items {
39 | 			log.Printf("#%d-item: %+v", count, item)
40 | 			item, ok := item.(model.Item)
41 | 			if ok {
42 | 				err := e.Saver.Save(item)
43 | 				if err != nil {
44 | 					log.Printf("数据 %v 保存出错: %s", item, err)
45 | 				}
46 | 			}
47 | 			count++
48 | 		}
49 | 	}
50 | 
51 | 	e.Saver.Close()
52 | }
53 | 
54 | func Worker(request model.Request) (model.ParseResult, error) {
55 | 	doc, err := fetcher.Fetch(request.Url)
56 | 	if err != nil {
57 | 		return model.ParseResult{}, err
58 | 	}
59 | 	return request.ParseFunc(doc, request.Deliver), nil
60 | }
61 | 
62 | func createWorker(in chan model.Request, out chan model.ParseResult) {
63 | 	go func() {
64 | 		for {
65 | 			request := <-in
66 | 			result, err := Worker(request)
67 | 			if err != nil {
68 | 				continue
69 | 			}
70 | 			out <- result
71 | 		}
72 | 	}()
73 | }
74 | 


--------------------------------------------------------------------------------
/engine/simple.go:
--------------------------------------------------------------------------------
 1 | package engine
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/fetcher"
 5 | 	"dicuz-crawler/model"
 6 | 	"dicuz-crawler/persist"
 7 | 	"log"
 8 | )
 9 | 
10 | type Simple struct {
11 | 	Saver persist.Saver
12 | }
13 | 
14 | func (e Simple) Run(seeds ...model.Request) {
15 | 	var requests []model.Request
16 | 	for _, seed := range seeds {
17 | 		requests = append(requests, seed)
18 | 	}
19 | 
20 | 	e.Saver.Init()
21 | 
22 | 	count := 0
23 | 	for len(requests) > 0 {
24 | 		request := requests[0]
25 | 		requests = requests[1:]
26 | 
27 | 		parseResult, err := e.worker(request)
28 | 		if err != nil {
29 | 			continue
30 | 		}
31 | 		requests = append(requests, parseResult.Requests...)
32 | 
33 | 		for _, item := range parseResult.Items {
34 | 			log.Printf("#%d-item: %+v", count, item)
35 | 			item, ok := item.(model.Item)
36 | 			if ok {
37 | 				err := e.Saver.Save(item)
38 | 				if err != nil {
39 | 					log.Printf("数据 %v 保存出错: %s", item, err)
40 | 				}
41 | 			}
42 | 			count++
43 | 		}
44 | 	}
45 | 
46 | 	e.Saver.Close()
47 | }
48 | 
49 | func (e Simple) worker(request model.Request) (model.ParseResult, error) {
50 | 	doc, err := fetcher.Fetch(request.Url)
51 | 	if err != nil {
52 | 		return model.ParseResult{}, err
53 | 	}
54 | 	return request.ParseFunc(doc, request.Deliver), nil
55 | }
56 | 


--------------------------------------------------------------------------------
/fetcher/fetcher.go:
--------------------------------------------------------------------------------
 1 | package fetcher
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"dicuz-crawler/config"
 6 | 	"dicuz-crawler/parser"
 7 | 	"errors"
 8 | 	"github.com/PuerkitoBio/goquery"
 9 | 	"golang.org/x/net/html/charset"
10 | 	"golang.org/x/text/encoding"
11 | 	"golang.org/x/text/encoding/unicode"
12 | 	"golang.org/x/text/transform"
13 | 	"io"
14 | 	"log"
15 | 	"net/http"
16 | 	"strconv"
17 | )
18 | 
19 | func Fetch(url string) (*goquery.Document, error) {
20 | 	client := &http.Client{CheckRedirect: redirect}
21 | 	req, err := http.NewRequest("GET", url, nil)
22 | 	for k, v := range config.Crawler.Header {
23 | 		req.Header.Add(k, v)
24 | 	}
25 | 	//log.Printf("请求url: %s", url)
26 | 	res, err := client.Do(req)
27 | 	if err != nil {
28 | 		log.Printf("请求错误: %s", err.Error())
29 | 		return nil, err
30 | 	}
31 | 	defer res.Body.Close()
32 | 	if res.StatusCode != 200 {
33 | 		log.Printf("status code: %d %s", res.StatusCode, res.Status)
34 | 		return nil, errors.New("status code: " + strconv.Itoa(res.StatusCode) + " " + res.Status)
35 | 	}
36 | 	utf8Reader := transform.NewReader(res.Body, DetermineEncoding(res.Body).NewDecoder())
37 | 	doc, err := goquery.NewDocumentFromReader(utf8Reader)
38 | 	if err != nil {
39 | 		log.Printf("解析dom出错: %s", err.Error())
40 | 		return nil, err
41 | 	}
42 | 	return doc, nil
43 | }
44 | 
45 | func redirect(req *http.Request, via []*http.Request) (e error) {
46 | 	if req.URL.String() != parser.SeedUrlParse.Host {
47 | 		return errors.New("host不同源: " + req.URL.String())
48 | 	}
49 | 	return nil
50 | }
51 | 
52 | func DetermineEncoding(r io.Reader) encoding.Encoding {
53 | 	bytes, err := bufio.NewReader(r).Peek(1024)
54 | 	if err != nil {
55 | 		log.Printf("DetermineEncoding出错: %s", err.Error())
56 | 		return unicode.UTF8
57 | 	}
58 | 	e, _, _ := charset.DetermineEncoding(bytes, "")
59 | 	return e
60 | }
61 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module dicuz-crawler
 2 | 
 3 | go 1.12
 4 | 
 5 | require (
 6 | 	github.com/PuerkitoBio/goquery v1.5.0
 7 | 	golang.org/x/net v0.0.0-20190424112056-4829fb13d2c6
 8 | 	golang.org/x/text v0.3.2
 9 | 	gopkg.in/yaml.v2 v2.2.2
10 | )
11 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/config"
 5 | 	"dicuz-crawler/engine"
 6 | 	"dicuz-crawler/model"
 7 | 	"dicuz-crawler/parser"
 8 | 	"dicuz-crawler/persist"
 9 | )
10 | 
11 | func main() {
12 | 	e := engine.Simple{
13 | 		Saver: &persist.FileSaver{},
14 | 	}
15 | 	//e := engine.Concurrent{
16 | 	//	Saver: &persist.FileSaver{},
17 | 	//	WorkerCount: 1,
18 | 	//}
19 | 	e.Run(model.Request{
20 | 		Url:       config.Crawler.Seed.Url,
21 | 		ParseFunc: parser.StrToFuncOfParser(config.Crawler.Seed.Parser),
22 | 	})
23 | }
24 | 


--------------------------------------------------------------------------------
/model/item.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | type Item struct {
 4 | 	Id      string `json:"id"`
 5 | 	Url     string `json:"url"`
 6 | 	Section string `json:"section"`
 7 | 	Title   string `json:"title"`
 8 | 	Content string `json:"content"`
 9 | }
10 | 


--------------------------------------------------------------------------------
/model/request.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import "github.com/PuerkitoBio/goquery"
 4 | 
 5 | type Request struct {
 6 | 	Url       string
 7 | 	ParseFunc func(*goquery.Document, Item) ParseResult
 8 | 	Deliver   Item
 9 | }
10 | 
11 | type ParseResult struct {
12 | 	Requests []Request
13 | 	Items    []interface{}
14 | }
15 | 


--------------------------------------------------------------------------------
/parser/article.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/config"
 5 | 	"dicuz-crawler/model"
 6 | 	"github.com/PuerkitoBio/goquery"
 7 | 	"strings"
 8 | )
 9 | 
10 | func ParseArticle(doc *goquery.Document, item model.Item) model.ParseResult {
11 | 	parseResult := model.ParseResult{}
12 | 	doc.Find(config.Crawler.Selector.Article).Each(func(i int, selection *goquery.Selection) {
13 | 		content, _ := selection.Html()
14 | 		content = strings.Replace(content, "\n", "", -1)
15 | 		//log.Printf("content: %s", content)
16 | 		item.Content = content
17 | 		parseResult.Items = append(parseResult.Items, item)
18 | 	})
19 | 	return parseResult
20 | }
21 | 


--------------------------------------------------------------------------------
/parser/forum.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/config"
 5 | 	"dicuz-crawler/model"
 6 | 	"github.com/PuerkitoBio/goquery"
 7 | )
 8 | 
 9 | func ParseForum(doc *goquery.Document, _ model.Item) model.ParseResult {
10 | 	parseResult := model.ParseResult{}
11 | 	doc.Find(config.Crawler.Selector.Section).Each(func(i int, selection *goquery.Selection) {
12 | 		url, _ := selection.Attr("href")
13 | 		url, _ = RelativeToAbsoluteOfUrl(url)
14 | 		content := selection.Text()
15 | 		//log.Printf("url: %s, title: %s", url, content)
16 | 		parseResult.Items = append(parseResult.Items, content)
17 | 		parseResult.Requests = append(parseResult.Requests, model.Request{
18 | 			Url:       url,
19 | 			ParseFunc: ParseSection,
20 | 			Deliver: model.Item{
21 | 				Section: content,
22 | 			},
23 | 		})
24 | 	})
25 | 	doc.Find(config.Crawler.Selector.SubSection).Each(func(i int, selection *goquery.Selection) {
26 | 		url, _ := selection.Attr("href")
27 | 		url, _ = RelativeToAbsoluteOfUrl(url)
28 | 		content := selection.Text()
29 | 		//log.Printf("url: %s, title: %s", url, content)
30 | 		parseResult.Items = append(parseResult.Items, content)
31 | 		parseResult.Requests = append(parseResult.Requests, model.Request{
32 | 			Url:       url,
33 | 			ParseFunc: ParseSection,
34 | 			Deliver: model.Item{
35 | 				Section: content,
36 | 			},
37 | 		})
38 | 	})
39 | 	return parseResult
40 | }
41 | 


--------------------------------------------------------------------------------
/parser/section.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/config"
 5 | 	"dicuz-crawler/model"
 6 | 	"github.com/PuerkitoBio/goquery"
 7 | 	"regexp"
 8 | )
 9 | 
10 | var IdRe0 = regexp.MustCompile(`/thread-([\d]+)-[\d]+-[\d]+.html`)
11 | var IdRe1 = regexp.MustCompile(`tid=([\d]+)&`)
12 | 
13 | func ParseSection(doc *goquery.Document, item model.Item) model.ParseResult {
14 | 	parseResult := model.ParseResult{}
15 | 	doc.Find(config.Crawler.Selector.Title).Each(func(i int, selection *goquery.Selection) {
16 | 		content, _ := selection.Html()
17 | 		url, _ := selection.Attr("href")
18 | 		//log.Printf("url: %s, content: %s", url, content)
19 | 		url, _ = RelativeToAbsoluteOfUrl(url)
20 | 		item.Title = content
21 | 		item.Url = url
22 | 		match := IdRe0.FindSubmatch([]byte(url))
23 | 		var matchResult string
24 | 		if len(match) >= 2 {
25 | 			matchResult = string(match[1])
26 | 			item.Id = matchResult
27 | 		}
28 | 		if len(matchResult) == 0 {
29 | 			match = IdRe1.FindSubmatch([]byte(url))
30 | 			if len(match) >= 2 {
31 | 				matchResult = string(match[1])
32 | 				item.Id = matchResult
33 | 			} else {
34 | 				item.Id = "-"
35 | 			}
36 | 		}
37 | 		parseResult.Items = append(parseResult.Items, content)
38 | 		parseResult.Requests = append(parseResult.Requests, model.Request{
39 | 			Url:       url,
40 | 			ParseFunc: ParseArticle,
41 | 			Deliver:   item,
42 | 		})
43 | 	})
44 | 	count := 0
45 | 	doc.Find(config.Crawler.Selector.NextPage).Each(func(i int, selection *goquery.Selection) {
46 | 		if count > 0 {
47 | 			return
48 | 		}
49 | 		url, _ := selection.Attr("href")
50 | 		content := selection.Text()
51 | 		url, _ = RelativeToAbsoluteOfUrl(url)
52 | 		//log.Printf("url: %s, title: %s", url, content)
53 | 		parseResult.Items = append(parseResult.Items, content)
54 | 		parseResult.Requests = append(parseResult.Requests, model.Request{
55 | 			Url:       url,
56 | 			ParseFunc: ParseSection,
57 | 			Deliver:   item,
58 | 		})
59 | 		count++
60 | 	})
61 | 	return parseResult
62 | }
63 | 


--------------------------------------------------------------------------------
/parser/utils.go:
--------------------------------------------------------------------------------
 1 | package parser
 2 | 
 3 | import (
 4 | 	"dicuz-crawler/config"
 5 | 	"dicuz-crawler/model"
 6 | 	"errors"
 7 | 	"fmt"
 8 | 	"github.com/PuerkitoBio/goquery"
 9 | 	"net/url"
10 | )
11 | 
12 | var seedUrl string
13 | var SeedUrlParse *url.URL
14 | 
15 | func init() {
16 | 	seedUrl = config.Crawler.Seed.Url
17 | 	SeedUrlParse, _ = url.Parse(seedUrl)
18 | }
19 | 
20 | func RelativeToAbsoluteOfUrl(relativeUrl string) (absoluteUrl string, err error) {
21 | 	u, err := url.Parse(relativeUrl)
22 | 	if err != nil {
23 | 		return relativeUrl, errors.New("relativeUrl: " + err.Error())
24 | 	}
25 | 	if u.Scheme == "http" || u.Scheme == "https" {
26 | 		return relativeUrl, errors.New("unknown scheme")
27 | 	}
28 | 	return fmt.Sprintf("%s://%s/%s", SeedUrlParse.Scheme, SeedUrlParse.Host, relativeUrl), nil
29 | }
30 | 
31 | func StrToFuncOfParser(parserStr string) func(*goquery.Document, model.Item) model.ParseResult {
32 | 	switch parserStr {
33 | 	case "forum":
34 | 		return ParseForum
35 | 	case "section":
36 | 		return ParseSection
37 | 	case "article":
38 | 		return ParseArticle
39 | 	default:
40 | 		return ParseForum
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/persist/file.go:
--------------------------------------------------------------------------------
 1 | package persist
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"dicuz-crawler/model"
 6 | 	"encoding/json"
 7 | 	"log"
 8 | 	"os"
 9 | 	"strconv"
10 | 	"time"
11 | )
12 | 
13 | type FileSaver struct {
14 | 	File *os.File
15 | }
16 | 
17 | func (f *FileSaver) Init() {
18 | 	fileName := "save" + strconv.FormatInt(time.Now().Unix(), 10) + ".txt"
19 | 	var err error
20 | 	f.File, err = os.OpenFile(fileName, os.O_RDWR|os.O_CREATE|os.O_APPEND, os.ModeAppend)
21 | 	if err != nil {
22 | 		log.Println("创建(打开)文件"+fileName+"失败: %s", err)
23 | 	}
24 | }
25 | 
26 | func (f *FileSaver) Save(item model.Item) error {
27 | 	buffer := &bytes.Buffer{}
28 | 	encoder := json.NewEncoder(buffer)
29 | 	encoder.SetEscapeHTML(false) //禁止转义
30 | 	encoder.Encode(item)
31 | 	_, err := f.File.Write(buffer.Bytes())
32 | 	return err
33 | }
34 | 
35 | func (f *FileSaver) Close() {
36 | 	f.File.Close()
37 | }
38 | 


--------------------------------------------------------------------------------
/persist/persist.go:
--------------------------------------------------------------------------------
 1 | package persist
 2 | 
 3 | import "dicuz-crawler/model"
 4 | 
 5 | type Saver interface {
 6 | 	Init()
 7 | 	Save(item model.Item) error
 8 | 	Close()
 9 | }
10 | 


--------------------------------------------------------------------------------