├── .gitignore
├── README.md
├── examples
    ├── custom_output_server.go
    └── simple_server.go
├── handler.go
└── processor.go


/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/15921483570/wechat_spider/5d3ea189acb44bc7f5f1d0e26e6b22db40b0ce00/.gitignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # wechat_spider
 2 | 微信公众号爬虫  (只需设置代理, 一键可以爬取指定公众号的所有历史文章)
 3 | 
 4 | - 一个简单的Demo  [simple_server.go][1]
 5 | 
 6 | 
 7 | ```
 8 | package main
 9 | 
10 | import (
11 | 	"log"
12 | 	"net/http"
13 | 
14 | 	"github.com/sundy-li/wechat_spider"
15 | 
16 | 	"github.com/elazarl/goproxy"
17 | )
18 | 
19 | func main() {
20 | 	var port = "8899"
21 | 	proxy := goproxy.NewProxyHttpServer()
22 | 	//open it see detail logs
23 | 	// wechat.Verbose = true
24 | 	proxy.OnResponse().DoFunc(
25 | 		wechat_spider.ProxyHandle(wechat_spider.NewBaseProcessor()),
26 | 	)
27 | 	log.Println("server will at port:" + port)
28 | 	log.Fatal(http.ListenAndServe(":"+port, proxy))
29 | 
30 | }
31 | ```
32 | 
33 | 
34 | 
35 | - 使用方法:
36 | 运行后, 设置手机的代理为 本机ip 8899端口,  打开微信客户端, 点击任一公众号查看历史文章按钮, 即可爬取该公众号的所有历史文章(已经支持自动翻页爬取)
37 | 
38 | 
39 | - 自定义输出源,实现Processor接口的Output方法即可, [custom_output_server.go][2]
40 | 
41 | 
42 |   [1]: https://github.com/sundy-li/wechat_spider/blob/master/examples/simple_server.go
43 |   [2]: https://github.com/sundy-li/wechat_spider/blob/master/examples/custom_output_server.go
44 | 
45 | - 微信会屏蔽频繁的请求,所以历史文章的翻页请求调用了Sleep()方法, 默认每个请求休眠50ms,可以根据实际情况自定义Processor覆盖此方法


--------------------------------------------------------------------------------
/examples/custom_output_server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/sundy-li/wechat_spider"
 8 | 
 9 | 	"github.com/elazarl/goproxy"
10 | )
11 | 
12 | func main() {
13 | 	var port = "8899"
14 | 	proxy := goproxy.NewProxyHttpServer()
15 | 	//open it see detail logs
16 | 	// wechat.Verbose = true
17 | 	proxy.OnResponse().DoFunc(
18 | 		wechat_spider.ProxyHandle(&CustomProcessor{}),
19 | 	)
20 | 	log.Println("server will at port:" + port)
21 | 	log.Fatal(http.ListenAndServe(":"+port, proxy))
22 | 
23 | }
24 | 
25 | //Just to implement Output Method of interface{} Processor
26 | type CustomProcessor struct {
27 | 	wechat_spider.BaseProcessor
28 | }
29 | 
30 | func (c *CustomProcessor) Output() {
31 | 	//Just print the length of result urls
32 | 	println("result urls size =>", len(c.Urls()))
33 | 	c.Urls()
34 | }
35 | 


--------------------------------------------------------------------------------
/examples/simple_server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/sundy-li/wechat_spider"
 8 | 
 9 | 	"github.com/elazarl/goproxy"
10 | )
11 | 
12 | func main() {
13 | 	var port = "8899"
14 | 	proxy := goproxy.NewProxyHttpServer()
15 | 	//open it see detail logs
16 | 	// wechat.Verbose = true
17 | 	proxy.OnResponse().DoFunc(
18 | 		wechat_spider.ProxyHandle(wechat_spider.NewBaseProcessor()),
19 | 	)
20 | 	log.Println("server will at port:" + port)
21 | 	log.Fatal(http.ListenAndServe(":"+port, proxy))
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/handler.go:
--------------------------------------------------------------------------------
 1 | package wechat_spider
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 	"os"
 7 | 	"reflect"
 8 | 	"spiderx/utils"
 9 | 	"strings"
10 | 
11 | 	"github.com/elazarl/goproxy"
12 | )
13 | 
14 | var (
15 | 	Verbose = false
16 | 	Logger  = log.New(os.Stderr, "", log.LstdFlags)
17 | )
18 | 
19 | func ProxyHandle(proc Processor) func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
20 | 	return func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
21 | 		if ctx.Req.URL.Path == `/mp/getmasssendmsg` && !strings.Contains(ctx.Req.URL.RawQuery, `f=json`) {
22 | 			var data []byte
23 | 			var err error
24 | 			data, resp.Body, err = utils.CopyReader(resp.Body)
25 | 			if err != nil {
26 | 				return resp
27 | 			}
28 | 			t := reflect.TypeOf(proc)
29 | 			v := reflect.New(t.Elem())
30 | 			p := v.Interface().(Processor)
31 | 			go func() {
32 | 				err = p.Process(ctx.Req, data)
33 | 				if err != nil {
34 | 					Logger.Println(err.Error())
35 | 				}
36 | 				p.Output()
37 | 			}()
38 | 		}
39 | 		return resp
40 | 	}
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/processor.go:
--------------------------------------------------------------------------------
  1 | package wechat_spider
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"io/ioutil"
  9 | 	"regexp"
 10 | 	"strings"
 11 | 	"time"
 12 | 
 13 | 	"net/http"
 14 | 
 15 | 	"github.com/palantir/stacktrace"
 16 | )
 17 | 
 18 | type Processor interface {
 19 | 	//Core method
 20 | 	Process(req *http.Request, data []byte) error
 21 | 	//Result urls
 22 | 	Urls() []string
 23 | 	//Output
 24 | 	Output()
 25 | 	//Sleep method to avoid the req control of wechat
 26 | 	Sleep()
 27 | }
 28 | 
 29 | type BaseProcessor struct {
 30 | 	req    *http.Request
 31 | 	lastId string
 32 | 	data   []byte
 33 | 	result []string
 34 | }
 35 | 
 36 | var (
 37 | 	replacer = strings.NewReplacer(
 38 | 		"\t", "", " ", "",
 39 | 		"&quot;", `"`, "&nbsp;", "",
 40 | 		`\\`, "", "&amp;amp;", "&",
 41 | 		"&amp;", "&", `\`, "",
 42 | 	)
 43 | 
 44 | 	urlRegex    = regexp.MustCompile("http://mp.weixin.qq.com/s?[^#]*")
 45 | 	idRegex     = regexp.MustCompile(`"id":(\d+)`)
 46 | 	MsgNotFound = errors.New("MsgLists not found")
 47 | )
 48 | 
 49 | func NewBaseProcessor() *BaseProcessor {
 50 | 	return &BaseProcessor{}
 51 | }
 52 | 
 53 | func (p *BaseProcessor) init(req *http.Request, data []byte) (err error) {
 54 | 	p.req = req
 55 | 	p.data = data
 56 | 	fmt.Println("Running a new wechat processor, please wait...")
 57 | 	return nil
 58 | }
 59 | func (p *BaseProcessor) Process(req *http.Request, data []byte) error {
 60 | 	if err := p.init(req, data); err != nil {
 61 | 		return err
 62 | 	}
 63 | 
 64 | 	if err := p.processMain(); err != nil {
 65 | 		return err
 66 | 	}
 67 | 	if err := p.processPages(); err != nil {
 68 | 		return err
 69 | 	}
 70 | 	return nil
 71 | }
 72 | 
 73 | func (p *BaseProcessor) Sleep() {
 74 | 	time.Sleep(50 * time.Millisecond)
 75 | }
 76 | 
 77 | func (p *BaseProcessor) Urls() []string {
 78 | 	return p.result
 79 | }
 80 | 
 81 | func (p *BaseProcessor) Output() {
 82 | 	bs, _ := json.Marshal(p.Urls())
 83 | 	fmt.Println("result => ", string(bs))
 84 | }
 85 | 
 86 | //Parse the html
 87 | func (p *BaseProcessor) processMain() error {
 88 | 	p.result = make([]string, 0, 100)
 89 | 	buffer := bytes.NewBuffer(p.data)
 90 | 	var msgs string
 91 | 	str, err := buffer.ReadString('\n')
 92 | 	for err == nil {
 93 | 		if strings.Contains(str, "msgList = ") {
 94 | 			msgs = str
 95 | 			break
 96 | 		}
 97 | 		str, err = buffer.ReadString('\n')
 98 | 	}
 99 | 	if msgs == "" {
100 | 		return stacktrace.Propagate(MsgNotFound, "Failed parse main")
101 | 	}
102 | 	msgs = replacer.Replace(msgs)
103 | 	p.result = urlRegex.FindAllString(msgs, -1)
104 | 	if len(p.result) < 1 {
105 | 		return stacktrace.Propagate(MsgNotFound, "Failed find url in  main")
106 | 	}
107 | 	idMatcher := idRegex.FindAllStringSubmatch(msgs, -1)
108 | 	if len(idMatcher) < 1 {
109 | 		return stacktrace.Propagate(MsgNotFound, "Failed find id in  main")
110 | 	}
111 | 	p.lastId = idMatcher[len(idMatcher)-1][1]
112 | 	return nil
113 | }
114 | 
115 | func (p *BaseProcessor) processPages() (err error) {
116 | 	var pageUrl = p.genUrl()
117 | 	p.logf("process pages....")
118 | 	req, err := http.NewRequest("GET", pageUrl, nil)
119 | 	if err != nil {
120 | 		return stacktrace.Propagate(err, "Failed new page request")
121 | 	}
122 | 	for k, _ := range p.req.Header {
123 | 		req.Header.Set(k, p.req.Header.Get(k))
124 | 	}
125 | 	req.Header.Set("Content-Type", "application/json; charset=UTF-8")
126 | 	resp, err := http.DefaultClient.Do(req)
127 | 	if err != nil {
128 | 		return stacktrace.Propagate(err, "Failed get page response")
129 | 	}
130 | 	bs, _ := ioutil.ReadAll(resp.Body)
131 | 	str := replacer.Replace(string(bs))
132 | 	result := urlRegex.FindAllString(str, -1)
133 | 	if len(result) < 1 {
134 | 		return stacktrace.Propagate(err, "Failed get page url")
135 | 	}
136 | 	idMatcher := idRegex.FindAllStringSubmatch(str, -1)
137 | 	if len(idMatcher) < 1 {
138 | 		return stacktrace.Propagate(err, "Failed get page id")
139 | 	}
140 | 	p.lastId = idMatcher[len(idMatcher)-1][1]
141 | 	p.logf("Page Get => %d,lastid: %s", len(result), p.lastId)
142 | 	p.result = append(p.result, result...)
143 | 	if p.lastId != "" {
144 | 		p.Sleep()
145 | 		return p.processPages()
146 | 	}
147 | 	return nil
148 | }
149 | 
150 | func (P *BaseProcessor) Save() {
151 | 
152 | }
153 | 
154 | func (p *BaseProcessor) genUrl() string {
155 | 	url := "http://mp.weixin.qq.com/mp/getmasssendmsg?" + p.req.URL.RawQuery
156 | 	url += "&frommsgid=" + p.lastId + "&f=json&count=100"
157 | 	return url
158 | }
159 | 
160 | func (P *BaseProcessor) logf(format string, msg ...interface{}) {
161 | 	if Verbose {
162 | 		Logger.Printf(format, msg...)
163 | 	}
164 | }
165 | 


--------------------------------------------------------------------------------