├── .gitignore ├── README.md ├── examples ├── custom_output_server.go └── simple_server.go ├── handler.go └── processor.go /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15921483570/wechat_spider/5d3ea189acb44bc7f5f1d0e26e6b22db40b0ce00/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wechat_spider 2 | 微信公众号爬虫 (只需设置代理, 一键可以爬取指定公众号的所有历史文章) 3 | 4 | - 一个简单的Demo [simple_server.go][1] 5 | 6 | 7 | ``` 8 | package main 9 | 10 | import ( 11 | "log" 12 | "net/http" 13 | 14 | "github.com/sundy-li/wechat_spider" 15 | 16 | "github.com/elazarl/goproxy" 17 | ) 18 | 19 | func main() { 20 | var port = "8899" 21 | proxy := goproxy.NewProxyHttpServer() 22 | //open it see detail logs 23 | // wechat.Verbose = true 24 | proxy.OnResponse().DoFunc( 25 | wechat_spider.ProxyHandle(wechat_spider.NewBaseProcessor()), 26 | ) 27 | log.Println("server will at port:" + port) 28 | log.Fatal(http.ListenAndServe(":"+port, proxy)) 29 | 30 | } 31 | ``` 32 | 33 | 34 | 35 | - 使用方法: 36 | 运行后, 设置手机的代理为 本机ip 8899端口, 打开微信客户端, 点击任一公众号查看历史文章按钮, 即可爬取该公众号的所有历史文章(已经支持自动翻页爬取) 37 | 38 | 39 | - 自定义输出源,实现Processor接口的Output方法即可, [custom_output_server.go][2] 40 | 41 | 42 | [1]: https://github.com/sundy-li/wechat_spider/blob/master/examples/simple_server.go 43 | [2]: https://github.com/sundy-li/wechat_spider/blob/master/examples/custom_output_server.go 44 | 45 | - 微信会屏蔽频繁的请求,所以历史文章的翻页请求调用了Sleep()方法, 默认每个请求休眠50ms,可以根据实际情况自定义Processor覆盖此方法 -------------------------------------------------------------------------------- /examples/custom_output_server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | 7 | "github.com/sundy-li/wechat_spider" 8 | 9 | "github.com/elazarl/goproxy" 10 | ) 11 | 12 | func main() { 13 | var port = "8899" 14 | proxy := goproxy.NewProxyHttpServer() 15 | //open it see detail logs 16 | // wechat.Verbose = true 17 | proxy.OnResponse().DoFunc( 18 | wechat_spider.ProxyHandle(&CustomProcessor{}), 19 | ) 20 | log.Println("server will at port:" + port) 21 | log.Fatal(http.ListenAndServe(":"+port, proxy)) 22 | 23 | } 24 | 25 | //Just to implement Output Method of interface{} Processor 26 | type CustomProcessor struct { 27 | wechat_spider.BaseProcessor 28 | } 29 | 30 | func (c *CustomProcessor) Output() { 31 | //Just print the length of result urls 32 | println("result urls size =>", len(c.Urls())) 33 | c.Urls() 34 | } 35 | -------------------------------------------------------------------------------- /examples/simple_server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | 7 | "github.com/sundy-li/wechat_spider" 8 | 9 | "github.com/elazarl/goproxy" 10 | ) 11 | 12 | func main() { 13 | var port = "8899" 14 | proxy := goproxy.NewProxyHttpServer() 15 | //open it see detail logs 16 | // wechat.Verbose = true 17 | proxy.OnResponse().DoFunc( 18 | wechat_spider.ProxyHandle(wechat_spider.NewBaseProcessor()), 19 | ) 20 | log.Println("server will at port:" + port) 21 | log.Fatal(http.ListenAndServe(":"+port, proxy)) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /handler.go: -------------------------------------------------------------------------------- 1 | package wechat_spider 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | "os" 7 | "reflect" 8 | "spiderx/utils" 9 | "strings" 10 | 11 | "github.com/elazarl/goproxy" 12 | ) 13 | 14 | var ( 15 | Verbose = false 16 | Logger = log.New(os.Stderr, "", log.LstdFlags) 17 | ) 18 | 19 | func ProxyHandle(proc Processor) func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response { 20 | return func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response { 21 | if ctx.Req.URL.Path == `/mp/getmasssendmsg` && !strings.Contains(ctx.Req.URL.RawQuery, `f=json`) { 22 | var data []byte 23 | var err error 24 | data, resp.Body, err = utils.CopyReader(resp.Body) 25 | if err != nil { 26 | return resp 27 | } 28 | t := reflect.TypeOf(proc) 29 | v := reflect.New(t.Elem()) 30 | p := v.Interface().(Processor) 31 | go func() { 32 | err = p.Process(ctx.Req, data) 33 | if err != nil { 34 | Logger.Println(err.Error()) 35 | } 36 | p.Output() 37 | }() 38 | } 39 | return resp 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /processor.go: -------------------------------------------------------------------------------- 1 | package wechat_spider 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io/ioutil" 9 | "regexp" 10 | "strings" 11 | "time" 12 | 13 | "net/http" 14 | 15 | "github.com/palantir/stacktrace" 16 | ) 17 | 18 | type Processor interface { 19 | //Core method 20 | Process(req *http.Request, data []byte) error 21 | //Result urls 22 | Urls() []string 23 | //Output 24 | Output() 25 | //Sleep method to avoid the req control of wechat 26 | Sleep() 27 | } 28 | 29 | type BaseProcessor struct { 30 | req *http.Request 31 | lastId string 32 | data []byte 33 | result []string 34 | } 35 | 36 | var ( 37 | replacer = strings.NewReplacer( 38 | "\t", "", " ", "", 39 | """, `"`, " ", "", 40 | `\\`, "", "&amp;", "&", 41 | "&", "&", `\`, "", 42 | ) 43 | 44 | urlRegex = regexp.MustCompile("http://mp.weixin.qq.com/s?[^#]*") 45 | idRegex = regexp.MustCompile(`"id":(\d+)`) 46 | MsgNotFound = errors.New("MsgLists not found") 47 | ) 48 | 49 | func NewBaseProcessor() *BaseProcessor { 50 | return &BaseProcessor{} 51 | } 52 | 53 | func (p *BaseProcessor) init(req *http.Request, data []byte) (err error) { 54 | p.req = req 55 | p.data = data 56 | fmt.Println("Running a new wechat processor, please wait...") 57 | return nil 58 | } 59 | func (p *BaseProcessor) Process(req *http.Request, data []byte) error { 60 | if err := p.init(req, data); err != nil { 61 | return err 62 | } 63 | 64 | if err := p.processMain(); err != nil { 65 | return err 66 | } 67 | if err := p.processPages(); err != nil { 68 | return err 69 | } 70 | return nil 71 | } 72 | 73 | func (p *BaseProcessor) Sleep() { 74 | time.Sleep(50 * time.Millisecond) 75 | } 76 | 77 | func (p *BaseProcessor) Urls() []string { 78 | return p.result 79 | } 80 | 81 | func (p *BaseProcessor) Output() { 82 | bs, _ := json.Marshal(p.Urls()) 83 | fmt.Println("result => ", string(bs)) 84 | } 85 | 86 | //Parse the html 87 | func (p *BaseProcessor) processMain() error { 88 | p.result = make([]string, 0, 100) 89 | buffer := bytes.NewBuffer(p.data) 90 | var msgs string 91 | str, err := buffer.ReadString('\n') 92 | for err == nil { 93 | if strings.Contains(str, "msgList = ") { 94 | msgs = str 95 | break 96 | } 97 | str, err = buffer.ReadString('\n') 98 | } 99 | if msgs == "" { 100 | return stacktrace.Propagate(MsgNotFound, "Failed parse main") 101 | } 102 | msgs = replacer.Replace(msgs) 103 | p.result = urlRegex.FindAllString(msgs, -1) 104 | if len(p.result) < 1 { 105 | return stacktrace.Propagate(MsgNotFound, "Failed find url in main") 106 | } 107 | idMatcher := idRegex.FindAllStringSubmatch(msgs, -1) 108 | if len(idMatcher) < 1 { 109 | return stacktrace.Propagate(MsgNotFound, "Failed find id in main") 110 | } 111 | p.lastId = idMatcher[len(idMatcher)-1][1] 112 | return nil 113 | } 114 | 115 | func (p *BaseProcessor) processPages() (err error) { 116 | var pageUrl = p.genUrl() 117 | p.logf("process pages....") 118 | req, err := http.NewRequest("GET", pageUrl, nil) 119 | if err != nil { 120 | return stacktrace.Propagate(err, "Failed new page request") 121 | } 122 | for k, _ := range p.req.Header { 123 | req.Header.Set(k, p.req.Header.Get(k)) 124 | } 125 | req.Header.Set("Content-Type", "application/json; charset=UTF-8") 126 | resp, err := http.DefaultClient.Do(req) 127 | if err != nil { 128 | return stacktrace.Propagate(err, "Failed get page response") 129 | } 130 | bs, _ := ioutil.ReadAll(resp.Body) 131 | str := replacer.Replace(string(bs)) 132 | result := urlRegex.FindAllString(str, -1) 133 | if len(result) < 1 { 134 | return stacktrace.Propagate(err, "Failed get page url") 135 | } 136 | idMatcher := idRegex.FindAllStringSubmatch(str, -1) 137 | if len(idMatcher) < 1 { 138 | return stacktrace.Propagate(err, "Failed get page id") 139 | } 140 | p.lastId = idMatcher[len(idMatcher)-1][1] 141 | p.logf("Page Get => %d,lastid: %s", len(result), p.lastId) 142 | p.result = append(p.result, result...) 143 | if p.lastId != "" { 144 | p.Sleep() 145 | return p.processPages() 146 | } 147 | return nil 148 | } 149 | 150 | func (P *BaseProcessor) Save() { 151 | 152 | } 153 | 154 | func (p *BaseProcessor) genUrl() string { 155 | url := "http://mp.weixin.qq.com/mp/getmasssendmsg?" + p.req.URL.RawQuery 156 | url += "&frommsgid=" + p.lastId + "&f=json&count=100" 157 | return url 158 | } 159 | 160 | func (P *BaseProcessor) logf(format string, msg ...interface{}) { 161 | if Verbose { 162 | Logger.Printf(format, msg...) 163 | } 164 | } 165 | --------------------------------------------------------------------------------