├── README.md ├── chromedriver.log ├── lib └── chromedriver ├── spider.go └── utils ├── config.go ├── download.go ├── page.go ├── pipeline.go └── result.go /README.md: -------------------------------------------------------------------------------- 1 | # wechat_spider 2 | `使用selenium模拟浏览器采集http://weixin.sogou.com/` 3 | `内的公众号、文章列表。文章内容暂未采集,稍作改动便可试用。` 4 | -------------------------------------------------------------------------------- /chromedriver.log: -------------------------------------------------------------------------------- 1 | [0.002][SEVERE]: Could not bind socket to 0.0.0.0:9515 2 | [7418.809][INFO]: RESPONSE InitSession { 3 | "acceptSslCerts": true, 4 | "applicationCacheEnabled": false, 5 | "browserConnectionEnabled": false, 6 | "browserName": "chrome", 7 | "chrome": { 8 | "userDataDir": "/var/folders/0s/jpqx0n111ggdysbck6965ytm0000gn/T/.org.chromium.Chromium.WNVFd8" 9 | }, 10 | "cssSelectorsEnabled": true, 11 | "databaseEnabled": false, 12 | "handlesAlerts": true, 13 | "javascriptEnabled": true, 14 | "locationContextEnabled": true, 15 | "nativeEvents": true, 16 | "platform": "Mac OS X", 17 | "rotatable": false, 18 | "takesHeapSnapshot": true, 19 | "takesScreenshot": true, 20 | "version": "50.0.2661.102", 21 | "webStorageEnabled": true 22 | } 23 | [7418.810][INFO]: COMMAND Navigate { 24 | "url": "http://weixin.sogou.com/weixin?query=小企业E家&type=1&ie=utf8" 25 | } 26 | [7418.843][INFO]: Waiting for pending navigations... 27 | [7418.859][INFO]: Done waiting for pending navigations 28 | [7419.305][INFO]: Waiting for pending navigations... 29 | [7419.485][INFO]: Done waiting for pending navigations 30 | [7419.485][INFO]: RESPONSE Navigate 31 | [7419.486][INFO]: COMMAND FindElement { 32 | "using": "xpath", 33 | "value": "//*[@id='main']/div/div[2]/div" 34 | } 35 | [7419.486][INFO]: Waiting for pending navigations... 36 | [7419.486][INFO]: Done waiting for pending navigations 37 | [7419.499][INFO]: Waiting for pending navigations... 38 | [7419.499][INFO]: Done waiting for pending navigations 39 | [7419.499][INFO]: RESPONSE FindElement no such element 40 | (Session info: chrome=50.0.2661.102) 41 | -------------------------------------------------------------------------------- /lib/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangzhengqiao/wechat_spider/8345ada0ef4c89cd7aae49824490c2398633c7cc/lib/chromedriver -------------------------------------------------------------------------------- /spider.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fedesog/webdriver" 6 | "log" 7 | "weixin.com/utils" 8 | ) 9 | 10 | func main() { 11 | platName := "小企业E家" 12 | chromeDriver := webdriver.NewChromeDriver(utils.CHROME_DRIVER_PATH) 13 | err := chromeDriver.Start() 14 | if err != nil { 15 | log.Println(err) 16 | } 17 | 18 | url := "http://weixin.sogou.com/weixin?query=%s&type=1&ie=utf8" 19 | 20 | session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 21 | if err != nil { 22 | log.Println(err) 23 | } 24 | 25 | url = fmt.Sprintf(url, platName) 26 | err = session.Url(url) 27 | 28 | if err != nil { 29 | log.Println(err) 30 | } 31 | 32 | // pageNum := utils.GetPageNum(url, platName, chromeDriver) 33 | 34 | // if pageNum > 1 { 35 | // for i := 1; i <= pageNum; i++ { 36 | results := utils.DownloadTest(url, session) 37 | utils.Pipeline(results) 38 | // } 39 | // } else { 40 | // results := utils.Download(url, platName, 1, chromeDriver) 41 | // utils.Pipeline(results) 42 | // } 43 | 44 | session.Delete() 45 | 46 | // fmt.Println(results) 47 | } 48 | -------------------------------------------------------------------------------- /utils/config.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | const ( 4 | CHROME_DRIVER_PATH string = "/Users/jiangzhengqiao/go/src/weixin.com/lib/chromedriver" 5 | ) 6 | -------------------------------------------------------------------------------- /utils/download.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fedesog/webdriver" 6 | "log" 7 | ) 8 | 9 | func Download(url, platName string, page int, chromeDriver *webdriver.ChromeDriver) []WeChatList { 10 | log.Printf("开始采集第%v页,URL【%s】.", page, fmt.Sprintf(url, platName, page)) 11 | 12 | results := make([]WeChatList, 0) 13 | session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 14 | if err != nil { 15 | log.Println(err) 16 | } 17 | 18 | err = session.Url(fmt.Sprintf(url, platName, page)) 19 | if err != nil { 20 | log.Println(err) 21 | } 22 | 23 | html, _ := session.FindElement(webdriver.XPath, "//*[@id='main']/div/div[2]/div") 24 | 25 | divs, _ := html.FindElements(webdriver.ClassName, "wx-rb") 26 | if len(divs) > 0 { 27 | for _, div := range divs { 28 | wechatUrl, _ := div.GetAttribute("href") 29 | 30 | val, _ := div.FindElement(webdriver.ClassName, "txt-box") 31 | 32 | h3, _ := val.FindElement(webdriver.TagName, "h3") 33 | 34 | wechatName, _ := h3.Text() 35 | 36 | h4, _ := val.FindElement(webdriver.TagName, "h4") 37 | h4, _ = h4.FindElement(webdriver.Name, "em_weixinhao") 38 | wechatSignal, _ := h4.Text() 39 | 40 | sps, _ := val.FindElements(webdriver.ClassName, "s-p3") 41 | var wechatIntroduced, wechatTit string 42 | if len(sps) > 1 { 43 | t, _ := sps[1].FindElement(webdriver.ClassName, "sp-tit") 44 | th, _ := t.Text() 45 | if th == "功能介绍:" { 46 | sptxt, _ := sps[0].FindElement(webdriver.ClassName, "sp-txt") 47 | wechatIntroduced, _ = sptxt.Text() 48 | } 49 | 50 | sptxt1, _ := sps[1].FindElement(webdriver.ClassName, "sp-txt") 51 | wechatTit, _ = sptxt1.Text() 52 | } 53 | 54 | ico, _ := div.FindElement(webdriver.ClassName, "pos-ico") 55 | box, _ := ico.FindElement(webdriver.ClassName, "pos-box") 56 | img, _ := box.FindElement(webdriver.TagName, "img") 57 | wechatCode, _ := img.GetAttribute("src") 58 | 59 | result := NewWeChatList() 60 | result.wechatUrl = wechatUrl 61 | result.wechatName = wechatName 62 | result.wechatSignal = wechatSignal 63 | result.wechatIntroduced = wechatIntroduced 64 | result.wechatTit = wechatTit 65 | result.wechatCode = wechatCode 66 | result.articleLists = downloadArticleList(wechatUrl, chromeDriver) 67 | results = append(results, *result) 68 | } 69 | } 70 | 71 | session.Delete() 72 | return results 73 | } 74 | 75 | func downloadArticleList(url string, chromeDriver *webdriver.ChromeDriver) []ArticleList { 76 | log.Printf("开始采集文章列表页,URL【%s】.", url) 77 | articleLists := make([]ArticleList, 0) 78 | session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 79 | if err != nil { 80 | log.Println(err) 81 | } 82 | 83 | err = session.Url(url) 84 | if err != nil { 85 | log.Println(err) 86 | } 87 | 88 | div, _ := session.FindElement(webdriver.ClassName, "weui_msg_card_list") 89 | infos, _ := div.FindElements(webdriver.ClassName, "weui_msg_card") 90 | if len(infos) > 0 { 91 | for _, info := range infos { 92 | // date, _ := info.FindElement(webdriver.ClassName, "weui_msg_card_hd") 93 | titles, _ := info.FindElements(webdriver.ClassName, "weui_media_title") 94 | descs, _ := info.FindElements(webdriver.ClassName, "weui_media_desc") 95 | infos, _ := info.FindElements(webdriver.ClassName, "weui_media_extra_info") 96 | urls, _ := info.FindElements(webdriver.TagName, "h4") 97 | if len(titles) > 0 { 98 | for i := 0; i < len(titles); i++ { 99 | articleList := NewArticleList() 100 | t, _ := titles[i].Text() 101 | d, _ := descs[i].Text() 102 | in, _ := infos[i].Text() 103 | u, _ := urls[i].GetAttribute("hrefs") 104 | url := "http://mp.weixin.qq.com" + u 105 | articleList.title = t 106 | articleList.desc = d 107 | articleList.date = in 108 | articleList.url = url 109 | articleList.articleContent = downloadArticleContent(url, chromeDriver) 110 | articleLists = append(articleLists, *articleList) 111 | } 112 | } 113 | } 114 | } 115 | session.Delete() 116 | return articleLists 117 | } 118 | 119 | func downloadArticleContent(url string, chromeDriver *webdriver.ChromeDriver) ArticleContent { 120 | session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 121 | if err != nil { 122 | log.Println(err) 123 | } 124 | 125 | err = session.Url(url) 126 | if err != nil { 127 | log.Println(err) 128 | } 129 | 130 | title, _ := session.FindElement(webdriver.ID, "activity-name") 131 | t, _ := title.Text() 132 | 133 | postDate, _ := session.FindElement(webdriver.ID, "post-date") 134 | date, _ := postDate.Text() 135 | 136 | imgContent, _ := session.FindElement(webdriver.ID, "img-content") 137 | content, _ := imgContent.Text() 138 | 139 | articleContent := NewArticleContent() 140 | articleContent.articleTitle = t 141 | articleContent.articleDate = date 142 | articleContent.articleContent = content 143 | articleContent.articleUrl = url 144 | 145 | session.Delete() 146 | return *articleContent 147 | } 148 | 149 | func DownloadTest(url string, session *webdriver.Session) []WeChatList { 150 | log.Printf("开始采集URL【%s】.", url) 151 | 152 | results := make([]WeChatList, 0) 153 | // session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 154 | // if err != nil { 155 | // log.Println(err) 156 | // } 157 | 158 | // err = session.Url(fmt.Sprintf(url, platName, page)) 159 | // if err != nil { 160 | // log.Println(err) 161 | // } 162 | 163 | html, _ := session.FindElement(webdriver.XPath, "//*[@id='main']/div/div[2]/div") 164 | 165 | divs, _ := html.FindElements(webdriver.ClassName, "wx-rb") 166 | if len(divs) > 0 { 167 | for _, div := range divs { 168 | wechatUrl, _ := div.GetAttribute("href") 169 | val, _ := div.FindElement(webdriver.ClassName, "txt-box") 170 | h3, _ := val.FindElement(webdriver.TagName, "h3") 171 | wechatName, _ := h3.Text() 172 | h4, _ := val.FindElement(webdriver.TagName, "h4") 173 | h4, _ = h4.FindElement(webdriver.Name, "em_weixinhao") 174 | wechatSignal, _ := h4.Text() 175 | 176 | sps, _ := val.FindElements(webdriver.ClassName, "s-p3") 177 | var wechatIntroduced, wechatTit string 178 | if len(sps) > 1 { 179 | t, _ := sps[1].FindElement(webdriver.ClassName, "sp-tit") 180 | th, _ := t.Text() 181 | if th == "功能介绍:" { 182 | sptxt, _ := sps[0].FindElement(webdriver.ClassName, "sp-txt") 183 | wechatIntroduced, _ = sptxt.Text() 184 | } 185 | 186 | sptxt1, _ := sps[1].FindElement(webdriver.ClassName, "sp-txt") 187 | wechatTit, _ = sptxt1.Text() 188 | } 189 | 190 | ico, _ := div.FindElement(webdriver.ClassName, "pos-ico") 191 | box, _ := ico.FindElement(webdriver.ClassName, "pos-box") 192 | img, _ := box.FindElement(webdriver.TagName, "img") 193 | wechatCode, _ := img.GetAttribute("src") 194 | div.Click() 195 | 196 | // 采集文章列表 197 | articleLists := make([]ArticleList, 0) 198 | // wh, _ := session.WindowHandle() 199 | // whs, _ := session.WindowHandles() 200 | // for _, v := range whs { 201 | // if wh.ID != v.ID { 202 | // session.FocusOnWindow(v.ID) 203 | // div, _ := session.FindElement(webdriver.ClassName, "weui_msg_card_list") 204 | // cards, _ := div.FindElements(webdriver.ClassName, "weui_msg_card") 205 | // if len(cards) > 0 { 206 | // for _, info := range cards { 207 | // // date, _ := info.FindElement(webdriver.ClassName, "weui_msg_card_hd") 208 | // titles, _ := info.FindElements(webdriver.ClassName, "weui_media_title") 209 | // descs, _ := info.FindElements(webdriver.ClassName, "weui_media_desc") 210 | // infos, _ := info.FindElements(webdriver.ClassName, "weui_media_extra_info") 211 | // urls, _ := info.FindElements(webdriver.TagName, "h4") 212 | // if len(titles) > 0 { 213 | // for i := 0; i < len(titles); i++ { 214 | // articleList := NewArticleList() 215 | // t, _ := titles[i].Text() 216 | // d, _ := descs[i].Text() 217 | // in, _ := infos[i].Text() 218 | // u, _ := urls[i].GetAttribute("hrefs") 219 | // url := "http://mp.weixin.qq.com" + u 220 | // titles[i].Click() 221 | // // 采集文章 222 | // wh, _ := session.WindowHandle() 223 | // whs, _ := session.WindowHandles() 224 | // articleContent := NewArticleContent() 225 | // // log.Println("窗口个数:\t", len(whs)) 226 | // for _, v := range whs { 227 | // if wh.ID != v.ID { 228 | // session.FocusOnWindow(v.ID) 229 | // title, err := session.FindElement(webdriver.ID, "activity-name") 230 | // if err == nil { 231 | // t, _ := title.Text() 232 | 233 | // postDate, _ := session.FindElement(webdriver.ID, "post-date") 234 | // date, _ := postDate.Text() 235 | 236 | // imgContent, _ := session.FindElement(webdriver.ID, "img-content") 237 | // content, _ := imgContent.Text() 238 | 239 | // articleContent.articleTitle = t 240 | // articleContent.articleDate = date 241 | // articleContent.articleContent = content 242 | // articleContent.articleUrl = url 243 | // } 244 | // // session.CloseCurrentWindow() 245 | // session.Back() 246 | // session.FocusOnWindow(wh.ID) 247 | // } 248 | // } 249 | 250 | // articleList.title = t 251 | // articleList.desc = d 252 | // articleList.date = in 253 | // articleList.url = url 254 | // articleList.articleContent = *articleContent 255 | // articleLists = append(articleLists, *articleList) 256 | // } 257 | // } 258 | // } 259 | // } 260 | // session.CloseCurrentWindow() 261 | // session.FocusOnWindow(wh.ID) 262 | // } 263 | // } 264 | 265 | result := NewWeChatList() 266 | result.wechatUrl = wechatUrl 267 | result.wechatName = wechatName 268 | result.wechatSignal = wechatSignal 269 | result.wechatIntroduced = wechatIntroduced 270 | result.wechatTit = wechatTit 271 | result.wechatCode = wechatCode 272 | result.articleLists = articleLists 273 | results = append(results, *result) 274 | } 275 | } 276 | return results 277 | } 278 | -------------------------------------------------------------------------------- /utils/page.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fedesog/webdriver" 6 | "log" 7 | "strconv" 8 | ) 9 | 10 | func GetPageNum(url, platName string, chromeDriver *webdriver.ChromeDriver) int { 11 | url = fmt.Sprintf(url, platName, 1) 12 | 13 | session, err := chromeDriver.NewSession(webdriver.Capabilities{}, webdriver.Capabilities{}) 14 | if err != nil { 15 | log.Println(err) 16 | } 17 | 18 | err = session.Url(url) 19 | 20 | if err != nil { 21 | log.Println(err) 22 | } 23 | 24 | var page int = 1 25 | p, err := session.FindElement(webdriver.ID, "pagebar_container") 26 | if err != nil { 27 | return page 28 | } 29 | 30 | mun, _ := p.FindElement(webdriver.ClassName, "mun") 31 | scdnum, _ := mun.FindElement(webdriver.ID, "scd_num") 32 | pageText, _ := scdnum.Text() 33 | pageCount, _ := strconv.Atoi(pageText) 34 | 35 | if pageCount > 10 { 36 | page = pageCount/10 + 1 37 | } 38 | session.Delete() 39 | return page 40 | } 41 | -------------------------------------------------------------------------------- /utils/pipeline.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "log" 5 | ) 6 | 7 | func Pipeline(results []WeChatList) { 8 | if len(results) > 0 { 9 | var i int = 1 10 | for _, result := range results { 11 | log.Println("1,-----------------------------------------") 12 | log.Println("微信地址:", result.wechatUrl) 13 | log.Println("微信名:", result.wechatName) 14 | log.Println("微信号:", result.wechatSignal) 15 | log.Println("微信认证:", result.wechatTit) 16 | log.Println("功能介绍:", result.wechatIntroduced) 17 | log.Println("二维码地址:", result.wechatCode) 18 | 19 | if len(result.articleLists) > 0 { 20 | for _, articleList := range result.articleLists { 21 | log.Println("2,-----------------------------------------") 22 | log.Println("标题:", articleList.title) 23 | log.Println("摘要:", articleList.desc) 24 | log.Println("时间:", articleList.date) 25 | log.Println("链接地址:", articleList.url) 26 | 27 | // if len(articleList.articleContents) > 0 { 28 | // for _, articleContent := range articleList.articleContents { 29 | log.Println("3,-----------------------------------------") 30 | log.Println("标题:", articleList.articleContent.articleTitle) 31 | log.Println("内容:", articleList.articleContent.articleContent) 32 | log.Println("时间:", articleList.articleContent.articleDate) 33 | log.Println("地址:", articleList.articleContent.articleUrl) 34 | i++ 35 | // } 36 | // } 37 | } 38 | } 39 | log.Printf("微信号%s,有文章%v篇。", result.wechatName, i) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /utils/result.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // 公众号列表 4 | type WeChatList struct { 5 | wechatUrl string // 微信地址 6 | wechatName string // 微信名 7 | wechatSignal string // 微信号 8 | wechatIntroduced string // 功能介绍 9 | wechatTit string // 微信认证 10 | wechatCode string // 二维码 11 | articleLists []ArticleList // 文章列表 12 | } 13 | 14 | // 文章列表 15 | type ArticleList struct { 16 | title string // 标题 17 | desc string // 摘要 18 | date string // 时间 19 | url string // 地址 20 | articleContent ArticleContent // 文章 21 | } 22 | 23 | // 文章 24 | type ArticleContent struct { 25 | articleTitle string // 标题 26 | articleDate string // 时间 27 | articleUrl string // 地址 28 | articleContent string // 内容 29 | articleHtml string // 原始内容 30 | } 31 | 32 | func NewWeChatList() *WeChatList { 33 | return &WeChatList{} 34 | } 35 | 36 | func NewArticleList() *ArticleList { 37 | return &ArticleList{} 38 | } 39 | 40 | func NewArticleContent() *ArticleContent { 41 | return &ArticleContent{} 42 | } 43 | 44 | // func (this *WeChatList) GetArticleList() []ArticleList { 45 | // // 根据URL采集文章列表 this.wechatUrl 46 | // } 47 | --------------------------------------------------------------------------------