├── .gitignore ├── IJGUC └── IJGUC.go ├── README.md ├── alibaba └── alibaba.go ├── area_codes └── area_codes.go ├── baidunews └── baidunews.go ├── baidusearch └── baidusearch.go ├── car_home └── car_home.go ├── chinanews ├── chinanews.go └── readme.md ├── fang_resell_list ├── fang_resell_list.go └── readme.md ├── filetest └── filetest.go ├── ganji_gongsi └── ganji_gongsi.go ├── googlesearch └── googlesearch.go ├── hollandandbarrett └── hollandandbarrett.go ├── jdsearch └── jdsearch.go ├── jiban └── jiban.go ├── jingdong ├── README.md └── jdSpider.go ├── kaola └── kaola.go ├── lewa └── lewa.go ├── miyabaobei └── miyabaobei.go ├── people └── people.go ├── pholcus_lib.go ├── qq_avatar ├── README.md └── avatar.go ├── shunfenghaitao └── shunfenghaitao.go ├── taobao └── taobao.go ├── taobaosearch └── taobaosearch.go ├── wangyi └── wangyi.go ├── weibo_fans └── weibo_fans.go ├── wukongwenda ├── README.md └── wukongwenda.go ├── zhihu_bianji ├── README.md └── zhihu_bianji.go ├── zhihu_daily ├── README.md └── zhihu_daily.go ├── zolpc └── zolpc.go ├── zolphone └── zolphone.go └── zolslab └── zolslab.go /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | *.so 4 | _obj 5 | _test 6 | *.[568vq] 7 | [568vq].out 8 | *.cgo1.go 9 | *.cgo2.c 10 | _cgo_defun.c 11 | _cgo_gotypes.go 12 | _cgo_export.* 13 | _testmain.go 14 | *.exe 15 | *.exe~ 16 | *.test 17 | *.prof 18 | *.rar 19 | *.zip 20 | *.gz 21 | *.psd 22 | *.bmd 23 | *.cfg 24 | *.pptx 25 | *.log 26 | *nohup.out 27 | *.sublime-project 28 | *.sublime-workspace 29 | -------------------------------------------------------------------------------- /IJGUC/IJGUC.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | // "log" 6 | 7 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 10 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 11 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 12 | 13 | // net包 14 | // "net/http" //设置http.Header 15 | // "net/url" 16 | 17 | // 编码包 18 | // "encoding/xml" 19 | // "encoding/json" 20 | 21 | // 字符串处理包 22 | "regexp" 23 | "strconv" 24 | // "strings" 25 | // 其他包 26 | // "fmt" 27 | // "math" 28 | // "time" 29 | ) 30 | 31 | func init() { 32 | IJGUC.Register() 33 | } 34 | 35 | var IJGUC = &Spider{ 36 | Name: "IJGUC期刊", 37 | Description: "IJGUC期刊", 38 | // Pausetime: 300, 39 | // Keyin: KEYIN, 40 | // Limit: LIMIT, 41 | EnableCookie: false, 42 | RuleTree: &RuleTree{ 43 | Root: func(ctx *Context) { 44 | ctx.AddQueue(&request.Request{ 45 | Url: "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1", 46 | Rule: "期刊列表", 47 | }) 48 | }, 49 | 50 | Trunk: map[string]*Rule{ 51 | "期刊列表": { 52 | ParseFunc: func(ctx *Context) { 53 | query := ctx.GetDom() 54 | for i := 1; i <= 7; i++ { 55 | id := "#eventbody" + strconv.Itoa(i) + " a" 56 | query.Find(id).Each(func(j int, s *goquery.Selection) { 57 | if url, ok := s.Attr("href"); ok { 58 | // log.Print(url) 59 | ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"}) 60 | } 61 | }) 62 | } 63 | }, 64 | }, 65 | "文章列表": { 66 | ParseFunc: func(ctx *Context) { 67 | query := ctx.GetDom() 68 | //#journalcol1 article table tbody tr td:eq(1) table:eq(1) a 69 | query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) { 70 | if i == 1 { 71 | td.Find("table").Each(func(j int, table *goquery.Selection) { 72 | if j == 1 { 73 | table.Find("a").Each(func(k int, a *goquery.Selection) { 74 | if k%2 == 0 { 75 | if url, ok := a.Attr("href"); ok { 76 | // log.Print(url) 77 | ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"}) 78 | } 79 | } 80 | }) 81 | } 82 | }) 83 | } 84 | }) 85 | }, 86 | }, 87 | "文章页": { 88 | //注意:有无字段语义和是否输出数据必须保持一致 89 | ItemFields: []string{ 90 | "Title", 91 | "Author", 92 | "Addresses", 93 | "Journal", 94 | "Abstract", 95 | "Keywords", 96 | "DOI", 97 | }, 98 | ParseFunc: func(ctx *Context) { 99 | query := ctx.GetDom() 100 | // 获取内容 101 | content := query.Find("#col1").Text() 102 | 103 | // 过滤标签 104 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 105 | content = re.ReplaceAllString(content, "") 106 | 107 | // Title 108 | re, _ = regexp.Compile("Title:(.*?)Author:") 109 | title := re.FindStringSubmatch(content)[1] 110 | // Author 111 | re, _ = regexp.Compile("Author:(.*?)Addresses:") 112 | au := re.FindStringSubmatch(content) 113 | var author string 114 | if len(au) > 0 { 115 | author = au[1] 116 | } else { 117 | re, _ = regexp.Compile("Author:(.*?)Address:") 118 | author = re.FindStringSubmatch(content)[1] 119 | } 120 | // Addresses & Address 121 | re, _ = regexp.Compile("Addresses:(.*?)Journal:") 122 | address := re.FindStringSubmatch(content) 123 | var addresses string 124 | if len(address) > 0 { 125 | addresses = address[1] 126 | } else { 127 | re, _ = regexp.Compile("Address:(.*?)Journal:") 128 | addresses = re.FindStringSubmatch(content)[1] 129 | } 130 | // Journal 131 | re, _ = regexp.Compile("Journal:(.*?)Abstract:") 132 | journal := re.FindStringSubmatch(content)[1] 133 | // Abstract 134 | re, _ = regexp.Compile("Abstract:(.*?)Keywords:") 135 | abstract := re.FindStringSubmatch(content)[1] 136 | // Keywords 137 | re, _ = regexp.Compile("Keywords:(.*?)DOI:") 138 | keywords := re.FindStringSubmatch(content)[1] 139 | // DOI 140 | re, _ = regexp.Compile("DOI: ") 141 | doiIndex := re.FindStringSubmatchIndex(content) 142 | rs := []rune(content) 143 | left := doiIndex[1] - 8 144 | right := left + 43 145 | doi := string(rs[left:right]) 146 | 147 | // 结果存入Response中转 148 | ctx.Output(map[int]interface{}{ 149 | 0: title, 150 | 1: author, 151 | 2: addresses, 152 | 3: journal, 153 | 4: abstract, 154 | 5: keywords, 155 | 6: doi, 156 | }) 157 | }, 158 | }, 159 | }, 160 | }, 161 | } 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pholcus_lib 2 | 3 | [Pholcus](https://github.com/henrylee2cn/pholcus) 用户公共维护的spider爬虫规则库。 4 | 5 | ## 维护规范 6 | 7 | - 欢迎每位用户都来分享自己的爬虫规则 8 | - 每个规则放在单一个独的子目录 9 | - 新增规则最好提供README.md 10 | - 新增规则时,须在根目录 `pholcus_lib.go` 文件的import组中添加类似`_ "github.com/henrylee2cn/pholcus_lib/jingdong"`的包引用声明 11 | - 新增规则时,须在根目录README.md(本文档)的 `爬虫规则列表` 中按子目录名`a-z`的顺序插入一条相应的规则记录 12 | - 维护旧规则时,应在规则文件或相应README.md中增加修改说明:如修改原因、修改时间、签名、联系方式等 13 | - 凡爬虫规则的贡献者均可在其源码文件或相应README.md中留下在的签名、联系方式 14 | 15 | 16 | ## 爬虫规则列表 17 | 18 | |子目录|规则描述| 19 | |---|---| 20 | |alibaba|阿里巴巴产品搜索| 21 | |area_codes|国家统计局区划代码| 22 | |avatar|我要个性网-头像昵称搜索下载| 23 | |baidunews|百度RSS新闻| 24 | |baidusearch|百度搜索| 25 | |car_home|汽车之家| 26 | |chinanews|中国新闻网-滚动新闻| 27 | |filetest|文件下载测试| 28 | |ganji_gongsi|经典示例-赶集网企业名录| 29 | |googlesearch|谷歌搜索| 30 | |hollandandbarrett|Hollandand&Barrett商品数据| 31 | |IJGUC|IJGUC期刊| 32 | |jdsearch|京东搜索| 33 | |jingdong|京东搜索(修复版)| 34 | |jiban|羁绊动漫| 35 | |kaola|考拉海淘| 36 | |lewa|乐蛙登录测试| 37 | |miyabaobei|蜜芽宝贝| 38 | |people|人民网新闻抓取| 39 | |shunfenghaitao|顺丰海淘| 40 | |taobao|淘宝数据| 41 | |taobaosearch|淘宝天猫搜索| 42 | |wangyi|网易新闻| 43 | |weibo_fans|微博粉丝列表| 44 | |wukongwenda|悟空问答| 45 | |zolpc|中关村笔记本| 46 | |zolphone|中关村手机| 47 | |zolslab|中关村平板| 48 | |zhihu_bianji|知乎编辑推荐| 49 | |zhihu_daily|知乎每日推荐| 50 | -------------------------------------------------------------------------------- /alibaba/alibaba.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 8 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | AlibabaProduct.Register() 31 | } 32 | 33 | var AlibabaProduct = &Spider{ 34 | Name: "阿里巴巴产品搜索", 35 | Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]", 36 | // Pausetime: 300, 37 | Keyin: KEYIN, 38 | Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求") 43 | }, 44 | 45 | Trunk: map[string]*Rule{ 46 | 47 | "生成请求": { 48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 49 | keyin := EncodeString(ctx.GetKeyin(), "gbk") 50 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 51 | ctx.AddQueue(&request.Request{ 52 | Url: "http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&uniqfield=pic_tag_id&keyins=" + keyin + "&beginPage=" + strconv.Itoa(loop[0]+1), 53 | Rule: aid["Rule"].(string), 54 | Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}}, 55 | }) 56 | } 57 | return nil 58 | }, 59 | ParseFunc: func(ctx *Context) { 60 | query := ctx.GetDom() 61 | // logs.Log.Debug(ctx.GetText()) 62 | pageTag := query.Find("#sm-pagination div[data-total-page]") 63 | // 跳转 64 | if len(pageTag.Nodes) == 0 { 65 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 由于跳转AJAX问题,目前只能每个子类抓取 1 页……\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 66 | query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) { 67 | if href, ok := s.Attr("href"); ok { 68 | ctx.AddQueue(&request.Request{ 69 | Url: href, 70 | Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}}, 71 | Rule: "搜索结果", 72 | }) 73 | } 74 | }) 75 | return 76 | } 77 | total1, _ := pageTag.First().Attr("data-total-page") 78 | total1 = strings.Trim(total1, " \t\n") 79 | total, _ := strconv.Atoi(total1) 80 | if total > ctx.GetLimit() { 81 | total = ctx.GetLimit() 82 | } else if total == 0 { 83 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 84 | return 85 | } 86 | 87 | // 调用指定规则下辅助函数 88 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"}) 89 | // 用指定规则解析响应流 90 | ctx.Parse("搜索结果") 91 | }, 92 | }, 93 | 94 | "搜索结果": { 95 | //注意:有无字段语义和是否输出数据必须保持一致 96 | ItemFields: []string{ 97 | "公司", 98 | "标题", 99 | "价格", 100 | "销量", 101 | "星级", 102 | "地址", 103 | "链接", 104 | }, 105 | ParseFunc: func(ctx *Context) { 106 | query := ctx.GetDom() 107 | 108 | query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) { 109 | 110 | // 获取公司 111 | company, _ := s.Find("a.sm-offer-companyName").First().Attr("title") 112 | 113 | // 获取标题 114 | t := s.Find(".sm-offer-title > a:nth-child(1)") 115 | title, _ := t.Attr("title") 116 | 117 | // 获取URL 118 | url, _ := t.Attr("href") 119 | 120 | // 获取价格 121 | price := s.Find(".sm-offer-priceNum").First().Text() 122 | 123 | // 获取成交量 124 | sales := s.Find("span.sm-offer-trade > em").First().Text() 125 | 126 | // 获取地址 127 | address, _ := s.Find(".sm-offer-location").First().Attr("title") 128 | 129 | // 获取信用年限 130 | level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text() 131 | 132 | // 结果存入Response中转 133 | ctx.Output(map[int]interface{}{ 134 | 0: company, 135 | 1: title, 136 | 2: price, 137 | 3: sales, 138 | 4: level, 139 | 5: address, 140 | 6: url, 141 | }) 142 | }) 143 | }, 144 | }, 145 | }, 146 | }, 147 | } 148 | -------------------------------------------------------------------------------- /area_codes/area_codes.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | 8 | //. "github.com/henrylee2cn/pholcus/app/spider/common" //选用 9 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 10 | 11 | //信息输出 12 | // net包 13 | //设置http.Header 14 | // "net/url" 15 | 16 | // 编码包 17 | // "encoding/xml" 18 | // "encoding/json" 19 | 20 | // 字符串处理包 21 | // "regexp" 22 | 23 | "strings" 24 | // 其他包 25 | // "fmt" 26 | // "math" 27 | // "time" 28 | ) 29 | 30 | func init() { 31 | AreaCodes2018.Register() 32 | } 33 | 34 | /* 35 | -- 数据清洗 36 | 37 | SET SQL_SAFE_UPDATES = 0; 38 | -- 去重 39 | delete from 2018年统计用区划代码和城乡划分代码__0__市 where id not in (select temp.id from (select min(id) as id from 2018年统计用区划代码和城乡划分代码__0__市 group by 代码) as temp); 40 | 41 | -- 合并表 42 | CREATE TABLE area_codes 43 | select 名称 as name,RPAD(代码,12,'0') as area_code,级别 as level,RPAD(上级,12,'0') as parent from 2018年统计用区划代码和城乡划分代码__0__省 44 | UNION 45 | select 名称 as name,RPAD(代码,12,'0') as area_code,级别 as level,RPAD(上级,12,'0') as parent from 2018年统计用区划代码和城乡划分代码__0__市; 46 | */ 47 | 48 | // AreaCodes2018 2018年统计用区划代码和城乡划分代码 49 | // 50 | // creatTime:2019年09月06日 09:23:55 51 | // author:hailaz 52 | var AreaCodes2018 = &Spider{ 53 | Name: "2018年统计用区划代码和城乡划分代码", 54 | Description: "2018年统计用区划代码和城乡划分代码。间隔不要小于100ms,不然容易触发验证码导致失败。总数据大概71万(暂停时长100ms,耗时2小时),所以适当做数据分批输出,不然出现内存溢出。", 55 | // Pausetime: 50, 56 | // Keyin: KEYIN, 57 | // Limit: LIMIT, 58 | EnableCookie: false, 59 | RuleTree: &RuleTree{ 60 | Root: func(ctx *Context) { 61 | ctx.AddQueue(&request.Request{ 62 | Url: "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html", 63 | Rule: "省", 64 | }) 65 | }, 66 | 67 | Trunk: map[string]*Rule{ 68 | "省": { 69 | ItemFields: []string{ 70 | "名称", 71 | "代码", 72 | "级别", 73 | "上级", 74 | }, 75 | ParseFunc: func(ctx *Context) { 76 | baseUrl := ctx.GetRequest().Url 77 | baseUrl = baseUrl[:strings.LastIndex(baseUrl, "/")+1] 78 | query := ctx.GetDom() 79 | //cc := 0 80 | query.Find("tr.provincetr").Each(func(i int, tr *goquery.Selection) { 81 | //cc++ 82 | tr.Find("td a").Each(func(j int, a *goquery.Selection) { 83 | if url, ok := a.Attr("href"); ok { 84 | code := strings.Split(url, ".")[0] 85 | url = baseUrl + url 86 | //fmt.Println("0", a.Text()+":"+url) 87 | ctx.Output(map[int]interface{}{ 88 | 0: a.Text(), 89 | 1: code, 90 | 2: 0, 91 | 3: 0, 92 | }) 93 | ctx.AddQueue(&request.Request{Url: url, Rule: "市", Temp: request.Temp{"level": 0, "parent": code}}) 94 | } 95 | }) 96 | }) 97 | //fmt.Println(cc) // 等于零,证明请求太过频繁,需要输入验证码 98 | }, 99 | }, 100 | "市": { 101 | ItemFields: []string{ 102 | "名称", 103 | "代码", 104 | "级别", 105 | "上级", 106 | }, 107 | ParseFunc: func(ctx *Context) { 108 | baseUrl := ctx.GetRequest().Url 109 | baseUrl = baseUrl[:strings.LastIndex(baseUrl, "/")+1] 110 | level := ctx.GetRequest().Temp["level"].(int) + 1 111 | parent := ctx.GetRequest().Temp["parent"].(string) 112 | query := ctx.GetDom() 113 | if level == 4 { 114 | myCode := "" 115 | query.Find("tr.villagetr td").Each(func(i int, td *goquery.Selection) { 116 | if i%3 == 0 { 117 | myCode = td.Text() 118 | } 119 | if i%3 == 2 { 120 | ctx.Output(map[int]interface{}{ 121 | 0: td.Text(), 122 | 1: myCode, 123 | 2: level, 124 | 3: parent, 125 | }) 126 | //fmt.Println(level, td.Text(), myCode) 127 | } 128 | }) 129 | } else { 130 | myCode := "" 131 | query.Find("tr td a").Each(func(i int, a *goquery.Selection) { 132 | if i%2 == 0 { 133 | myCode = a.Text() 134 | } 135 | if i%2 == 1 { 136 | if url, ok := a.Attr("href"); ok { 137 | code := strings.Split(strings.Split(url, "/")[1], ".")[0] 138 | url = baseUrl + url 139 | ctx.Output(map[int]interface{}{ 140 | 0: a.Text(), 141 | 1: myCode, 142 | 2: level, 143 | 3: parent, 144 | }) 145 | //fmt.Println(level, a.Text(), myCode) 146 | ctx.AddQueue(&request.Request{Url: url, Rule: "市", Temp: request.Temp{"level": level, "parent": code}}) 147 | } 148 | } 149 | }) 150 | } 151 | }, 152 | }, 153 | }, 154 | }, 155 | } 156 | -------------------------------------------------------------------------------- /baidunews/baidunews.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 8 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/json" 17 | "encoding/xml" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | // "strconv" 22 | // "strings" 23 | 24 | // 其他包 25 | // "fmt" 26 | // "math" 27 | "time" 28 | ) 29 | 30 | func init() { 31 | BaiduNews.Register() 32 | } 33 | 34 | var rss_BaiduNews = map[string]string{ 35 | "国内最新": "http://news.baidu.com/n?cmd=4&class=civilnews&tn=rss", 36 | "国际最新": "http://news.baidu.com/n?cmd=4&class=internews&tn=rss", 37 | "军事最新": "http://news.baidu.com/n?cmd=4&class=mil&tn=rss", 38 | "财经最新": "http://news.baidu.com/n?cmd=4&class=finannews&tn=rss", 39 | "互联网最新": "http://news.baidu.com/n?cmd=4&class=internet&tn=rss", 40 | "房产最新": "http://news.baidu.com/n?cmd=4&class=housenews&tn=rss", 41 | "汽车最新": "http://news.baidu.com/n?cmd=4&class=autonews&tn=rss", 42 | "体育最新": "http://news.baidu.com/n?cmd=4&class=sportnews&tn=rss", 43 | "娱乐最新": "http://news.baidu.com/n?cmd=4&class=enternews&tn=rss", 44 | "游戏最新": "http://news.baidu.com/n?cmd=4&class=gamenews&tn=rss", 45 | "教育最新": "http://news.baidu.com/n?cmd=4&class=edunews&tn=rss", 46 | "女人最新": "http://news.baidu.com/n?cmd=4&class=healthnews&tn=rss", 47 | "科技最新": "http://news.baidu.com/n?cmd=4&class=technnews&tn=rss", 48 | "社会最新": "http://news.baidu.com/n?cmd=4&class=socianews&tn=rss", 49 | } 50 | 51 | type ( 52 | BaiduNewsRss struct { 53 | Channel BaiduNewsData `xml:"channel"` 54 | } 55 | BaiduNewsData struct { 56 | Item []BaiduNewsItem `xml:"item"` 57 | } 58 | BaiduNewsItem struct { 59 | Title string `xml:"title"` 60 | Link string `xml:"link"` 61 | Description string `xml:"description"` 62 | PubDate string `xml:"pubDate"` 63 | Author string `xml:"author"` 64 | } 65 | ) 66 | 67 | var BaiduNews = &Spider{ 68 | Name: "百度RSS新闻", 69 | Description: "百度RSS新闻,实现轮询更新 [Auto Page] [news.baidu.com]", 70 | // Pausetime: 300, 71 | // Keyin: KEYIN, 72 | EnableCookie: false, 73 | // Limit: LIMIT, 74 | // 命名空间相对于数据库名,不依赖具体数据内容,可选 75 | Namespace: nil, 76 | // 子命名空间相对于表名,可依赖具体数据内容,可选 77 | SubNamespace: func(self *Spider, dataCell map[string]interface{}) string { 78 | return dataCell["Data"].(map[string]interface{})["分类"].(string) 79 | }, 80 | RuleTree: &RuleTree{ 81 | Root: func(ctx *Context) { 82 | for k := range rss_BaiduNews { 83 | ctx.SetTimer(k, time.Minute*5, nil) 84 | ctx.Aid(map[string]interface{}{"loop": k}, "LOOP") 85 | } 86 | }, 87 | 88 | Trunk: map[string]*Rule{ 89 | "LOOP": { 90 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 91 | k := aid["loop"].(string) 92 | v := rss_BaiduNews[k] 93 | 94 | ctx.AddQueue(&request.Request{ 95 | Url: v, 96 | Rule: "XML列表页", 97 | Header: http.Header{"Content-Type": []string{"application/xml"}}, 98 | Temp: map[string]interface{}{"src": k}, 99 | // DialTimeout: -1, 100 | // ConnTimeout: -1, 101 | // TryTimes: -1, 102 | Reloadable: true, 103 | }) 104 | return nil 105 | }, 106 | }, 107 | "XML列表页": { 108 | ParseFunc: func(ctx *Context) { 109 | var src = ctx.GetTemp("src", "").(string) 110 | defer func() { 111 | // 循环请求 112 | ctx.RunTimer(src) 113 | ctx.Aid(map[string]interface{}{"loop": src}, "LOOP") 114 | }() 115 | 116 | page := ctx.GetText() 117 | rss := new(BaiduNewsRss) 118 | if err := xml.Unmarshal([]byte(page), rss); err != nil { 119 | logs.Log.Error("XML列表页: %v", err) 120 | return 121 | } 122 | content := rss.Channel 123 | for _, v := range content.Item { 124 | ctx.AddQueue(&request.Request{ 125 | Url: v.Link, 126 | Rule: "新闻详情", 127 | Temp: map[string]interface{}{ 128 | "title": CleanHtml(v.Title, 4), 129 | "description": CleanHtml(v.Description, 4), 130 | "src": src, 131 | "releaseTime": CleanHtml(v.PubDate, 4), 132 | "author": CleanHtml(v.Author, 4), 133 | }, 134 | }) 135 | } 136 | }, 137 | }, 138 | 139 | "新闻详情": { 140 | //注意:有无字段语义和是否输出数据必须保持一致 141 | ItemFields: []string{ 142 | "标题", 143 | "描述", 144 | "内容", 145 | "发布时间", 146 | "分类", 147 | "作者", 148 | }, 149 | ParseFunc: func(ctx *Context) { 150 | var title = ctx.GetTemp("title", "").(string) 151 | 152 | infoStr, isReload := baiduNewsFn.prase(ctx) 153 | if isReload { 154 | return 155 | } 156 | // 结果存入Response中转 157 | ctx.Output(map[int]interface{}{ 158 | 0: title, 159 | 1: ctx.GetTemp("description", ""), 160 | 2: infoStr, 161 | 3: ctx.GetTemp("releaseTime", ""), 162 | 4: ctx.GetTemp("src", ""), 163 | 5: ctx.GetTemp("author", ""), 164 | }) 165 | }, 166 | }, 167 | }, 168 | }, 169 | } 170 | 171 | type baiduNews map[string]func(ctx *Context) (infoStr string, isReload bool) 172 | 173 | // @url 必须为含有协议头的地址 174 | func (b baiduNews) prase(ctx *Context) (infoStr string, isReload bool) { 175 | url := ctx.GetHost() 176 | if _, ok := b[url]; ok { 177 | return b[url](ctx) 178 | } else { 179 | return b.commonPrase(ctx), false 180 | } 181 | } 182 | 183 | func (b baiduNews) commonPrase(ctx *Context) (infoStr string) { 184 | body := ctx.GetDom().Find("body") 185 | 186 | var info *goquery.Selection 187 | 188 | if h1s := body.Find("h1"); len(h1s.Nodes) != 0 { 189 | for i := 0; i < len(h1s.Nodes); i++ { 190 | info = b.findP(h1s.Eq(i)) 191 | } 192 | } else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 { 193 | for i := 0; i < len(h2s.Nodes); i++ { 194 | info = b.findP(h2s.Eq(i)) 195 | } 196 | } else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 { 197 | for i := 0; i < len(h3s.Nodes); i++ { 198 | info = b.findP(h3s.Eq(i)) 199 | } 200 | } else { 201 | info = body.Find("body") 202 | } 203 | infoStr, _ = info.Html() 204 | 205 | // 清洗HTML 206 | infoStr = CleanHtml(infoStr, 5) 207 | return 208 | } 209 | 210 | func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection { 211 | if html.Is("body") { 212 | return html 213 | } else if result := html.Parent().Find("p"); len(result.Nodes) == 0 { 214 | return b.findP(html.Parent()) 215 | } else { 216 | return html.Parent() 217 | } 218 | } 219 | 220 | var baiduNewsFn = baiduNews{ 221 | "yule.sohu.com": func(ctx *Context) (infoStr string, isReload bool) { 222 | infoStr = ctx.GetDom().Find("#contentText").Text() 223 | return 224 | }, 225 | "news.qtv.com.cn": func(ctx *Context) (infoStr string, isReload bool) { 226 | infoStr = ctx.GetDom().Find(".zwConreally_z").Text() 227 | return 228 | }, 229 | } 230 | -------------------------------------------------------------------------------- /baidusearch/baidusearch.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | "github.com/henrylee2cn/pholcus/logs" //信息输出 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | 24 | // 其他包 25 | // "fmt" 26 | "math" 27 | // "time" 28 | ) 29 | 30 | func init() { 31 | BaiduSearch.Register() 32 | } 33 | 34 | var BaiduSearch = &Spider{ 35 | Name: "百度搜索", 36 | Description: "百度搜索结果 [www.baidu.com]", 37 | // Pausetime: 300, 38 | Keyin: KEYIN, 39 | Limit: LIMIT, 40 | EnableCookie: false, 41 | // 禁止输出默认字段 Url/ParentUrl/DownloadTime 42 | NotDefaultField: true, 43 | // 命名空间相对于数据库名,不依赖具体数据内容,可选 44 | Namespace: nil, 45 | // 子命名空间相对于表名,可依赖具体数据内容,可选 46 | SubNamespace: nil, 47 | RuleTree: &RuleTree{ 48 | Root: func(ctx *Context) { 49 | ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求") 50 | }, 51 | 52 | Trunk: map[string]*Rule{ 53 | 54 | "生成请求": { 55 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 56 | var duplicatable bool 57 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 58 | if loop[0] == 0 { 59 | duplicatable = true 60 | } else { 61 | duplicatable = false 62 | } 63 | ctx.AddQueue(&request.Request{ 64 | Url: "http://www.baidu.com/s?ie=utf-8&nojc=1&wd=" + ctx.GetKeyin() + "&rn=50&pn=" + strconv.Itoa(50*loop[0]), 65 | Rule: aid["Rule"].(string), 66 | Reloadable: duplicatable, 67 | }) 68 | } 69 | return nil 70 | }, 71 | ParseFunc: func(ctx *Context) { 72 | query := ctx.GetDom() 73 | total1 := query.Find(".nums").Text() 74 | re, _ := regexp.Compile(`[\D]*`) 75 | total1 = re.ReplaceAllString(total1, "") 76 | total2, _ := strconv.Atoi(total1) 77 | total := int(math.Ceil(float64(total2) / 50)) 78 | if total > ctx.GetLimit() { 79 | total = ctx.GetLimit() 80 | } else if total == 0 { 81 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 82 | return 83 | } 84 | // 调用指定规则下辅助函数 85 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"}) 86 | // 用指定规则解析响应流 87 | ctx.Parse("搜索结果") 88 | }, 89 | }, 90 | 91 | "搜索结果": { 92 | //注意:有无字段语义和是否输出数据必须保持一致 93 | ItemFields: []string{ 94 | "标题", 95 | "内容", 96 | "不完整URL", 97 | "百度跳转", 98 | }, 99 | ParseFunc: func(ctx *Context) { 100 | query := ctx.GetDom() 101 | query.Find("#content_left .c-container").Each(func(i int, s *goquery.Selection) { 102 | 103 | title := s.Find(".t").Text() 104 | content := s.Find(".c-abstract").Text() 105 | href, _ := s.Find(".t >a").Attr("href") 106 | tar := s.Find(".g").Text() 107 | 108 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 109 | // title = re.ReplaceAllStringFunc(title, strings.ToLower) 110 | // content = re.ReplaceAllStringFunc(content, strings.ToLower) 111 | 112 | title = re.ReplaceAllString(title, "") 113 | content = re.ReplaceAllString(content, "") 114 | 115 | // 结果存入Response中转 116 | ctx.Output(map[int]interface{}{ 117 | 0: strings.Trim(title, " \t\n"), 118 | 1: strings.Trim(content, " \t\n"), 119 | 2: tar, 120 | 3: href, 121 | }) 122 | }) 123 | }, 124 | }, 125 | }, 126 | }, 127 | } 128 | -------------------------------------------------------------------------------- /car_home/car_home.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | CarHome.Register() 31 | } 32 | 33 | var CarHome = &Spider{ 34 | Name: "汽车之家", 35 | Description: "汽车之家帖子 [http://club.autohome.com.cn/bbs/]", 36 | // Pausetime: 300, 37 | // Keyin: KEYIN, 38 | // Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.AddQueue(&request.Request{ 43 | Url: "http://club.autohome.com.cn/bbs/forum-o-200042-1.html?qaType=-1#pvareaid=101061", 44 | Rule: "请求列表", 45 | Temp: map[string]interface{}{"p": 1}, 46 | }) 47 | }, 48 | 49 | Trunk: map[string]*Rule{ 50 | 51 | "请求列表": { 52 | ParseFunc: func(ctx *Context) { 53 | var curr = ctx.GetTemp("p", 0).(int) 54 | if c := ctx.GetDom().Find(".pages .cur").Text(); c != strconv.Itoa(curr) { 55 | // Log.Printf("当前列表页不存在 %v", c) 56 | return 57 | } 58 | ctx.AddQueue(&request.Request{ 59 | Url: "http://club.autohome.com.cn/bbs/forum-o-200042-" + strconv.Itoa(curr+1) + ".html?qaType=-1#pvareaid=101061", 60 | Rule: "请求列表", 61 | Temp: map[string]interface{}{"p": curr + 1}, 62 | }) 63 | 64 | // 用指定规则解析响应流 65 | ctx.Parse("获取列表") 66 | }, 67 | }, 68 | 69 | "获取列表": { 70 | ParseFunc: func(ctx *Context) { 71 | ctx.GetDom(). 72 | Find(".list_dl"). 73 | Each(func(i int, s *goquery.Selection) { 74 | url, _ := s.Find("dt a").Attr("href") 75 | ctx.AddQueue(&request.Request{ 76 | Url: "http://club.autohome.com.cn" + url, 77 | Rule: "输出结果", 78 | Priority: 1, 79 | }) 80 | }) 81 | }, 82 | }, 83 | 84 | "输出结果": { 85 | //注意:有无字段语义和是否输出数据必须保持一致 86 | ItemFields: []string{ 87 | 88 | "当前积分", 89 | "帖子数", 90 | "关注的车", 91 | "注册时间", 92 | "作者", 93 | }, 94 | ParseFunc: func(ctx *Context) { 95 | query := ctx.GetDom() 96 | 97 | var 当前积分, 帖子数, 关注的车, 注册时间, 作者 string 98 | 99 | 积分 := strings.Split(query.Find(".lv-curr").First().Text(), "当前积分:") 100 | if len(积分) > 1 { 101 | 当前积分 = 积分[1] 102 | } 103 | 104 | info := query.Find(".conleft").Eq(0).Find(".leftlist li") 105 | 106 | if len(info.Eq(3).Nodes) > 0 { 107 | 帖子数 = strings.Split(info.Eq(3).Find("a").Text(), "帖")[0] 108 | } 109 | 110 | for i := 6; !info.Eq(i).HasClass("leftimgs") && 111 | len(info.Eq(i).Nodes) > 0 && 112 | len(info.Eq(i).Find("a").Nodes) > 0; i++ { 113 | if strings.Contains(info.Eq(i).Text(), "所属:") { 114 | continue 115 | } 116 | 117 | fs := info.Eq(i).Find("a") 118 | var f string 119 | if len(fs.Nodes) > 1 { 120 | f, _ = info.Eq(i).Find("a").Eq(1).Attr("title") 121 | } else { 122 | f, _ = info.Eq(i).Find("a").First().Attr("title") 123 | } 124 | if f == "" { 125 | continue 126 | } 127 | 关注的车 += f + "|" 128 | } 129 | 130 | 关注的车 = strings.Trim(关注的车, "|") 131 | 132 | if len(info.Eq(4).Nodes) > 0 { 133 | 注册 := strings.Split(info.Eq(4).Text(), "注册:") 134 | if len(注册) > 1 { 135 | 注册时间 = 注册[1] 136 | } 137 | } 138 | 作者 = query.Find(".conleft").Eq(0).Find("a").Text() 139 | // 结果存入Response中转 140 | ctx.Output(map[int]interface{}{ 141 | 0: 当前积分, 142 | 1: 帖子数, 143 | 2: 关注的车, 144 | 3: 注册时间, 145 | 4: 作者, 146 | }) 147 | }, 148 | }, 149 | 150 | // "联系方式": { 151 | // ParseFunc: func(ctx *Context) { 152 | // ctx.AddFile(ctx.GetTemp("n").(string)) 153 | // }, 154 | // }, 155 | }, 156 | }, 157 | } 158 | -------------------------------------------------------------------------------- /chinanews/chinanews.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 8 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 9 | // "github.com/henrylee2cn/pholcus/logs" 10 | // net包 11 | // "net/http" //设置http.Header 12 | // "net/url" 13 | // 编码包 14 | // "encoding/xml" 15 | //"encoding/json" 16 | // 字符串处理包 17 | //"regexp" 18 | // "strconv" 19 | // "fmt" 20 | // "math" 21 | // "time" 22 | "github.com/henrylee2cn/pholcus/common/goquery" 23 | "strings" 24 | ) 25 | 26 | func init() { 27 | FileTest.Register() 28 | } 29 | 30 | var FileTest = &Spider{ 31 | Name: "中国新闻网", 32 | Description: "测试 [http://www.chinanews.com/scroll-news/news1.html]", 33 | // Pausetime: 300, 34 | // Keyin: KEYIN, 35 | // Limit: LIMIT, 36 | EnableCookie: false, 37 | RuleTree: &RuleTree{ 38 | Root: func(ctx *Context) { 39 | ctx.AddQueue(&request.Request{ 40 | Url: "http://www.chinanews.com/scroll-news/news1.html", 41 | Rule: "滚动新闻", 42 | }) 43 | }, 44 | 45 | Trunk: map[string]*Rule{ 46 | 47 | "滚动新闻": { 48 | ParseFunc: func(ctx *Context) { 49 | query := ctx.GetDom() 50 | //获取分页导航 51 | navBox := query.Find(".pagebox a") 52 | navBox.Each(func(i int, s *goquery.Selection) { 53 | if url, ok := s.Attr("href"); ok { 54 | ctx.AddQueue(&request.Request{ 55 | Url: "http://www.chinanews.com" + url, 56 | Rule: "新闻列表", 57 | 58 | }) 59 | } 60 | 61 | }) 62 | 63 | }, 64 | }, 65 | 66 | "新闻列表": { 67 | ParseFunc: func(ctx *Context) { 68 | query := ctx.GetDom() 69 | //获取新闻列表 70 | newList := query.Find(".content_list li") 71 | newList.Each(func(i int, s *goquery.Selection) { 72 | //新闻类型 73 | newsType := s.Find(".dd_lm a").Text() 74 | //标题 75 | newsTitle := s.Find(".dd_bt a").Text() 76 | //时间 77 | newsTime := s.Find(".dd_time").Text() 78 | if url, ok := s.Find(".dd_bt a").Attr("href"); ok { 79 | ctx.AddQueue(&request.Request{ 80 | Url: "http://" + url[2:len(url)], 81 | Rule: "新闻内容", 82 | Temp: map[string]interface{}{ 83 | "newsType": newsType, 84 | "newsTitle": newsTitle, 85 | "newsTime": newsTime, 86 | }, 87 | }) 88 | } 89 | 90 | }) 91 | 92 | }, 93 | }, 94 | 95 | "新闻内容": { 96 | ItemFields: []string{ 97 | "类别", 98 | "来源", 99 | "标题", 100 | "内容", 101 | "时间", 102 | }, 103 | 104 | ParseFunc: func(ctx *Context) { 105 | query := ctx.GetDom() 106 | //正文 107 | content := query.Find(".left_zw").Text() 108 | //来源 109 | from := query.Find(".left-t").Text() 110 | i := strings.LastIndex(from,"来源") 111 | //来源字符串特殊处理 112 | if i == -1{ 113 | from = "未知" 114 | }else{ 115 | from = from[i+9:len(from)] 116 | from = strings.Replace(from,"参与互动","",1) 117 | if from=="" { 118 | from = query.Find(".left-t").Eq(2).Text() 119 | from = strings.Replace(from,"参与互动","",1) 120 | } 121 | } 122 | 123 | //输出格式 124 | ctx.Output(map[int]interface{}{ 125 | 0: ctx.GetTemp("newsType",""), 126 | 1: from, 127 | 2: ctx.GetTemp("newsTitle",""), 128 | 3: content, 129 | 4: ctx.GetTemp("newsTime", ""), 130 | }) 131 | }, 132 | }, 133 | 134 | }, 135 | }, 136 | } 137 | -------------------------------------------------------------------------------- /chinanews/readme.md: -------------------------------------------------------------------------------- 1 | ## 中国新闻网-滚动新闻栏目 2 | 3 | ### 说明 4 | 5 | 只是爬取滚动新闻栏目(共10页) 6 | 7 | ### 代码说明 8 | 9 | 1.直接访问滚动新闻栏目地址(http://www.chinanews.com/scroll-news/news1.html) 10 | 2.获取分页导航 11 | 3.获取分页链接 12 | 13 | 刚开始学习,写的不好,多多指教 weChat:gaoyawei616 -------------------------------------------------------------------------------- /fang_resell_list/fang_resell_list.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | //"github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | //"github.com/henrylee2cn/pholcus/logs/logs" 11 | // 字符串处理包 12 | // "regexp" 13 | //"strconv" 14 | //"strings" 15 | // 其他包 16 | // "fmt" 17 | // "math" 18 | // "time" 19 | //"strings" 20 | //"strings" 21 | "strings" 22 | "github.com/henrylee2cn/pholcus/logs" 23 | "strconv" 24 | ) 25 | 26 | func init() { 27 | fangList.Register() 28 | } 29 | 30 | var fangList = &Spider{ 31 | Name: "resell house of fang.com", 32 | Description: "fang.com http://esf.zz.fang.com/house/i31/", 33 | EnableCookie: true, 34 | RuleTree: &RuleTree{ 35 | Root: func(ctx *Context) { 36 | var i = 1; 37 | //for i = 1; i < 101; i++ { 38 | ctx.AddQueue(&request.Request{ 39 | Url: "http://esf.zz.fang.com/house/i3" + strconv.Itoa(i) + "/", 40 | Rule: "fang_collection", 41 | Temp: map[string]interface{}{"p": 1}, 42 | }) 43 | //} 44 | }, 45 | 46 | Trunk: map[string]*Rule{ 47 | "fang_collection": { 48 | ItemFields: []string{ 49 | "communityName", 50 | "totalFloor", 51 | "rooms", 52 | "halls", 53 | "buildTime", 54 | "address", 55 | "direction", 56 | "area", 57 | "price", 58 | "unitPrice", 59 | "locationType", //楼层所在高低 60 | "remoteId", //搜房id 61 | "business", 62 | }, 63 | ParseFunc: func(ctx *Context) { 64 | //获取当页搜房的所有数据 65 | ctx.GetDom().Find(".houseList dl").Each( 66 | func(i int, s *goquery.Selection) { 67 | var communityName, totalFloor, rooms, halls, locationType, remoteId, buildTime, address, direction, area, price, unitPrice, business string; 68 | communityName = s.Find(".info p.mt10 a span").Text(); 69 | 70 | address = s.Find(".info p.mt10 span.iconAdress").Text(); 71 | business = ""; 72 | 73 | sp := strings.Split(address,"-"); 74 | if(len(sp) == 2){ 75 | address = sp[1]; 76 | business = sp[0]; 77 | } 78 | //获取年代中的一吨 79 | roomLineTmp := s.Find("dd.info p.mt12").Text(); 80 | roomLine := strings.Fields(roomLineTmp); 81 | 82 | if (len(roomLine) == 4 ) { 83 | //替换掉厅 84 | roomsTmp := roomLine[0]; 85 | roomsTmp = strings.Replace(roomsTmp, "厅", "", 1); 86 | roomsS := strings.Split(roomsTmp, "室"); 87 | if (len(roomsS) == 2) { 88 | rooms = roomsS[0]; 89 | halls = roomsS[1]; 90 | } 91 | //楼类型和层高获取 92 | buildingTmp := roomLine[1]; 93 | buildingTmpSec := strings.Split(buildingTmp, "(共"); 94 | if (len(buildingTmpSec) == 2) { 95 | locationType = strings.Replace(buildingTmpSec[0], "|", "", 1); 96 | totalFloor = strings.Replace(buildingTmpSec[1], "层)", "", 1); 97 | } 98 | 99 | buildTime = strings.Replace(roomLine[3], "|建筑年代:", "", 1); 100 | direction = strings.Replace(roomLine[2], "|", "", 1); 101 | direction = strings.Replace(direction, "向", "", 1); 102 | } 103 | 104 | area = s.Find("dd.info div.area").Children().Eq(0).Text(); 105 | price = s.Find("dd.info div.moreInfo").Children().Eq(0).Text(); 106 | unitPrice = s.Find("dd.info div.moreInfo").Children().Eq(1).Text(); 107 | remoteTmp, exists := s.Find("dd.info p.title a").Attr("href"); 108 | if (exists) { 109 | remoteAttr := strings.Split(remoteTmp,"_"); 110 | remoteId = strings.Replace(remoteAttr[1],".htm","",1); 111 | } 112 | 113 | logs.Log.Critical("当前房源id: %v", remoteId) 114 | //解析传入的片段 115 | // 结果存入Response中转 116 | ctx.Output(map[int]interface{}{ 117 | 0: strings.Trim(communityName, " "), 118 | 1: strings.Trim(totalFloor, " "), 119 | 2: strings.Trim(rooms, " "), 120 | 3: strings.Trim(halls, " "), 121 | 4: strings.Trim(buildTime, " "), 122 | 5: strings.Trim(address, " "), 123 | 6: strings.Trim(direction, " "), 124 | 7: strings.Trim(strings.Replace(area,"㎡","",1), " "), 125 | 8: strings.Trim(strings.Replace(price,"万","",1), " "), 126 | 9: strings.Trim(strings.Replace(unitPrice,"元/㎡","",1), " "), 127 | 10: strings.Trim(locationType, " "), 128 | 11: strings.Trim(remoteId, " "), 129 | 12: strings.Trim(business, " "), 130 | }) 131 | }) 132 | ctx.Parse("getContent") 133 | }, 134 | }, 135 | }, 136 | }, 137 | } 138 | -------------------------------------------------------------------------------- /fang_resell_list/readme.md: -------------------------------------------------------------------------------- 1 | ## 搜房爬取二手房列表 2 | 3 | ### 说明 4 | 5 | 仅爬取列表页, 字段: 6 | "communityName":小区名, 7 | "totalFloor":总层数, 8 | "rooms":房间数, 9 | "halls":厅数量, 10 | "buildTime":建筑年代, 11 | "address":地址, 12 | "direction":朝向, 13 | "area":面积, 14 | "price":价格, 15 | "unitPrice"单价, 16 | "locationType"所在层数高低, 17 | 18 | ### 代码说明 19 | 20 | 1.目前仅仅爬取了搜房二手房的列表页, 一次爬取一页 21 | 2.如果有需要就修改37行打开多页爬取 22 | 3.在使用中发现,如果爬取的页面数太多会导致蜘蛛崩溃, 原因未知, 待查 -------------------------------------------------------------------------------- /filetest/filetest.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 8 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 9 | // "github.com/henrylee2cn/pholcus/logs" 10 | // net包 11 | // "net/http" //设置http.Header 12 | // "net/url" 13 | // 编码包 14 | // "encoding/xml" 15 | //"encoding/json" 16 | // 字符串处理包 17 | //"regexp" 18 | // "strconv" 19 | // "strings" 20 | // 其他包 21 | // "fmt" 22 | // "math" 23 | // "time" 24 | ) 25 | 26 | func init() { 27 | FileTest.Register() 28 | } 29 | 30 | var FileTest = &Spider{ 31 | Name: "文件下载测试", 32 | Description: "文件下载测试", 33 | // Pausetime: 300, 34 | // Keyin: KEYIN, 35 | // Limit: LIMIT, 36 | EnableCookie: false, 37 | RuleTree: &RuleTree{ 38 | Root: func(ctx *Context) { 39 | ctx.AddQueue(&request.Request{ 40 | Url: "https://www.baidu.com/img/bd_logo1.png", 41 | Rule: "百度图片", 42 | ConnTimeout: -1, 43 | DownloaderID: 0, //图片等多媒体文件必须使用0(surfer surf go原生下载器) 44 | }) 45 | ctx.AddQueue(&request.Request{ 46 | Url: "https://github.com/henrylee2cn/pholcus", 47 | Rule: "Pholcus页面", 48 | ConnTimeout: -1, 49 | DownloaderID: 0, //文本文件可使用0或者1(0:surfer surf go原生下载器;1:surfer plantomjs内核) 50 | }) 51 | }, 52 | 53 | Trunk: map[string]*Rule{ 54 | 55 | "百度图片": { 56 | ParseFunc: func(ctx *Context) { 57 | ctx.FileOutput("baidu") // 等价于ctx.AddFile("baidu") 58 | }, 59 | }, 60 | "Pholcus页面": { 61 | ParseFunc: func(ctx *Context) { 62 | ctx.FileOutput() // 等价于ctx.AddFile() 63 | }, 64 | }, 65 | }, 66 | }, 67 | } 68 | -------------------------------------------------------------------------------- /ganji_gongsi/ganji_gongsi.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | GanjiGongsi.Register() 31 | } 32 | 33 | var GanjiGongsi = &Spider{ 34 | Name: "经典示例-赶集网企业名录", 35 | Description: "**典型规则示例,具有文本与文件两种输出行为**", 36 | // Pausetime: 300, 37 | // Keyin: KEYIN, 38 | // Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.AddQueue(&request.Request{ 43 | Url: "http://sz.ganji.com/gongsi/o1", 44 | Rule: "请求列表", 45 | Temp: map[string]interface{}{"p": 1}, 46 | }) 47 | }, 48 | 49 | Trunk: map[string]*Rule{ 50 | 51 | "请求列表": { 52 | ParseFunc: func(ctx *Context) { 53 | var curr = ctx.GetTemp("p", int(0)).(int) 54 | if ctx.GetDom().Find(".linkOn span").Text() != strconv.Itoa(curr) { 55 | return 56 | } 57 | ctx.AddQueue(&request.Request{ 58 | Url: "http://sz.ganji.com/gongsi/o" + strconv.Itoa(curr+1), 59 | Rule: "请求列表", 60 | Temp: map[string]interface{}{"p": curr + 1}, 61 | ConnTimeout: -1, 62 | }) 63 | 64 | // 用指定规则解析响应流 65 | ctx.Parse("获取列表") 66 | }, 67 | }, 68 | 69 | "获取列表": { 70 | ParseFunc: func(ctx *Context) { 71 | ctx.GetDom(). 72 | Find(".com-list-2 table a"). 73 | Each(func(i int, s *goquery.Selection) { 74 | url, _ := s.Attr("href") 75 | ctx.AddQueue(&request.Request{ 76 | Url: url, 77 | Rule: "输出结果", 78 | ConnTimeout: -1, 79 | }) 80 | }) 81 | }, 82 | }, 83 | 84 | "输出结果": { 85 | //注意:有无字段语义和是否输出数据必须保持一致 86 | ItemFields: []string{ 87 | "公司", 88 | "联系人", 89 | "地址", 90 | "简介", 91 | "行业", 92 | "类型", 93 | "规模", 94 | }, 95 | ParseFunc: func(ctx *Context) { 96 | query := ctx.GetDom() 97 | 98 | var 公司, 规模, 行业, 类型, 联系人, 地址 string 99 | 100 | query.Find(".c-introduce li").Each(func(i int, s *goquery.Selection) { 101 | em := s.Find("em").Text() 102 | t := strings.Split(s.Text(), ` `)[0] 103 | t = strings.Replace(t, em, "", -1) 104 | t = strings.Trim(t, " ") 105 | 106 | switch em { 107 | case "公司名称:": 108 | 公司 = t 109 | 110 | case "公司规模:": 111 | 规模 = t 112 | 113 | case "公司行业:": 114 | 行业 = t 115 | 116 | case "公司类型:": 117 | 类型 = t 118 | 119 | case "联 系 人:": 120 | 联系人 = t 121 | 122 | case "联系电话:": 123 | if img, ok := s.Find("img").Attr("src"); ok { 124 | ctx.AddQueue(&request.Request{ 125 | Url: "http://www.ganji.com" + img, 126 | Rule: "联系方式", 127 | Temp: map[string]interface{}{"n": 公司 + "(" + 联系人 + ").png"}, 128 | Priority: 1, 129 | ConnTimeout: -1, 130 | }) 131 | } 132 | 133 | case "公司地址:": 134 | 地址 = t 135 | } 136 | }) 137 | 138 | 简介 := query.Find("#company_description").Text() 139 | 140 | // 结果输出方式一(推荐) 141 | ctx.Output(map[int]interface{}{ 142 | 0: 公司, 143 | 1: 联系人, 144 | 2: 地址, 145 | 3: 简介, 146 | 4: 行业, 147 | 5: 类型, 148 | 6: 规模, 149 | }) 150 | 151 | // 结果输出方式二 152 | // var item map[string]interface{} = ctx.CreatItem(map[int]interface{}{ 153 | // 0: 公司, 154 | // 1: 联系人, 155 | // 2: 地址, 156 | // 3: 简介, 157 | // 4: 行业, 158 | // 5: 类型, 159 | // 6: 规模, 160 | // }) 161 | // ctx.Output(item) 162 | 163 | // 结果输出方式三(不推荐) 164 | // ctx.Output(map[string]interface{}{ 165 | // ctx.GetItemField(0): 公司, 166 | // ctx.GetItemField(1): 联系人, 167 | // ctx.GetItemField(2): 地址, 168 | // ctx.GetItemField(3): 简介, 169 | // ctx.GetItemField(4): 行业, 170 | // ctx.GetItemField(5): 类型, 171 | // ctx.GetItemField(6): 规模, 172 | // }) 173 | }, 174 | }, 175 | 176 | "联系方式": { 177 | ParseFunc: func(ctx *Context) { 178 | // 文件输出方式一(推荐) 179 | ctx.FileOutput(ctx.GetTemp("n", "").(string)) 180 | 181 | // 文件输出方式二 182 | // ctx.AddFile(ctx.GetTemp("n").(string)) 183 | }, 184 | }, 185 | }, 186 | }, 187 | } 188 | -------------------------------------------------------------------------------- /googlesearch/googlesearch.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | 24 | // 其他包 25 | // "fmt" 26 | "math" 27 | // "time" 28 | ) 29 | 30 | func init() { 31 | GoogleSearch.Register() 32 | } 33 | 34 | var googleIp = []string{ 35 | "210.242.125.100", 36 | "210.242.125.96", 37 | "210.242.125.91", 38 | "210.242.125.95", 39 | "64.233.189.163", 40 | "58.123.102.5", 41 | "210.242.125.97", 42 | "210.242.125.115", 43 | "58.123.102.28", 44 | "210.242.125.70", 45 | "220.255.2.153", 46 | } 47 | 48 | var GoogleSearch = &Spider{ 49 | Name: "Google search", 50 | Description: "Crawls pages from [www.google.com]", 51 | // Pausetime: 300, 52 | Keyin: KEYIN, 53 | Limit: LIMIT, 54 | EnableCookie: false, 55 | RuleTree: &RuleTree{ 56 | Root: func(ctx *Context) { 57 | var url string 58 | var success bool 59 | logs.Log.Informational("Running google spider,this may take some time...") 60 | 61 | for _, ip := range googleIp { 62 | // url = "http://" + ip + "/search?q=" + ctx.GetKeyin() + "&newwindow=1&biw=1600&bih=398&start=" 63 | // Beware of redirections, if it doesnt work use google domain: 64 | // url = "https://google.co.uk/search?q=" + ctx.GetKeyin() 65 | url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin() 66 | logs.Log.Informational("测试 " + ip) 67 | if _, err := goquery.NewDocument(url); err == nil { 68 | success = true 69 | break 70 | } 71 | } 72 | if !success { 73 | logs.Log.Critical("Could not reach any of the Google mirrors") 74 | return 75 | } 76 | logs.Log.Critical("Starting Google search ...") 77 | ctx.AddQueue(&request.Request{ 78 | Url: url, 79 | Rule: "total_pages", 80 | Temp: map[string]interface{}{ 81 | "baseUrl": url, 82 | }, 83 | }) 84 | }, 85 | 86 | Trunk: map[string]*Rule{ 87 | 88 | "total_pages": { 89 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 90 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 91 | ctx.AddQueue(&request.Request{ 92 | Url: aid["urlBase"].(string) +"&start="+ strconv.Itoa(10 * loop[0]), 93 | Rule: aid["Rule"].(string), 94 | }) 95 | } 96 | return nil 97 | }, 98 | ParseFunc: func(ctx *Context) { 99 | query := ctx.GetDom() 100 | txt := query.Find("#resultStats").Text() 101 | re, _ := regexp.Compile(`,+`) 102 | txt = re.ReplaceAllString(txt, "") 103 | re, _ = regexp.Compile(`[\d]+`) 104 | txt = re.FindString(txt) 105 | num, _ := strconv.Atoi(txt) 106 | total := int(math.Ceil(float64(num) / 10)) 107 | if total > ctx.GetLimit() { 108 | total = ctx.GetLimit() 109 | } else if total == 0 { 110 | logs.Log.Critical("[ERROR:| Spider:%v | KEYIN:%v | Rule:%v] Did not fetch any data!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 111 | return 112 | } 113 | // 调用指定规则下辅助函数 114 | ctx.Aid(map[string]interface{}{ 115 | "loop": [2]int{1, total}, 116 | "urlBase": ctx.GetTemp("baseUrl", ""), 117 | "Rule": "search_results", 118 | }) 119 | // 用指定规则解析响应流 120 | ctx.Parse("search_results") 121 | }, 122 | }, 123 | 124 | "search_results": { 125 | //注意:有无字段语义和是否输出数据必须保持一致 126 | ItemFields: []string{ 127 | "title", 128 | "content", 129 | "href", 130 | }, 131 | ParseFunc: func(ctx *Context) { 132 | query := ctx.GetDom() 133 | query.Find("#ires .g").Each(func(i int, s *goquery.Selection) { 134 | t := s.Find(".r > a") 135 | href, _ := t.Attr("href") 136 | href = strings.TrimLeft(href, "/url?q=") 137 | logs.Log.Informational(href) 138 | title := t.Text() 139 | content := s.Find(".st").Text() 140 | ctx.Output(map[int]interface{}{ 141 | 0: title, 142 | 1: content, 143 | 2: href, 144 | }) 145 | }) 146 | }, 147 | }, 148 | }, 149 | }, 150 | } 151 | -------------------------------------------------------------------------------- /hollandandbarrett/hollandandbarrett.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | "github.com/henrylee2cn/pholcus/logs" //信息输出 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | // "strconv" 22 | "strings" 23 | 24 | // 其他包 25 | "fmt" 26 | // "math" 27 | // "time" 28 | ) 29 | 30 | func init() { 31 | Hollandandbarrett.Register() 32 | } 33 | 34 | var Hollandandbarrett = &Spider{ 35 | Name: "Hollandandbarrett", 36 | Description: "Hollandand&Barrett商品数据 [Auto Page] [www.Hollandandbarrett.com]", 37 | // Pausetime: 300, 38 | // Keyin: KEYIN, 39 | // Limit: LIMIT, 40 | EnableCookie: false, 41 | RuleTree: &RuleTree{ 42 | Root: func(ctx *Context) { 43 | ctx.AddQueue(&request.Request{ 44 | Url: "http://www.hollandandbarrett.com/", 45 | Rule: "获取版块URL", 46 | }, 47 | ) 48 | }, 49 | 50 | Trunk: map[string]*Rule{ 51 | 52 | "获取版块URL": { 53 | ParseFunc: func(ctx *Context) { 54 | query := ctx.GetDom() 55 | lis := query.Find(".footer-links nav.l-one-half a") 56 | 57 | lis.Each(func(i int, s *goquery.Selection) { 58 | if url, ok := s.Attr("href"); ok { 59 | tit, _ := s.Attr("title") 60 | ctx.AddQueue(&request.Request{ 61 | Url: "http://www.hollandandbarrett.com" + url + "?showAll=1&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true", 62 | Rule: "获取总数", 63 | Temp: map[string]interface{}{ 64 | "type": tit, 65 | "baseUrl": url, 66 | }, 67 | }, 68 | ) 69 | } 70 | }) 71 | }, 72 | }, 73 | 74 | "获取总数": { 75 | ParseFunc: func(ctx *Context) { 76 | 77 | query := ctx.GetDom() 78 | 79 | re, _ := regexp.Compile(`(?U)"totalNumRecs":[\d]+,`) 80 | total := re.FindString(query.Text()) 81 | re, _ = regexp.Compile(`[\d]+`) 82 | total = re.FindString(total) 83 | total = strings.Trim(total, " \t\n") 84 | 85 | if total == "0" { 86 | logs.Log.Critical("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 87 | } else { 88 | 89 | ctx.AddQueue(&request.Request{ 90 | Url: "http://www.hollandandbarrett.com" + ctx.GetTemp("baseUrl", "").(string) + "?showAll=" + total + "&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true", 91 | Rule: "商品详情", 92 | Temp: map[string]interface{}{ 93 | "type": ctx.GetTemp("type", "").(string), 94 | }, 95 | }, 96 | ) 97 | 98 | } 99 | }, 100 | }, 101 | 102 | "商品详情": { 103 | //注意:有无字段语义和是否输出数据必须保持一致 104 | ItemFields: []string{ 105 | "标题", 106 | "原价", 107 | "折后价", 108 | "打折", 109 | "星级", 110 | "分类", 111 | }, 112 | ParseFunc: func(ctx *Context) { 113 | query := ctx.GetDom() 114 | 115 | src := query.Text() 116 | 117 | infos := map[string]interface{}{} 118 | 119 | err := json.Unmarshal([]byte(src), &infos) 120 | 121 | if err != nil { 122 | logs.Log.Error("error is %v\n", err) 123 | return 124 | } else { 125 | for _, info1 := range infos["contents"].([]interface{})[0].(map[string]interface{})["mainContent"].([]interface{})[0].(map[string]interface{})["records"].([]interface{}) { 126 | 127 | info2 := info1.(map[string]interface{})["records"].([]interface{})[0].(map[string]interface{})["attributes"].(map[string]interface{}) 128 | 129 | var n, price1, price2, prm, level string 130 | 131 | if info2["Name"] == nil { 132 | n = "" 133 | } else { 134 | n = fmt.Sprint(info2["Name"]) 135 | n = strings.TrimRight(n, "]") 136 | n = strings.TrimLeft(n, "[") 137 | } 138 | 139 | if info2["lp"] == nil { 140 | price1 = "" 141 | } else { 142 | price1 = fmt.Sprint(info2["lp"]) 143 | price1 = strings.TrimRight(price1, "]") 144 | price1 = strings.TrimLeft(price1, "[") 145 | } 146 | 147 | if info2["sp"] == nil { 148 | price2 = "" 149 | } else { 150 | price2 = fmt.Sprint(info2["sp"]) 151 | price2 = strings.TrimRight(price2, "]") 152 | price2 = strings.TrimLeft(price2, "[") 153 | } 154 | 155 | if info2["prm"] == nil { 156 | prm = "" 157 | } else { 158 | prm = fmt.Sprint(info2["prm"]) 159 | prm = strings.TrimRight(prm, "]") 160 | prm = strings.TrimLeft(prm, "[") 161 | } 162 | 163 | if info2["ratingCount"] == nil { 164 | level = "0" 165 | } else { 166 | level = fmt.Sprint(info2["ratingCount"]) 167 | level = strings.TrimRight(level, "]") 168 | level = strings.TrimLeft(level, "[") 169 | } 170 | 171 | // 结果存入Response中转 172 | ctx.Output(map[int]interface{}{ 173 | 0: n, 174 | 1: price1, 175 | 2: price2, 176 | 3: prm, 177 | 4: level, 178 | 5: ctx.GetTemp("type", ""), 179 | }) 180 | } 181 | } 182 | }, 183 | }, 184 | }, 185 | }, 186 | } 187 | -------------------------------------------------------------------------------- /jdsearch/jdsearch.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | "github.com/henrylee2cn/pholcus/logs" //信息输出 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | JDSearch.Register() 31 | } 32 | 33 | var JDSearch = &Spider{ 34 | Name: "京东搜索", 35 | Description: "京东搜索结果 [search.jd.com]", 36 | // Pausetime: 300, 37 | Keyin: KEYIN, 38 | Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求") 43 | }, 44 | 45 | Trunk: map[string]*Rule{ 46 | 47 | "生成请求": { 48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 49 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 50 | ctx.AddQueue( 51 | &request.Request{ 52 | Url: "http://search.jd.com/Search?keyin=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*loop[0]+1), 53 | Rule: aid["Rule"].(string), 54 | }, 55 | ) 56 | ctx.AddQueue( 57 | &request.Request{ 58 | Url: "http://search.jd.com/Search?keyin=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*loop[0]+2), 59 | Rule: aid["Rule"].(string), 60 | }, 61 | ) 62 | } 63 | return nil 64 | }, 65 | ParseFunc: func(ctx *Context) { 66 | query := ctx.GetDom() 67 | 68 | total1 := query.Find("#top_pagi span.text").Text() 69 | 70 | re, _ := regexp.Compile(`[\d]+$`) 71 | total1 = re.FindString(total1) 72 | total, _ := strconv.Atoi(total1) 73 | 74 | if total > ctx.GetLimit() { 75 | total = ctx.GetLimit() 76 | } else if total == 0 { 77 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 78 | return 79 | } 80 | // 调用指定规则下辅助函数 81 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"}) 82 | // 用指定规则解析响应流 83 | ctx.Parse("搜索结果") 84 | }, 85 | }, 86 | 87 | "搜索结果": { 88 | //注意:有无字段语义和是否输出数据必须保持一致 89 | ItemFields: []string{ 90 | "标题", 91 | "价格", 92 | "评论数", 93 | "星级", 94 | "链接", 95 | }, 96 | ParseFunc: func(ctx *Context) { 97 | query := ctx.GetDom() 98 | 99 | query.Find("#plist .list-h:nth-child(1) > li").Each(func(i int, s *goquery.Selection) { 100 | // 获取标题 101 | a := s.Find(".p-name a") 102 | title := a.Text() 103 | 104 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 105 | // title = re.ReplaceAllStringFunc(title, strings.ToLower) 106 | title = re.ReplaceAllString(title, " ") 107 | title = strings.Trim(title, " \t\n") 108 | 109 | // 获取价格 110 | price, _ := s.Find("strong[data-price]").First().Attr("data-price") 111 | 112 | // 获取评论数 113 | e := s.Find(".extra").First() 114 | discuss := e.Find("a").First().Text() 115 | re, _ = regexp.Compile(`[\d]+`) 116 | discuss = re.FindString(discuss) 117 | 118 | // 获取星级 119 | level, _ := e.Find(".star span[id]").First().Attr("class") 120 | level = re.FindString(level) 121 | 122 | // 获取URL 123 | url, _ := a.Attr("href") 124 | 125 | // 结果存入Response中转 126 | ctx.Output(map[int]interface{}{ 127 | 0: title, 128 | 1: price, 129 | 2: discuss, 130 | 3: level, 131 | 4: url, 132 | }) 133 | }) 134 | }, 135 | }, 136 | }, 137 | }, 138 | } 139 | -------------------------------------------------------------------------------- /jiban/jiban.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | import ( 4 | "github.com/henrylee2cn/pholcus/app/downloader/request" 5 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" 7 | // net包 8 | // "net/http" //设置http.Header 9 | // "net/url" 10 | 11 | // 编码包 12 | // "encoding/xml" 13 | // "encoding/json" 14 | 15 | // 字符串处理包 16 | "strconv" 17 | "strings" 18 | // "regexp" 19 | // 其他包 20 | // "fmt" 21 | // "math" 22 | // "time" 23 | ) 24 | 25 | func init() { 26 | Jiban.Register() 27 | } 28 | 29 | var Jiban = &Spider{ 30 | Name: "羁绊动漫", 31 | Description: "羁绊二次元资讯 [http://www.005.tv/zx/]", 32 | EnableCookie: true, 33 | RuleTree: &RuleTree{ 34 | Root: func(ctx *Context) { 35 | ctx.AddQueue(&request.Request{ 36 | Url: "http://www.005.tv/zx/list_526_1.html", 37 | Rule: "请求", 38 | Temp: map[string]interface{}{"p": 1}, 39 | ConnTimeout: -1, 40 | Reloadable: true, 41 | }) 42 | 43 | }, 44 | Trunk: map[string]*Rule{ 45 | "请求": { 46 | ParseFunc: func(ctx *Context) { 47 | var curr = ctx.GetTemp("p", int(0)).(int) 48 | ctx.GetDom().Find(".pages .dede_pages .pagelist .thisclass a").Each(func(ii int, iio *goquery.Selection) { 49 | url2, _ := iio.Attr("href") 50 | if url2 != "javascript:void(0);" { 51 | if curr > 100 { 52 | return 53 | } 54 | } 55 | }) 56 | ctx.AddQueue(&request.Request{ 57 | Url: "http://www.005.tv/zx/list_526_" + strconv.Itoa(curr+1) + ".html", 58 | Rule: "请求", 59 | Temp: map[string]interface{}{"p": curr + 1}, 60 | ConnTimeout: -1, 61 | Reloadable: true, 62 | }) 63 | ctx.Parse("获取列表") 64 | }, 65 | }, 66 | 67 | "获取列表": { 68 | ParseFunc: func(ctx *Context) { 69 | ctx.GetDom(). 70 | Find(".article-list ul li .xs-100 div h3 a"). 71 | Each(func(i int, s *goquery.Selection) { 72 | url, _ := s.Attr("href") 73 | ctx.AddQueue(&request.Request{ 74 | Url: url, 75 | Rule: "news", 76 | ConnTimeout: -1, 77 | }) 78 | }) 79 | }, 80 | }, 81 | 82 | "news": { 83 | ItemFields: []string{ 84 | "title", 85 | "time", 86 | "img_url", 87 | "content", 88 | }, 89 | ParseFunc: func(ctx *Context) { 90 | query := ctx.GetDom() 91 | var title, time, img_url, content string 92 | query.Find(".article-list-wrap"). 93 | Each(func(j int, jo *goquery.Selection) { 94 | title = jo.Find(".articleTitle-name").Text() 95 | time = jo.Find("span.time").Text() 96 | jo.Find(".articleContent img").Each(func(x int, xo *goquery.Selection) { 97 | if img, ok := xo.Attr("src"); ok { 98 | img_url = img_url + img + "," 99 | } 100 | }) 101 | jo.Find(".articleContent img").ReplaceWithHtml("#image#") 102 | jo.Find(".articleContent img").Remove() 103 | content, _ = jo.Find(".articleContent").Html() 104 | content = strings.Replace(content, `"`, `'`, -1) 105 | }) 106 | ctx.Output(map[int]interface{}{ 107 | 0: title, 108 | 1: time, 109 | 2: img_url, 110 | 3: content, 111 | }) 112 | }, 113 | }, 114 | }, 115 | }, 116 | } 117 | -------------------------------------------------------------------------------- /jingdong/README.md: -------------------------------------------------------------------------------- 1 | 根据京东新的页面规则进行了修改 2 | 3 | 1.以前是修改url中的page参数就可以得到每页的值。但是现在京东做了修改。 4 |  5 | 现在点击第二页的时候,url中的page参数会是3,修改page现在不能得到所有的商品信息的。page=2的时候的内容,会在你的页面滚动到中间的时候通过异步的方式来加载。 6 | 7 | 2.我们输入的关键字总共有多少页商品的显示方式也修改了。这个参数现在改到了一段javasript代码中,通过js来生成页面代码。 8 |  9 | 10 | 3.在存入结果的时候,我判断了一下title为空的情况。这个是因为,京东会在一些商品里面加入广告的,但是这个广告的html结构是和商品是一样的,这样我们的规则在解析的时候会得到这个无效的信息,需要去掉。 11 | 如下图: 12 |  13 | 14 | 这个爬虫整体的过程就是。 15 | 16 | 1. 先访问参数page=1的url,使用正则表达式得到这个关键字一共有多少页商品 17 | 2. 根据两种加载方式(url的直接返回和异步加载),生成所有的url。 18 | 3. 分析页面结构,得到相关的值 19 | 20 | 第一次写,写的不好的或错的地方希望大家多多包涵。^_^ -------------------------------------------------------------------------------- /jingdong/jdSpider.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | //"github.com/henrylee2cn/pholcus/logs" //信息输出 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | //"fmt" 28 | ) 29 | 30 | func init() { 31 | JDSpider.Register() 32 | } 33 | 34 | var JDSpider = &Spider{ 35 | Name: "京东搜索new", 36 | Description: "京东搜索结果 [search.jd.com]", 37 | // Pausetime: 300, 38 | Keyin: KEYIN, 39 | Limit: LIMIT, 40 | EnableCookie: false, 41 | RuleTree: &RuleTree{ 42 | Root: func(ctx *Context) { 43 | //Aid调用Rule中的AidFunc 44 | ctx.Aid(map[string]interface{}{"Rule": "判断页数"}, "判断页数") 45 | }, 46 | 47 | Trunk: map[string]*Rule{ 48 | //只判断关键字商品一共有多少页 49 | "判断页数": { 50 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 51 | ctx.AddQueue( 52 | &request.Request{ 53 | Url: "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=1", 54 | Rule: aid["Rule"].(string), 55 | }, 56 | ) 57 | return nil 58 | }, 59 | ParseFunc: func(ctx *Context) { 60 | query := ctx.GetDom() 61 | pageCount := 0 62 | query.Find("script").Each(func(i int, s *goquery.Selection) { 63 | if strings.Contains(s.Text(), "page_count") { 64 | re, _ := regexp.Compile(`page_count:"\d{1,}"`) 65 | temp := re.FindString(s.Text()) 66 | re, _ = regexp.Compile(`\d{1,}`) 67 | temp2 := re.FindString(temp) 68 | pageCount, _ = strconv.Atoi(temp2) 69 | } 70 | }) 71 | ctx.Aid(map[string]interface{}{"PageCount": pageCount}, "生成请求") 72 | }, 73 | }, 74 | 75 | "生成请求": { 76 | //单数页是url直接返回,双数页是异步加载,两个url在下面有写 77 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 78 | //Url: "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=" + strconv.Itoa(pageNum), 79 | //Url: "http://search.jd.com/s_new.php?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=31&scrolling=y&pos=30&page=" + strconv.Itoa(pageNum), 80 | pageCount := aid["PageCount"].(int) 81 | 82 | for i := 1; i < pageCount; i++ { 83 | ctx.AddQueue( 84 | &request.Request{ 85 | Url: "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=" + strconv.Itoa(i*2-1), 86 | Rule: "搜索结果", 87 | }, 88 | ) 89 | ctx.AddQueue( 90 | &request.Request{ 91 | Url: "http://search.jd.com/s_new.php?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=31&scrolling=y&pos=30&page=" + strconv.Itoa(i*2), 92 | Rule: "搜索结果", 93 | }, 94 | ) 95 | } 96 | return nil 97 | }, 98 | }, 99 | 100 | "搜索结果": { 101 | //从返回中解析出数据。注:异步返回的结果页面结构是和单数页的一样的,所以就一套解析就可以了。 102 | ItemFields: []string{ 103 | "标题", 104 | "价格", 105 | "评论数", 106 | "链接", 107 | }, 108 | ParseFunc: func(ctx *Context) { 109 | query := ctx.GetDom() 110 | 111 | query.Find(".gl-item").Each(func(i int, s *goquery.Selection) { 112 | // 获取标题 113 | a := s.Find(".p-name.p-name-type-2 > a") 114 | title := a.Text() 115 | 116 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 117 | // title = re.ReplaceAllStringFunc(title, strings.ToLower) 118 | title = re.ReplaceAllString(title, " ") 119 | title = strings.Trim(title, " \t\n") 120 | 121 | // 获取价格 122 | price := s.Find(".p-price > strong > i").Text() 123 | 124 | // 获取评论数 125 | //#J_goodsList > ul > li:nth-child(1) > div > div.p-commit 126 | discuss := s.Find(".p-commit > strong > a").Text() 127 | 128 | // 获取URL 129 | url, _ := a.Attr("href") 130 | url = "http:" + url 131 | 132 | // 结果存入Response中转 133 | if title != "" { 134 | ctx.Output(map[int]interface{}{ 135 | 0: title, 136 | 1: price, 137 | 2: discuss, 138 | 3: url, 139 | }) 140 | } 141 | }) 142 | }, 143 | }, 144 | }, 145 | }, 146 | } 147 | -------------------------------------------------------------------------------- /kaola/kaola.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | // net包 11 | // "net/http" //设置http.Header 12 | // "net/url" 13 | // 编码包 14 | // "encoding/xml" 15 | // "encoding/json" 16 | // 字符串处理包 17 | // "regexp" 18 | // "strconv" 19 | // "strings" 20 | // 其他包 21 | // "fmt" 22 | // "math" 23 | // "time" 24 | ) 25 | 26 | func init() { 27 | Kaola.Register() 28 | } 29 | 30 | // 考拉海淘,海外直采,7天无理由退货,售后无忧!考拉网放心的海淘网站! 31 | var Kaola = &Spider{ 32 | Name: "考拉海淘", 33 | Description: "考拉海淘商品数据 [Auto Page] [www.kaola.com]", 34 | // Pausetime: 300, 35 | // Keyin: KEYIN, 36 | // Limit: LIMIT, 37 | EnableCookie: false, 38 | RuleTree: &RuleTree{ 39 | Root: func(ctx *Context) { 40 | ctx.AddQueue(&request.Request{Url: "http://www.kaola.com", Rule: "获取版块URL"}) 41 | }, 42 | 43 | Trunk: map[string]*Rule{ 44 | 45 | "获取版块URL": { 46 | ParseFunc: func(ctx *Context) { 47 | query := ctx.GetDom() 48 | lis := query.Find("#funcTab li a") 49 | lis.Each(func(i int, s *goquery.Selection) { 50 | if i == 0 { 51 | return 52 | } 53 | if url, ok := s.Attr("href"); ok { 54 | ctx.AddQueue(&request.Request{Url: url, Rule: "商品列表", Temp: map[string]interface{}{"goodsType": s.Text()}}) 55 | } 56 | }) 57 | }, 58 | }, 59 | 60 | "商品列表": { 61 | ParseFunc: func(ctx *Context) { 62 | query := ctx.GetDom() 63 | query.Find(".proinfo").Each(func(i int, s *goquery.Selection) { 64 | if url, ok := s.Find("a").Attr("href"); ok { 65 | ctx.AddQueue(&request.Request{ 66 | Url: "http://www.kaola.com" + url, 67 | Rule: "商品详情", 68 | Temp: map[string]interface{}{"goodsType": ctx.GetTemp("goodsType", "").(string)}, 69 | }) 70 | } 71 | }) 72 | }, 73 | }, 74 | 75 | "商品详情": { 76 | //注意:有无字段语义和是否输出数据必须保持一致 77 | ItemFields: []string{ 78 | "标题", 79 | "价格", 80 | "品牌", 81 | "采购地", 82 | "评论数", 83 | "类别", 84 | }, 85 | ParseFunc: func(ctx *Context) { 86 | query := ctx.GetDom() 87 | // 获取标题 88 | title := query.Find(".product-title").Text() 89 | 90 | // 获取价格 91 | price := query.Find("#js_currentPrice span").Text() 92 | 93 | // 获取品牌 94 | brand := query.Find(".goods_parameter li").Eq(0).Text() 95 | 96 | // 获取采购地 97 | from := query.Find(".goods_parameter li").Eq(1).Text() 98 | 99 | // 获取评论数 100 | discussNum := query.Find("#commentCounts").Text() 101 | 102 | // 结果存入Response中转 103 | ctx.Output(map[int]interface{}{ 104 | 0: title, 105 | 1: price, 106 | 2: brand, 107 | 3: from, 108 | 4: discussNum, 109 | 5: ctx.GetTemp("goodsType", ""), 110 | }) 111 | }, 112 | }, 113 | }, 114 | }, 115 | } 116 | -------------------------------------------------------------------------------- /lewa/lewa.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | // 编码包 15 | // "encoding/xml" 16 | // "encoding/json" 17 | // 字符串处理包 18 | // "regexp" 19 | // "strconv" 20 | // "strings" 21 | // 其他包 22 | // "fmt" 23 | // "math" 24 | // "time" 25 | ) 26 | 27 | func init() { 28 | Lewa.Register() 29 | } 30 | 31 | var Lewa = &Spider{ 32 | Name: "乐蛙登录测试", 33 | Description: "乐蛙登录测试 [Auto Page] [http://accounts.lewaos.com]", 34 | // Pausetime: 300, 35 | // Keyin: KEYIN, 36 | // Limit: LIMIT, 37 | EnableCookie: true, 38 | RuleTree: &RuleTree{ 39 | Root: func(ctx *Context) { 40 | ctx.AddQueue(&request.Request{Url: "http://accounts.lewaos.com/", Rule: "登录页"}) 41 | }, 42 | 43 | Trunk: map[string]*Rule{ 44 | 45 | "登录页": { 46 | ParseFunc: func(ctx *Context) { 47 | // ctx.AddQueue(&request.Request{ 48 | // Url: "http://accounts.lewaos.com", 49 | // Rule: "登录后", 50 | // Method: "POST", 51 | // PostData: "username=123456@qq.com&password=123456&login_btn=login_btn&submit=login_btn", 52 | // }) 53 | NewForm( 54 | ctx, 55 | "登录后", 56 | "http://accounts.lewaos.com", 57 | ctx.GetDom().Find(".userlogin.lw-pl40"), 58 | ).Inputs(map[string]string{ 59 | "username": "", 60 | "password": "", 61 | }).Submit() 62 | }, 63 | }, 64 | "登录后": { 65 | ParseFunc: func(ctx *Context) { 66 | // 结果存入Response中转 67 | ctx.Output(map[string]interface{}{ 68 | "Body": ctx.GetText(), 69 | "Cookie": ctx.GetCookie(), 70 | }) 71 | ctx.AddQueue(&request.Request{ 72 | Url: "http://accounts.lewaos.com/member", 73 | Rule: "个人中心", 74 | Header: http.Header{"Referer": []string{ctx.GetUrl()}}, 75 | }) 76 | }, 77 | }, 78 | "个人中心": { 79 | ParseFunc: func(ctx *Context) { 80 | // 结果存入Response中转 81 | ctx.Output(map[string]interface{}{ 82 | "Body": ctx.GetText(), 83 | "Cookie": ctx.GetCookie(), 84 | }) 85 | }, 86 | }, 87 | }, 88 | }, 89 | } 90 | -------------------------------------------------------------------------------- /miyabaobei/miyabaobei.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | Miyabaobei.Register() 31 | } 32 | 33 | var Miyabaobei = &Spider{ 34 | Name: "蜜芽宝贝", 35 | Description: "蜜芽宝贝商品数据 [Auto Page] [www.miyabaobei.com]", 36 | // Pausetime: 300, 37 | // Keyin: KEYIN, 38 | // Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.AddQueue(&request.Request{Url: "http://www.miyabaobei.com/", Rule: "获取版块URL"}) 43 | }, 44 | 45 | Trunk: map[string]*Rule{ 46 | 47 | "获取版块URL": { 48 | ParseFunc: func(ctx *Context) { 49 | query := ctx.GetDom() 50 | lis := query.Find(".ccon") 51 | lis.Each(func(i int, s *goquery.Selection) { 52 | s.Find("a").Each(func(n int, ss *goquery.Selection) { 53 | if url, ok := ss.Attr("href"); ok { 54 | if !strings.Contains(url, "http://www.miyabaobei.com") { 55 | url = "http://www.miyabaobei.com" + url 56 | } 57 | ctx.Aid(map[string]interface{}{ 58 | "loop": [2]int{0, 1}, 59 | "urlBase": url, 60 | "req": map[string]interface{}{ 61 | "Rule": "生成请求", 62 | "Temp": map[string]interface{}{"baseUrl": url}, 63 | }, 64 | }, "生成请求") 65 | } 66 | }) 67 | }) 68 | }, 69 | }, 70 | 71 | "生成请求": { 72 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 73 | req := aid["req"].(*request.Request) 74 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 75 | req.Url = aid["urlBase"].(string) + "&per_page=" + strconv.Itoa(loop[0]*40) 76 | ctx.AddQueue(req) 77 | } 78 | return nil 79 | }, 80 | ParseFunc: func(ctx *Context) { 81 | query := ctx.GetDom() 82 | totalPage := "1" 83 | 84 | urls := query.Find(".Lpage.page p a") 85 | 86 | if urls.Length() != 0 { 87 | if urls.Last().Text() == ">" { 88 | totalPage = urls.Eq(urls.Length() - 2).Text() 89 | } else { 90 | totalPage = urls.Last().Text() 91 | } 92 | } 93 | total, _ := strconv.Atoi(totalPage) 94 | 95 | // 调用指定规则下辅助函数 96 | ctx.Aid(map[string]interface{}{ 97 | "loop": [2]int{1, total}, 98 | "ruleBase": ctx.GetTemp("baseUrl", "").(string), 99 | "rep": map[string]interface{}{ 100 | "Rule": "商品列表", 101 | }, 102 | }) 103 | // 用指定规则解析响应流 104 | ctx.Parse("商品列表") 105 | }, 106 | }, 107 | 108 | "商品列表": { 109 | //注意:有无字段语义和是否输出数据必须保持一致 110 | ItemFields: []string{ 111 | "标题", 112 | "价格", 113 | "类别", 114 | }, 115 | ParseFunc: func(ctx *Context) { 116 | query := ctx.GetDom() 117 | //获取品类 118 | goodsType := query.Find(".crumbs").Text() 119 | re, _ := regexp.Compile("\\s") 120 | goodsType = re.ReplaceAllString(goodsType, "") 121 | re, _ = regexp.Compile("蜜芽宝贝>") 122 | goodsType = re.ReplaceAllString(goodsType, "") 123 | query.Find(".bmfo").Each(func(i int, s *goquery.Selection) { 124 | // 获取标题 125 | title, _ := s.Find("p a").First().Attr("title") 126 | 127 | // 获取价格 128 | price := s.Find(".f20").Text() 129 | 130 | // 结果存入Response中转 131 | ctx.Output(map[int]interface{}{ 132 | 0: title, 133 | 1: price, 134 | 2: goodsType, 135 | }) 136 | }) 137 | }, 138 | }, 139 | }, 140 | }, 141 | } 142 | -------------------------------------------------------------------------------- /people/people.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "log" 6 | 7 | // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 9 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 11 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 12 | 13 | // net包 14 | // "net/http" //设置http.Header 15 | // "net/url" 16 | 17 | // 编码包 18 | 19 | // "encoding/xml" 20 | "encoding/json" 21 | // 字符串处理包 22 | // "regexp" 23 | // "strconv" 24 | // "strings" 25 | // 其他包 26 | // "fmt" 27 | // "math" 28 | // "time" 29 | ) 30 | 31 | func init() { 32 | People.Register() 33 | } 34 | 35 | type Item struct { 36 | Id string `json:"id"` 37 | Title string `json:"title"` 38 | Url string `json:"url"` 39 | Date string `json:"date"` 40 | NodeId string `json:"nodeId"` 41 | ImgCount string `json:"imgCount"` 42 | } 43 | type News struct { 44 | Items []Item `json:"items"` 45 | } 46 | 47 | var news News 48 | 49 | var People = &Spider{ 50 | Name: "人民网新闻抓取", 51 | Description: "人民网最新分类新闻", 52 | // Pausetime: 300, 53 | // Keyin: KEYIN, 54 | // Limit: LIMIT, 55 | EnableCookie: false, 56 | RuleTree: &RuleTree{ 57 | Root: func(ctx *Context) { 58 | ctx.AddQueue(&request.Request{ 59 | Method: "GET", 60 | Url: "http://news.people.com.cn/210801/211150/index.js?cache=false", 61 | Rule: "新闻列表", 62 | }) 63 | }, 64 | 65 | Trunk: map[string]*Rule{ 66 | "新闻列表": { 67 | ParseFunc: func(ctx *Context) { 68 | 69 | //query := ctx.GetDom() 70 | //str := query.Find("body").Text() 71 | 72 | //str := `{"items":[{"id":"282","title":"人社 转型升级"战术"手册","url":"ht","date":"201","nodeId":"1001","imgCount":"4"}]}` 73 | 74 | str := ctx.GetText() 75 | 76 | err := json.Unmarshal([]byte(str), &news) 77 | if err != nil { 78 | log.Printf("解析错误: %v\n", err) 79 | return 80 | } 81 | ///////////////// 82 | newsLength := len(news.Items) 83 | for i := 0; i < newsLength; i++ { 84 | ctx.AddQueue(&request.Request{ 85 | Url: news.Items[i].Url, 86 | Rule: "热点新闻", 87 | Temp: map[string]interface{}{ 88 | "id": news.Items[i].Id, 89 | "title": news.Items[i].Title, 90 | "date": news.Items[i].Date, 91 | "newsType": news.Items[i].NodeId, 92 | }, 93 | }) 94 | } 95 | ///////////////// 96 | }, 97 | }, 98 | 99 | "热点新闻": { 100 | //注意:有无字段语义和是否输出数据必须保持一致 101 | ItemFields: []string{ 102 | "ID", 103 | "标题", 104 | "内容", 105 | "类别", 106 | "ReleaseTime", 107 | }, 108 | ParseFunc: func(ctx *Context) { 109 | query := ctx.GetDom() 110 | 111 | // 获取内容 112 | content := query.Find("#p_content").Text() 113 | // re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 114 | // content = re.ReplaceAllStringFunc(content, strings.ToLower) 115 | // content = re.ReplaceAllString(content, "") 116 | 117 | // 结果存入Response中转 118 | ctx.Output(map[int]interface{}{ 119 | 0: ctx.GetTemp("id", ""), 120 | 1: ctx.GetTemp("title", ""), 121 | 2: content, 122 | 3: ctx.GetTemp("newsType", ""), 123 | 4: ctx.GetTemp("date", ""), 124 | }) 125 | }, 126 | }, 127 | }, 128 | }, 129 | } 130 | -------------------------------------------------------------------------------- /pholcus_lib.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | import ( 4 | _ "github.com/henrylee2cn/pholcus_lib/IJGUC" 5 | _ "github.com/henrylee2cn/pholcus_lib/alibaba" 6 | _ "github.com/henrylee2cn/pholcus_lib/area_codes" 7 | _ "github.com/henrylee2cn/pholcus_lib/baidunews" 8 | _ "github.com/henrylee2cn/pholcus_lib/baidusearch" 9 | _ "github.com/henrylee2cn/pholcus_lib/car_home" 10 | _ "github.com/henrylee2cn/pholcus_lib/chinanews" 11 | _ "github.com/henrylee2cn/pholcus_lib/fang_resell_list" 12 | _ "github.com/henrylee2cn/pholcus_lib/filetest" 13 | _ "github.com/henrylee2cn/pholcus_lib/ganji_gongsi" 14 | _ "github.com/henrylee2cn/pholcus_lib/googlesearch" 15 | _ "github.com/henrylee2cn/pholcus_lib/hollandandbarrett" 16 | _ "github.com/henrylee2cn/pholcus_lib/jdsearch" 17 | _ "github.com/henrylee2cn/pholcus_lib/jiban" 18 | _ "github.com/henrylee2cn/pholcus_lib/jingdong" 19 | _ "github.com/henrylee2cn/pholcus_lib/kaola" 20 | _ "github.com/henrylee2cn/pholcus_lib/lewa" 21 | _ "github.com/henrylee2cn/pholcus_lib/miyabaobei" 22 | _ "github.com/henrylee2cn/pholcus_lib/people" 23 | _ "github.com/henrylee2cn/pholcus_lib/qq_avatar" 24 | _ "github.com/henrylee2cn/pholcus_lib/shunfenghaitao" 25 | _ "github.com/henrylee2cn/pholcus_lib/taobao" 26 | _ "github.com/henrylee2cn/pholcus_lib/taobaosearch" 27 | _ "github.com/henrylee2cn/pholcus_lib/wangyi" 28 | _ "github.com/henrylee2cn/pholcus_lib/weibo_fans" 29 | _ "github.com/henrylee2cn/pholcus_lib/wukongwenda" 30 | _ "github.com/henrylee2cn/pholcus_lib/zhihu_bianji" 31 | _ "github.com/henrylee2cn/pholcus_lib/zhihu_daily" 32 | _ "github.com/henrylee2cn/pholcus_lib/zolpc" 33 | _ "github.com/henrylee2cn/pholcus_lib/zolphone" 34 | _ "github.com/henrylee2cn/pholcus_lib/zolslab" 35 | ) 36 | -------------------------------------------------------------------------------- /qq_avatar/README.md: -------------------------------------------------------------------------------- 1 | ## QQ头像和昵称抓取和下载头像 2 | 3 | > 默认抓取1页 4 | -------------------------------------------------------------------------------- /qq_avatar/avatar.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 8 | "github.com/henrylee2cn/pholcus/logs" //信息输出 9 | // net包 10 | "net/http" //设置http.Header 11 | // "net/url" 12 | 13 | // 编码包 14 | // "encoding/xml" 15 | // "encoding/json" 16 | 17 | // 字符串处理包 18 | // "regexp" 19 | "strconv" 20 | "fmt" 21 | "strings" 22 | ) 23 | 24 | func init() { 25 | Avatar.Register() 26 | } 27 | 28 | var Avatar = &Spider{ 29 | 30 | Name: "QQ头像和昵称抓去和下载", 31 | Description: "QQ头像和昵称抓去和下载", 32 | // Pausetime: 300, 33 | Keyin: KEYIN, 34 | Limit: LIMIT, 35 | EnableCookie: false, 36 | NotDefaultField: true, 37 | RuleTree: &RuleTree{ 38 | Root: func(ctx *Context) { 39 | ctx.Aid(map[string]interface{}{"loop": [2]int{0, ctx.GetLimit()}, "Rule": "生成请求"}, "生成请求") 40 | }, 41 | 42 | Trunk: map[string]*Rule{ 43 | "生成请求": { 44 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 45 | var url string 46 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 47 | if loop[0] == 0 { 48 | url = "http://www.woyaogexing.com/touxiang/index.html" 49 | loop[0]++ 50 | } else { 51 | url = "http://www.woyaogexing.com/touxiang/index_" + strconv.Itoa(loop[0]+1) + ".html" 52 | } 53 | ctx.AddQueue(&request.Request{ 54 | Url: url, 55 | Rule: aid["Rule"].(string), 56 | Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}}, 57 | }) 58 | } 59 | return nil 60 | }, 61 | ParseFunc: func(ctx *Context) { 62 | query := ctx.GetDom() 63 | // logs.Log.Debug(ctx.GetText()) 64 | pageTag := query.Find("div.pageNum.wp div.page a:last-child") 65 | // 跳转 66 | if len(pageTag.Nodes) == 0 { 67 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] \n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 68 | query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) { 69 | if href, ok := s.Attr("href"); ok { 70 | ctx.AddQueue(&request.Request{ 71 | Url: href, 72 | Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}}, 73 | Rule: "搜索结果", 74 | }) 75 | } 76 | }) 77 | return 78 | } 79 | // 用指定规则解析响应流 80 | ctx.Parse("搜索结果") 81 | }, 82 | }, 83 | "搜索结果": { 84 | ItemFields: []string{ 85 | "avatar", 86 | "nickname", 87 | }, 88 | ParseFunc: func(ctx *Context) { 89 | query := ctx.GetDom() 90 | query.Find(".txList").Each(func(i int, selection *goquery.Selection) { 91 | src, _ := selection.Find("a.img>img").First().Attr("src") 92 | name := selection.Find("p>a").Text() 93 | fmt.Printf("nickname:%s \t url: %s\n", name, src) 94 | ctx.AddQueue(&request.Request{ 95 | Url: src, 96 | Rule: "下载文件", 97 | ConnTimeout: -1, 98 | DownloaderID: 0, 99 | }) 100 | str := strings.Split(src, "/") 101 | ctx.Output(map[int]interface{}{ 102 | 0: str[len(str)-1], 103 | 1: name, 104 | }) 105 | }) 106 | }, 107 | }, 108 | "下载文件": { 109 | ParseFunc: func(ctx *Context) { 110 | ctx.FileOutput() 111 | }, 112 | }, 113 | }, 114 | }, 115 | } 116 | 117 | -------------------------------------------------------------------------------- /shunfenghaitao/shunfenghaitao.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | // "strconv" 22 | // "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | Shunfenghaitao.Register() 31 | } 32 | 33 | // 进口母婴专区,买进口奶粉、尿裤尿布、辅食、营养、洗护、日用、母婴用品 - 顺丰海淘 34 | var Shunfenghaitao = &Spider{ 35 | Name: "顺丰海淘", 36 | Description: "顺丰海淘商品数据 [Auto Page] [www.sfht.com]", 37 | // Pausetime: 300, 38 | // Keyin: KEYIN, 39 | // Limit: LIMIT, 40 | EnableCookie: false, 41 | RuleTree: &RuleTree{ 42 | Root: func(ctx *Context) { 43 | ctx.AddQueue(&request.Request{Url: "http://www.sfht.com", Rule: "获取版块URL"}) 44 | }, 45 | 46 | Trunk: map[string]*Rule{ 47 | 48 | "获取版块URL": { 49 | ParseFunc: func(ctx *Context) { 50 | query := ctx.GetDom() 51 | 52 | lis := query.Find(".nav-c1").First().Find("li a") 53 | 54 | lis.Each(func(i int, s *goquery.Selection) { 55 | if i == 0 { 56 | return 57 | } 58 | if url, ok := s.Attr("href"); ok { 59 | ctx.AddQueue(&request.Request{Url: url, Rule: "商品列表", Temp: map[string]interface{}{"goodsType": s.Text()}}) 60 | } 61 | }) 62 | }, 63 | }, 64 | 65 | "商品列表": { 66 | ParseFunc: func(ctx *Context) { 67 | query := ctx.GetDom() 68 | 69 | query.Find(".cms-src-item").Each(func(i int, s *goquery.Selection) { 70 | if url, ok := s.Find("a").Attr("href"); ok { 71 | ctx.AddQueue(&request.Request{ 72 | Url: url, 73 | Rule: "商品详情", 74 | Temp: map[string]interface{}{"goodsType": ctx.GetTemp("goodsType", "").(string)}, 75 | }) 76 | } 77 | }) 78 | }, 79 | }, 80 | 81 | "商品详情": { 82 | //注意:有无字段语义和是否输出数据必须保持一致 83 | ItemFields: []string{ 84 | "标题", 85 | "品牌", 86 | "原产地", 87 | "货源地", 88 | "类别", 89 | }, 90 | ParseFunc: func(ctx *Context) { 91 | query := ctx.GetDom() 92 | 93 | // 获取标题 94 | title := query.Find("#titleInfo h1").Text() 95 | 96 | // 获取品牌 97 | brand := query.Find(".goods-c2 ul").Eq(0).Find("li").Eq(2).Text() 98 | re, _ := regexp.Compile(`品 牌`) 99 | brand = re.ReplaceAllString(brand, "") 100 | 101 | // 获取原产地 102 | from1 := query.Find("#detailattributes li").Eq(0).Text() 103 | 104 | // 获取货源地 105 | from2 := query.Find("#detailattributes li").Eq(1).Text() 106 | 107 | // 结果存入Response中转 108 | ctx.Output(map[int]interface{}{ 109 | 0: title, 110 | 1: brand, 111 | 2: from1, 112 | 3: from2, 113 | 4: ctx.GetTemp("goodsType", ""), 114 | }) 115 | }, 116 | }, 117 | }, 118 | }, 119 | } 120 | -------------------------------------------------------------------------------- /taobao/taobao.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 8 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | Taobao.Register() 31 | } 32 | 33 | var cookies_Taobao = "mt=ci%3D-1_0; swfstore=35673; thw=cn; cna=fcr5DRDmwnQCAT2QxZSu3Db6; sloc=%E8%BE%BD%E5%AE%81; _tb_token_=XLlMHhT9BI8IzeA; ck1=; v=0; uc3=nk2=symxAo6NBazVq7cY2z0%3D&id2=UU23CgHxOwgwgA%3D%3D&vt3=F8dAT%2BCFEEyTLicOBEc%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; existShop=MTQzNDM1NDcyNg%3D%3D; lgc=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; tracknick=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; sg=%E6%B5%B721; cookie2=1433b814776e3b3c61f4ba3b8631a81a; cookie1=Bqbn0lh%2FkPm9D0NtnTdFiqggRYia%2FBrNeQpwLWlbyJk%3D; unb=2559173312; t=1a9b12bb535040723808836b32e53507; _cc_=WqG3DMC9EA%3D%3D; tg=5; _l_g_=Ug%3D%3D; _nk_=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; cookie17=UU23CgHxOwgwgA%3D%3D; mt=ci=0_1; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=lltime=1434353890&cookie14=UoW0FrfFYp27FQ%3D%3D&existShop=false&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=U%2BGCWk%2F7p4mBoUyTltGF&tag=7&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&pas=0; isg=C08C1D752BC08A3DCDF1FE6611FA3EE1; l=Ajk53TTUeK0ZKkG8yx7w7svcyasSxC34" 34 | 35 | var Taobao = &Spider{ 36 | Name: "淘宝数据", 37 | Description: "淘宝天猫商品数据 [Auto Page] [http://list.taobao.com/]", 38 | // Pausetime: 300, 39 | // Keyin: KEYIN, 40 | // Limit: LIMIT, 41 | EnableCookie: false, 42 | RuleTree: &RuleTree{ 43 | Root: func(ctx *Context) { 44 | ctx.AddQueue(&request.Request{ 45 | Url: "http://list.taobao.com/browse/cat-0.htm", 46 | Rule: "生成请求", 47 | Header: http.Header{ 48 | "Cookie": []string{cookies_Taobao}, 49 | }, 50 | }) 51 | }, 52 | 53 | Trunk: map[string]*Rule{ 54 | 55 | "生成请求": { 56 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 57 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 58 | for _, loc := range loc_Taobao { 59 | ctx.AddQueue(&request.Request{ 60 | Url: "http:" + aid["urlBase"].(string) + "&_input_charset=utf-8&json=on&viewIndex=1&as=0&atype=b&style=grid&same_info=1&tid=0&isnew=2&data-action&module=page&s=0&loc=" + loc + "&pSize=96&data-key=s&data-value=" + strconv.Itoa(loop[0]*96), 61 | Rule: aid["Rule"].(string), 62 | Header: http.Header{ 63 | "Cookie": []string{cookies_Taobao}, 64 | }, 65 | Temp: aid["Temp"].(map[string]interface{}), 66 | }) 67 | } 68 | } 69 | return nil 70 | }, 71 | ParseFunc: func(ctx *Context) { 72 | query := ctx.GetDom() 73 | query.Find(".J_TBMarketCat").Each(func(i int, a *goquery.Selection) { 74 | type1 := a.Find("h4").Text() 75 | a.Find(".section").Each(func(i int, b *goquery.Selection) { 76 | type2 := b.Find(".subtitle a").Text() 77 | b.Find(".sublist a").Each(func(i int, c *goquery.Selection) { 78 | type3 := c.Text() 79 | href3, _ := c.Attr("href") 80 | 81 | ctx.Aid(map[string]interface{}{ 82 | "loop": [2]int{0, 1}, 83 | "urlBase": href3, 84 | "Rule": "列表页数", 85 | "Temp": map[string]interface{}{ 86 | "type1": type1, 87 | "type2": type2, 88 | "type3": type3, 89 | }, 90 | }) 91 | }) 92 | }) 93 | }) 94 | }, 95 | }, 96 | 97 | "列表页数": { 98 | ParseFunc: func(ctx *Context) { 99 | json := ctx.GetText() 100 | re, _ := regexp.Compile(`(?U)"totalPage":"[\d]+",`) 101 | total := re.FindString(json) 102 | re, _ = regexp.Compile(`[\d]+`) 103 | total = re.FindString(total) 104 | total = strings.Trim(total, " \t\n") 105 | totalPage, _ := strconv.Atoi(total) 106 | if total == "0" { 107 | logs.Log.Critical("[消息提示:| 任务:%v | 关键词:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 108 | } else { 109 | ctx.Aid(map[string]interface{}{ 110 | "loop": [2]int{1, totalPage}, 111 | "urlBase": ctx.GetUrl(), 112 | "Rule": "商品列表", 113 | "Temp": ctx.CopyTemps(), 114 | }, "生成请求") 115 | ctx.Parse("商品列表") 116 | } 117 | }, 118 | }, 119 | 120 | "商品列表": { 121 | ParseFunc: func(ctx *Context) { 122 | j := ctx.GetText() 123 | // re, _ := regexp.Compile(`null`) 124 | // j = re.ReplaceAllString(j, " ") 125 | 126 | infos := map[string]interface{}{} 127 | err := json.Unmarshal([]byte(j), &infos) 128 | if err != nil { 129 | logs.Log.Error("商品列表解析错误: %v\n", err) 130 | return 131 | } 132 | if infos["mallItemList"] == nil { 133 | logs.Log.Error("商品列表解析错误: 内容不存在!") 134 | return 135 | } 136 | for _, item := range infos["mallItemList"].([]interface{}) { 137 | item2 := item.(map[string]interface{}) 138 | temp := ctx.CreatItem(map[int]interface{}{ 139 | 0: item2["title"], 140 | 1: item2["price"], 141 | 2: item2["currentPrice"], 142 | 3: item2["vipPrice"], 143 | 4: item2["unitPrice"], 144 | 5: item2["unit"], 145 | 6: item2["isVirtual"], 146 | 7: item2["ship"], 147 | 8: item2["tradeNum"], 148 | 9: item2["formatedNum"], 149 | 10: item2["nick"], 150 | 11: item2["sellerId"], 151 | 12: item2["guarantee"], 152 | 13: item2["itemId"], 153 | 14: item2["isLimitPromotion"], 154 | 15: item2["loc"], 155 | 16: "http:" + item2["storeLink"].(string), 156 | 17: "http:" + item2["href"].(string), 157 | 18: item2["commend"], 158 | 19: item2["source"], 159 | 20: item2["ratesum"], 160 | 21: item2["goodRate"], 161 | 22: item2["dsrScore"], 162 | 23: item2["spSource"], 163 | }, "结果") 164 | ctx.AddQueue(&request.Request{ 165 | Url: "http:" + item2["href"].(string), 166 | Rule: "商品详情", 167 | Temp: temp, 168 | Priority: 1, 169 | }) 170 | } 171 | }, 172 | }, 173 | 174 | "商品详情": { 175 | 176 | ParseFunc: func(ctx *Context) { 177 | query := ctx.GetDom() 178 | 179 | // 商品规格参数 180 | detail := make(map[string]string) 181 | 182 | if li := query.Find(".attributes-list ul li"); len(li.Nodes) != 0 { 183 | // 天猫店宝贝详情 184 | li.Each(func(i int, s *goquery.Selection) { 185 | native := s.Text() 186 | slice := strings.Split(native, ": ") 187 | //空格替换为分隔号“|” 188 | slice[1] = strings.Replace(slice[1], " ", "|", -1) 189 | detail[slice[0]] = UnicodeToUTF8(slice[1]) 190 | }) 191 | 192 | } else { 193 | // 淘宝店宝贝详情 194 | query.Find(".attributes-list li").Each(func(i int, s *goquery.Selection) { 195 | native := s.Text() 196 | slice := strings.Split(native, ": ") 197 | detail[slice[0]] = slice[1] 198 | }) 199 | } 200 | 201 | temp := ctx.CopyTemps() 202 | temp[ctx.GetItemField(24, "结果")] = detail 203 | temp[ctx.GetItemField(25, "结果")] = []interface{}{} 204 | 205 | ctx.AddQueue(&request.Request{ 206 | Rule: "商品评论", 207 | Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" + 208 | ctx.GetTemp("sellerId", "").(string) + 209 | "&auctionNumId=" + 210 | ctx.GetTemp("itemId", "").(string) + 211 | "¤tPageNum=1", 212 | Temp: temp, 213 | Priority: 2, 214 | }) 215 | }, 216 | }, 217 | 218 | "商品评论": { 219 | ParseFunc: func(ctx *Context) { 220 | j := ctx.GetText() 221 | j = strings.TrimLeft(j, "(") 222 | j = strings.TrimRight(j, ")") 223 | 224 | infos := map[string]interface{}{} 225 | if err := json.Unmarshal([]byte(j), &infos); err != nil { 226 | logs.Log.Error("商品评论解析错误: %v\n", err) 227 | return 228 | } 229 | if infos["comments"] == nil || infos["maxPage"] == nil || infos["currentPageNum"] == nil { 230 | logs.Log.Error("商品评论解析错误: 内容不存在!") 231 | return 232 | } 233 | discussSlice := infos["comments"].([]interface{}) 234 | var discussAll = ctx.GetTemp(ctx.GetItemField(25, "结果"), []interface{}{}).([]interface{}) 235 | discussAll = append(discussAll, discussSlice...) 236 | temp := ctx.CopyTemps() 237 | temp[ctx.GetItemField(25, "结果")] = discussAll 238 | 239 | currentPageNum := infos["currentPageNum"].(int) 240 | maxPage := infos["maxPage"].(int) 241 | if currentPageNum < maxPage { 242 | // 请求下一页 243 | ctx.AddQueue(&request.Request{ 244 | Rule: "商品评论", 245 | Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" + 246 | ctx.GetTemp("sellerId", "").(string) + 247 | "&auctionNumId=" + 248 | ctx.GetTemp("itemId", "").(string) + 249 | "¤tPageNum=" + 250 | strconv.Itoa(currentPageNum+1), 251 | Temp: temp, 252 | }) 253 | } else { 254 | // 输出结果 255 | ctx.Parse("结果") 256 | } 257 | }, 258 | }, 259 | 260 | "结果": { 261 | //注意:有无字段语义和是否输出数据必须保持一致 262 | ItemFields: []string{ 263 | "标题", //title 264 | "原价", //price 265 | "现价", //currentPrice 266 | "会员价", //vipPrice 267 | "单价", //unitPrice 268 | "单位", //unit 269 | "是否虚拟物品", //isVirtual 270 | "ship", //ship 271 | "tradeNum", //tradeNum 272 | "formatedNum", //formatedNum 273 | "店铺", //nick 274 | "店铺ID", //sellerId 275 | "guarantee", //guarantee 276 | "货号", //itemId 277 | "isLimitPromotion", //isLimitPromotion 278 | "发货地", //loc 279 | "店铺链接", //storeLink 280 | "商品链接", //href 281 | "评价", //commend 282 | "source", //source 283 | "店铺信誉", //ratesum 284 | "店铺好评率", //goodRate 285 | "dsrScore", //dsrScore 286 | "spSource", //spSource 287 | "规格参数", 288 | "评论内容", 289 | }, 290 | ParseFunc: func(ctx *Context) { 291 | // 结果存入Response中转 292 | ctx.Output(ctx.CopyTemps()) 293 | }, 294 | }, 295 | }, 296 | }, 297 | } 298 | 299 | var ( 300 | loc_Taobao = map[string]string{ 301 | // "北京": "%E5%8C%97%E4%BA%AC", 302 | // "上海": "%E4%B8%8A%E6%B5%B7", 303 | // "广州": "%E5%B9%BF%E5%B7%9E", 304 | // "深圳": "%E6%B7%B1%E5%9C%B3", 305 | // "杭州": "%E6%9D%AD%E5%B7%9E", 306 | // "海外": "%E7%BE%8E%E5%9B%BD%2C%E8%8B%B1%E5%9B%BD%2C%E6%B3%95%E5%9B%BD%2C%E7%91%9E%E5%A3%AB%2C%E6%BE%B3%E6%B4%B2%2C%E6%96%B0%E8%A5%BF%E5%85%B0%2C%E5%8A%A0%E6%8B%BF%E5%A4%A7%2C%E5%A5%A5%E5%9C%B0%E5%88%A9%2C%E9%9F%A9%E5%9B%BD%2C%E6%97%A5%E6%9C%AC%2C%E5%BE%B7%E5%9B%BD%2C%E6%84%8F%E5%A4%A7%E5%88%A9%2C%E8%A5%BF%E7%8F%AD%E7%89%99%2C%E4%BF%84%E7%BD%97%E6%96%AF%2C%E6%B3%B0%E5%9B%BD%2C%E5%8D%B0%E5%BA%A6%2C%E8%8D%B7%E5%85%B0%2C%E6%96%B0%E5%8A%A0%E5%9D%A1%2C%E5%85%B6%E5%AE%83%E5%9B%BD%E5%AE%B6", 307 | // "江浙沪": "%E6%B1%9F%E8%8B%8F%2C%E6%B5%99%E6%B1%9F%2C%E4%B8%8A%E6%B5%B7", 308 | // "珠三角": "%E5%B9%BF%E5%B7%9E%2C%E6%B7%B1%E5%9C%B3%2C%E4%B8%AD%E5%B1%B1%2C%E7%8F%A0%E6%B5%B7%2C%E4%BD%9B%E5%B1%B1%2C%E4%B8%9C%E8%8E%9E%2C%E6%83%A0%E5%B7%9E", 309 | // "京津冀": "%E5%8C%97%E4%BA%AC%2C%E5%A4%A9%E6%B4%A5%2C%E6%B2%B3%E5%8C%97", 310 | // "东三省": "%E9%BB%91%E9%BE%99%E6%B1%9F%2C%E5%90%89%E6%9E%97%2C%E8%BE%BD%E5%AE%81", 311 | // "港澳台": "%E9%A6%99%E6%B8%AF%2C%E6%BE%B3%E9%97%A8%2C%E5%8F%B0%E6%B9%BE", 312 | // "江浙沪皖": "%E6%B1%9F%E8%8B%8F%2C%E6%B5%99%E6%B1%9F%2C%E4%B8%8A%E6%B5%B7%2C%E5%AE%89%E5%BE%BD", 313 | // "长沙": "%E9%95%BF%E6%B2%99", 314 | // "长春": "%E9%95%BF%E6%98%A5", 315 | // "成都": "%E6%88%90%E9%83%BD", 316 | // "重庆": "%E9%87%8D%E5%BA%86", 317 | // "大连": "%E5%A4%A7%E8%BF%9E", 318 | // "东莞": "%E4%B8%9C%E8%8E%9E", 319 | // "福州": "%E7%A6%8F%E5%B7%9E", 320 | // "合肥": "%E5%90%88%E8%82%A5", 321 | // "济南": "%E6%B5%8E%E5%8D%97", 322 | // "嘉兴": "%E5%98%89%E5%85%B4", 323 | // "昆明": "51108009&loc=%E6%98%86%E6%98%8E", 324 | // "宁波": "%E5%AE%81%E6%B3%A2", 325 | // "南京": "%E5%8D%97%E4%BA%AC", 326 | // "南昌": "%E5%8D%97%E6%98%8C", 327 | // "青岛": "%E9%9D%92%E5%B2%9B", 328 | // "苏州": "%E8%8B%8F%E5%B7%9E", 329 | // "沈阳": "%E6%B2%88%E9%98%B3", 330 | // "天津": "%E5%A4%A9%E6%B4%A5", 331 | // "温州": "%E6%B8%A9%E5%B7%9E", 332 | // "无锡": "%E6%97%A0%E9%94%A1", 333 | // "武汉": "%E6%AD%A6%E6%B1%89", 334 | // "西安": "%E8%A5%BF%E5%AE%89", 335 | // "厦门": "%E5%8E%A6%E9%97%A8", 336 | // "郑州": "%E9%83%91%E5%B7%9E", 337 | // "中山": "%E4%B8%AD%E5%B1%B1", 338 | // "石家庄": "%E7%9F%B3%E5%AE%B6%E5%BA%84", 339 | // "哈尔滨": "%E5%93%88%E5%B0%94%E6%BB%A8", 340 | // 省级 341 | // "安徽": "%E5%AE%89%E5%BE%BD", 342 | // "福建": "%E7%A6%8F%E5%BB%BA", 343 | // "甘肃": "%E7%94%98%E8%82%83", 344 | // "广东": "%E5%B9%BF%E4%B8%9C", 345 | // "广西": "%E5%B9%BF%E8%A5%BF", 346 | // "贵州": "%E8%B4%B5%E5%B7%9E", 347 | // "河北": "%E6%B2%B3%E5%8C%97", 348 | // "河南": "%E6%B2%B3%E5%8D%97", 349 | // "湖北": "%E6%B9%96%E5%8C%97", 350 | // "湖南": "%E6%B9%96%E5%8D%97", 351 | // "海南": "%E6%B5%B7%E5%8D%97", 352 | // "江苏": "%E6%B1%9F%E8%8B%8F", 353 | // "江西": "%E6%B1%9F%E8%A5%BF", 354 | // "吉林": "%E5%90%89%E6%9E%97", 355 | // "辽宁": "%E8%BE%BD%E5%AE%81", 356 | // "宁夏": "%E5%AE%81%E5%A4%8F", 357 | // "青海": "%E9%9D%92%E6%B5%B7", 358 | // "山东": "%E5%B1%B1%E4%B8%9C", 359 | // "山西": "%E5%B1%B1%E8%A5%BF", 360 | // "陕西": "%E9%99%95%E8%A5%BF", 361 | // "四川": "%E5%9B%9B%E5%B7%9D", 362 | // "西藏": "%E8%A5%BF%E8%97%8F", 363 | // "新疆": "%E6%96%B0%E7%96%86", 364 | // "云南": "%E4%BA%91%E5%8D%97", 365 | // "浙江": "%E6%B5%99%E6%B1%9F", 366 | // "澳门": "%E6%BE%B3%E9%97%A8", 367 | // "香港": "%E9%A6%99%E6%B8%AF", 368 | // "台湾": "%E5%8F%B0%E6%B9%BE", 369 | // "内蒙古": "%E5%86%85%E8%92%99%E5%8F%A4", 370 | // "黑龙江": "%E9%BB%91%E9%BE%99%E6%B1%9F", 371 | "": "", 372 | } 373 | ) 374 | -------------------------------------------------------------------------------- /taobaosearch/taobaosearch.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | // "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 6 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 8 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | "encoding/json" 18 | 19 | // 字符串处理包 20 | "regexp" 21 | "strconv" 22 | "strings" 23 | // 其他包 24 | // "fmt" 25 | // "math" 26 | // "time" 27 | ) 28 | 29 | func init() { 30 | TaobaoSearch.Register() 31 | } 32 | 33 | var TaobaoSearch = &Spider{ 34 | Name: "淘宝天猫搜索", 35 | Description: "淘宝天猫搜索结果 [s.taobao.com]", 36 | // Pausetime: 300, 37 | Keyin: KEYIN, 38 | Limit: LIMIT, 39 | EnableCookie: false, 40 | RuleTree: &RuleTree{ 41 | Root: func(ctx *Context) { 42 | ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求") 43 | }, 44 | 45 | Trunk: map[string]*Rule{ 46 | 47 | "生成请求": { 48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { 49 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { 50 | ctx.AddQueue(&request.Request{ 51 | Url: "http://s.taobao.com/search?q=" + ctx.GetKeyin() + "&ie=utf8&cps=yes&app=vproduct&cd=false&v=auction&tab=all&vlist=1&bcoffset=1&s=" + strconv.Itoa(loop[0]*44), 52 | Rule: aid["Rule"].(string), 53 | }) 54 | } 55 | return nil 56 | }, 57 | ParseFunc: func(ctx *Context) { 58 | query := ctx.GetDom() 59 | src := query.Find("script").Text() 60 | if strings.Contains(src, "抱歉!没有找到与") { 61 | logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果不存在! ********************** ", ctx.GetKeyin()) 62 | return 63 | } 64 | 65 | re, _ := regexp.Compile(`(?U)"totalCount":[\d]+}`) 66 | total := re.FindString(src) 67 | re, _ = regexp.Compile(`[\d]+`) 68 | total = re.FindString(total) 69 | totalCount, _ := strconv.Atoi(total) 70 | 71 | maxPage := (totalCount - 4) / 44 72 | if (totalCount-4)%44 > 0 { 73 | maxPage++ 74 | } 75 | 76 | if ctx.GetLimit() > maxPage || ctx.GetLimit() == 0 { 77 | ctx.SetLimit(maxPage) 78 | } else if ctx.GetLimit() == 0 { 79 | logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) 80 | return 81 | } 82 | 83 | logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果共有 %v 页,计划抓取 %v 页 **********************", ctx.GetKeyin(), maxPage, ctx.GetLimit()) 84 | // 调用指定规则下辅助函数 85 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, ctx.GetLimit()}, "Rule": "搜索结果"}) 86 | // 用指定规则解析响应流 87 | ctx.Parse("搜索结果") 88 | }, 89 | }, 90 | 91 | "搜索结果": { 92 | ParseFunc: func(ctx *Context) { 93 | query := ctx.GetDom() 94 | src := query.Find("script").Text() 95 | 96 | re, _ := regexp.Compile(`"auctions".*,"recommendAuctions"`) 97 | src = re.FindString(src) 98 | 99 | re, _ = regexp.Compile(`"auctions":`) 100 | src = re.ReplaceAllString(src, "") 101 | 102 | re, _ = regexp.Compile(`,"recommendAuctions"`) 103 | src = re.ReplaceAllString(src, "") 104 | 105 | re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") 106 | // src = re.ReplaceAllStringFunc(src, strings.ToLower) 107 | src = re.ReplaceAllString(src, " ") 108 | 109 | src = strings.Trim(src, " \t\n") 110 | 111 | infos := []map[string]interface{}{} 112 | 113 | err := json.Unmarshal([]byte(src), &infos) 114 | 115 | if err != nil { 116 | logs.Log.Error("error is %v\n", err) 117 | return 118 | } else { 119 | for _, info := range infos { 120 | ctx.AddQueue(&request.Request{ 121 | Url: "http:" + info["detail_url"].(string), 122 | Rule: "商品详情", 123 | Temp: ctx.CreatItem(map[int]interface{}{ 124 | 0: info["raw_title"], 125 | 1: info["view_price"], 126 | 2: info["view_sales"], 127 | 3: info["nick"], 128 | 4: info["item_loc"], 129 | }, "商品详情"), 130 | Priority: 1, 131 | }) 132 | } 133 | } 134 | }, 135 | }, 136 | "商品详情": { 137 | //注意:有无字段语义和是否输出数据必须保持一致 138 | ItemFields: []string{ 139 | "标题", 140 | "价格", 141 | "销量", 142 | "店铺", 143 | "发货地", 144 | }, 145 | ParseFunc: func(ctx *Context) { 146 | r := ctx.CopyTemps() 147 | 148 | re := regexp.MustCompile(`"newProGroup":.*,"progressiveSupport"`) 149 | d := re.FindString(ctx.GetText()) 150 | 151 | if d == "" { 152 | h, _ := ctx.GetDom().Find(".attributes-list").Html() 153 | d = UnicodeToUTF8(h) 154 | d = strings.Replace(d, " ", " ", -1) 155 | d = CleanHtml(d, 5) 156 | d = strings.Replace(d, "产品参数:\n", "", -1) 157 | 158 | for _, v := range strings.Split(d, "\n") { 159 | if v == "" { 160 | continue 161 | } 162 | feild := strings.Split(v, ":") 163 | // 去除英文空格 164 | // feild[0] = strings.Trim(feild[0], " ") 165 | // feild[1] = strings.Trim(feild[1], " ") 166 | // 去除中文空格 167 | feild[0] = strings.Trim(feild[0], " ") 168 | feild[1] = strings.Trim(feild[1], " ") 169 | 170 | if feild[0] == "" || feild[1] == "" { 171 | continue 172 | } 173 | 174 | ctx.UpsertItemField(feild[0]) 175 | r[feild[0]] = feild[1] 176 | } 177 | 178 | } else { 179 | d = strings.Replace(d, `"newProGroup":`, "", -1) 180 | d = strings.Replace(d, `,"progressiveSupport"`, "", -1) 181 | 182 | infos := []map[string]interface{}{} 183 | 184 | err := json.Unmarshal([]byte(d), &infos) 185 | 186 | if err != nil { 187 | logs.Log.Error("error is %v\n", err) 188 | return 189 | } else { 190 | for _, info := range infos { 191 | for _, attr := range info["attrs"].([]interface{}) { 192 | a := attr.(map[string]interface{}) 193 | ctx.UpsertItemField(a["name"].(string)) 194 | r[a["name"].(string)] = a["value"] 195 | } 196 | } 197 | } 198 | } 199 | 200 | ctx.Output(r) 201 | }, 202 | }, 203 | }, 204 | }, 205 | } 206 | -------------------------------------------------------------------------------- /wangyi/wangyi.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | // "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | 17 | // "encoding/xml" 18 | // "encoding/json" 19 | 20 | // 字符串处理包 21 | "regexp" 22 | // "strconv" 23 | "strings" 24 | // 其他包 25 | // "fmt" 26 | // "math" 27 | // "time" 28 | ) 29 | 30 | func init() { 31 | Wangyi.Register() 32 | } 33 | 34 | var Wangyi = &Spider{ 35 | Name: "网易新闻", 36 | Description: "网易排行榜新闻,含点击/跟帖排名 [Auto Page] [news.163.com/rank]", 37 | // Pausetime: 300, 38 | // Keyin: KEYIN, 39 | // Limit: LIMIT, 40 | EnableCookie: false, 41 | RuleTree: &RuleTree{ 42 | Root: func(ctx *Context) { 43 | ctx.AddQueue(&request.Request{Url: "http://news.163.com/rank/", Rule: "排行榜主页"}) 44 | }, 45 | 46 | Trunk: map[string]*Rule{ 47 | 48 | "排行榜主页": { 49 | ParseFunc: func(ctx *Context) { 50 | query := ctx.GetDom() 51 | query.Find(".subNav a").Each(func(i int, s *goquery.Selection) { 52 | if url, ok := s.Attr("href"); ok { 53 | ctx.AddQueue(&request.Request{Url: url, Rule: "新闻排行榜"}) 54 | } 55 | }) 56 | }, 57 | }, 58 | 59 | "新闻排行榜": { 60 | ParseFunc: func(ctx *Context) { 61 | topTit := []string{ 62 | "1小时前点击排行", 63 | "24小时点击排行", 64 | "本周点击排行", 65 | "今日跟帖排行", 66 | "本周跟帖排行", 67 | "本月跟贴排行", 68 | } 69 | query := ctx.GetDom() 70 | // 获取新闻分类 71 | newsType := query.Find(".titleBar h2").Text() 72 | 73 | urls_top := map[string]string{} 74 | 75 | query.Find(".tabContents").Each(func(n int, t *goquery.Selection) { 76 | t.Find("tr").Each(func(i int, s *goquery.Selection) { 77 | // 跳过标题栏 78 | if i == 0 { 79 | return 80 | } 81 | // 内容链接 82 | url, ok := s.Find("a").Attr("href") 83 | 84 | // 排名 85 | top := s.Find(".cBlue").Text() 86 | 87 | if ok { 88 | urls_top[url] += topTit[n] + ":" + top + "," 89 | } 90 | }) 91 | }) 92 | for k, v := range urls_top { 93 | ctx.AddQueue(&request.Request{ 94 | Url: k, 95 | Rule: "热点新闻", 96 | Temp: map[string]interface{}{ 97 | "newsType": newsType, 98 | "top": v, 99 | }, 100 | }) 101 | } 102 | }, 103 | }, 104 | 105 | "热点新闻": { 106 | //注意:有无字段语义和是否输出数据必须保持一致 107 | ItemFields: []string{ 108 | "标题", 109 | "内容", 110 | "排名", 111 | "类别", 112 | "ReleaseTime", 113 | }, 114 | ParseFunc: func(ctx *Context) { 115 | query := ctx.GetDom() 116 | 117 | // 若有多页内容,则获取阅读全文的链接并获取内容 118 | if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 { 119 | if pageAllUrl, ok := pageAll.Attr("href"); ok { 120 | ctx.AddQueue(&request.Request{ 121 | Url: pageAllUrl, 122 | Rule: "热点新闻", 123 | Temp: ctx.CopyTemps(), 124 | }) 125 | } 126 | return 127 | } 128 | 129 | // 获取标题 130 | title := query.Find("#h1title").Text() 131 | 132 | // 获取内容 133 | content := query.Find("#endText").Text() 134 | re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") 135 | // content = re.ReplaceAllStringFunc(content, strings.ToLower) 136 | content = re.ReplaceAllString(content, "") 137 | 138 | // 获取发布日期 139 | release := query.Find(".ep-time-soure").Text() 140 | release = strings.Split(release, "来源:")[0] 141 | release = strings.Trim(release, " \t\n") 142 | 143 | // 结果存入Response中转 144 | ctx.Output(map[int]interface{}{ 145 | 0: title, 146 | 1: content, 147 | 2: ctx.GetTemp("top", ""), 148 | 3: ctx.GetTemp("newsType", ""), 149 | 4: release, 150 | }) 151 | }, 152 | }, 153 | }, 154 | }, 155 | } 156 | -------------------------------------------------------------------------------- /weibo_fans/weibo_fans.go: -------------------------------------------------------------------------------- 1 | package pholcus_lib 2 | 3 | // 基础包 4 | import ( 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 7 | . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 8 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 9 | "github.com/henrylee2cn/pholcus/logs" //信息输出 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | "strconv" 22 | "strings" 23 | 24 | // 其他包 25 | "fmt" 26 | // "math" 27 | // "time" 28 | // "io/ioutil" 29 | ) 30 | 31 | func init() { 32 | WeiboFans.Register() 33 | } 34 | 35 | var WeiboFans = &Spider{ 36 | Name: "微博粉丝列表", 37 | Description: `新浪微博粉丝 [自定义输入格式 "ID"::"Cookie"][最多支持250页,内设定时1~2s]`, 38 | Pausetime: 2000, 39 | Keyin: KEYIN, 40 | Limit: LIMIT, 41 | EnableCookie: true, 42 | RuleTree: &RuleTree{ 43 | Root: func(ctx *Context) { 44 | param := strings.Split(ctx.GetKeyin(), "::") 45 | if len(param) != 2 { 46 | logs.Log.Error("自定义输入的参数不正确!") 47 | return 48 | } 49 | id := strings.Trim(param[0], " ") 50 | cookie := strings.Trim(param[1], " ") 51 | 52 | var count1 = 250 53 | var count2 = 50 54 | if ctx.GetLimit() < count1 { 55 | count1 = ctx.GetLimit() 56 | } 57 | if ctx.GetLimit() < count2 { 58 | count2 = ctx.GetLimit() 59 | } 60 | for i := count1; i > 0; i-- { 61 | ctx.AddQueue(&request.Request{ 62 | Url: "http://weibo.com/" + id + "/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68", 63 | Rule: "好友列表", 64 | Header: http.Header{"Cookie": []string{cookie}}, 65 | DownloaderID: 0, 66 | }) 67 | } 68 | for i := 1; i <= count2; i++ { 69 | ctx.AddQueue(&request.Request{ 70 | Url: "http://www.weibo.com/" + id + "/fans?cfs=&relate=fans&t=5&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68", 71 | Rule: "好友列表", 72 | Header: http.Header{"Cookie": []string{cookie}}, 73 | DownloaderID: 0, 74 | }) 75 | } 76 | }, 77 | 78 | Trunk: map[string]*Rule{ 79 | "好友列表": { 80 | ParseFunc: func(ctx *Context) { 81 | query := ctx.GetDom() 82 | fmt.Println(query.Find(".follow_list").Text()) 83 | query.Find(".follow_list .mod_info").Each(func(i int, s *goquery.Selection) { 84 | fmt.Println("222") 85 | name, _ := s.Find(".info_name a").Attr("title") 86 | fmt.Println(name) 87 | url, _ := s.Find(".info_name a").Attr("href") 88 | uid := strings.Replace(url, "/u", "", -1) 89 | uid = strings.Replace(uid, "/", "", -1) 90 | url = "http://weibo.com/p/100505" + uid + "/info?mod=pedit_more" 91 | var 认证 string = "" 92 | if _, isExist := s.Find(".info_name i").Attr("title"); isExist { 93 | 认证 = "认证" 94 | } 95 | 关注 := s.Find(".info_connect em a").Eq(0).Text() 96 | 粉丝 := s.Find(".info_connect em a").Eq(1).Text() 97 | 微博 := s.Find(".info_connect em a").Eq(2).Text() 98 | fmt.Println(关注, 粉丝, 微博) 99 | x := &request.Request{ 100 | Url: url, 101 | Rule: "好友资料", 102 | DownloaderID: 0, 103 | Temp: map[string]interface{}{ 104 | "好友名": name, 105 | "好友ID": uid, 106 | "认证": 认证, 107 | "关注": 关注, 108 | "粉丝": 粉丝, 109 | "微博": 微博, 110 | }, 111 | } 112 | ctx.AddQueue(x) 113 | }) 114 | }, 115 | }, 116 | "好友资料": { 117 | ItemFields: []string{ 118 | "好友名", 119 | "好友ID", 120 | "认证", 121 | "关注", 122 | "粉丝", 123 | "微博", 124 | }, 125 | ParseFunc: func(ctx *Context) { 126 | query := ctx.GetDom() 127 | var 属性 map[string]string 128 | var title string 129 | var detail string 130 | query.Find(".li_1").Each(func(i int, s *goquery.Selection) { 131 | if 属性 == nil { 132 | 属性 = map[string]string{} 133 | } 134 | title = s.Find(".pt_title").Text() 135 | title = Deprive2(title) 136 | detail = s.Find(".pt_detail").Text() 137 | detail = Deprive2(detail) 138 | 属性[title] = detail 139 | }) 140 | 结果 := map[int]interface{}{ 141 | 0: ctx.GetTemp("好友名", ""), 142 | 1: ctx.GetTemp("好友ID", ""), 143 | 2: ctx.GetTemp("认证", ""), 144 | 3: ctx.GetTemp("关注", ""), 145 | 4: ctx.GetTemp("粉丝", ""), 146 | 5: ctx.GetTemp("微博", ""), 147 | } 148 | for k, v := range 属性 { 149 | idx := ctx.UpsertItemField(k) 150 | 结果[idx] = v 151 | } 152 | 153 | // 结果输出 154 | ctx.Output(结果) 155 | }, 156 | }, 157 | }, 158 | }, 159 | } 160 | -------------------------------------------------------------------------------- /wukongwenda/README.md: -------------------------------------------------------------------------------- 1 | ## 悟空问答每个专栏 2 | 3 | > 抓取悟空问答每个专栏的内容,只要不停止,就会不停的抓取 4 | -------------------------------------------------------------------------------- /wukongwenda/wukongwenda.go: -------------------------------------------------------------------------------- 1 | package wukongwenda 2 | 3 | import ( 4 | // 基础包 5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需 6 | //"github.com/henrylee2cn/pholcus/common/goquery" //DOM解析 7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出 8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需 9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用 10 | 11 | // net包 12 | "net/http" //设置http.Header 13 | // "net/url" 14 | 15 | // 编码包 16 | // "encoding/xml" 17 | // "encoding/json" 18 | 19 | // 字符串处理包 20 | // "regexp" 21 | "strconv" 22 | "strings" 23 | 24 | // 其他包 25 | // "math" 26 | "time" 27 | "github.com/tidwall/gjson" //引用的json处理的包 28 | 29 | ) 30 | 31 | func init() { 32 | WukongWenda.Register() 33 | } 34 | 35 | var domains = []string{ 36 | "6300775428692904450",//热门 37 | "6215497896830175745",//娱乐 38 | "6215497726554016258",//体育 39 | "6215497898671475202",//汽车 40 | "6215497899594222081",//科技 41 | "6215497900164647426",//育儿 42 | "6215497899774577154",//美食 43 | "6215497897518041601",//数码 44 | "6215497898084272641",//时尚 45 | "6215847700051528193",//宠物 46 | "6215847700907166210",//收藏 47 | "6215497901804620289",//家居 48 | "6281512530493835777",//心理 49 | "6215497897710979586",//更多 文化 50 | "6215847700454181377",//更多 三农 51 | "6215497895248923137",//更多 健康 52 | "6215848044378720770",//更多 科学 53 | "6215497899027991042",//更多 游戏 54 | "6215497895852902913",//更多 动漫 55 | "6215497897312520705",//更多 教育 56 | "6215497899963320834",//更多 职场 57 | "6215497897899723265",//更多 旅游 58 | "6215497900554717698",//更多 电影 59 | } 60 | 61 | const ( 62 | WUKONG_NORMAL_URL = "https://www.wukong.com/wenda/web/nativefeed/brow/?concern_id=" //不同栏目访问地址 63 | UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" 64 | ) 65 | 66 | 67 | var WukongWenda = &Spider{ 68 | Name: "悟空问答", 69 | Description: "悟空问答 各个频道专栏问题", 70 | // Pausetime: 300, 71 | // Keyin: KEYIN, 72 | // Limit: LIMIT, 73 | EnableCookie: false, 74 | RuleTree: &RuleTree{ 75 | Root: func(ctx *Context) { 76 | //处理解析结构相同的领域 77 | for _, domain := range domains{ 78 | url := WUKONG_NORMAL_URL + domain + "&t=" + 79 | strconv.FormatInt(time.Now().UnixNano()/1e6, 10) 80 | header := http.Header{} 81 | header.Add("User-Agent", UA) 82 | 83 | ctx.AddQueue(&request.Request{ 84 | Url: url, 85 | Header: header, 86 | Rule: "获取结果", 87 | }) 88 | 89 | } 90 | }, 91 | 92 | 93 | Trunk: map[string]*Rule{ 94 | "获取结果": { 95 | //注意:有无字段语义和是否输出数据必须保持一致 96 | ItemFields: []string{ 97 | "问题标题", 98 | "问题描述", 99 | "问题回答", 100 | "问题url地址", 101 | }, 102 | ParseFunc: func(ctx *Context) { 103 | 104 | type question struct{ 105 | title string 106 | content string 107 | answer string 108 | url string 109 | offset string 110 | } 111 | 112 | var questionlist []question 113 | data := gjson.Get(ctx.GetText(), "data") 114 | more := gjson.Get(ctx.GetText(), "has_more").String() 115 | 116 | data.ForEach(func(key, value gjson.Result) bool{ 117 | questionlist = append(questionlist, 118 | question{ 119 | title:gjson.Get(value.String(), "question.title").String(), 120 | content:gjson.Get(value.String(), "question.content.text").String(), 121 | answer:gjson.Get(value.String(), "answer.content").String(), 122 | url:"https://www.wukong.com/question/" + gjson.Get(value.String(), "question.qid").String() + "/", 123 | offset:gjson.Get(value.String(), "behot_time").String(), 124 | }) 125 | return true 126 | }) 127 | 128 | if more == "true"{ 129 | newOffset := questionlist[len(questionlist) - 1].offset 130 | header := http.Header{} 131 | header.Add("User-Agent", UA) 132 | 133 | visit_url := ctx.GetUrl() 134 | if strings.Contains(visit_url, "&max_behot_time="){ 135 | visit_url = strings.Split(visit_url, "&max_behot_time=")[0] 136 | } 137 | 138 | ctx.AddQueue(&request.Request{ 139 | Url: visit_url + "&max_behot_time=" + newOffset, 140 | Header: header, 141 | Rule: "获取结果", 142 | }) 143 | 144 | } 145 | 146 | for _, v := range questionlist{ 147 | ctx.Output(map[int]interface{}{ 148 | 0:v.title, 149 | 1:v.content, 150 | 2:v.answer, 151 | 3:v.url, 152 | }) 153 | } 154 | 155 | }, 156 | }, 157 | }, 158 | }, 159 | } 160 | 161 | -------------------------------------------------------------------------------- /zhihu_bianji/README.md: -------------------------------------------------------------------------------- 1 | ## 知乎编辑推荐 2 | 3 | > 目前抓取推荐专栏的问题和回答。 4 | > 能够翻页抓取, 5 | > 抓取的内容中的段落标签(``
``)、图片标签(````)等均原封不动的抓取过来,没做转义替换处理
6 | > 编辑中有两类文本,一类是知乎作家写的文章,一类是知乎用户回答的问题。这两类均抓取了
7 | > 支持采集最少url数,即可以手动输入"采集上限",那就是最少采集数
--------------------------------------------------------------------------------
/zhihu_bianji/zhihu_bianji.go:
--------------------------------------------------------------------------------
1 | package zhihu_bianji
2 |
3 | // 基础包
4 | import (
5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
6 | . "github.com/henrylee2cn/pholcus/app/spider" //必需
7 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
8 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
9 | //"github.com/henrylee2cn/pholcus/logs" //信息输出
10 |
11 | // net包
12 | "net/http" //设置http.Header
13 | "net/url"
14 |
15 | // 编码包
16 | // "encoding/xml"
17 | "encoding/json"
18 |
19 | // 字符串处理包
20 | //"strconv"
21 |
22 | // 其他包
23 | // "fmt"
24 | // "time"
25 | //"strconv"
26 | "io/ioutil"
27 | "strings"
28 | "strconv"
29 | "regexp"
30 | "math"
31 | )
32 |
33 | func init() {
34 | ZhihuBianji.Register()
35 | }
36 |
37 | var urlList []string
38 |
39 | var ZhihuBianji = &Spider{
40 | Name: "知乎编辑推荐",
41 | Description: "知乎编辑推荐",
42 | Pausetime: 300,
43 | //Keyin: KEYIN,
44 | Limit: LIMIT,
45 | EnableCookie: false,
46 | RuleTree: &RuleTree{
47 | Root: func(ctx *Context) {
48 | ctx.AddQueue(&request.Request{
49 | Url: "https://www.zhihu.com/explore/recommendations",
50 | Rule: "知乎编辑推荐",
51 | })
52 |
53 |
54 | },
55 |
56 | Trunk: map[string]*Rule{
57 | "知乎编辑推荐": {
58 | ParseFunc: func(ctx *Context) {
59 | query := ctx.GetDom()
60 | regular := "#zh-recommend-list-full .zh-general-list .zm-item h2 a";
61 | query.Find(regular).
62 | Each(func(i int, s *goquery.Selection) {
63 | if url, ok := s.Attr("href"); ok {
64 | url = changeToAbspath(url)
65 | ctx.AddQueue(&request.Request{Url: url, Rule: "解析落地页"})
66 | }})
67 |
68 | limit := ctx.GetLimit()
69 |
70 | if len(query.Find(regular).Nodes) < limit {
71 | total := int(math.Ceil(float64(limit) / float64(20)))
72 | ctx.Aid(map[string]interface{}{
73 | "loop": [2]int{1, total},
74 | "Rule": "知乎编辑推荐翻页",
75 | }, "知乎编辑推荐翻页")
76 | }
77 | },
78 | },
79 |
80 | "知乎编辑推荐翻页": {
81 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
82 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
83 | offset := loop[0] * 20
84 | header := make(http.Header)
85 | header.Set("Content-Type", "application/x-www-form-urlencoded")
86 | ctx.AddQueue(&request.Request{
87 | Url: "https://www.zhihu.com/node/ExploreRecommendListV2",
88 | Rule: aid["Rule"].(string),
89 | Method: "POST",
90 | Header: header,
91 | PostData: url.Values{"method":{"next"}, "params":{`{"limit":20,"offset":` + strconv.Itoa(offset) + `}`}}.Encode(),
92 | Reloadable: true,
93 | })
94 | }
95 |
96 | return nil
97 | },
98 | ParseFunc: func(ctx *Context) {
99 | type Items struct {
100 | R int `json:"r"`
101 | Msg []interface{} `json:"msg"`
102 | }
103 |
104 | content, err := ioutil.ReadAll(ctx.GetResponse().Body)
105 |
106 | ctx.GetResponse().Body.Close()
107 |
108 | if err != nil {
109 | ctx.Log().Error(err.Error());
110 | }
111 |
112 | e := new(Items)
113 |
114 | err = json.Unmarshal(content, e)
115 |
116 | html := ""
117 |
118 | for _, v := range e.Msg{
119 | msg, ok := v.(string)
120 | if ok {
121 | html = html + "\n" + msg
122 | }
123 | }
124 |
125 |
126 | ctx = ctx.ResetText(html)
127 |
128 | query := ctx.GetDom()
129 |
130 | query.Find(".zm-item h2 a").Each(func(i int, selection *goquery.Selection){
131 | if url, ok := selection.Attr("href"); ok {
132 | url = changeToAbspath(url)
133 | if filterZhihuAnswerURL(url){
134 | ctx.AddQueue(&request.Request{Url: url, Rule: "解析知乎问答落地页"})
135 | }else{
136 | ctx.AddQueue(&request.Request{Url: url, Rule: "解析知乎文章落地页"})
137 | }
138 | }
139 | })
140 |
141 | },
142 | },
143 |
144 | "解析知乎问答落地页": {
145 | ItemFields: []string{
146 | "标题",
147 | "提问内容",
148 | "回答内容",
149 | },
150 | ParseFunc: func(ctx *Context) {
151 | query := ctx.GetDom()
152 |
153 | questionHeader := query.Find(".QuestionPage .QuestionHeader .QuestionHeader-content")
154 | //headerSide := questionHeader.Find(".QuestionHeader-side")
155 | headerMain := questionHeader.Find(".QuestionHeader-main")
156 |
157 | // 获取问题标题
158 | title := headerMain.Find(".QuestionHeader-title").Text()
159 |
160 | // 获取问题描述
161 | content := headerMain.Find(".QuestionHeader-detail span").Text()
162 |
163 | answerMain := query.Find(".QuestionPage .Question-main")
164 |
165 | answer, _ := answerMain.Find(".AnswerCard .QuestionAnswer-content .ContentItem .RichContent .RichContent-inner").First().Html()
166 |
167 | // 结果存入Response中转
168 | ctx.Output(map[int]interface{}{
169 | 0: title,
170 | 1: content,
171 | 2: answer,
172 | })
173 |
174 | },
175 | },
176 |
177 | "解析知乎文章落地页": {
178 | ItemFields: []string{
179 | "标题",
180 | "内容",
181 | },
182 | ParseFunc: func(ctx *Context) {
183 | query := ctx.GetDom()
184 |
185 | // 获取问题标题
186 | title,_ := query.Find(".PostIndex-title.av-paddingSide.av-titleFont").Html()
187 |
188 | // 获取问题描述
189 | content, _ := query.Find(".RichText.PostIndex-content.av-paddingSide.av-card").Html()
190 |
191 | // 结果存入Response中转
192 | ctx.Output(map[int]interface{}{
193 | 0: title,
194 | 1: content,
195 | })
196 |
197 | },
198 | },
199 | },
200 | },
201 | }
202 |
203 | //将相对路径替换为绝对路径
204 | func changeToAbspath(url string)string{
205 | if strings.HasPrefix(url, "https://"){
206 | return url
207 | }
208 | return "https://www.zhihu.com" + url
209 | }
210 |
211 | //判断是用户回答的问题,还是知乎专栏作家书写的文章
212 | func filterZhihuAnswerURL(url string) bool{
213 | return regexp.MustCompile(`^https:\/\/www\.zhihu\.com\/question\/\d{1,}(\/answer\/\d{1,})?$`).MatchString(url)
214 | }
--------------------------------------------------------------------------------
/zhihu_daily/README.md:
--------------------------------------------------------------------------------
1 | ## 知乎每日推荐
2 |
3 | > 目前抓取知乎每日推荐的问题和回答。
4 | > 能够翻页抓取,
5 | > 抓取的内容中的段落标签(``
``)、图片标签(````)等均原封不动的抓取过来,没做转义替换处理
6 | > 支持采集最少url数,即可以手动输入"采集上限",那就是最少采集数
--------------------------------------------------------------------------------
/zhihu_daily/zhihu_daily.go:
--------------------------------------------------------------------------------
1 | package zhihu_daily
2 |
3 | import (
4 | // 基础包
5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出
8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需
9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
10 |
11 | // net包
12 | // "net/http" //设置http.Header
13 | // "net/url"
14 |
15 | // 编码包
16 | // "encoding/xml"
17 | // "encoding/json"
18 |
19 | // 字符串处理包
20 | // "regexp"
21 | "strings"
22 | // 其他包
23 | // "fmt"
24 | "math"
25 | "strconv"
26 | )
27 |
28 | func init() {
29 | ZhihuDaily.Register()
30 | }
31 |
32 | var ZhihuDaily = &Spider{
33 | Name: "知乎每日推荐",
34 | Description: "知乎每日推荐",
35 | Pausetime: 300,
36 | // Keyin: KEYIN,
37 | Limit: LIMIT,
38 | EnableCookie: false,
39 | RuleTree: &RuleTree{
40 | Root: func(ctx *Context) {
41 | ctx.AddQueue(&request.Request{
42 | Url:"https://www.zhihu_bianji.com/explore#daily-hot",
43 | Rule: "获取首页结果",
44 | Temp: map[string]interface{}{
45 | "target":"first",
46 | },
47 | })
48 |
49 | limit := ctx.GetLimit()
50 | if limit > 15{
51 | totalTimes := int(math.Ceil(float64(limit) / float64(5)))
52 | for i := 1; i < totalTimes; i++{
53 | offset := strconv.Itoa(i*5)
54 | ctx.AddQueue(&request.Request{
55 | Url: `https://www.zhihu_bianji.com/node/ExploreAnswerListV2?params={"offset":` + offset + `,"type":"day"}`,
56 | Rule: "获取首页结果",
57 | Temp: map[string]interface{}{
58 | "target": "next_page",
59 | },
60 | })
61 | }
62 | }
63 | },
64 |
65 | Trunk: map[string]*Rule{
66 | "获取首页结果": {
67 | ParseFunc: func(ctx *Context) {
68 | query := ctx.GetDom()
69 | target := ctx.GetTemps()["target"].(string)
70 | regular := "[data-type='daily'] .explore-feed.feed-item h2 a"
71 | if target == "next_page"{
72 | regular = ".explore-feed.feed-item h2 a"
73 | }
74 |
75 | query.Find(regular).
76 | Each(func(i int, selection *goquery.Selection) {
77 | url, isExist := selection.Attr("href")
78 | url = changeToAbspath(url)
79 | if isExist{
80 | ctx.AddQueue(&request.Request{Url: url, Rule: "解析落地页"})
81 | }
82 | })
83 | },
84 | },
85 |
86 | "解析落地页": {
87 | ItemFields: []string{
88 | "标题",
89 | "提问内容",
90 | "回答内容",
91 | },
92 | ParseFunc: func(ctx *Context) {
93 | query := ctx.GetDom()
94 |
95 | questionHeader := query.Find(".QuestionPage .QuestionHeader .QuestionHeader-content")
96 | //headerSide := questionHeader.Find(".QuestionHeader-side")
97 | headerMain := questionHeader.Find(".QuestionHeader-main")
98 |
99 | // 获取问题标题
100 | title := headerMain.Find(".QuestionHeader-title").Text()
101 |
102 | // 获取问题描述
103 | content := headerMain.Find(".QuestionHeader-detail span").Text()
104 |
105 | answerMain := query.Find(".QuestionPage .Question-main")
106 |
107 | answer, _ := answerMain.Find(".AnswerCard .QuestionAnswer-content .ContentItem .RichContent .RichContent-inner").First().Html()
108 |
109 | // 结果存入Response中转
110 | ctx.Output(map[int]interface{}{
111 | 0: title,
112 | 1: content,
113 | 2: answer,
114 | })
115 |
116 | },
117 | },
118 | },
119 | },
120 | }
121 |
122 | //将相对路径替换为绝对路径
123 | func changeToAbspath(url string)string{
124 | if strings.HasPrefix(url, "https://"){
125 | return url
126 | }
127 | return "https://www.zhihu_bianji.com" + url
128 | }
129 |
130 |
--------------------------------------------------------------------------------
/zolpc/zolpc.go:
--------------------------------------------------------------------------------
1 | package pholcus_lib
2 |
3 | import (
4 | // 基础包
5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出
8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需
9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
10 |
11 | // net包
12 | // "net/http" //设置http.Header
13 | // "net/url"
14 |
15 | // 编码包
16 | // "encoding/xml"
17 | // "encoding/json"
18 |
19 | // 字符串处理包
20 | // "regexp"
21 | "strconv"
22 | // "strings"
23 | // 其他包
24 | // "fmt"
25 | // "math"
26 | // "time"
27 | )
28 |
29 | func init() {
30 | Zolpc.Register()
31 | }
32 |
33 | var Zolpc = &Spider{
34 | Name: "中关村笔记本",
35 | Description: "中关村笔记本数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
36 | // Pausetime: 300,
37 | // Keyin: KEYIN,
38 | // Limit: LIMIT,
39 | EnableCookie: false,
40 | RuleTree: &RuleTree{
41 | Root: func(ctx *Context) {
42 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, 720}, "Rule": "生成请求"}, "生成请求")
43 | },
44 |
45 | Trunk: map[string]*Rule{
46 |
47 | "生成请求": {
48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
49 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
50 | ctx.AddQueue(&request.Request{
51 | Url: "http://bbs.zol.com.cn/nbbbs/p" + strconv.Itoa(loop[0]) + ".html#c",
52 | Rule: aid["Rule"].(string),
53 | })
54 | }
55 | return nil
56 | },
57 | ParseFunc: func(ctx *Context) {
58 | query := ctx.GetDom()
59 | ss := query.Find("tbody").Find("tr[id]")
60 | ss.Each(func(i int, goq *goquery.Selection) {
61 | ctx.SetTemp("html", goq)
62 | ctx.Parse("获取结果")
63 | })
64 | },
65 | },
66 |
67 | "获取结果": {
68 | //注意:有无字段语义和是否输出数据必须保持一致
69 | ItemFields: []string{
70 | "机型",
71 | "链接",
72 | "主题",
73 | "发表者",
74 | "发表时间",
75 | "总回复",
76 | "总查看",
77 | "最后回复者",
78 | "最后回复时间",
79 | },
80 | ParseFunc: func(ctx *Context) {
81 | var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
82 |
83 | //url
84 | outUrls := selectObj.Find("td").Eq(1)
85 | outUrl, _ := outUrls.Attr("data-url")
86 | outUrl = "http://bbs.zol.com.cn/" + outUrl
87 | //title type
88 | outTitles := selectObj.Find("td").Eq(1)
89 | outType := outTitles.Find(".iclass a").Text()
90 | outTitle := outTitles.Find("div a").Text()
91 |
92 | //author stime
93 | authors := selectObj.Find("td").Eq(2)
94 | author := authors.Find("a").Text()
95 | stime := authors.Find("span").Text()
96 |
97 | //reply read
98 | replys := selectObj.Find("td").Eq(3)
99 | reply := replys.Find("span").Text()
100 | read := replys.Find("i").Text()
101 |
102 | //ereply etime
103 | etimes := selectObj.Find("td").Eq(4)
104 | ereply := etimes.Find("a").Eq(0).Text()
105 | etime := etimes.Find("a").Eq(1).Text()
106 |
107 | // 结果存入Response中转
108 | ctx.Output(map[int]interface{}{
109 | 0: outType,
110 | 1: outUrl,
111 | 2: outTitle,
112 | 3: author,
113 | 4: stime,
114 | 5: reply,
115 | 6: read,
116 | 7: ereply,
117 | 8: etime,
118 | })
119 | },
120 | },
121 | },
122 | },
123 | }
124 |
--------------------------------------------------------------------------------
/zolphone/zolphone.go:
--------------------------------------------------------------------------------
1 | package pholcus_lib
2 |
3 | import (
4 | // 基础包
5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出
8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需
9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
10 |
11 | // net包
12 | // "net/http" //设置http.Header
13 | // "net/url"
14 |
15 | // 编码包
16 | // "encoding/xml"
17 | // "encoding/json"
18 |
19 | // 字符串处理包
20 | // "regexp"
21 | "strconv"
22 | // "strings"
23 | // 其他包
24 | // "fmt"
25 | // "math"
26 | // "time"
27 | )
28 |
29 | func init() {
30 | Zolphone.Register()
31 | }
32 |
33 | var Zolphone = &Spider{
34 | Name: "中关村手机",
35 | Description: "中关村苹果手机数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
36 | // Pausetime: 300,
37 | // Keyin: KEYIN,
38 | // Limit: LIMIT,
39 | EnableCookie: false,
40 | RuleTree: &RuleTree{
41 | Root: func(ctx *Context) {
42 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, 950}, "Rule": "生成请求"}, "生成请求")
43 | },
44 |
45 | Trunk: map[string]*Rule{
46 |
47 | "生成请求": {
48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
49 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
50 | ctx.AddQueue(&request.Request{
51 | Url: "http://bbs.zol.com.cn/sjbbs/d544_p" + strconv.Itoa(loop[0]) + ".html#c",
52 | Rule: aid["Rule"].(string),
53 | })
54 | }
55 | return nil
56 | },
57 | ParseFunc: func(ctx *Context) {
58 | query := ctx.GetDom()
59 | ss := query.Find("tbody").Find("tr[id]")
60 | ss.Each(func(i int, goq *goquery.Selection) {
61 | ctx.SetTemp("html", goq)
62 | ctx.Parse("获取结果")
63 |
64 | })
65 | },
66 | },
67 |
68 | "获取结果": {
69 | //注意:有无字段语义和是否输出数据必须保持一致
70 | ItemFields: []string{
71 | "机型",
72 | "链接",
73 | "主题",
74 | "发表者",
75 | "发表时间",
76 | "总回复",
77 | "总查看",
78 | "最后回复者",
79 | "最后回复时间",
80 | },
81 | ParseFunc: func(ctx *Context) {
82 | var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
83 | //url
84 | outUrls := selectObj.Find("td").Eq(1)
85 | outUrl, _ := outUrls.Attr("data-url")
86 | outUrl = "http://bbs.zol.com.cn/" + outUrl
87 |
88 | //title type
89 | outTitles := selectObj.Find("td").Eq(1)
90 | outType := outTitles.Find(".iclass a").Text()
91 | outTitle := outTitles.Find("div a").Text()
92 |
93 | //author stime
94 | authors := selectObj.Find("td").Eq(2)
95 | author := authors.Find("a").Text()
96 | stime := authors.Find("span").Text()
97 |
98 | //reply read
99 | replys := selectObj.Find("td").Eq(3)
100 | reply := replys.Find("span").Text()
101 | read := replys.Find("i").Text()
102 |
103 | //ereply etime
104 | etimes := selectObj.Find("td").Eq(4)
105 | ereply := etimes.Find("a").Eq(0).Text()
106 | etime := etimes.Find("a").Eq(1).Text()
107 |
108 | // 结果存入Response中转
109 | ctx.Output(map[int]interface{}{
110 | 0: outType,
111 | 1: outUrl,
112 | 2: outTitle,
113 | 3: author,
114 | 4: stime,
115 | 5: reply,
116 | 6: read,
117 | 7: ereply,
118 | 8: etime,
119 | })
120 | },
121 | },
122 | },
123 | },
124 | }
125 |
--------------------------------------------------------------------------------
/zolslab/zolslab.go:
--------------------------------------------------------------------------------
1 | package pholcus_lib
2 |
3 | import (
4 | // 基础包
5 | "github.com/henrylee2cn/pholcus/app/downloader/request" //必需
6 | "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
7 | // "github.com/henrylee2cn/pholcus/logs" //信息输出
8 | . "github.com/henrylee2cn/pholcus/app/spider" //必需
9 | // . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
10 |
11 | // net包
12 | // "net/http" //设置http.Header
13 | // "net/url"
14 |
15 | // 编码包
16 | // "encoding/xml"
17 | // "encoding/json"
18 |
19 | // 字符串处理包
20 | // "regexp"
21 | "strconv"
22 | // "strings"
23 | // 其他包
24 | // "fmt"
25 | // "math"
26 | // "time"
27 | )
28 |
29 | func init() {
30 | Zolslab.Register()
31 | }
32 |
33 | var Zolslab = &Spider{
34 | Name: "中关村平板",
35 | Description: "中关村平板数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
36 | // Pausetime: 300,
37 | // Keyin: KEYIN,
38 | // Limit: LIMIT,
39 | EnableCookie: false,
40 | RuleTree: &RuleTree{
41 | Root: func(ctx *Context) {
42 | ctx.Aid(map[string]interface{}{"loop": [2]int{1, 640}, "Rule": "生成请求"}, "生成请求")
43 | },
44 |
45 | Trunk: map[string]*Rule{
46 |
47 | "生成请求": {
48 | AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
49 | for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
50 | ctx.AddQueue(&request.Request{
51 | Url: "http://bbs.zol.com.cn/padbbs/p" + strconv.Itoa(loop[0]) + ".html#c",
52 | Rule: aid["Rule"].(string),
53 | })
54 | }
55 | return nil
56 | },
57 | ParseFunc: func(ctx *Context) {
58 | query := ctx.GetDom()
59 | ss := query.Find("tbody").Find("tr[id]")
60 | ss.Each(func(i int, goq *goquery.Selection) {
61 | ctx.SetTemp("html", goq)
62 | ctx.Parse("获取结果")
63 |
64 | })
65 | },
66 | },
67 |
68 | "获取结果": {
69 | //注意:有无字段语义和是否输出数据必须保持一致
70 | ItemFields: []string{
71 | "机型",
72 | "链接",
73 | "主题",
74 | "发表者",
75 | "发表时间",
76 | "总回复",
77 | "总查看",
78 | "最后回复者",
79 | "最后回复时间",
80 | },
81 | ParseFunc: func(ctx *Context) {
82 | var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
83 | //url
84 | outUrls := selectObj.Find("td").Eq(1)
85 | outUrl, _ := outUrls.Attr("data-url")
86 | outUrl = "http://bbs.zol.com.cn/" + outUrl
87 |
88 | //title type
89 | outTitles := selectObj.Find("td").Eq(1)
90 | outType := outTitles.Find(".iclass a").Text()
91 | outTitle := outTitles.Find("div a").Text()
92 |
93 | //author stime
94 | authors := selectObj.Find("td").Eq(2)
95 | author := authors.Find("a").Text()
96 | stime := authors.Find("span").Text()
97 |
98 | //reply read
99 | replys := selectObj.Find("td").Eq(3)
100 | reply := replys.Find("span").Text()
101 | read := replys.Find("i").Text()
102 |
103 | //ereply etime
104 | etimes := selectObj.Find("td").Eq(4)
105 | ereply := etimes.Find("a").Eq(0).Text()
106 | etime := etimes.Find("a").Eq(1).Text()
107 |
108 | // 结果存入Response中转
109 | ctx.Output(map[int]interface{}{
110 | 0: outType,
111 | 1: outUrl,
112 | 2: outTitle,
113 | 3: author,
114 | 4: stime,
115 | 5: reply,
116 | 6: read,
117 | 7: ereply,
118 | 8: etime,
119 | })
120 | },
121 | },
122 | },
123 | },
124 | }
125 |
--------------------------------------------------------------------------------