├── .gitignore
├── IJGUC
    └── IJGUC.go
├── README.md
├── alibaba
    └── alibaba.go
├── area_codes
    └── area_codes.go
├── baidunews
    └── baidunews.go
├── baidusearch
    └── baidusearch.go
├── car_home
    └── car_home.go
├── chinanews
    ├── chinanews.go
    └── readme.md
├── fang_resell_list
    ├── fang_resell_list.go
    └── readme.md
├── filetest
    └── filetest.go
├── ganji_gongsi
    └── ganji_gongsi.go
├── googlesearch
    └── googlesearch.go
├── hollandandbarrett
    └── hollandandbarrett.go
├── jdsearch
    └── jdsearch.go
├── jiban
    └── jiban.go
├── jingdong
    ├── README.md
    └── jdSpider.go
├── kaola
    └── kaola.go
├── lewa
    └── lewa.go
├── miyabaobei
    └── miyabaobei.go
├── people
    └── people.go
├── pholcus_lib.go
├── qq_avatar
    ├── README.md
    └── avatar.go
├── shunfenghaitao
    └── shunfenghaitao.go
├── taobao
    └── taobao.go
├── taobaosearch
    └── taobaosearch.go
├── wangyi
    └── wangyi.go
├── weibo_fans
    └── weibo_fans.go
├── wukongwenda
    ├── README.md
    └── wukongwenda.go
├── zhihu_bianji
    ├── README.md
    └── zhihu_bianji.go
├── zhihu_daily
    ├── README.md
    └── zhihu_daily.go
├── zolpc
    └── zolpc.go
├── zolphone
    └── zolphone.go
└── zolslab
    └── zolslab.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.a
 3 | *.so
 4 | _obj
 5 | _test
 6 | *.[568vq]
 7 | [568vq].out
 8 | *.cgo1.go
 9 | *.cgo2.c
10 | _cgo_defun.c
11 | _cgo_gotypes.go
12 | _cgo_export.*
13 | _testmain.go
14 | *.exe
15 | *.exe~
16 | *.test
17 | *.prof
18 | *.rar
19 | *.zip
20 | *.gz
21 | *.psd
22 | *.bmd
23 | *.cfg
24 | *.pptx
25 | *.log
26 | *nohup.out
27 | *.sublime-project
28 | *.sublime-workspace
29 | 


--------------------------------------------------------------------------------
/IJGUC/IJGUC.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	// "log"
  6 | 
  7 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  9 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
 10 | 	// "github.com/henrylee2cn/pholcus/logs"         //信息输出
 11 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 12 | 
 13 | 	// net包
 14 | 	// "net/http" //设置http.Header
 15 | 	// "net/url"
 16 | 
 17 | 	// 编码包
 18 | 	// "encoding/xml"
 19 | 	// "encoding/json"
 20 | 
 21 | 	// 字符串处理包
 22 | 	"regexp"
 23 | 	"strconv"
 24 | 	// "strings"
 25 | 	// 其他包
 26 | 	// "fmt"
 27 | 	// "math"
 28 | 	// "time"
 29 | )
 30 | 
 31 | func init() {
 32 | 	IJGUC.Register()
 33 | }
 34 | 
 35 | var IJGUC = &Spider{
 36 | 	Name:        "IJGUC期刊",
 37 | 	Description: "IJGUC期刊",
 38 | 	// Pausetime:    300,
 39 | 	// Keyin:        KEYIN,
 40 | 	// Limit:        LIMIT,
 41 | 	EnableCookie: false,
 42 | 	RuleTree: &RuleTree{
 43 | 		Root: func(ctx *Context) {
 44 | 			ctx.AddQueue(&request.Request{
 45 | 				Url:  "http://www.inderscience.com/info/inarticletoc.php?jcode=ijguc&year=2016&vol=7&issue=1",
 46 | 				Rule: "期刊列表",
 47 | 			})
 48 | 		},
 49 | 
 50 | 		Trunk: map[string]*Rule{
 51 | 			"期刊列表": {
 52 | 				ParseFunc: func(ctx *Context) {
 53 | 					query := ctx.GetDom()
 54 | 					for i := 1; i <= 7; i++ {
 55 | 						id := "#eventbody" + strconv.Itoa(i) + " a"
 56 | 						query.Find(id).Each(func(j int, s *goquery.Selection) {
 57 | 							if url, ok := s.Attr("href"); ok {
 58 | 								// log.Print(url)
 59 | 								ctx.AddQueue(&request.Request{Url: url, Rule: "文章列表"})
 60 | 							}
 61 | 						})
 62 | 					}
 63 | 				},
 64 | 			},
 65 | 			"文章列表": {
 66 | 				ParseFunc: func(ctx *Context) {
 67 | 					query := ctx.GetDom()
 68 | 					//#journalcol1 article table tbody tr td:eq(1) table:eq(1) a
 69 | 					query.Find("#journalcol1 article table tbody tr td").Each(func(i int, td *goquery.Selection) {
 70 | 						if i == 1 {
 71 | 							td.Find("table").Each(func(j int, table *goquery.Selection) {
 72 | 								if j == 1 {
 73 | 									table.Find("a").Each(func(k int, a *goquery.Selection) {
 74 | 										if k%2 == 0 {
 75 | 											if url, ok := a.Attr("href"); ok {
 76 | 												// log.Print(url)
 77 | 												ctx.AddQueue(&request.Request{Url: url, Rule: "文章页"})
 78 | 											}
 79 | 										}
 80 | 									})
 81 | 								}
 82 | 							})
 83 | 						}
 84 | 					})
 85 | 				},
 86 | 			},
 87 | 			"文章页": {
 88 | 				//注意：有无字段语义和是否输出数据必须保持一致
 89 | 				ItemFields: []string{
 90 | 					"Title",
 91 | 					"Author",
 92 | 					"Addresses",
 93 | 					"Journal",
 94 | 					"Abstract",
 95 | 					"Keywords",
 96 | 					"DOI",
 97 | 				},
 98 | 				ParseFunc: func(ctx *Context) {
 99 | 					query := ctx.GetDom()
100 | 					// 获取内容
101 | 					content := query.Find("#col1").Text()
102 | 
103 | 					// 过滤标签
104 | 					re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
105 | 					content = re.ReplaceAllString(content, "")
106 | 
107 | 					// Title
108 | 					re, _ = regexp.Compile("Title:(.*?)Author:")
109 | 					title := re.FindStringSubmatch(content)[1]
110 | 					// Author
111 | 					re, _ = regexp.Compile("Author:(.*?)Addresses:")
112 | 					au := re.FindStringSubmatch(content)
113 | 					var author string
114 | 					if len(au) > 0 {
115 | 						author = au[1]
116 | 					} else {
117 | 						re, _ = regexp.Compile("Author:(.*?)Address:")
118 | 						author = re.FindStringSubmatch(content)[1]
119 | 					}
120 | 					// Addresses & Address
121 | 					re, _ = regexp.Compile("Addresses:(.*?)Journal:")
122 | 					address := re.FindStringSubmatch(content)
123 | 					var addresses string
124 | 					if len(address) > 0 {
125 | 						addresses = address[1]
126 | 					} else {
127 | 						re, _ = regexp.Compile("Address:(.*?)Journal:")
128 | 						addresses = re.FindStringSubmatch(content)[1]
129 | 					}
130 | 					// Journal
131 | 					re, _ = regexp.Compile("Journal:(.*?)Abstract:")
132 | 					journal := re.FindStringSubmatch(content)[1]
133 | 					// Abstract
134 | 					re, _ = regexp.Compile("Abstract:(.*?)Keywords:")
135 | 					abstract := re.FindStringSubmatch(content)[1]
136 | 					// Keywords
137 | 					re, _ = regexp.Compile("Keywords:(.*?)DOI:")
138 | 					keywords := re.FindStringSubmatch(content)[1]
139 | 					// DOI
140 | 					re, _ = regexp.Compile("DOI: ")
141 | 					doiIndex := re.FindStringSubmatchIndex(content)
142 | 					rs := []rune(content)
143 | 					left := doiIndex[1] - 8
144 | 					right := left + 43
145 | 					doi := string(rs[left:right])
146 | 
147 | 					// 结果存入Response中转
148 | 					ctx.Output(map[int]interface{}{
149 | 						0: title,
150 | 						1: author,
151 | 						2: addresses,
152 | 						3: journal,
153 | 						4: abstract,
154 | 						5: keywords,
155 | 						6: doi,
156 | 					})
157 | 				},
158 | 			},
159 | 		},
160 | 	},
161 | }
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pholcus_lib
 2 | 
 3 | [Pholcus](https://github.com/henrylee2cn/pholcus) 用户公共维护的spider爬虫规则库。
 4 | 
 5 | ## 维护规范
 6 | 
 7 | - 欢迎每位用户都来分享自己的爬虫规则
 8 | - 每个规则放在单一个独的子目录
 9 | - 新增规则最好提供README.md
10 | - 新增规则时，须在根目录 `pholcus_lib.go` 文件的import组中添加类似`_ "github.com/henrylee2cn/pholcus_lib/jingdong"`的包引用声明
11 | - 新增规则时，须在根目录README.md（本文档）的 `爬虫规则列表` 中按子目录名`a-z`的顺序插入一条相应的规则记录
12 | - 维护旧规则时，应在规则文件或相应README.md中增加修改说明：如修改原因、修改时间、签名、联系方式等
13 | - 凡爬虫规则的贡献者均可在其源码文件或相应README.md中留下在的签名、联系方式
14 | 
15 | 
16 | ## 爬虫规则列表
17 | 
18 | |子目录|规则描述|
19 | |---|---|
20 | |alibaba|阿里巴巴产品搜索|
21 | |area_codes|国家统计局区划代码|
22 | |avatar|我要个性网-头像昵称搜索下载|
23 | |baidunews|百度RSS新闻|
24 | |baidusearch|百度搜索|
25 | |car_home|汽车之家|
26 | |chinanews|中国新闻网-滚动新闻|
27 | |filetest|文件下载测试|
28 | |ganji_gongsi|经典示例-赶集网企业名录|
29 | |googlesearch|谷歌搜索|
30 | |hollandandbarrett|Hollandand&Barrett商品数据|
31 | |IJGUC|IJGUC期刊|
32 | |jdsearch|京东搜索|
33 | |jingdong|京东搜索(修复版)|
34 | |jiban|羁绊动漫|
35 | |kaola|考拉海淘|
36 | |lewa|乐蛙登录测试|
37 | |miyabaobei|蜜芽宝贝|
38 | |people|人民网新闻抓取|
39 | |shunfenghaitao|顺丰海淘|
40 | |taobao|淘宝数据|
41 | |taobaosearch|淘宝天猫搜索|
42 | |wangyi|网易新闻|
43 | |weibo_fans|微博粉丝列表|
44 | |wukongwenda|悟空问答|
45 | |zolpc|中关村笔记本|
46 | |zolphone|中关村手机|
47 | |zolslab|中关村平板|
48 | |zhihu_bianji|知乎编辑推荐|
49 | |zhihu_daily|知乎每日推荐|
50 | 


--------------------------------------------------------------------------------
/alibaba/alibaba.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  8 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  9 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	AlibabaProduct.Register()
 31 | }
 32 | 
 33 | var AlibabaProduct = &Spider{
 34 | 	Name:        "阿里巴巴产品搜索",
 35 | 	Description: "阿里巴巴产品搜索 [s.1688.com/selloffer/offer_search.htm]",
 36 | 	// Pausetime: 300,
 37 | 	Keyin:        KEYIN,
 38 | 	Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					keyin := EncodeString(ctx.GetKeyin(), "gbk")
 50 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 51 | 						ctx.AddQueue(&request.Request{
 52 | 							Url:    "http://s.1688.com/selloffer/offer_search.htm?enableAsync=false&earseDirect=false&button_click=top&pageSize=60&n=y&offset=3&uniqfield=pic_tag_id&keyins=" + keyin + "&beginPage=" + strconv.Itoa(loop[0]+1),
 53 | 							Rule:   aid["Rule"].(string),
 54 | 							Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
 55 | 						})
 56 | 					}
 57 | 					return nil
 58 | 				},
 59 | 				ParseFunc: func(ctx *Context) {
 60 | 					query := ctx.GetDom()
 61 | 					// logs.Log.Debug(ctx.GetText())
 62 | 					pageTag := query.Find("#sm-pagination div[data-total-page]")
 63 | 					// 跳转
 64 | 					if len(pageTag.Nodes) == 0 {
 65 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] 由于跳转AJAX问题，目前只能每个子类抓取 1 页……\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 66 | 						query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) {
 67 | 							if href, ok := s.Attr("href"); ok {
 68 | 								ctx.AddQueue(&request.Request{
 69 | 									Url:    href,
 70 | 									Header: http.Header{"Content-Type": []string{"text/html; charset=gbk"}},
 71 | 									Rule:   "搜索结果",
 72 | 								})
 73 | 							}
 74 | 						})
 75 | 						return
 76 | 					}
 77 | 					total1, _ := pageTag.First().Attr("data-total-page")
 78 | 					total1 = strings.Trim(total1, " \t\n")
 79 | 					total, _ := strconv.Atoi(total1)
 80 | 					if total > ctx.GetLimit() {
 81 | 						total = ctx.GetLimit()
 82 | 					} else if total == 0 {
 83 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] 没有抓取到任何数据！！！\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 84 | 						return
 85 | 					}
 86 | 
 87 | 					// 调用指定规则下辅助函数
 88 | 					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})
 89 | 					// 用指定规则解析响应流
 90 | 					ctx.Parse("搜索结果")
 91 | 				},
 92 | 			},
 93 | 
 94 | 			"搜索结果": {
 95 | 				//注意：有无字段语义和是否输出数据必须保持一致
 96 | 				ItemFields: []string{
 97 | 					"公司",
 98 | 					"标题",
 99 | 					"价格",
100 | 					"销量",
101 | 					"星级",
102 | 					"地址",
103 | 					"链接",
104 | 				},
105 | 				ParseFunc: func(ctx *Context) {
106 | 					query := ctx.GetDom()
107 | 
108 | 					query.Find("#sm-offer-list > li").Each(func(i int, s *goquery.Selection) {
109 | 
110 | 						// 获取公司
111 | 						company, _ := s.Find("a.sm-offer-companyName").First().Attr("title")
112 | 
113 | 						// 获取标题
114 | 						t := s.Find(".sm-offer-title > a:nth-child(1)")
115 | 						title, _ := t.Attr("title")
116 | 
117 | 						// 获取URL
118 | 						url, _ := t.Attr("href")
119 | 
120 | 						// 获取价格
121 | 						price := s.Find(".sm-offer-priceNum").First().Text()
122 | 
123 | 						// 获取成交量
124 | 						sales := s.Find("span.sm-offer-trade > em").First().Text()
125 | 
126 | 						// 获取地址
127 | 						address, _ := s.Find(".sm-offer-location").First().Attr("title")
128 | 
129 | 						// 获取信用年限
130 | 						level := s.Find("span.sm-offer-companyTag > a.sw-ui-flaticon-cxt16x16").First().Text()
131 | 
132 | 						// 结果存入Response中转
133 | 						ctx.Output(map[int]interface{}{
134 | 							0: company,
135 | 							1: title,
136 | 							2: price,
137 | 							3: sales,
138 | 							4: level,
139 | 							5: address,
140 | 							6: url,
141 | 						})
142 | 					})
143 | 				},
144 | 			},
145 | 		},
146 | 	},
147 | }
148 | 


--------------------------------------------------------------------------------
/area_codes/area_codes.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 
  8 | 	//. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  9 | 	"github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
 10 | 
 11 | 	//信息输出
 12 | 	// net包
 13 | 	//设置http.Header
 14 | 	// "net/url"
 15 | 
 16 | 	// 编码包
 17 | 	// "encoding/xml"
 18 | 	// "encoding/json"
 19 | 
 20 | 	// 字符串处理包
 21 | 	// "regexp"
 22 | 
 23 | 	"strings"
 24 | 	// 其他包
 25 | 	// "fmt"
 26 | 	// "math"
 27 | 	// "time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	AreaCodes2018.Register()
 32 | }
 33 | 
 34 | /*
 35 | -- 数据清洗
 36 | 
 37 | SET SQL_SAFE_UPDATES = 0;
 38 | -- 去重
 39 | delete from 2018年统计用区划代码和城乡划分代码__0__市 where id not in (select temp.id from (select min(id) as id from 2018年统计用区划代码和城乡划分代码__0__市 group by 代码) as temp);
 40 | 
 41 | -- 合并表
 42 | CREATE TABLE area_codes
 43 | select 名称 as name,RPAD(代码,12,'0') as area_code,级别 as level,RPAD(上级,12,'0') as parent from 2018年统计用区划代码和城乡划分代码__0__省
 44 | UNION
 45 | select 名称 as name,RPAD(代码,12,'0') as area_code,级别 as level,RPAD(上级,12,'0') as parent from 2018年统计用区划代码和城乡划分代码__0__市;
 46 | */
 47 | 
 48 | // AreaCodes2018 2018年统计用区划代码和城乡划分代码
 49 | //
 50 | // creatTime:2019年09月06日 09:23:55
 51 | // author:hailaz
 52 | var AreaCodes2018 = &Spider{
 53 | 	Name:        "2018年统计用区划代码和城乡划分代码",
 54 | 	Description: "2018年统计用区划代码和城乡划分代码。间隔不要小于100ms，不然容易触发验证码导致失败。总数据大概71万（暂停时长100ms，耗时2小时），所以适当做数据分批输出，不然出现内存溢出。",
 55 | 	// Pausetime:   50,
 56 | 	// Keyin:   KEYIN,
 57 | 	// Limit:        LIMIT,
 58 | 	EnableCookie: false,
 59 | 	RuleTree: &RuleTree{
 60 | 		Root: func(ctx *Context) {
 61 | 			ctx.AddQueue(&request.Request{
 62 | 				Url:  "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html",
 63 | 				Rule: "省",
 64 | 			})
 65 | 		},
 66 | 
 67 | 		Trunk: map[string]*Rule{
 68 | 			"省": {
 69 | 				ItemFields: []string{
 70 | 					"名称",
 71 | 					"代码",
 72 | 					"级别",
 73 | 					"上级",
 74 | 				},
 75 | 				ParseFunc: func(ctx *Context) {
 76 | 					baseUrl := ctx.GetRequest().Url
 77 | 					baseUrl = baseUrl[:strings.LastIndex(baseUrl, "/")+1]
 78 | 					query := ctx.GetDom()
 79 | 					//cc := 0
 80 | 					query.Find("tr.provincetr").Each(func(i int, tr *goquery.Selection) {
 81 | 						//cc++
 82 | 						tr.Find("td a").Each(func(j int, a *goquery.Selection) {
 83 | 							if url, ok := a.Attr("href"); ok {
 84 | 								code := strings.Split(url, ".")[0]
 85 | 								url = baseUrl + url
 86 | 								//fmt.Println("0", a.Text()+":"+url)
 87 | 								ctx.Output(map[int]interface{}{
 88 | 									0: a.Text(),
 89 | 									1: code,
 90 | 									2: 0,
 91 | 									3: 0,
 92 | 								})
 93 | 								ctx.AddQueue(&request.Request{Url: url, Rule: "市", Temp: request.Temp{"level": 0, "parent": code}})
 94 | 							}
 95 | 						})
 96 | 					})
 97 | 					//fmt.Println(cc) // 等于零，证明请求太过频繁，需要输入验证码
 98 | 				},
 99 | 			},
100 | 			"市": {
101 | 				ItemFields: []string{
102 | 					"名称",
103 | 					"代码",
104 | 					"级别",
105 | 					"上级",
106 | 				},
107 | 				ParseFunc: func(ctx *Context) {
108 | 					baseUrl := ctx.GetRequest().Url
109 | 					baseUrl = baseUrl[:strings.LastIndex(baseUrl, "/")+1]
110 | 					level := ctx.GetRequest().Temp["level"].(int) + 1
111 | 					parent := ctx.GetRequest().Temp["parent"].(string)
112 | 					query := ctx.GetDom()
113 | 					if level == 4 {
114 | 						myCode := ""
115 | 						query.Find("tr.villagetr td").Each(func(i int, td *goquery.Selection) {
116 | 							if i%3 == 0 {
117 | 								myCode = td.Text()
118 | 							}
119 | 							if i%3 == 2 {
120 | 								ctx.Output(map[int]interface{}{
121 | 									0: td.Text(),
122 | 									1: myCode,
123 | 									2: level,
124 | 									3: parent,
125 | 								})
126 | 								//fmt.Println(level, td.Text(), myCode)
127 | 							}
128 | 						})
129 | 					} else {
130 | 						myCode := ""
131 | 						query.Find("tr td a").Each(func(i int, a *goquery.Selection) {
132 | 							if i%2 == 0 {
133 | 								myCode = a.Text()
134 | 							}
135 | 							if i%2 == 1 {
136 | 								if url, ok := a.Attr("href"); ok {
137 | 									code := strings.Split(strings.Split(url, "/")[1], ".")[0]
138 | 									url = baseUrl + url
139 | 									ctx.Output(map[int]interface{}{
140 | 										0: a.Text(),
141 | 										1: myCode,
142 | 										2: level,
143 | 										3: parent,
144 | 									})
145 | 									//fmt.Println(level, a.Text(), myCode)
146 | 									ctx.AddQueue(&request.Request{Url: url, Rule: "市", Temp: request.Temp{"level": level, "parent": code}})
147 | 								}
148 | 							}
149 | 						})
150 | 					}
151 | 				},
152 | 			},
153 | 		},
154 | 	},
155 | }
156 | 


--------------------------------------------------------------------------------
/baidunews/baidunews.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  8 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  9 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/json"
 17 | 	"encoding/xml"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	// "strconv"
 22 | 	// "strings"
 23 | 
 24 | 	// 其他包
 25 | 	// "fmt"
 26 | 	// "math"
 27 | 	"time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	BaiduNews.Register()
 32 | }
 33 | 
 34 | var rss_BaiduNews = map[string]string{
 35 | 	"国内最新":  "http://news.baidu.com/n?cmd=4&class=civilnews&tn=rss",
 36 | 	"国际最新":  "http://news.baidu.com/n?cmd=4&class=internews&tn=rss",
 37 | 	"军事最新":  "http://news.baidu.com/n?cmd=4&class=mil&tn=rss",
 38 | 	"财经最新":  "http://news.baidu.com/n?cmd=4&class=finannews&tn=rss",
 39 | 	"互联网最新": "http://news.baidu.com/n?cmd=4&class=internet&tn=rss",
 40 | 	"房产最新":  "http://news.baidu.com/n?cmd=4&class=housenews&tn=rss",
 41 | 	"汽车最新":  "http://news.baidu.com/n?cmd=4&class=autonews&tn=rss",
 42 | 	"体育最新":  "http://news.baidu.com/n?cmd=4&class=sportnews&tn=rss",
 43 | 	"娱乐最新":  "http://news.baidu.com/n?cmd=4&class=enternews&tn=rss",
 44 | 	"游戏最新":  "http://news.baidu.com/n?cmd=4&class=gamenews&tn=rss",
 45 | 	"教育最新":  "http://news.baidu.com/n?cmd=4&class=edunews&tn=rss",
 46 | 	"女人最新":  "http://news.baidu.com/n?cmd=4&class=healthnews&tn=rss",
 47 | 	"科技最新":  "http://news.baidu.com/n?cmd=4&class=technnews&tn=rss",
 48 | 	"社会最新":  "http://news.baidu.com/n?cmd=4&class=socianews&tn=rss",
 49 | }
 50 | 
 51 | type (
 52 | 	BaiduNewsRss struct {
 53 | 		Channel BaiduNewsData `xml:"channel"`
 54 | 	}
 55 | 	BaiduNewsData struct {
 56 | 		Item []BaiduNewsItem `xml:"item"`
 57 | 	}
 58 | 	BaiduNewsItem struct {
 59 | 		Title       string `xml:"title"`
 60 | 		Link        string `xml:"link"`
 61 | 		Description string `xml:"description"`
 62 | 		PubDate     string `xml:"pubDate"`
 63 | 		Author      string `xml:"author"`
 64 | 	}
 65 | )
 66 | 
 67 | var BaiduNews = &Spider{
 68 | 	Name:        "百度RSS新闻",
 69 | 	Description: "百度RSS新闻，实现轮询更新 [Auto Page] [news.baidu.com]",
 70 | 	// Pausetime: 300,
 71 | 	// Keyin:     KEYIN,
 72 | 	EnableCookie: false,
 73 | 	// Limit:        LIMIT,
 74 | 	// 命名空间相对于数据库名，不依赖具体数据内容，可选
 75 | 	Namespace: nil,
 76 | 	// 子命名空间相对于表名，可依赖具体数据内容，可选
 77 | 	SubNamespace: func(self *Spider, dataCell map[string]interface{}) string {
 78 | 		return dataCell["Data"].(map[string]interface{})["分类"].(string)
 79 | 	},
 80 | 	RuleTree: &RuleTree{
 81 | 		Root: func(ctx *Context) {
 82 | 			for k := range rss_BaiduNews {
 83 | 				ctx.SetTimer(k, time.Minute*5, nil)
 84 | 				ctx.Aid(map[string]interface{}{"loop": k}, "LOOP")
 85 | 			}
 86 | 		},
 87 | 
 88 | 		Trunk: map[string]*Rule{
 89 | 			"LOOP": {
 90 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 91 | 					k := aid["loop"].(string)
 92 | 					v := rss_BaiduNews[k]
 93 | 
 94 | 					ctx.AddQueue(&request.Request{
 95 | 						Url:    v,
 96 | 						Rule:   "XML列表页",
 97 | 						Header: http.Header{"Content-Type": []string{"application/xml"}},
 98 | 						Temp:   map[string]interface{}{"src": k},
 99 | 						// DialTimeout: -1,
100 | 						// ConnTimeout: -1,
101 | 						// TryTimes:    -1,
102 | 						Reloadable: true,
103 | 					})
104 | 					return nil
105 | 				},
106 | 			},
107 | 			"XML列表页": {
108 | 				ParseFunc: func(ctx *Context) {
109 | 					var src = ctx.GetTemp("src", "").(string)
110 | 					defer func() {
111 | 						// 循环请求
112 | 						ctx.RunTimer(src)
113 | 						ctx.Aid(map[string]interface{}{"loop": src}, "LOOP")
114 | 					}()
115 | 
116 | 					page := ctx.GetText()
117 | 					rss := new(BaiduNewsRss)
118 | 					if err := xml.Unmarshal([]byte(page), rss); err != nil {
119 | 						logs.Log.Error("XML列表页: %v", err)
120 | 						return
121 | 					}
122 | 					content := rss.Channel
123 | 					for _, v := range content.Item {
124 | 						ctx.AddQueue(&request.Request{
125 | 							Url:  v.Link,
126 | 							Rule: "新闻详情",
127 | 							Temp: map[string]interface{}{
128 | 								"title":       CleanHtml(v.Title, 4),
129 | 								"description": CleanHtml(v.Description, 4),
130 | 								"src":         src,
131 | 								"releaseTime": CleanHtml(v.PubDate, 4),
132 | 								"author":      CleanHtml(v.Author, 4),
133 | 							},
134 | 						})
135 | 					}
136 | 				},
137 | 			},
138 | 
139 | 			"新闻详情": {
140 | 				//注意：有无字段语义和是否输出数据必须保持一致
141 | 				ItemFields: []string{
142 | 					"标题",
143 | 					"描述",
144 | 					"内容",
145 | 					"发布时间",
146 | 					"分类",
147 | 					"作者",
148 | 				},
149 | 				ParseFunc: func(ctx *Context) {
150 | 					var title = ctx.GetTemp("title", "").(string)
151 | 
152 | 					infoStr, isReload := baiduNewsFn.prase(ctx)
153 | 					if isReload {
154 | 						return
155 | 					}
156 | 					// 结果存入Response中转
157 | 					ctx.Output(map[int]interface{}{
158 | 						0: title,
159 | 						1: ctx.GetTemp("description", ""),
160 | 						2: infoStr,
161 | 						3: ctx.GetTemp("releaseTime", ""),
162 | 						4: ctx.GetTemp("src", ""),
163 | 						5: ctx.GetTemp("author", ""),
164 | 					})
165 | 				},
166 | 			},
167 | 		},
168 | 	},
169 | }
170 | 
171 | type baiduNews map[string]func(ctx *Context) (infoStr string, isReload bool)
172 | 
173 | // @url 必须为含有协议头的地址
174 | func (b baiduNews) prase(ctx *Context) (infoStr string, isReload bool) {
175 | 	url := ctx.GetHost()
176 | 	if _, ok := b[url]; ok {
177 | 		return b[url](ctx)
178 | 	} else {
179 | 		return b.commonPrase(ctx), false
180 | 	}
181 | }
182 | 
183 | func (b baiduNews) commonPrase(ctx *Context) (infoStr string) {
184 | 	body := ctx.GetDom().Find("body")
185 | 
186 | 	var info *goquery.Selection
187 | 
188 | 	if h1s := body.Find("h1"); len(h1s.Nodes) != 0 {
189 | 		for i := 0; i < len(h1s.Nodes); i++ {
190 | 			info = b.findP(h1s.Eq(i))
191 | 		}
192 | 	} else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 {
193 | 		for i := 0; i < len(h2s.Nodes); i++ {
194 | 			info = b.findP(h2s.Eq(i))
195 | 		}
196 | 	} else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 {
197 | 		for i := 0; i < len(h3s.Nodes); i++ {
198 | 			info = b.findP(h3s.Eq(i))
199 | 		}
200 | 	} else {
201 | 		info = body.Find("body")
202 | 	}
203 | 	infoStr, _ = info.Html()
204 | 
205 | 	// 清洗HTML
206 | 	infoStr = CleanHtml(infoStr, 5)
207 | 	return
208 | }
209 | 
210 | func (b baiduNews) findP(html *goquery.Selection) *goquery.Selection {
211 | 	if html.Is("body") {
212 | 		return html
213 | 	} else if result := html.Parent().Find("p"); len(result.Nodes) == 0 {
214 | 		return b.findP(html.Parent())
215 | 	} else {
216 | 		return html.Parent()
217 | 	}
218 | }
219 | 
220 | var baiduNewsFn = baiduNews{
221 | 	"yule.sohu.com": func(ctx *Context) (infoStr string, isReload bool) {
222 | 		infoStr = ctx.GetDom().Find("#contentText").Text()
223 | 		return
224 | 	},
225 | 	"news.qtv.com.cn": func(ctx *Context) (infoStr string, isReload bool) {
226 | 		infoStr = ctx.GetDom().Find(".zwConreally_z").Text()
227 | 		return
228 | 	},
229 | }
230 | 


--------------------------------------------------------------------------------
/baidusearch/baidusearch.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 
 24 | 	// 其他包
 25 | 	// "fmt"
 26 | 	"math"
 27 | 	// "time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	BaiduSearch.Register()
 32 | }
 33 | 
 34 | var BaiduSearch = &Spider{
 35 | 	Name:        "百度搜索",
 36 | 	Description: "百度搜索结果 [www.baidu.com]",
 37 | 	// Pausetime: 300,
 38 | 	Keyin:        KEYIN,
 39 | 	Limit:        LIMIT,
 40 | 	EnableCookie: false,
 41 | 	// 禁止输出默认字段 Url/ParentUrl/DownloadTime
 42 | 	NotDefaultField: true,
 43 | 	// 命名空间相对于数据库名，不依赖具体数据内容，可选
 44 | 	Namespace: nil,
 45 | 	// 子命名空间相对于表名，可依赖具体数据内容，可选
 46 | 	SubNamespace: nil,
 47 | 	RuleTree: &RuleTree{
 48 | 		Root: func(ctx *Context) {
 49 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
 50 | 		},
 51 | 
 52 | 		Trunk: map[string]*Rule{
 53 | 
 54 | 			"生成请求": {
 55 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 56 | 					var duplicatable bool
 57 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 58 | 						if loop[0] == 0 {
 59 | 							duplicatable = true
 60 | 						} else {
 61 | 							duplicatable = false
 62 | 						}
 63 | 						ctx.AddQueue(&request.Request{
 64 | 							Url:        "http://www.baidu.com/s?ie=utf-8&nojc=1&wd=" + ctx.GetKeyin() + "&rn=50&pn=" + strconv.Itoa(50*loop[0]),
 65 | 							Rule:       aid["Rule"].(string),
 66 | 							Reloadable: duplicatable,
 67 | 						})
 68 | 					}
 69 | 					return nil
 70 | 				},
 71 | 				ParseFunc: func(ctx *Context) {
 72 | 					query := ctx.GetDom()
 73 | 					total1 := query.Find(".nums").Text()
 74 | 					re, _ := regexp.Compile(`[\D]*`)
 75 | 					total1 = re.ReplaceAllString(total1, "")
 76 | 					total2, _ := strconv.Atoi(total1)
 77 | 					total := int(math.Ceil(float64(total2) / 50))
 78 | 					if total > ctx.GetLimit() {
 79 | 						total = ctx.GetLimit()
 80 | 					} else if total == 0 {
 81 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] 没有抓取到任何数据！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 82 | 						return
 83 | 					}
 84 | 					// 调用指定规则下辅助函数
 85 | 					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})
 86 | 					// 用指定规则解析响应流
 87 | 					ctx.Parse("搜索结果")
 88 | 				},
 89 | 			},
 90 | 
 91 | 			"搜索结果": {
 92 | 				//注意：有无字段语义和是否输出数据必须保持一致
 93 | 				ItemFields: []string{
 94 | 					"标题",
 95 | 					"内容",
 96 | 					"不完整URL",
 97 | 					"百度跳转",
 98 | 				},
 99 | 				ParseFunc: func(ctx *Context) {
100 | 					query := ctx.GetDom()
101 | 					query.Find("#content_left .c-container").Each(func(i int, s *goquery.Selection) {
102 | 
103 | 						title := s.Find(".t").Text()
104 | 						content := s.Find(".c-abstract").Text()
105 | 						href, _ := s.Find(".t >a").Attr("href")
106 | 						tar := s.Find(".g").Text()
107 | 
108 | 						re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
109 | 						// title = re.ReplaceAllStringFunc(title, strings.ToLower)
110 | 						// content = re.ReplaceAllStringFunc(content, strings.ToLower)
111 | 
112 | 						title = re.ReplaceAllString(title, "")
113 | 						content = re.ReplaceAllString(content, "")
114 | 
115 | 						// 结果存入Response中转
116 | 						ctx.Output(map[int]interface{}{
117 | 							0: strings.Trim(title, " \t\n"),
118 | 							1: strings.Trim(content, " \t\n"),
119 | 							2: tar,
120 | 							3: href,
121 | 						})
122 | 					})
123 | 				},
124 | 			},
125 | 		},
126 | 	},
127 | }
128 | 


--------------------------------------------------------------------------------
/car_home/car_home.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	CarHome.Register()
 31 | }
 32 | 
 33 | var CarHome = &Spider{
 34 | 	Name:        "汽车之家",
 35 | 	Description: "汽车之家帖子 [http://club.autohome.com.cn/bbs/]",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.AddQueue(&request.Request{
 43 | 				Url:  "http://club.autohome.com.cn/bbs/forum-o-200042-1.html?qaType=-1#pvareaid=101061",
 44 | 				Rule: "请求列表",
 45 | 				Temp: map[string]interface{}{"p": 1},
 46 | 			})
 47 | 		},
 48 | 
 49 | 		Trunk: map[string]*Rule{
 50 | 
 51 | 			"请求列表": {
 52 | 				ParseFunc: func(ctx *Context) {
 53 | 					var curr = ctx.GetTemp("p", 0).(int)
 54 | 					if c := ctx.GetDom().Find(".pages .cur").Text(); c != strconv.Itoa(curr) {
 55 | 						// Log.Printf("当前列表页不存在 %v", c)
 56 | 						return
 57 | 					}
 58 | 					ctx.AddQueue(&request.Request{
 59 | 						Url:  "http://club.autohome.com.cn/bbs/forum-o-200042-" + strconv.Itoa(curr+1) + ".html?qaType=-1#pvareaid=101061",
 60 | 						Rule: "请求列表",
 61 | 						Temp: map[string]interface{}{"p": curr + 1},
 62 | 					})
 63 | 
 64 | 					// 用指定规则解析响应流
 65 | 					ctx.Parse("获取列表")
 66 | 				},
 67 | 			},
 68 | 
 69 | 			"获取列表": {
 70 | 				ParseFunc: func(ctx *Context) {
 71 | 					ctx.GetDom().
 72 | 						Find(".list_dl").
 73 | 						Each(func(i int, s *goquery.Selection) {
 74 | 							url, _ := s.Find("dt a").Attr("href")
 75 | 							ctx.AddQueue(&request.Request{
 76 | 								Url:      "http://club.autohome.com.cn" + url,
 77 | 								Rule:     "输出结果",
 78 | 								Priority: 1,
 79 | 							})
 80 | 						})
 81 | 				},
 82 | 			},
 83 | 
 84 | 			"输出结果": {
 85 | 				//注意：有无字段语义和是否输出数据必须保持一致
 86 | 				ItemFields: []string{
 87 | 
 88 | 					"当前积分",
 89 | 					"帖子数",
 90 | 					"关注的车",
 91 | 					"注册时间",
 92 | 					"作者",
 93 | 				},
 94 | 				ParseFunc: func(ctx *Context) {
 95 | 					query := ctx.GetDom()
 96 | 
 97 | 					var 当前积分, 帖子数, 关注的车, 注册时间, 作者 string
 98 | 
 99 | 					积分 := strings.Split(query.Find(".lv-curr").First().Text(), "当前积分：")
100 | 					if len(积分) > 1 {
101 | 						当前积分 = 积分[1]
102 | 					}
103 | 
104 | 					info := query.Find(".conleft").Eq(0).Find(".leftlist li")
105 | 
106 | 					if len(info.Eq(3).Nodes) > 0 {
107 | 						帖子数 = strings.Split(info.Eq(3).Find("a").Text(), "帖")[0]
108 | 					}
109 | 
110 | 					for i := 6; !info.Eq(i).HasClass("leftimgs") &&
111 | 						len(info.Eq(i).Nodes) > 0 &&
112 | 						len(info.Eq(i).Find("a").Nodes) > 0; i++ {
113 | 						if strings.Contains(info.Eq(i).Text(), "所属：") {
114 | 							continue
115 | 						}
116 | 
117 | 						fs := info.Eq(i).Find("a")
118 | 						var f string
119 | 						if len(fs.Nodes) > 1 {
120 | 							f, _ = info.Eq(i).Find("a").Eq(1).Attr("title")
121 | 						} else {
122 | 							f, _ = info.Eq(i).Find("a").First().Attr("title")
123 | 						}
124 | 						if f == "" {
125 | 							continue
126 | 						}
127 | 						关注的车 += f + "|"
128 | 					}
129 | 
130 | 					关注的车 = strings.Trim(关注的车, "|")
131 | 
132 | 					if len(info.Eq(4).Nodes) > 0 {
133 | 						注册 := strings.Split(info.Eq(4).Text(), "注册：")
134 | 						if len(注册) > 1 {
135 | 							注册时间 = 注册[1]
136 | 						}
137 | 					}
138 | 					作者 = query.Find(".conleft").Eq(0).Find("a").Text()
139 | 					// 结果存入Response中转
140 | 					ctx.Output(map[int]interface{}{
141 | 						0: 当前积分,
142 | 						1: 帖子数,
143 | 						2: 关注的车,
144 | 						3: 注册时间,
145 | 						4: 作者,
146 | 					})
147 | 				},
148 | 			},
149 | 
150 | 			// "联系方式": {
151 | 			// 	ParseFunc: func(ctx *Context) {
152 | 			// 		ctx.AddFile(ctx.GetTemp("n").(string))
153 | 			// 	},
154 | 			// },
155 | 		},
156 | 	},
157 | }
158 | 


--------------------------------------------------------------------------------
/chinanews/chinanews.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	// "github.com/henrylee2cn/pholcus/common/goquery"                          //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  8 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
  9 | 	// "github.com/henrylee2cn/pholcus/logs"
 10 | 	// net包
 11 | 	// "net/http" //设置http.Header
 12 | 	// "net/url"
 13 | 	// 编码包
 14 | 	// "encoding/xml"
 15 | 	//"encoding/json"
 16 | 	// 字符串处理包
 17 | 	//"regexp"
 18 | 	// "strconv"
 19 | 	// "fmt"
 20 | 	// "math"
 21 | 	// "time"
 22 | 	"github.com/henrylee2cn/pholcus/common/goquery"
 23 | 	"strings"
 24 | )
 25 | 
 26 | func init() {
 27 | 	FileTest.Register()
 28 | }
 29 | 
 30 | var FileTest = &Spider{
 31 | 	Name:        "中国新闻网",
 32 | 	Description: "测试 [http://www.chinanews.com/scroll-news/news1.html]",
 33 | 	// Pausetime: 300,
 34 | 	// Keyin:   KEYIN,
 35 | 	// Limit:        LIMIT,
 36 | 	EnableCookie: false,
 37 | 	RuleTree: &RuleTree{
 38 | 		Root: func(ctx *Context) {
 39 | 			ctx.AddQueue(&request.Request{
 40 | 				Url:          "http://www.chinanews.com/scroll-news/news1.html",
 41 | 				Rule:         "滚动新闻",
 42 | 			})
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"滚动新闻": {
 48 | 				ParseFunc: func(ctx *Context) {
 49 | 					query := ctx.GetDom()
 50 | 					//获取分页导航
 51 | 					navBox := query.Find(".pagebox a")
 52 | 					navBox.Each(func(i int, s *goquery.Selection) {
 53 | 						if url, ok := s.Attr("href"); ok {
 54 | 							ctx.AddQueue(&request.Request{
 55 | 								Url:  "http://www.chinanews.com" +  url,
 56 | 								Rule: "新闻列表",
 57 | 
 58 | 							})
 59 | 						}
 60 | 
 61 | 					})
 62 | 
 63 | 				},
 64 | 			},
 65 | 
 66 | 			"新闻列表": {
 67 | 				ParseFunc: func(ctx *Context) {
 68 | 					query := ctx.GetDom()
 69 | 					//获取新闻列表
 70 | 					newList := query.Find(".content_list li")
 71 | 					newList.Each(func(i int, s *goquery.Selection) {
 72 | 						//新闻类型
 73 | 						newsType := s.Find(".dd_lm a").Text()
 74 | 						//标题
 75 | 						newsTitle := s.Find(".dd_bt a").Text()
 76 | 						//时间
 77 | 						newsTime := s.Find(".dd_time").Text()
 78 | 						if url, ok := s.Find(".dd_bt a").Attr("href"); ok {
 79 | 							ctx.AddQueue(&request.Request{
 80 | 								Url:  "http://" + url[2:len(url)],
 81 | 								Rule: "新闻内容",
 82 | 								Temp: map[string]interface{}{
 83 | 									"newsType":  newsType,
 84 | 									"newsTitle": newsTitle,
 85 | 									"newsTime":  newsTime,
 86 | 								},
 87 | 							})
 88 | 						}
 89 | 
 90 | 					})
 91 | 
 92 | 				},
 93 | 			},
 94 | 
 95 | 			"新闻内容": {
 96 | 				ItemFields: []string{
 97 | 					"类别",
 98 | 					"来源",
 99 | 					"标题",
100 | 					"内容",
101 | 					"时间",
102 | 				},
103 | 
104 | 				ParseFunc: func(ctx *Context) {
105 | 					query := ctx.GetDom()
106 | 					//正文
107 | 					content := query.Find(".left_zw").Text()
108 | 					//来源
109 | 					from := query.Find(".left-t").Text()
110 | 					i := strings.LastIndex(from,"来源")
111 | 					//来源字符串特殊处理
112 | 					if i == -1{
113 | 						from = "未知"
114 | 					}else{
115 | 						from = from[i+9:len(from)]
116 | 						from = strings.Replace(from,"参与互动","",1)
117 | 						if from=="" {
118 | 							from = query.Find(".left-t").Eq(2).Text()
119 | 							from = strings.Replace(from,"参与互动","",1)
120 | 						}
121 | 					}
122 | 
123 | 					//输出格式
124 | 					ctx.Output(map[int]interface{}{
125 | 						0: ctx.GetTemp("newsType",""),
126 | 						1: from,
127 | 						2: ctx.GetTemp("newsTitle",""),
128 | 						3: content,
129 | 						4: ctx.GetTemp("newsTime", ""),
130 | 					})
131 | 				},
132 | 			},
133 | 
134 | 		},
135 | 	},
136 | }
137 | 


--------------------------------------------------------------------------------
/chinanews/readme.md:
--------------------------------------------------------------------------------
 1 | ## 中国新闻网-滚动新闻栏目
 2 | 
 3 | ### 说明
 4 | 	
 5 | 	只是爬取滚动新闻栏目（共10页）
 6 | 
 7 | ### 代码说明
 8 | 
 9 | 	1.直接访问滚动新闻栏目地址（http://www.chinanews.com/scroll-news/news1.html）
10 | 	2.获取分页导航
11 | 	3.获取分页链接
12 | 
13 | 刚开始学习，写的不好，多多指教 weChat：gaoyawei616


--------------------------------------------------------------------------------
/fang_resell_list/fang_resell_list.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	//"github.com/henrylee2cn/pholcus/logs"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 	//"github.com/henrylee2cn/pholcus/logs/logs"
 11 | 	// 字符串处理包
 12 | 	// "regexp"
 13 | 	//"strconv"
 14 | 	//"strings"
 15 | 	// 其他包
 16 | 	// "fmt"
 17 | 	// "math"
 18 | 	// "time"
 19 | 	//"strings"
 20 | 	//"strings"
 21 | 	"strings"
 22 | 	"github.com/henrylee2cn/pholcus/logs"
 23 | 	"strconv"
 24 | )
 25 | 
 26 | func init() {
 27 | 	fangList.Register()
 28 | }
 29 | 
 30 | var fangList = &Spider{
 31 | 	Name:         "resell house of fang.com",
 32 | 	Description:  "fang.com http://esf.zz.fang.com/house/i31/",
 33 | 	EnableCookie: true,
 34 | 	RuleTree: &RuleTree{
 35 | 		Root: func(ctx *Context) {
 36 | 			var i = 1;
 37 | 			//for i = 1; i < 101; i++ {
 38 | 			ctx.AddQueue(&request.Request{
 39 | 				Url:  "http://esf.zz.fang.com/house/i3" + strconv.Itoa(i) + "/",
 40 | 				Rule: "fang_collection",
 41 | 				Temp: map[string]interface{}{"p": 1},
 42 | 			})
 43 | 			//}
 44 | 		},
 45 | 
 46 | 		Trunk: map[string]*Rule{
 47 | 			"fang_collection": {
 48 | 				ItemFields: []string{
 49 | 					"communityName",
 50 | 					"totalFloor",
 51 | 					"rooms",
 52 | 					"halls",
 53 | 					"buildTime",
 54 | 					"address",
 55 | 					"direction",
 56 | 					"area",
 57 | 					"price",
 58 | 					"unitPrice",
 59 | 					"locationType", //楼层所在高低
 60 | 					"remoteId",     //搜房id
 61 | 					"business",
 62 | 				},
 63 | 				ParseFunc: func(ctx *Context) {
 64 | 					//获取当页搜房的所有数据
 65 | 					ctx.GetDom().Find(".houseList dl").Each(
 66 | 						func(i int, s *goquery.Selection) {
 67 | 							var communityName, totalFloor, rooms, halls, locationType, remoteId, buildTime, address, direction, area, price, unitPrice, business string;
 68 | 							communityName = s.Find(".info p.mt10 a span").Text();
 69 | 
 70 | 							address = s.Find(".info p.mt10 span.iconAdress").Text();
 71 | 							business = "";
 72 | 
 73 | 							sp := strings.Split(address,"-");
 74 | 							if(len(sp) == 2){
 75 | 								address = sp[1];
 76 | 								business = sp[0];
 77 | 							}
 78 | 							//获取年代中的一吨
 79 | 							roomLineTmp := s.Find("dd.info p.mt12").Text();
 80 | 							roomLine := strings.Fields(roomLineTmp);
 81 | 
 82 | 							if (len(roomLine) == 4 ) {
 83 | 								//替换掉厅
 84 | 								roomsTmp := roomLine[0];
 85 | 								roomsTmp = strings.Replace(roomsTmp, "厅", "", 1);
 86 | 								roomsS := strings.Split(roomsTmp, "室");
 87 | 								if (len(roomsS) == 2) {
 88 | 									rooms = roomsS[0];
 89 | 									halls = roomsS[1];
 90 | 								}
 91 | 								//楼类型和层高获取
 92 | 								buildingTmp := roomLine[1];
 93 | 								buildingTmpSec := strings.Split(buildingTmp, "(共");
 94 | 								if (len(buildingTmpSec) == 2) {
 95 | 									locationType = strings.Replace(buildingTmpSec[0], "|", "", 1);
 96 | 									totalFloor = strings.Replace(buildingTmpSec[1], "层)", "", 1);
 97 | 								}
 98 | 
 99 | 								buildTime = strings.Replace(roomLine[3], "|建筑年代：", "", 1);
100 | 								direction = strings.Replace(roomLine[2], "|", "", 1);
101 | 								direction = strings.Replace(direction, "向", "", 1);
102 | 							}
103 | 
104 | 							area = s.Find("dd.info div.area").Children().Eq(0).Text();
105 | 							price = s.Find("dd.info div.moreInfo").Children().Eq(0).Text();
106 | 							unitPrice = s.Find("dd.info div.moreInfo").Children().Eq(1).Text();
107 | 							remoteTmp, exists := s.Find("dd.info p.title a").Attr("href");
108 | 							if (exists) {
109 | 								remoteAttr := strings.Split(remoteTmp,"_");
110 | 								remoteId = strings.Replace(remoteAttr[1],".htm","",1);
111 | 							}
112 | 
113 | 							logs.Log.Critical("当前房源id: %v", remoteId)
114 | 							//解析传入的片段
115 | 							// 结果存入Response中转
116 | 							ctx.Output(map[int]interface{}{
117 | 								0:  strings.Trim(communityName, " "),
118 | 								1:  strings.Trim(totalFloor, " "),
119 | 								2:  strings.Trim(rooms, " "),
120 | 								3:  strings.Trim(halls, " "),
121 | 								4:  strings.Trim(buildTime, " "),
122 | 								5:  strings.Trim(address, " "),
123 | 								6:  strings.Trim(direction, " "),
124 | 								7:  strings.Trim(strings.Replace(area,"㎡","",1), " "),
125 | 								8:  strings.Trim(strings.Replace(price,"万","",1), " "),
126 | 								9:  strings.Trim(strings.Replace(unitPrice,"元/㎡","",1), " "),
127 | 								10: strings.Trim(locationType, " "),
128 | 								11: strings.Trim(remoteId, " "),
129 | 								12: strings.Trim(business, " "),
130 | 							})
131 | 						})
132 | 					ctx.Parse("getContent")
133 | 				},
134 | 			},
135 | 		},
136 | 	},
137 | }
138 | 


--------------------------------------------------------------------------------
/fang_resell_list/readme.md:
--------------------------------------------------------------------------------
 1 | ## 搜房爬取二手房列表
 2 | 
 3 | ### 说明
 4 | 
 5 | 	仅爬取列表页, 字段: 
 6 | 	"communityName":小区名,
 7 | 	"totalFloor":总层数,
 8 | 	"rooms":房间数,
 9 | 	"halls":厅数量,
10 | 	"buildTime":建筑年代,
11 | 	"address":地址,
12 | 	"direction":朝向,
13 | 	"area":面积,
14 | 	"price":价格,
15 | 	"unitPrice"单价,
16 | 	"locationType"所在层数高低,
17 | 
18 | ### 代码说明
19 | 
20 | 	1.目前仅仅爬取了搜房二手房的列表页, 一次爬取一页
21 | 	2.如果有需要就修改37行打开多页爬取
22 | 	3.在使用中发现,如果爬取的页面数太多会导致蜘蛛崩溃, 原因未知, 待查


--------------------------------------------------------------------------------
/filetest/filetest.go:
--------------------------------------------------------------------------------
 1 | package pholcus_lib
 2 | 
 3 | // 基础包
 4 | import (
 5 | 	// "github.com/henrylee2cn/pholcus/common/goquery"                          //DOM解析
 6 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
 7 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
 8 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 9 | 	// "github.com/henrylee2cn/pholcus/logs"
10 | 	// net包
11 | 	// "net/http" //设置http.Header
12 | 	// "net/url"
13 | 	// 编码包
14 | 	// "encoding/xml"
15 | 	//"encoding/json"
16 | 	// 字符串处理包
17 | 	//"regexp"
18 | 	// "strconv"
19 | 	//	"strings"
20 | 	// 其他包
21 | 	// "fmt"
22 | 	// "math"
23 | 	// "time"
24 | )
25 | 
26 | func init() {
27 | 	FileTest.Register()
28 | }
29 | 
30 | var FileTest = &Spider{
31 | 	Name:        "文件下载测试",
32 | 	Description: "文件下载测试",
33 | 	// Pausetime: 300,
34 | 	// Keyin:   KEYIN,
35 | 	// Limit:        LIMIT,
36 | 	EnableCookie: false,
37 | 	RuleTree: &RuleTree{
38 | 		Root: func(ctx *Context) {
39 | 			ctx.AddQueue(&request.Request{
40 | 				Url:          "https://www.baidu.com/img/bd_logo1.png",
41 | 				Rule:         "百度图片",
42 | 				ConnTimeout:  -1,
43 | 				DownloaderID: 0, //图片等多媒体文件必须使用0（surfer surf go原生下载器）
44 | 			})
45 | 			ctx.AddQueue(&request.Request{
46 | 				Url:          "https://github.com/henrylee2cn/pholcus",
47 | 				Rule:         "Pholcus页面",
48 | 				ConnTimeout:  -1,
49 | 				DownloaderID: 0, //文本文件可使用0或者1（0：surfer surf go原生下载器；1：surfer plantomjs内核）
50 | 			})
51 | 		},
52 | 
53 | 		Trunk: map[string]*Rule{
54 | 
55 | 			"百度图片": {
56 | 				ParseFunc: func(ctx *Context) {
57 | 					ctx.FileOutput("baidu") // 等价于ctx.AddFile("baidu")
58 | 				},
59 | 			},
60 | 			"Pholcus页面": {
61 | 				ParseFunc: func(ctx *Context) {
62 | 					ctx.FileOutput() // 等价于ctx.AddFile()
63 | 				},
64 | 			},
65 | 		},
66 | 	},
67 | }
68 | 


--------------------------------------------------------------------------------
/ganji_gongsi/ganji_gongsi.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	GanjiGongsi.Register()
 31 | }
 32 | 
 33 | var GanjiGongsi = &Spider{
 34 | 	Name:        "经典示例-赶集网企业名录",
 35 | 	Description: "**典型规则示例，具有文本与文件两种输出行为**",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.AddQueue(&request.Request{
 43 | 				Url:  "http://sz.ganji.com/gongsi/o1",
 44 | 				Rule: "请求列表",
 45 | 				Temp: map[string]interface{}{"p": 1},
 46 | 			})
 47 | 		},
 48 | 
 49 | 		Trunk: map[string]*Rule{
 50 | 
 51 | 			"请求列表": {
 52 | 				ParseFunc: func(ctx *Context) {
 53 | 					var curr = ctx.GetTemp("p", int(0)).(int)
 54 | 					if ctx.GetDom().Find(".linkOn span").Text() != strconv.Itoa(curr) {
 55 | 						return
 56 | 					}
 57 | 					ctx.AddQueue(&request.Request{
 58 | 						Url:         "http://sz.ganji.com/gongsi/o" + strconv.Itoa(curr+1),
 59 | 						Rule:        "请求列表",
 60 | 						Temp:        map[string]interface{}{"p": curr + 1},
 61 | 						ConnTimeout: -1,
 62 | 					})
 63 | 
 64 | 					// 用指定规则解析响应流
 65 | 					ctx.Parse("获取列表")
 66 | 				},
 67 | 			},
 68 | 
 69 | 			"获取列表": {
 70 | 				ParseFunc: func(ctx *Context) {
 71 | 					ctx.GetDom().
 72 | 						Find(".com-list-2 table a").
 73 | 						Each(func(i int, s *goquery.Selection) {
 74 | 							url, _ := s.Attr("href")
 75 | 							ctx.AddQueue(&request.Request{
 76 | 								Url:         url,
 77 | 								Rule:        "输出结果",
 78 | 								ConnTimeout: -1,
 79 | 							})
 80 | 						})
 81 | 				},
 82 | 			},
 83 | 
 84 | 			"输出结果": {
 85 | 				//注意：有无字段语义和是否输出数据必须保持一致
 86 | 				ItemFields: []string{
 87 | 					"公司",
 88 | 					"联系人",
 89 | 					"地址",
 90 | 					"简介",
 91 | 					"行业",
 92 | 					"类型",
 93 | 					"规模",
 94 | 				},
 95 | 				ParseFunc: func(ctx *Context) {
 96 | 					query := ctx.GetDom()
 97 | 
 98 | 					var 公司, 规模, 行业, 类型, 联系人, 地址 string
 99 | 
100 | 					query.Find(".c-introduce li").Each(func(i int, s *goquery.Selection) {
101 | 						em := s.Find("em").Text()
102 | 						t := strings.Split(s.Text(), `   `)[0]
103 | 						t = strings.Replace(t, em, "", -1)
104 | 						t = strings.Trim(t, " ")
105 | 
106 | 						switch em {
107 | 						case "公司名称：":
108 | 							公司 = t
109 | 
110 | 						case "公司规模：":
111 | 							规模 = t
112 | 
113 | 						case "公司行业：":
114 | 							行业 = t
115 | 
116 | 						case "公司类型：":
117 | 							类型 = t
118 | 
119 | 						case "联 系 人：":
120 | 							联系人 = t
121 | 
122 | 						case "联系电话：":
123 | 							if img, ok := s.Find("img").Attr("src"); ok {
124 | 								ctx.AddQueue(&request.Request{
125 | 									Url:         "http://www.ganji.com" + img,
126 | 									Rule:        "联系方式",
127 | 									Temp:        map[string]interface{}{"n": 公司 + "(" + 联系人 + ").png"},
128 | 									Priority:    1,
129 | 									ConnTimeout: -1,
130 | 								})
131 | 							}
132 | 
133 | 						case "公司地址：":
134 | 							地址 = t
135 | 						}
136 | 					})
137 | 
138 | 					简介 := query.Find("#company_description").Text()
139 | 
140 | 					// 结果输出方式一（推荐）
141 | 					ctx.Output(map[int]interface{}{
142 | 						0: 公司,
143 | 						1: 联系人,
144 | 						2: 地址,
145 | 						3: 简介,
146 | 						4: 行业,
147 | 						5: 类型,
148 | 						6: 规模,
149 | 					})
150 | 
151 | 					// 结果输出方式二
152 | 					// var item map[string]interface{} = ctx.CreatItem(map[int]interface{}{
153 | 					// 	0: 公司,
154 | 					// 	1: 联系人,
155 | 					// 	2: 地址,
156 | 					// 	3: 简介,
157 | 					// 	4: 行业,
158 | 					// 	5: 类型,
159 | 					// 	6: 规模,
160 | 					// })
161 | 					// ctx.Output(item)
162 | 
163 | 					// 结果输出方式三（不推荐）
164 | 					// ctx.Output(map[string]interface{}{
165 | 					// 	ctx.GetItemField(0): 公司,
166 | 					// 	ctx.GetItemField(1): 联系人,
167 | 					// 	ctx.GetItemField(2): 地址,
168 | 					// 	ctx.GetItemField(3): 简介,
169 | 					// 	ctx.GetItemField(4): 行业,
170 | 					// 	ctx.GetItemField(5): 类型,
171 | 					// 	ctx.GetItemField(6): 规模,
172 | 					// })
173 | 				},
174 | 			},
175 | 
176 | 			"联系方式": {
177 | 				ParseFunc: func(ctx *Context) {
178 | 					// 文件输出方式一（推荐）
179 | 					ctx.FileOutput(ctx.GetTemp("n", "").(string))
180 | 
181 | 					// 文件输出方式二
182 | 					// ctx.AddFile(ctx.GetTemp("n").(string))
183 | 				},
184 | 			},
185 | 		},
186 | 	},
187 | }
188 | 


--------------------------------------------------------------------------------
/googlesearch/googlesearch.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  9 | 	"github.com/henrylee2cn/pholcus/logs" //信息输出
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 
 24 | 	// 其他包
 25 | 	// "fmt"
 26 | 	"math"
 27 | 	// "time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	GoogleSearch.Register()
 32 | }
 33 | 
 34 | var googleIp = []string{
 35 | 	"210.242.125.100",
 36 | 	"210.242.125.96",
 37 | 	"210.242.125.91",
 38 | 	"210.242.125.95",
 39 | 	"64.233.189.163",
 40 | 	"58.123.102.5",
 41 | 	"210.242.125.97",
 42 | 	"210.242.125.115",
 43 | 	"58.123.102.28",
 44 | 	"210.242.125.70",
 45 | 	"220.255.2.153",
 46 | }
 47 | 
 48 | var GoogleSearch = &Spider{
 49 | 	Name:        "Google search",
 50 | 	Description: "Crawls pages from [www.google.com]",
 51 | 	// Pausetime: 300,
 52 | 	Keyin:        KEYIN,
 53 | 	Limit:        LIMIT,
 54 | 	EnableCookie: false,
 55 | 	RuleTree: &RuleTree{
 56 | 		Root: func(ctx *Context) {
 57 | 			var url string
 58 | 			var success bool
 59 | 			logs.Log.Informational("Running google spider，this may take some time...")
 60 | 
 61 | 			for _, ip := range googleIp {
 62 | 				// url = "http://" + ip + "/search?q=" + ctx.GetKeyin() + "&newwindow=1&biw=1600&bih=398&start="
 63 | 				// Beware of redirections, if it doesnt work use google domain:
 64 | 				// url = "https://google.co.uk/search?q=" + ctx.GetKeyin()
 65 | 				url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin()
 66 | 				logs.Log.Informational("测试 " + ip)
 67 | 				if _, err := goquery.NewDocument(url); err == nil {
 68 | 					success = true
 69 | 					break
 70 | 				}
 71 | 			}
 72 | 			if !success {
 73 | 				logs.Log.Critical("Could not reach any of the Google mirrors")
 74 | 				return
 75 | 			}
 76 | 			logs.Log.Critical("Starting Google search ...")
 77 | 			ctx.AddQueue(&request.Request{
 78 | 				Url:  url,
 79 | 				Rule: "total_pages",
 80 | 				Temp: map[string]interface{}{
 81 | 					"baseUrl": url,
 82 | 				},
 83 | 			})
 84 | 		},
 85 | 
 86 | 		Trunk: map[string]*Rule{
 87 | 
 88 | 			"total_pages": {
 89 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 90 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 91 | 						ctx.AddQueue(&request.Request{
 92 | 							Url:  aid["urlBase"].(string) +"&start="+ strconv.Itoa(10 * loop[0]),
 93 | 							Rule: aid["Rule"].(string),
 94 | 						})
 95 | 					}
 96 | 					return nil
 97 | 				},
 98 | 				ParseFunc: func(ctx *Context) {
 99 | 					query := ctx.GetDom()
100 | 					txt := query.Find("#resultStats").Text()
101 | 					re, _ := regexp.Compile(`,+`)
102 | 					txt = re.ReplaceAllString(txt, "")
103 | 					re, _ = regexp.Compile(`[\d]+`)
104 | 					txt = re.FindString(txt)
105 | 					num, _ := strconv.Atoi(txt)
106 | 					total := int(math.Ceil(float64(num) / 10))
107 | 					if total > ctx.GetLimit() {
108 | 						total = ctx.GetLimit()
109 | 					} else if total == 0 {
110 | 						logs.Log.Critical("[ERROR：| Spider：%v | KEYIN：%v | Rule：%v] Did not fetch any data！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
111 | 						return
112 | 					}
113 | 					// 调用指定规则下辅助函数
114 | 					ctx.Aid(map[string]interface{}{
115 | 						"loop":    [2]int{1, total},
116 | 						"urlBase": ctx.GetTemp("baseUrl", ""),
117 | 						"Rule":    "search_results",
118 | 					})
119 | 					// 用指定规则解析响应流
120 | 					ctx.Parse("search_results")
121 | 				},
122 | 			},
123 | 
124 | 			"search_results": {
125 | 				//注意：有无字段语义和是否输出数据必须保持一致
126 | 				ItemFields: []string{
127 | 					"title",
128 | 					"content",
129 | 					"href",
130 | 				},
131 | 				ParseFunc: func(ctx *Context) {
132 | 					query := ctx.GetDom()
133 | 					query.Find("#ires .g").Each(func(i int, s *goquery.Selection) {
134 | 						t := s.Find(".r > a")
135 | 						href, _ := t.Attr("href")
136 | 						href = strings.TrimLeft(href, "/url?q=")
137 | 						logs.Log.Informational(href)
138 | 						title := t.Text()
139 | 						content := s.Find(".st").Text()
140 | 						ctx.Output(map[int]interface{}{
141 | 							0: title,
142 | 							1: content,
143 | 							2: href,
144 | 						})
145 | 					})
146 | 				},
147 | 			},
148 | 		},
149 | 	},
150 | }
151 | 


--------------------------------------------------------------------------------
/hollandandbarrett/hollandandbarrett.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	"encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	// "strconv"
 22 | 	"strings"
 23 | 
 24 | 	// 其他包
 25 | 	"fmt"
 26 | 	// "math"
 27 | 	// "time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	Hollandandbarrett.Register()
 32 | }
 33 | 
 34 | var Hollandandbarrett = &Spider{
 35 | 	Name:        "Hollandandbarrett",
 36 | 	Description: "Hollandand&Barrett商品数据 [Auto Page] [www.Hollandandbarrett.com]",
 37 | 	// Pausetime: 300,
 38 | 	// Keyin:   KEYIN,
 39 | 	// Limit:        LIMIT,
 40 | 	EnableCookie: false,
 41 | 	RuleTree: &RuleTree{
 42 | 		Root: func(ctx *Context) {
 43 | 			ctx.AddQueue(&request.Request{
 44 | 				Url:  "http://www.hollandandbarrett.com/",
 45 | 				Rule: "获取版块URL",
 46 | 			},
 47 | 			)
 48 | 		},
 49 | 
 50 | 		Trunk: map[string]*Rule{
 51 | 
 52 | 			"获取版块URL": {
 53 | 				ParseFunc: func(ctx *Context) {
 54 | 					query := ctx.GetDom()
 55 | 					lis := query.Find(".footer-links nav.l-one-half a")
 56 | 
 57 | 					lis.Each(func(i int, s *goquery.Selection) {
 58 | 						if url, ok := s.Attr("href"); ok {
 59 | 							tit, _ := s.Attr("title")
 60 | 							ctx.AddQueue(&request.Request{
 61 | 								Url:  "http://www.hollandandbarrett.com" + url + "?showAll=1&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true",
 62 | 								Rule: "获取总数",
 63 | 								Temp: map[string]interface{}{
 64 | 									"type":    tit,
 65 | 									"baseUrl": url,
 66 | 								},
 67 | 							},
 68 | 							)
 69 | 						}
 70 | 					})
 71 | 				},
 72 | 			},
 73 | 
 74 | 			"获取总数": {
 75 | 				ParseFunc: func(ctx *Context) {
 76 | 
 77 | 					query := ctx.GetDom()
 78 | 
 79 | 					re, _ := regexp.Compile(`(?U)"totalNumRecs":[\d]+,`)
 80 | 					total := re.FindString(query.Text())
 81 | 					re, _ = regexp.Compile(`[\d]+`)
 82 | 					total = re.FindString(total)
 83 | 					total = strings.Trim(total, " \t\n")
 84 | 
 85 | 					if total == "0" {
 86 | 						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 87 | 					} else {
 88 | 
 89 | 						ctx.AddQueue(&request.Request{
 90 | 							Url:  "http://www.hollandandbarrett.com" + ctx.GetTemp("baseUrl", "").(string) + "?showAll=" + total + "&pageHa=1&es=true&vm=grid&imd=true&format=json&single=true",
 91 | 							Rule: "商品详情",
 92 | 							Temp: map[string]interface{}{
 93 | 								"type": ctx.GetTemp("type", "").(string),
 94 | 							},
 95 | 						},
 96 | 						)
 97 | 
 98 | 					}
 99 | 				},
100 | 			},
101 | 
102 | 			"商品详情": {
103 | 				//注意：有无字段语义和是否输出数据必须保持一致
104 | 				ItemFields: []string{
105 | 					"标题",
106 | 					"原价",
107 | 					"折后价",
108 | 					"打折",
109 | 					"星级",
110 | 					"分类",
111 | 				},
112 | 				ParseFunc: func(ctx *Context) {
113 | 					query := ctx.GetDom()
114 | 
115 | 					src := query.Text()
116 | 
117 | 					infos := map[string]interface{}{}
118 | 
119 | 					err := json.Unmarshal([]byte(src), &infos)
120 | 
121 | 					if err != nil {
122 | 						logs.Log.Error("error is %v\n", err)
123 | 						return
124 | 					} else {
125 | 						for _, info1 := range infos["contents"].([]interface{})[0].(map[string]interface{})["mainContent"].([]interface{})[0].(map[string]interface{})["records"].([]interface{}) {
126 | 
127 | 							info2 := info1.(map[string]interface{})["records"].([]interface{})[0].(map[string]interface{})["attributes"].(map[string]interface{})
128 | 
129 | 							var n, price1, price2, prm, level string
130 | 
131 | 							if info2["Name"] == nil {
132 | 								n = ""
133 | 							} else {
134 | 								n = fmt.Sprint(info2["Name"])
135 | 								n = strings.TrimRight(n, "]")
136 | 								n = strings.TrimLeft(n, "[")
137 | 							}
138 | 
139 | 							if info2["lp"] == nil {
140 | 								price1 = ""
141 | 							} else {
142 | 								price1 = fmt.Sprint(info2["lp"])
143 | 								price1 = strings.TrimRight(price1, "]")
144 | 								price1 = strings.TrimLeft(price1, "[")
145 | 							}
146 | 
147 | 							if info2["sp"] == nil {
148 | 								price2 = ""
149 | 							} else {
150 | 								price2 = fmt.Sprint(info2["sp"])
151 | 								price2 = strings.TrimRight(price2, "]")
152 | 								price2 = strings.TrimLeft(price2, "[")
153 | 							}
154 | 
155 | 							if info2["prm"] == nil {
156 | 								prm = ""
157 | 							} else {
158 | 								prm = fmt.Sprint(info2["prm"])
159 | 								prm = strings.TrimRight(prm, "]")
160 | 								prm = strings.TrimLeft(prm, "[")
161 | 							}
162 | 
163 | 							if info2["ratingCount"] == nil {
164 | 								level = "0"
165 | 							} else {
166 | 								level = fmt.Sprint(info2["ratingCount"])
167 | 								level = strings.TrimRight(level, "]")
168 | 								level = strings.TrimLeft(level, "[")
169 | 							}
170 | 
171 | 							// 结果存入Response中转
172 | 							ctx.Output(map[int]interface{}{
173 | 								0: n,
174 | 								1: price1,
175 | 								2: price2,
176 | 								3: prm,
177 | 								4: level,
178 | 								5: ctx.GetTemp("type", ""),
179 | 							})
180 | 						}
181 | 					}
182 | 				},
183 | 			},
184 | 		},
185 | 	},
186 | }
187 | 


--------------------------------------------------------------------------------
/jdsearch/jdsearch.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	JDSearch.Register()
 31 | }
 32 | 
 33 | var JDSearch = &Spider{
 34 | 	Name:        "京东搜索",
 35 | 	Description: "京东搜索结果 [search.jd.com]",
 36 | 	// Pausetime: 300,
 37 | 	Keyin:        KEYIN,
 38 | 	Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 50 | 						ctx.AddQueue(
 51 | 							&request.Request{
 52 | 								Url:  "http://search.jd.com/Search?keyin=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*loop[0]+1),
 53 | 								Rule: aid["Rule"].(string),
 54 | 							},
 55 | 						)
 56 | 						ctx.AddQueue(
 57 | 							&request.Request{
 58 | 								Url:  "http://search.jd.com/Search?keyin=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&click=&psort=&page=" + strconv.Itoa(2*loop[0]+2),
 59 | 								Rule: aid["Rule"].(string),
 60 | 							},
 61 | 						)
 62 | 					}
 63 | 					return nil
 64 | 				},
 65 | 				ParseFunc: func(ctx *Context) {
 66 | 					query := ctx.GetDom()
 67 | 
 68 | 					total1 := query.Find("#top_pagi span.text").Text()
 69 | 
 70 | 					re, _ := regexp.Compile(`[\d]+$`)
 71 | 					total1 = re.FindString(total1)
 72 | 					total, _ := strconv.Atoi(total1)
 73 | 
 74 | 					if total > ctx.GetLimit() {
 75 | 						total = ctx.GetLimit()
 76 | 					} else if total == 0 {
 77 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] 没有抓取到任何数据！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 78 | 						return
 79 | 					}
 80 | 					// 调用指定规则下辅助函数
 81 | 					ctx.Aid(map[string]interface{}{"loop": [2]int{1, total}, "Rule": "搜索结果"})
 82 | 					// 用指定规则解析响应流
 83 | 					ctx.Parse("搜索结果")
 84 | 				},
 85 | 			},
 86 | 
 87 | 			"搜索结果": {
 88 | 				//注意：有无字段语义和是否输出数据必须保持一致
 89 | 				ItemFields: []string{
 90 | 					"标题",
 91 | 					"价格",
 92 | 					"评论数",
 93 | 					"星级",
 94 | 					"链接",
 95 | 				},
 96 | 				ParseFunc: func(ctx *Context) {
 97 | 					query := ctx.GetDom()
 98 | 
 99 | 					query.Find("#plist .list-h:nth-child(1) > li").Each(func(i int, s *goquery.Selection) {
100 | 						// 获取标题
101 | 						a := s.Find(".p-name a")
102 | 						title := a.Text()
103 | 
104 | 						re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
105 | 						// title = re.ReplaceAllStringFunc(title, strings.ToLower)
106 | 						title = re.ReplaceAllString(title, " ")
107 | 						title = strings.Trim(title, " \t\n")
108 | 
109 | 						// 获取价格
110 | 						price, _ := s.Find("strong[data-price]").First().Attr("data-price")
111 | 
112 | 						// 获取评论数
113 | 						e := s.Find(".extra").First()
114 | 						discuss := e.Find("a").First().Text()
115 | 						re, _ = regexp.Compile(`[\d]+`)
116 | 						discuss = re.FindString(discuss)
117 | 
118 | 						// 获取星级
119 | 						level, _ := e.Find(".star span[id]").First().Attr("class")
120 | 						level = re.FindString(level)
121 | 
122 | 						// 获取URL
123 | 						url, _ := a.Attr("href")
124 | 
125 | 						// 结果存入Response中转
126 | 						ctx.Output(map[int]interface{}{
127 | 							0: title,
128 | 							1: price,
129 | 							2: discuss,
130 | 							3: level,
131 | 							4: url,
132 | 						})
133 | 					})
134 | 				},
135 | 			},
136 | 		},
137 | 	},
138 | }
139 | 


--------------------------------------------------------------------------------
/jiban/jiban.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | import (
  4 | 	"github.com/henrylee2cn/pholcus/app/downloader/request"
  5 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"
  7 | 	// net包
  8 | 	//	"net/http" //设置http.Header
  9 | 	// "net/url"
 10 | 
 11 | 	// 编码包
 12 | 	// "encoding/xml"
 13 | 	// "encoding/json"
 14 | 
 15 | 	// 字符串处理包
 16 | 	"strconv"
 17 | 	"strings"
 18 | 	// "regexp"
 19 | 	// 其他包
 20 | 	// "fmt"
 21 | 	// "math"
 22 | 	// "time"
 23 | )
 24 | 
 25 | func init() {
 26 | 	Jiban.Register()
 27 | }
 28 | 
 29 | var Jiban = &Spider{
 30 | 	Name:         "羁绊动漫",
 31 | 	Description:  "羁绊二次元资讯 [http://www.005.tv/zx/]",
 32 | 	EnableCookie: true,
 33 | 	RuleTree: &RuleTree{
 34 | 		Root: func(ctx *Context) {
 35 | 			ctx.AddQueue(&request.Request{
 36 | 				Url:         "http://www.005.tv/zx/list_526_1.html",
 37 | 				Rule:        "请求",
 38 | 				Temp:        map[string]interface{}{"p": 1},
 39 | 				ConnTimeout: -1,
 40 | 				Reloadable:  true,
 41 | 			})
 42 | 
 43 | 		},
 44 | 		Trunk: map[string]*Rule{
 45 | 			"请求": {
 46 | 				ParseFunc: func(ctx *Context) {
 47 | 					var curr = ctx.GetTemp("p", int(0)).(int)
 48 | 					ctx.GetDom().Find(".pages .dede_pages  .pagelist  .thisclass a").Each(func(ii int, iio *goquery.Selection) {
 49 | 						url2, _ := iio.Attr("href")
 50 | 						if url2 != "javascript:void(0);" {
 51 | 							if curr > 100 {
 52 | 								return
 53 | 							}
 54 | 						}
 55 | 					})
 56 | 					ctx.AddQueue(&request.Request{
 57 | 						Url:         "http://www.005.tv/zx/list_526_" + strconv.Itoa(curr+1) + ".html",
 58 | 						Rule:        "请求",
 59 | 						Temp:        map[string]interface{}{"p": curr + 1},
 60 | 						ConnTimeout: -1,
 61 | 						Reloadable:  true,
 62 | 					})
 63 | 					ctx.Parse("获取列表")
 64 | 				},
 65 | 			},
 66 | 
 67 | 			"获取列表": {
 68 | 				ParseFunc: func(ctx *Context) {
 69 | 					ctx.GetDom().
 70 | 						Find(".article-list ul li .xs-100 div h3 a").
 71 | 						Each(func(i int, s *goquery.Selection) {
 72 | 							url, _ := s.Attr("href")
 73 | 							ctx.AddQueue(&request.Request{
 74 | 								Url:         url,
 75 | 								Rule:        "news",
 76 | 								ConnTimeout: -1,
 77 | 							})
 78 | 						})
 79 | 				},
 80 | 			},
 81 | 
 82 | 			"news": {
 83 | 				ItemFields: []string{
 84 | 					"title",
 85 | 					"time",
 86 | 					"img_url",
 87 | 					"content",
 88 | 				},
 89 | 				ParseFunc: func(ctx *Context) {
 90 | 					query := ctx.GetDom()
 91 | 					var title, time, img_url, content string
 92 | 					query.Find(".article-list-wrap").
 93 | 						Each(func(j int, jo *goquery.Selection) {
 94 | 							title = jo.Find(".articleTitle-name").Text()
 95 | 							time = jo.Find("span.time").Text()
 96 | 							jo.Find(".articleContent img").Each(func(x int, xo *goquery.Selection) {
 97 | 								if img, ok := xo.Attr("src"); ok {
 98 | 									img_url = img_url + img + ","
 99 | 								}
100 | 							})
101 | 							jo.Find(".articleContent img").ReplaceWithHtml("#image#")
102 | 							jo.Find(".articleContent img").Remove()
103 | 							content, _ = jo.Find(".articleContent").Html()
104 | 							content = strings.Replace(content, `"`, `'`, -1)
105 | 						})
106 | 					ctx.Output(map[int]interface{}{
107 | 						0: title,
108 | 						1: time,
109 | 						2: img_url,
110 | 						3: content,
111 | 					})
112 | 				},
113 | 			},
114 | 		},
115 | 	},
116 | }
117 | 


--------------------------------------------------------------------------------
/jingdong/README.md:
--------------------------------------------------------------------------------
 1 | 根据京东新的页面规则进行了修改
 2 | 
 3 | 1.以前是修改url中的page参数就可以得到每页的值。但是现在京东做了修改。
 4 | ![Imgur](http://i.imgur.com/ssQHdz3.png)
 5 | 现在点击第二页的时候，url中的page参数会是3，修改page现在不能得到所有的商品信息的。page=2的时候的内容，会在你的页面滚动到中间的时候通过异步的方式来加载。
 6 | 
 7 | 2.我们输入的关键字总共有多少页商品的显示方式也修改了。这个参数现在改到了一段javasript代码中，通过js来生成页面代码。
 8 | ![Imgur](http://i.imgur.com/4WEIgTs.png)
 9 | 
10 | 3.在存入结果的时候，我判断了一下title为空的情况。这个是因为，京东会在一些商品里面加入广告的，但是这个广告的html结构是和商品是一样的，这样我们的规则在解析的时候会得到这个无效的信息，需要去掉。
11 | 如下图:
12 | ![Imgur](http://i.imgur.com/KYZJBqp.png)
13 | 
14 | 这个爬虫整体的过程就是。
15 | 
16 | 1. 先访问参数page=1的url，使用正则表达式得到这个关键字一共有多少页商品
17 | 2. 根据两种加载方式(url的直接返回和异步加载)，生成所有的url。
18 | 3. 分析页面结构，得到相关的值
19 | 
20 | 第一次写，写的不好的或错的地方希望大家多多包涵。^_^


--------------------------------------------------------------------------------
/jingdong/jdSpider.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	//"github.com/henrylee2cn/pholcus/logs"                   //信息输出
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"          //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | 	//"fmt"
 28 | )
 29 | 
 30 | func init() {
 31 | 	JDSpider.Register()
 32 | }
 33 | 
 34 | var JDSpider = &Spider{
 35 | 	Name:        "京东搜索new",
 36 | 	Description: "京东搜索结果 [search.jd.com]",
 37 | 	// Pausetime: 300,
 38 | 	Keyin:        KEYIN,
 39 | 	Limit:        LIMIT,
 40 | 	EnableCookie: false,
 41 | 	RuleTree: &RuleTree{
 42 | 		Root: func(ctx *Context) {
 43 | 			//Aid调用Rule中的AidFunc
 44 | 			ctx.Aid(map[string]interface{}{"Rule": "判断页数"}, "判断页数")
 45 | 		},
 46 | 
 47 | 		Trunk: map[string]*Rule{
 48 | 			//只判断关键字商品一共有多少页
 49 | 			"判断页数": {
 50 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 51 | 					ctx.AddQueue(
 52 | 						&request.Request{
 53 | 							Url:  "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=1",
 54 | 							Rule: aid["Rule"].(string),
 55 | 						},
 56 | 					)
 57 | 					return nil
 58 | 				},
 59 | 				ParseFunc: func(ctx *Context) {
 60 | 					query := ctx.GetDom()
 61 | 					pageCount := 0
 62 | 					query.Find("script").Each(func(i int, s *goquery.Selection) {
 63 | 						if strings.Contains(s.Text(), "page_count") {
 64 | 							re, _ := regexp.Compile(`page_count:"\d{1,}"`)
 65 | 							temp := re.FindString(s.Text())
 66 | 							re, _ = regexp.Compile(`\d{1,}`)
 67 | 							temp2 := re.FindString(temp)
 68 | 							pageCount, _ = strconv.Atoi(temp2)
 69 | 						}
 70 | 					})
 71 | 					ctx.Aid(map[string]interface{}{"PageCount": pageCount}, "生成请求")
 72 | 				},
 73 | 			},
 74 | 
 75 | 			"生成请求": {
 76 | 				//单数页是url直接返回,双数页是异步加载,两个url在下面有写
 77 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 78 | 					//Url:  "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=" + strconv.Itoa(pageNum),
 79 | 					//Url:  "http://search.jd.com/s_new.php?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=31&scrolling=y&pos=30&page=" + strconv.Itoa(pageNum),
 80 | 					pageCount := aid["PageCount"].(int)
 81 | 
 82 | 					for i := 1; i < pageCount; i++ {
 83 | 						ctx.AddQueue(
 84 | 							&request.Request{
 85 | 								Url:  "http://search.jd.com/Search?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=1&click=0&page=" + strconv.Itoa(i*2-1),
 86 | 								Rule: "搜索结果",
 87 | 							},
 88 | 						)
 89 | 						ctx.AddQueue(
 90 | 							&request.Request{
 91 | 								Url:  "http://search.jd.com/s_new.php?keyword=" + ctx.GetKeyin() + "&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&s=31&scrolling=y&pos=30&page=" + strconv.Itoa(i*2),
 92 | 								Rule: "搜索结果",
 93 | 							},
 94 | 						)
 95 | 					}
 96 | 					return nil
 97 | 				},
 98 | 			},
 99 | 
100 | 			"搜索结果": {
101 | 				//从返回中解析出数据。注：异步返回的结果页面结构是和单数页的一样的，所以就一套解析就可以了。
102 | 				ItemFields: []string{
103 | 					"标题",
104 | 					"价格",
105 | 					"评论数",
106 | 					"链接",
107 | 				},
108 | 				ParseFunc: func(ctx *Context) {
109 | 					query := ctx.GetDom()
110 | 
111 | 					query.Find(".gl-item").Each(func(i int, s *goquery.Selection) {
112 | 						// 获取标题
113 | 						a := s.Find(".p-name.p-name-type-2 > a")
114 | 						title := a.Text()
115 | 
116 | 						re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
117 | 						// title = re.ReplaceAllStringFunc(title, strings.ToLower)
118 | 						title = re.ReplaceAllString(title, " ")
119 | 						title = strings.Trim(title, " \t\n")
120 | 
121 | 						// 获取价格
122 | 						price := s.Find(".p-price > strong > i").Text()
123 | 
124 | 						// 获取评论数
125 | 						//#J_goodsList > ul > li:nth-child(1) > div > div.p-commit
126 | 						discuss := s.Find(".p-commit > strong > a").Text()
127 | 
128 | 						// 获取URL
129 | 						url, _ := a.Attr("href")
130 | 						url = "http:" + url
131 | 
132 | 						// 结果存入Response中转
133 | 						if title != "" {
134 | 							ctx.Output(map[int]interface{}{
135 | 								0: title,
136 | 								1: price,
137 | 								2: discuss,
138 | 								3: url,
139 | 							})
140 | 						}
141 | 					})
142 | 				},
143 | 			},
144 | 		},
145 | 	},
146 | }
147 | 


--------------------------------------------------------------------------------
/kaola/kaola.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"              //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 	// net包
 11 | 	// "net/http" //设置http.Header
 12 | 	// "net/url"
 13 | 	// 编码包
 14 | 	// "encoding/xml"
 15 | 	// "encoding/json"
 16 | 	// 字符串处理包
 17 | 	// "regexp"
 18 | 	// "strconv"
 19 | 	// "strings"
 20 | 	// 其他包
 21 | 	// "fmt"
 22 | 	// "math"
 23 | 	// "time"
 24 | )
 25 | 
 26 | func init() {
 27 | 	Kaola.Register()
 28 | }
 29 | 
 30 | // 考拉海淘,海外直采,7天无理由退货,售后无忧!考拉网放心的海淘网站!
 31 | var Kaola = &Spider{
 32 | 	Name:        "考拉海淘",
 33 | 	Description: "考拉海淘商品数据 [Auto Page] [www.kaola.com]",
 34 | 	// Pausetime: 300,
 35 | 	// Keyin:   KEYIN,
 36 | 	// Limit:        LIMIT,
 37 | 	EnableCookie: false,
 38 | 	RuleTree: &RuleTree{
 39 | 		Root: func(ctx *Context) {
 40 | 			ctx.AddQueue(&request.Request{Url: "http://www.kaola.com", Rule: "获取版块URL"})
 41 | 		},
 42 | 
 43 | 		Trunk: map[string]*Rule{
 44 | 
 45 | 			"获取版块URL": {
 46 | 				ParseFunc: func(ctx *Context) {
 47 | 					query := ctx.GetDom()
 48 | 					lis := query.Find("#funcTab li a")
 49 | 					lis.Each(func(i int, s *goquery.Selection) {
 50 | 						if i == 0 {
 51 | 							return
 52 | 						}
 53 | 						if url, ok := s.Attr("href"); ok {
 54 | 							ctx.AddQueue(&request.Request{Url: url, Rule: "商品列表", Temp: map[string]interface{}{"goodsType": s.Text()}})
 55 | 						}
 56 | 					})
 57 | 				},
 58 | 			},
 59 | 
 60 | 			"商品列表": {
 61 | 				ParseFunc: func(ctx *Context) {
 62 | 					query := ctx.GetDom()
 63 | 					query.Find(".proinfo").Each(func(i int, s *goquery.Selection) {
 64 | 						if url, ok := s.Find("a").Attr("href"); ok {
 65 | 							ctx.AddQueue(&request.Request{
 66 | 								Url:  "http://www.kaola.com" + url,
 67 | 								Rule: "商品详情",
 68 | 								Temp: map[string]interface{}{"goodsType": ctx.GetTemp("goodsType", "").(string)},
 69 | 							})
 70 | 						}
 71 | 					})
 72 | 				},
 73 | 			},
 74 | 
 75 | 			"商品详情": {
 76 | 				//注意：有无字段语义和是否输出数据必须保持一致
 77 | 				ItemFields: []string{
 78 | 					"标题",
 79 | 					"价格",
 80 | 					"品牌",
 81 | 					"采购地",
 82 | 					"评论数",
 83 | 					"类别",
 84 | 				},
 85 | 				ParseFunc: func(ctx *Context) {
 86 | 					query := ctx.GetDom()
 87 | 					// 获取标题
 88 | 					title := query.Find(".product-title").Text()
 89 | 
 90 | 					// 获取价格
 91 | 					price := query.Find("#js_currentPrice span").Text()
 92 | 
 93 | 					// 获取品牌
 94 | 					brand := query.Find(".goods_parameter li").Eq(0).Text()
 95 | 
 96 | 					// 获取采购地
 97 | 					from := query.Find(".goods_parameter li").Eq(1).Text()
 98 | 
 99 | 					// 获取评论数
100 | 					discussNum := query.Find("#commentCounts").Text()
101 | 
102 | 					// 结果存入Response中转
103 | 					ctx.Output(map[int]interface{}{
104 | 						0: title,
105 | 						1: price,
106 | 						2: brand,
107 | 						3: from,
108 | 						4: discussNum,
109 | 						5: ctx.GetTemp("goodsType", ""),
110 | 					})
111 | 				},
112 | 			},
113 | 		},
114 | 	},
115 | }
116 | 


--------------------------------------------------------------------------------
/lewa/lewa.go:
--------------------------------------------------------------------------------
 1 | package pholcus_lib
 2 | 
 3 | // 基础包
 4 | import (
 5 | 	// "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
 6 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
 7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
 8 | 	. "github.com/henrylee2cn/pholcus/app/spider"        //必需
 9 | 	. "github.com/henrylee2cn/pholcus/app/spider/common" //选用
10 | 
11 | 	// net包
12 | 	"net/http" //设置http.Header
13 | 	// "net/url"
14 | 	// 编码包
15 | 	// "encoding/xml"
16 | 	// "encoding/json"
17 | 	// 字符串处理包
18 | 	// "regexp"
19 | 	// "strconv"
20 | 	// "strings"
21 | 	// 其他包
22 | 	// "fmt"
23 | 	// "math"
24 | 	// "time"
25 | )
26 | 
27 | func init() {
28 | 	Lewa.Register()
29 | }
30 | 
31 | var Lewa = &Spider{
32 | 	Name:        "乐蛙登录测试",
33 | 	Description: "乐蛙登录测试 [Auto Page] [http://accounts.lewaos.com]",
34 | 	// Pausetime: 300,
35 | 	// Keyin:   KEYIN,
36 | 	// Limit:        LIMIT,
37 | 	EnableCookie: true,
38 | 	RuleTree: &RuleTree{
39 | 		Root: func(ctx *Context) {
40 | 			ctx.AddQueue(&request.Request{Url: "http://accounts.lewaos.com/", Rule: "登录页"})
41 | 		},
42 | 
43 | 		Trunk: map[string]*Rule{
44 | 
45 | 			"登录页": {
46 | 				ParseFunc: func(ctx *Context) {
47 | 					// ctx.AddQueue(&request.Request{
48 | 					// 	Url:    "http://accounts.lewaos.com",
49 | 					// 	Rule:   "登录后",
50 | 					// 	Method: "POST",
51 | 					// 	PostData: "username=123456@qq.com&password=123456&login_btn=login_btn&submit=login_btn",
52 | 					// })
53 | 					NewForm(
54 | 						ctx,
55 | 						"登录后",
56 | 						"http://accounts.lewaos.com",
57 | 						ctx.GetDom().Find(".userlogin.lw-pl40"),
58 | 					).Inputs(map[string]string{
59 | 						"username": "",
60 | 						"password": "",
61 | 					}).Submit()
62 | 				},
63 | 			},
64 | 			"登录后": {
65 | 				ParseFunc: func(ctx *Context) {
66 | 					// 结果存入Response中转
67 | 					ctx.Output(map[string]interface{}{
68 | 						"Body":   ctx.GetText(),
69 | 						"Cookie": ctx.GetCookie(),
70 | 					})
71 | 					ctx.AddQueue(&request.Request{
72 | 						Url:    "http://accounts.lewaos.com/member",
73 | 						Rule:   "个人中心",
74 | 						Header: http.Header{"Referer": []string{ctx.GetUrl()}},
75 | 					})
76 | 				},
77 | 			},
78 | 			"个人中心": {
79 | 				ParseFunc: func(ctx *Context) {
80 | 					// 结果存入Response中转
81 | 					ctx.Output(map[string]interface{}{
82 | 						"Body":   ctx.GetText(),
83 | 						"Cookie": ctx.GetCookie(),
84 | 					})
85 | 				},
86 | 			},
87 | 		},
88 | 	},
89 | }
90 | 


--------------------------------------------------------------------------------
/miyabaobei/miyabaobei.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"              //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Miyabaobei.Register()
 31 | }
 32 | 
 33 | var Miyabaobei = &Spider{
 34 | 	Name:        "蜜芽宝贝",
 35 | 	Description: "蜜芽宝贝商品数据 [Auto Page] [www.miyabaobei.com]",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.AddQueue(&request.Request{Url: "http://www.miyabaobei.com/", Rule: "获取版块URL"})
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"获取版块URL": {
 48 | 				ParseFunc: func(ctx *Context) {
 49 | 					query := ctx.GetDom()
 50 | 					lis := query.Find(".ccon")
 51 | 					lis.Each(func(i int, s *goquery.Selection) {
 52 | 						s.Find("a").Each(func(n int, ss *goquery.Selection) {
 53 | 							if url, ok := ss.Attr("href"); ok {
 54 | 								if !strings.Contains(url, "http://www.miyabaobei.com") {
 55 | 									url = "http://www.miyabaobei.com" + url
 56 | 								}
 57 | 								ctx.Aid(map[string]interface{}{
 58 | 									"loop":    [2]int{0, 1},
 59 | 									"urlBase": url,
 60 | 									"req": map[string]interface{}{
 61 | 										"Rule": "生成请求",
 62 | 										"Temp": map[string]interface{}{"baseUrl": url},
 63 | 									},
 64 | 								}, "生成请求")
 65 | 							}
 66 | 						})
 67 | 					})
 68 | 				},
 69 | 			},
 70 | 
 71 | 			"生成请求": {
 72 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 73 | 					req := aid["req"].(*request.Request)
 74 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 75 | 						req.Url = aid["urlBase"].(string) + "&per_page=" + strconv.Itoa(loop[0]*40)
 76 | 						ctx.AddQueue(req)
 77 | 					}
 78 | 					return nil
 79 | 				},
 80 | 				ParseFunc: func(ctx *Context) {
 81 | 					query := ctx.GetDom()
 82 | 					totalPage := "1"
 83 | 
 84 | 					urls := query.Find(".Lpage.page p a")
 85 | 
 86 | 					if urls.Length() != 0 {
 87 | 						if urls.Last().Text() == ">" {
 88 | 							totalPage = urls.Eq(urls.Length() - 2).Text()
 89 | 						} else {
 90 | 							totalPage = urls.Last().Text()
 91 | 						}
 92 | 					}
 93 | 					total, _ := strconv.Atoi(totalPage)
 94 | 
 95 | 					// 调用指定规则下辅助函数
 96 | 					ctx.Aid(map[string]interface{}{
 97 | 						"loop":     [2]int{1, total},
 98 | 						"ruleBase": ctx.GetTemp("baseUrl", "").(string),
 99 | 						"rep": map[string]interface{}{
100 | 							"Rule": "商品列表",
101 | 						},
102 | 					})
103 | 					// 用指定规则解析响应流
104 | 					ctx.Parse("商品列表")
105 | 				},
106 | 			},
107 | 
108 | 			"商品列表": {
109 | 				//注意：有无字段语义和是否输出数据必须保持一致
110 | 				ItemFields: []string{
111 | 					"标题",
112 | 					"价格",
113 | 					"类别",
114 | 				},
115 | 				ParseFunc: func(ctx *Context) {
116 | 					query := ctx.GetDom()
117 | 					//获取品类
118 | 					goodsType := query.Find(".crumbs").Text()
119 | 					re, _ := regexp.Compile("\\s")
120 | 					goodsType = re.ReplaceAllString(goodsType, "")
121 | 					re, _ = regexp.Compile("蜜芽宝贝>")
122 | 					goodsType = re.ReplaceAllString(goodsType, "")
123 | 					query.Find(".bmfo").Each(func(i int, s *goquery.Selection) {
124 | 						// 获取标题
125 | 						title, _ := s.Find("p a").First().Attr("title")
126 | 
127 | 						// 获取价格
128 | 						price := s.Find(".f20").Text()
129 | 
130 | 						// 结果存入Response中转
131 | 						ctx.Output(map[int]interface{}{
132 | 							0: title,
133 | 							1: price,
134 | 							2: goodsType,
135 | 						})
136 | 					})
137 | 				},
138 | 			},
139 | 		},
140 | 	},
141 | }
142 | 


--------------------------------------------------------------------------------
/people/people.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"log"
  6 | 
  7 | 	// "github.com/henrylee2cn/pholcus/common/goquery"                        //DOM解析
  8 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  9 | 	// "github.com/henrylee2cn/pholcus/logs"               //信息输出
 10 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
 11 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 12 | 
 13 | 	// net包
 14 | 	// "net/http" //设置http.Header
 15 | 	// "net/url"
 16 | 
 17 | 	// 编码包
 18 | 
 19 | 	// "encoding/xml"
 20 | 	"encoding/json"
 21 | 	// 字符串处理包
 22 | 	// "regexp"
 23 | 	// "strconv"
 24 | 	// "strings"
 25 | 	// 其他包
 26 | 	// "fmt"
 27 | 	// "math"
 28 | 	// "time"
 29 | )
 30 | 
 31 | func init() {
 32 | 	People.Register()
 33 | }
 34 | 
 35 | type Item struct {
 36 | 	Id       string `json:"id"`
 37 | 	Title    string `json:"title"`
 38 | 	Url      string `json:"url"`
 39 | 	Date     string `json:"date"`
 40 | 	NodeId   string `json:"nodeId"`
 41 | 	ImgCount string `json:"imgCount"`
 42 | }
 43 | type News struct {
 44 | 	Items []Item `json:"items"`
 45 | }
 46 | 
 47 | var news News
 48 | 
 49 | var People = &Spider{
 50 | 	Name:        "人民网新闻抓取",
 51 | 	Description: "人民网最新分类新闻",
 52 | 	// Pausetime:    300,
 53 | 	// Keyin:        KEYIN,
 54 | 	// Limit:        LIMIT,
 55 | 	EnableCookie: false,
 56 | 	RuleTree: &RuleTree{
 57 | 		Root: func(ctx *Context) {
 58 | 			ctx.AddQueue(&request.Request{
 59 | 				Method: "GET",
 60 | 				Url:    "http://news.people.com.cn/210801/211150/index.js?cache=false",
 61 | 				Rule:   "新闻列表",
 62 | 			})
 63 | 		},
 64 | 
 65 | 		Trunk: map[string]*Rule{
 66 | 			"新闻列表": {
 67 | 				ParseFunc: func(ctx *Context) {
 68 | 
 69 | 					//query := ctx.GetDom()
 70 | 					//str := query.Find("body").Text()
 71 | 
 72 | 					//str := `{"items":[{"id":"282","title":"人社&nbsp;转型升级&quot;战术&quot;手册","url":"ht","date":"201","nodeId":"1001","imgCount":"4"}]}`
 73 | 
 74 | 					str := ctx.GetText()
 75 | 
 76 | 					err := json.Unmarshal([]byte(str), &news)
 77 | 					if err != nil {
 78 | 						log.Printf("解析错误： %v\n", err)
 79 | 						return
 80 | 					}
 81 | 					/////////////////
 82 | 					newsLength := len(news.Items)
 83 | 					for i := 0; i < newsLength; i++ {
 84 | 						ctx.AddQueue(&request.Request{
 85 | 							Url:  news.Items[i].Url,
 86 | 							Rule: "热点新闻",
 87 | 							Temp: map[string]interface{}{
 88 | 								"id":       news.Items[i].Id,
 89 | 								"title":    news.Items[i].Title,
 90 | 								"date":     news.Items[i].Date,
 91 | 								"newsType": news.Items[i].NodeId,
 92 | 							},
 93 | 						})
 94 | 					}
 95 | 					/////////////////
 96 | 				},
 97 | 			},
 98 | 
 99 | 			"热点新闻": {
100 | 				//注意：有无字段语义和是否输出数据必须保持一致
101 | 				ItemFields: []string{
102 | 					"ID",
103 | 					"标题",
104 | 					"内容",
105 | 					"类别",
106 | 					"ReleaseTime",
107 | 				},
108 | 				ParseFunc: func(ctx *Context) {
109 | 					query := ctx.GetDom()
110 | 
111 | 					// 获取内容
112 | 					content := query.Find("#p_content").Text()
113 | 					// re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
114 | 					// content = re.ReplaceAllStringFunc(content, strings.ToLower)
115 | 					// content = re.ReplaceAllString(content, "")
116 | 
117 | 					// 结果存入Response中转
118 | 					ctx.Output(map[int]interface{}{
119 | 						0: ctx.GetTemp("id", ""),
120 | 						1: ctx.GetTemp("title", ""),
121 | 						2: content,
122 | 						3: ctx.GetTemp("newsType", ""),
123 | 						4: ctx.GetTemp("date", ""),
124 | 					})
125 | 				},
126 | 			},
127 | 		},
128 | 	},
129 | }
130 | 


--------------------------------------------------------------------------------
/pholcus_lib.go:
--------------------------------------------------------------------------------
 1 | package pholcus_lib
 2 | 
 3 | import (
 4 | 	_ "github.com/henrylee2cn/pholcus_lib/IJGUC"
 5 | 	_ "github.com/henrylee2cn/pholcus_lib/alibaba"
 6 | 	_ "github.com/henrylee2cn/pholcus_lib/area_codes"
 7 | 	_ "github.com/henrylee2cn/pholcus_lib/baidunews"
 8 | 	_ "github.com/henrylee2cn/pholcus_lib/baidusearch"
 9 | 	_ "github.com/henrylee2cn/pholcus_lib/car_home"
10 | 	_ "github.com/henrylee2cn/pholcus_lib/chinanews"
11 | 	_ "github.com/henrylee2cn/pholcus_lib/fang_resell_list"
12 | 	_ "github.com/henrylee2cn/pholcus_lib/filetest"
13 | 	_ "github.com/henrylee2cn/pholcus_lib/ganji_gongsi"
14 | 	_ "github.com/henrylee2cn/pholcus_lib/googlesearch"
15 | 	_ "github.com/henrylee2cn/pholcus_lib/hollandandbarrett"
16 | 	_ "github.com/henrylee2cn/pholcus_lib/jdsearch"
17 | 	_ "github.com/henrylee2cn/pholcus_lib/jiban"
18 | 	_ "github.com/henrylee2cn/pholcus_lib/jingdong"
19 | 	_ "github.com/henrylee2cn/pholcus_lib/kaola"
20 | 	_ "github.com/henrylee2cn/pholcus_lib/lewa"
21 | 	_ "github.com/henrylee2cn/pholcus_lib/miyabaobei"
22 | 	_ "github.com/henrylee2cn/pholcus_lib/people"
23 | 	_ "github.com/henrylee2cn/pholcus_lib/qq_avatar"
24 | 	_ "github.com/henrylee2cn/pholcus_lib/shunfenghaitao"
25 | 	_ "github.com/henrylee2cn/pholcus_lib/taobao"
26 | 	_ "github.com/henrylee2cn/pholcus_lib/taobaosearch"
27 | 	_ "github.com/henrylee2cn/pholcus_lib/wangyi"
28 | 	_ "github.com/henrylee2cn/pholcus_lib/weibo_fans"
29 | 	_ "github.com/henrylee2cn/pholcus_lib/wukongwenda"
30 | 	_ "github.com/henrylee2cn/pholcus_lib/zhihu_bianji"
31 | 	_ "github.com/henrylee2cn/pholcus_lib/zhihu_daily"
32 | 	_ "github.com/henrylee2cn/pholcus_lib/zolpc"
33 | 	_ "github.com/henrylee2cn/pholcus_lib/zolphone"
34 | 	_ "github.com/henrylee2cn/pholcus_lib/zolslab"
35 | )
36 | 


--------------------------------------------------------------------------------
/qq_avatar/README.md:
--------------------------------------------------------------------------------
1 | ## QQ头像和昵称抓取和下载头像
2 | 
3 | > 默认抓取1页
4 | 


--------------------------------------------------------------------------------
/qq_avatar/avatar.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
  9 | 	// net包
 10 | 	"net/http" //设置http.Header
 11 | 	// "net/url"
 12 | 
 13 | 	// 编码包
 14 | 	// "encoding/xml"
 15 | 	// "encoding/json"
 16 | 
 17 | 	// 字符串处理包
 18 | 	// "regexp"
 19 | 	"strconv"
 20 | 	"fmt"
 21 | 	"strings"
 22 | )
 23 | 
 24 | func init() {
 25 | 	Avatar.Register()
 26 | }
 27 | 
 28 | var Avatar = &Spider{
 29 | 
 30 | 	Name:        "QQ头像和昵称抓去和下载",
 31 | 	Description: "QQ头像和昵称抓去和下载",
 32 | 	// Pausetime: 300,
 33 | 	Keyin:           KEYIN,
 34 | 	Limit:           LIMIT,
 35 | 	EnableCookie:    false,
 36 | 	NotDefaultField: true,
 37 | 	RuleTree: &RuleTree{
 38 | 		Root: func(ctx *Context) {
 39 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{0, ctx.GetLimit()}, "Rule": "生成请求"}, "生成请求")
 40 | 		},
 41 | 
 42 | 		Trunk: map[string]*Rule{
 43 | 			"生成请求": {
 44 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 45 | 					var url string
 46 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 47 | 						if loop[0] == 0 {
 48 | 							url = "http://www.woyaogexing.com/touxiang/index.html"
 49 | 							loop[0]++
 50 | 						} else {
 51 | 							url = "http://www.woyaogexing.com/touxiang/index_" + strconv.Itoa(loop[0]+1) + ".html"
 52 | 						}
 53 | 						ctx.AddQueue(&request.Request{
 54 | 							Url:    url,
 55 | 							Rule:   aid["Rule"].(string),
 56 | 							Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}},
 57 | 						})
 58 | 					}
 59 | 					return nil
 60 | 				},
 61 | 				ParseFunc: func(ctx *Context) {
 62 | 					query := ctx.GetDom()
 63 | 					// logs.Log.Debug(ctx.GetText())
 64 | 					pageTag := query.Find("div.pageNum.wp div.page a:last-child")
 65 | 					// 跳转
 66 | 					if len(pageTag.Nodes) == 0 {
 67 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] \n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 68 | 						query.Find(".sm-floorhead-typemore a").Each(func(i int, s *goquery.Selection) {
 69 | 							if href, ok := s.Attr("href"); ok {
 70 | 								ctx.AddQueue(&request.Request{
 71 | 									Url:    href,
 72 | 									Header: http.Header{"Content-Type": []string{"text/html; charset=utf-8"}},
 73 | 									Rule:   "搜索结果",
 74 | 								})
 75 | 							}
 76 | 						})
 77 | 						return
 78 | 					}
 79 | 					// 用指定规则解析响应流
 80 | 					ctx.Parse("搜索结果")
 81 | 				},
 82 | 			},
 83 | 			"搜索结果": {
 84 | 				ItemFields: []string{
 85 | 					"avatar",
 86 | 					"nickname",
 87 | 				},
 88 | 				ParseFunc: func(ctx *Context) {
 89 | 					query := ctx.GetDom()
 90 | 					query.Find(".txList").Each(func(i int, selection *goquery.Selection) {
 91 | 						src, _ := selection.Find("a.img>img").First().Attr("src")
 92 | 						name := selection.Find("p>a").Text()
 93 | 						fmt.Printf("nickname:%s \t url: %s\n", name, src)
 94 | 						ctx.AddQueue(&request.Request{
 95 | 							Url:          src,
 96 | 							Rule:         "下载文件",
 97 | 							ConnTimeout:  -1,
 98 | 							DownloaderID: 0,
 99 | 						})
100 | 						str := strings.Split(src, "/")
101 | 						ctx.Output(map[int]interface{}{
102 | 							0: str[len(str)-1],
103 | 							1: name,
104 | 						})
105 | 					})
106 | 				},
107 | 			},
108 | 			"下载文件": {
109 | 				ParseFunc: func(ctx *Context) {
110 | 					ctx.FileOutput()
111 | 				},
112 | 			},
113 | 		},
114 | 	},
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/shunfenghaitao/shunfenghaitao.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"              //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	// "strconv"
 22 | 	// "strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Shunfenghaitao.Register()
 31 | }
 32 | 
 33 | // 进口母婴专区，买进口奶粉、尿裤尿布、辅食、营养、洗护、日用、母婴用品  - 顺丰海淘
 34 | var Shunfenghaitao = &Spider{
 35 | 	Name:        "顺丰海淘",
 36 | 	Description: "顺丰海淘商品数据 [Auto Page] [www.sfht.com]",
 37 | 	// Pausetime: 300,
 38 | 	// Keyin:   KEYIN,
 39 | 	// Limit:        LIMIT,
 40 | 	EnableCookie: false,
 41 | 	RuleTree: &RuleTree{
 42 | 		Root: func(ctx *Context) {
 43 | 			ctx.AddQueue(&request.Request{Url: "http://www.sfht.com", Rule: "获取版块URL"})
 44 | 		},
 45 | 
 46 | 		Trunk: map[string]*Rule{
 47 | 
 48 | 			"获取版块URL": {
 49 | 				ParseFunc: func(ctx *Context) {
 50 | 					query := ctx.GetDom()
 51 | 
 52 | 					lis := query.Find(".nav-c1").First().Find("li a")
 53 | 
 54 | 					lis.Each(func(i int, s *goquery.Selection) {
 55 | 						if i == 0 {
 56 | 							return
 57 | 						}
 58 | 						if url, ok := s.Attr("href"); ok {
 59 | 							ctx.AddQueue(&request.Request{Url: url, Rule: "商品列表", Temp: map[string]interface{}{"goodsType": s.Text()}})
 60 | 						}
 61 | 					})
 62 | 				},
 63 | 			},
 64 | 
 65 | 			"商品列表": {
 66 | 				ParseFunc: func(ctx *Context) {
 67 | 					query := ctx.GetDom()
 68 | 
 69 | 					query.Find(".cms-src-item").Each(func(i int, s *goquery.Selection) {
 70 | 						if url, ok := s.Find("a").Attr("href"); ok {
 71 | 							ctx.AddQueue(&request.Request{
 72 | 								Url:  url,
 73 | 								Rule: "商品详情",
 74 | 								Temp: map[string]interface{}{"goodsType": ctx.GetTemp("goodsType", "").(string)},
 75 | 							})
 76 | 						}
 77 | 					})
 78 | 				},
 79 | 			},
 80 | 
 81 | 			"商品详情": {
 82 | 				//注意：有无字段语义和是否输出数据必须保持一致
 83 | 				ItemFields: []string{
 84 | 					"标题",
 85 | 					"品牌",
 86 | 					"原产地",
 87 | 					"货源地",
 88 | 					"类别",
 89 | 				},
 90 | 				ParseFunc: func(ctx *Context) {
 91 | 					query := ctx.GetDom()
 92 | 
 93 | 					// 获取标题
 94 | 					title := query.Find("#titleInfo h1").Text()
 95 | 
 96 | 					// 获取品牌
 97 | 					brand := query.Find(".goods-c2 ul").Eq(0).Find("li").Eq(2).Text()
 98 | 					re, _ := regexp.Compile(`品 牌`)
 99 | 					brand = re.ReplaceAllString(brand, "")
100 | 
101 | 					// 获取原产地
102 | 					from1 := query.Find("#detailattributes li").Eq(0).Text()
103 | 
104 | 					// 获取货源地
105 | 					from2 := query.Find("#detailattributes li").Eq(1).Text()
106 | 
107 | 					// 结果存入Response中转
108 | 					ctx.Output(map[int]interface{}{
109 | 						0: title,
110 | 						1: brand,
111 | 						2: from1,
112 | 						3: from2,
113 | 						4: ctx.GetTemp("goodsType", ""),
114 | 					})
115 | 				},
116 | 			},
117 | 		},
118 | 	},
119 | }
120 | 


--------------------------------------------------------------------------------
/taobao/taobao.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  8 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  9 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	"encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Taobao.Register()
 31 | }
 32 | 
 33 | var cookies_Taobao = "mt=ci%3D-1_0; swfstore=35673; thw=cn; cna=fcr5DRDmwnQCAT2QxZSu3Db6; sloc=%E8%BE%BD%E5%AE%81; _tb_token_=XLlMHhT9BI8IzeA; ck1=; v=0; uc3=nk2=symxAo6NBazVq7cY2z0%3D&id2=UU23CgHxOwgwgA%3D%3D&vt3=F8dAT%2BCFEEyTLicOBEc%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; existShop=MTQzNDM1NDcyNg%3D%3D; lgc=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; tracknick=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; sg=%E6%B5%B721; cookie2=1433b814776e3b3c61f4ba3b8631a81a; cookie1=Bqbn0lh%2FkPm9D0NtnTdFiqggRYia%2FBrNeQpwLWlbyJk%3D; unb=2559173312; t=1a9b12bb535040723808836b32e53507; _cc_=WqG3DMC9EA%3D%3D; tg=5; _l_g_=Ug%3D%3D; _nk_=%5Cu5C0F%5Cu7C73%5Cu7C92%5Cu559C%5Cu6B22%5Cu5927%5Cu6D77; cookie17=UU23CgHxOwgwgA%3D%3D; mt=ci=0_1; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=lltime=1434353890&cookie14=UoW0FrfFYp27FQ%3D%3D&existShop=false&cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=U%2BGCWk%2F7p4mBoUyTltGF&tag=7&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&pas=0; isg=C08C1D752BC08A3DCDF1FE6611FA3EE1; l=Ajk53TTUeK0ZKkG8yx7w7svcyasSxC34"
 34 | 
 35 | var Taobao = &Spider{
 36 | 	Name:        "淘宝数据",
 37 | 	Description: "淘宝天猫商品数据 [Auto Page] [http://list.taobao.com/]",
 38 | 	// Pausetime: 300,
 39 | 	// Keyin:   KEYIN,
 40 | 	// Limit:        LIMIT,
 41 | 	EnableCookie: false,
 42 | 	RuleTree: &RuleTree{
 43 | 		Root: func(ctx *Context) {
 44 | 			ctx.AddQueue(&request.Request{
 45 | 				Url:  "http://list.taobao.com/browse/cat-0.htm",
 46 | 				Rule: "生成请求",
 47 | 				Header: http.Header{
 48 | 					"Cookie": []string{cookies_Taobao},
 49 | 				},
 50 | 			})
 51 | 		},
 52 | 
 53 | 		Trunk: map[string]*Rule{
 54 | 
 55 | 			"生成请求": {
 56 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 57 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 58 | 						for _, loc := range loc_Taobao {
 59 | 							ctx.AddQueue(&request.Request{
 60 | 								Url:  "http:" + aid["urlBase"].(string) + "&_input_charset=utf-8&json=on&viewIndex=1&as=0&atype=b&style=grid&same_info=1&tid=0&isnew=2&data-action&module=page&s=0&loc=" + loc + "&pSize=96&data-key=s&data-value=" + strconv.Itoa(loop[0]*96),
 61 | 								Rule: aid["Rule"].(string),
 62 | 								Header: http.Header{
 63 | 									"Cookie": []string{cookies_Taobao},
 64 | 								},
 65 | 								Temp: aid["Temp"].(map[string]interface{}),
 66 | 							})
 67 | 						}
 68 | 					}
 69 | 					return nil
 70 | 				},
 71 | 				ParseFunc: func(ctx *Context) {
 72 | 					query := ctx.GetDom()
 73 | 					query.Find(".J_TBMarketCat").Each(func(i int, a *goquery.Selection) {
 74 | 						type1 := a.Find("h4").Text()
 75 | 						a.Find(".section").Each(func(i int, b *goquery.Selection) {
 76 | 							type2 := b.Find(".subtitle a").Text()
 77 | 							b.Find(".sublist a").Each(func(i int, c *goquery.Selection) {
 78 | 								type3 := c.Text()
 79 | 								href3, _ := c.Attr("href")
 80 | 
 81 | 								ctx.Aid(map[string]interface{}{
 82 | 									"loop":    [2]int{0, 1},
 83 | 									"urlBase": href3,
 84 | 									"Rule":    "列表页数",
 85 | 									"Temp": map[string]interface{}{
 86 | 										"type1": type1,
 87 | 										"type2": type2,
 88 | 										"type3": type3,
 89 | 									},
 90 | 								})
 91 | 							})
 92 | 						})
 93 | 					})
 94 | 				},
 95 | 			},
 96 | 
 97 | 			"列表页数": {
 98 | 				ParseFunc: func(ctx *Context) {
 99 | 					json := ctx.GetText()
100 | 					re, _ := regexp.Compile(`(?U)"totalPage":"[\d]+",`)
101 | 					total := re.FindString(json)
102 | 					re, _ = regexp.Compile(`[\d]+`)
103 | 					total = re.FindString(total)
104 | 					total = strings.Trim(total, " \t\n")
105 | 					totalPage, _ := strconv.Atoi(total)
106 | 					if total == "0" {
107 | 						logs.Log.Critical("[消息提示：| 任务：%v | 关键词：%v | 规则：%v] 没有抓取到任何数据！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
108 | 					} else {
109 | 						ctx.Aid(map[string]interface{}{
110 | 							"loop":    [2]int{1, totalPage},
111 | 							"urlBase": ctx.GetUrl(),
112 | 							"Rule":    "商品列表",
113 | 							"Temp":    ctx.CopyTemps(),
114 | 						}, "生成请求")
115 | 						ctx.Parse("商品列表")
116 | 					}
117 | 				},
118 | 			},
119 | 
120 | 			"商品列表": {
121 | 				ParseFunc: func(ctx *Context) {
122 | 					j := ctx.GetText()
123 | 					// re, _ := regexp.Compile(`null`)
124 | 					// j = re.ReplaceAllString(j, " ")
125 | 
126 | 					infos := map[string]interface{}{}
127 | 					err := json.Unmarshal([]byte(j), &infos)
128 | 					if err != nil {
129 | 						logs.Log.Error("商品列表解析错误： %v\n", err)
130 | 						return
131 | 					}
132 | 					if infos["mallItemList"] == nil {
133 | 						logs.Log.Error("商品列表解析错误： 内容不存在！")
134 | 						return
135 | 					}
136 | 					for _, item := range infos["mallItemList"].([]interface{}) {
137 | 						item2 := item.(map[string]interface{})
138 | 						temp := ctx.CreatItem(map[int]interface{}{
139 | 							0:  item2["title"],
140 | 							1:  item2["price"],
141 | 							2:  item2["currentPrice"],
142 | 							3:  item2["vipPrice"],
143 | 							4:  item2["unitPrice"],
144 | 							5:  item2["unit"],
145 | 							6:  item2["isVirtual"],
146 | 							7:  item2["ship"],
147 | 							8:  item2["tradeNum"],
148 | 							9:  item2["formatedNum"],
149 | 							10: item2["nick"],
150 | 							11: item2["sellerId"],
151 | 							12: item2["guarantee"],
152 | 							13: item2["itemId"],
153 | 							14: item2["isLimitPromotion"],
154 | 							15: item2["loc"],
155 | 							16: "http:" + item2["storeLink"].(string),
156 | 							17: "http:" + item2["href"].(string),
157 | 							18: item2["commend"],
158 | 							19: item2["source"],
159 | 							20: item2["ratesum"],
160 | 							21: item2["goodRate"],
161 | 							22: item2["dsrScore"],
162 | 							23: item2["spSource"],
163 | 						}, "结果")
164 | 						ctx.AddQueue(&request.Request{
165 | 							Url:      "http:" + item2["href"].(string),
166 | 							Rule:     "商品详情",
167 | 							Temp:     temp,
168 | 							Priority: 1,
169 | 						})
170 | 					}
171 | 				},
172 | 			},
173 | 
174 | 			"商品详情": {
175 | 
176 | 				ParseFunc: func(ctx *Context) {
177 | 					query := ctx.GetDom()
178 | 
179 | 					// 商品规格参数
180 | 					detail := make(map[string]string)
181 | 
182 | 					if li := query.Find(".attributes-list ul li"); len(li.Nodes) != 0 {
183 | 						// 天猫店宝贝详情
184 | 						li.Each(func(i int, s *goquery.Selection) {
185 | 							native := s.Text()
186 | 							slice := strings.Split(native, ":&nbsp;")
187 | 							//空格替换为分隔号“|”
188 | 							slice[1] = strings.Replace(slice[1], "&nbsp;", "&#124;", -1)
189 | 							detail[slice[0]] = UnicodeToUTF8(slice[1])
190 | 						})
191 | 
192 | 					} else {
193 | 						// 淘宝店宝贝详情
194 | 						query.Find(".attributes-list li").Each(func(i int, s *goquery.Selection) {
195 | 							native := s.Text()
196 | 							slice := strings.Split(native, ": ")
197 | 							detail[slice[0]] = slice[1]
198 | 						})
199 | 					}
200 | 
201 | 					temp := ctx.CopyTemps()
202 | 					temp[ctx.GetItemField(24, "结果")] = detail
203 | 					temp[ctx.GetItemField(25, "结果")] = []interface{}{}
204 | 
205 | 					ctx.AddQueue(&request.Request{
206 | 						Rule: "商品评论",
207 | 						Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" +
208 | 							ctx.GetTemp("sellerId", "").(string) +
209 | 							"&auctionNumId=" +
210 | 							ctx.GetTemp("itemId", "").(string) +
211 | 							"&currentPageNum=1",
212 | 						Temp:     temp,
213 | 						Priority: 2,
214 | 					})
215 | 				},
216 | 			},
217 | 
218 | 			"商品评论": {
219 | 				ParseFunc: func(ctx *Context) {
220 | 					j := ctx.GetText()
221 | 					j = strings.TrimLeft(j, "(")
222 | 					j = strings.TrimRight(j, ")")
223 | 
224 | 					infos := map[string]interface{}{}
225 | 					if err := json.Unmarshal([]byte(j), &infos); err != nil {
226 | 						logs.Log.Error("商品评论解析错误： %v\n", err)
227 | 						return
228 | 					}
229 | 					if infos["comments"] == nil || infos["maxPage"] == nil || infos["currentPageNum"] == nil {
230 | 						logs.Log.Error("商品评论解析错误： 内容不存在！")
231 | 						return
232 | 					}
233 | 					discussSlice := infos["comments"].([]interface{})
234 | 					var discussAll = ctx.GetTemp(ctx.GetItemField(25, "结果"), []interface{}{}).([]interface{})
235 | 					discussAll = append(discussAll, discussSlice...)
236 | 					temp := ctx.CopyTemps()
237 | 					temp[ctx.GetItemField(25, "结果")] = discussAll
238 | 
239 | 					currentPageNum := infos["currentPageNum"].(int)
240 | 					maxPage := infos["maxPage"].(int)
241 | 					if currentPageNum < maxPage {
242 | 						// 请求下一页
243 | 						ctx.AddQueue(&request.Request{
244 | 							Rule: "商品评论",
245 | 							Url: "http://rate.taobao.com/feedRateList.htm?siteID=4&rateType=&orderType=sort_weight&showContent=1&userNumId=" +
246 | 								ctx.GetTemp("sellerId", "").(string) +
247 | 								"&auctionNumId=" +
248 | 								ctx.GetTemp("itemId", "").(string) +
249 | 								"&currentPageNum=" +
250 | 								strconv.Itoa(currentPageNum+1),
251 | 							Temp: temp,
252 | 						})
253 | 					} else {
254 | 						// 输出结果
255 | 						ctx.Parse("结果")
256 | 					}
257 | 				},
258 | 			},
259 | 
260 | 			"结果": {
261 | 				//注意：有无字段语义和是否输出数据必须保持一致
262 | 				ItemFields: []string{
263 | 					"标题",               //title
264 | 					"原价",               //price
265 | 					"现价",               //currentPrice
266 | 					"会员价",              //vipPrice
267 | 					"单价",               //unitPrice
268 | 					"单位",               //unit
269 | 					"是否虚拟物品",           //isVirtual
270 | 					"ship",             //ship
271 | 					"tradeNum",         //tradeNum
272 | 					"formatedNum",      //formatedNum
273 | 					"店铺",               //nick
274 | 					"店铺ID",             //sellerId
275 | 					"guarantee",        //guarantee
276 | 					"货号",               //itemId
277 | 					"isLimitPromotion", //isLimitPromotion
278 | 					"发货地",              //loc
279 | 					"店铺链接",             //storeLink
280 | 					"商品链接",             //href
281 | 					"评价",               //commend
282 | 					"source",           //source
283 | 					"店铺信誉",             //ratesum
284 | 					"店铺好评率",            //goodRate
285 | 					"dsrScore",         //dsrScore
286 | 					"spSource",         //spSource
287 | 					"规格参数",
288 | 					"评论内容",
289 | 				},
290 | 				ParseFunc: func(ctx *Context) {
291 | 					// 结果存入Response中转
292 | 					ctx.Output(ctx.CopyTemps())
293 | 				},
294 | 			},
295 | 		},
296 | 	},
297 | }
298 | 
299 | var (
300 | 	loc_Taobao = map[string]string{
301 | 		// "北京": "%E5%8C%97%E4%BA%AC",
302 | 		// "上海": "%E4%B8%8A%E6%B5%B7",
303 | 		// "广州":   "%E5%B9%BF%E5%B7%9E",
304 | 		// "深圳":   "%E6%B7%B1%E5%9C%B3",
305 | 		// "杭州":   "%E6%9D%AD%E5%B7%9E",
306 | 		// "海外": "%E7%BE%8E%E5%9B%BD%2C%E8%8B%B1%E5%9B%BD%2C%E6%B3%95%E5%9B%BD%2C%E7%91%9E%E5%A3%AB%2C%E6%BE%B3%E6%B4%B2%2C%E6%96%B0%E8%A5%BF%E5%85%B0%2C%E5%8A%A0%E6%8B%BF%E5%A4%A7%2C%E5%A5%A5%E5%9C%B0%E5%88%A9%2C%E9%9F%A9%E5%9B%BD%2C%E6%97%A5%E6%9C%AC%2C%E5%BE%B7%E5%9B%BD%2C%E6%84%8F%E5%A4%A7%E5%88%A9%2C%E8%A5%BF%E7%8F%AD%E7%89%99%2C%E4%BF%84%E7%BD%97%E6%96%AF%2C%E6%B3%B0%E5%9B%BD%2C%E5%8D%B0%E5%BA%A6%2C%E8%8D%B7%E5%85%B0%2C%E6%96%B0%E5%8A%A0%E5%9D%A1%2C%E5%85%B6%E5%AE%83%E5%9B%BD%E5%AE%B6",
307 | 		// "江浙沪":  "%E6%B1%9F%E8%8B%8F%2C%E6%B5%99%E6%B1%9F%2C%E4%B8%8A%E6%B5%B7",
308 | 		// "珠三角":  "%E5%B9%BF%E5%B7%9E%2C%E6%B7%B1%E5%9C%B3%2C%E4%B8%AD%E5%B1%B1%2C%E7%8F%A0%E6%B5%B7%2C%E4%BD%9B%E5%B1%B1%2C%E4%B8%9C%E8%8E%9E%2C%E6%83%A0%E5%B7%9E",
309 | 		// "京津冀":  "%E5%8C%97%E4%BA%AC%2C%E5%A4%A9%E6%B4%A5%2C%E6%B2%B3%E5%8C%97",
310 | 		// "东三省":  "%E9%BB%91%E9%BE%99%E6%B1%9F%2C%E5%90%89%E6%9E%97%2C%E8%BE%BD%E5%AE%81",
311 | 		// "港澳台":  "%E9%A6%99%E6%B8%AF%2C%E6%BE%B3%E9%97%A8%2C%E5%8F%B0%E6%B9%BE",
312 | 		// "江浙沪皖": "%E6%B1%9F%E8%8B%8F%2C%E6%B5%99%E6%B1%9F%2C%E4%B8%8A%E6%B5%B7%2C%E5%AE%89%E5%BE%BD",
313 | 		// "长沙":   "%E9%95%BF%E6%B2%99",
314 | 		// "长春":   "%E9%95%BF%E6%98%A5",
315 | 		// "成都":   "%E6%88%90%E9%83%BD",
316 | 		// "重庆": "%E9%87%8D%E5%BA%86",
317 | 		// "大连":   "%E5%A4%A7%E8%BF%9E",
318 | 		// "东莞":   "%E4%B8%9C%E8%8E%9E",
319 | 		// "福州":   "%E7%A6%8F%E5%B7%9E",
320 | 		// "合肥":   "%E5%90%88%E8%82%A5",
321 | 		// "济南":   "%E6%B5%8E%E5%8D%97",
322 | 		// "嘉兴":   "%E5%98%89%E5%85%B4",
323 | 		// "昆明":   "51108009&loc=%E6%98%86%E6%98%8E",
324 | 		// "宁波":   "%E5%AE%81%E6%B3%A2",
325 | 		// "南京":   "%E5%8D%97%E4%BA%AC",
326 | 		// "南昌":   "%E5%8D%97%E6%98%8C",
327 | 		// "青岛":   "%E9%9D%92%E5%B2%9B",
328 | 		// "苏州":   "%E8%8B%8F%E5%B7%9E",
329 | 		// "沈阳":   "%E6%B2%88%E9%98%B3",
330 | 		// "天津": "%E5%A4%A9%E6%B4%A5",
331 | 		// "温州":   "%E6%B8%A9%E5%B7%9E",
332 | 		// "无锡":   "%E6%97%A0%E9%94%A1",
333 | 		// "武汉":   "%E6%AD%A6%E6%B1%89",
334 | 		// "西安":   "%E8%A5%BF%E5%AE%89",
335 | 		// "厦门":   "%E5%8E%A6%E9%97%A8",
336 | 		// "郑州":   "%E9%83%91%E5%B7%9E",
337 | 		// "中山":   "%E4%B8%AD%E5%B1%B1",
338 | 		// "石家庄":  "%E7%9F%B3%E5%AE%B6%E5%BA%84",
339 | 		// "哈尔滨":  "%E5%93%88%E5%B0%94%E6%BB%A8",
340 | 		// 省级
341 | 		// "安徽":  "%E5%AE%89%E5%BE%BD",
342 | 		// "福建":  "%E7%A6%8F%E5%BB%BA",
343 | 		// "甘肃":  "%E7%94%98%E8%82%83",
344 | 		// "广东":  "%E5%B9%BF%E4%B8%9C",
345 | 		// "广西":  "%E5%B9%BF%E8%A5%BF",
346 | 		// "贵州":  "%E8%B4%B5%E5%B7%9E",
347 | 		// "河北":  "%E6%B2%B3%E5%8C%97",
348 | 		// "河南":  "%E6%B2%B3%E5%8D%97",
349 | 		// "湖北":  "%E6%B9%96%E5%8C%97",
350 | 		// "湖南":  "%E6%B9%96%E5%8D%97",
351 | 		// "海南":  "%E6%B5%B7%E5%8D%97",
352 | 		// "江苏":  "%E6%B1%9F%E8%8B%8F",
353 | 		// "江西":  "%E6%B1%9F%E8%A5%BF",
354 | 		// "吉林":  "%E5%90%89%E6%9E%97",
355 | 		// "辽宁":  "%E8%BE%BD%E5%AE%81",
356 | 		// "宁夏":  "%E5%AE%81%E5%A4%8F",
357 | 		// "青海":  "%E9%9D%92%E6%B5%B7",
358 | 		// "山东":  "%E5%B1%B1%E4%B8%9C",
359 | 		// "山西":  "%E5%B1%B1%E8%A5%BF",
360 | 		// "陕西":  "%E9%99%95%E8%A5%BF",
361 | 		// "四川":  "%E5%9B%9B%E5%B7%9D",
362 | 		// "西藏":  "%E8%A5%BF%E8%97%8F",
363 | 		// "新疆":  "%E6%96%B0%E7%96%86",
364 | 		// "云南":  "%E4%BA%91%E5%8D%97",
365 | 		// "浙江":  "%E6%B5%99%E6%B1%9F",
366 | 		// "澳门":  "%E6%BE%B3%E9%97%A8",
367 | 		// "香港":  "%E9%A6%99%E6%B8%AF",
368 | 		// "台湾":  "%E5%8F%B0%E6%B9%BE",
369 | 		// "内蒙古": "%E5%86%85%E8%92%99%E5%8F%A4",
370 | 		// "黑龙江": "%E9%BB%91%E9%BE%99%E6%B1%9F",
371 | 		"": "",
372 | 	}
373 | )
374 | 


--------------------------------------------------------------------------------
/taobaosearch/taobaosearch.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	// "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
  6 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  9 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	"encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	"regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	TaobaoSearch.Register()
 31 | }
 32 | 
 33 | var TaobaoSearch = &Spider{
 34 | 	Name:        "淘宝天猫搜索",
 35 | 	Description: "淘宝天猫搜索结果 [s.taobao.com]",
 36 | 	// Pausetime: 300,
 37 | 	Keyin:        KEYIN,
 38 | 	Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 50 | 						ctx.AddQueue(&request.Request{
 51 | 							Url:  "http://s.taobao.com/search?q=" + ctx.GetKeyin() + "&ie=utf8&cps=yes&app=vproduct&cd=false&v=auction&tab=all&vlist=1&bcoffset=1&s=" + strconv.Itoa(loop[0]*44),
 52 | 							Rule: aid["Rule"].(string),
 53 | 						})
 54 | 					}
 55 | 					return nil
 56 | 				},
 57 | 				ParseFunc: func(ctx *Context) {
 58 | 					query := ctx.GetDom()
 59 | 					src := query.Find("script").Text()
 60 | 					if strings.Contains(src, "抱歉！没有找到与") {
 61 | 						logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果不存在！ ********************** ", ctx.GetKeyin())
 62 | 						return
 63 | 					}
 64 | 
 65 | 					re, _ := regexp.Compile(`(?U)"totalCount":[\d]+}`)
 66 | 					total := re.FindString(src)
 67 | 					re, _ = regexp.Compile(`[\d]+`)
 68 | 					total = re.FindString(total)
 69 | 					totalCount, _ := strconv.Atoi(total)
 70 | 
 71 | 					maxPage := (totalCount - 4) / 44
 72 | 					if (totalCount-4)%44 > 0 {
 73 | 						maxPage++
 74 | 					}
 75 | 
 76 | 					if ctx.GetLimit() > maxPage || ctx.GetLimit() == 0 {
 77 | 						ctx.SetLimit(maxPage)
 78 | 					} else if ctx.GetLimit() == 0 {
 79 | 						logs.Log.Critical("[消息提示：| 任务：%v | KEYIN：%v | 规则：%v] 没有抓取到任何数据！!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName())
 80 | 						return
 81 | 					}
 82 | 
 83 | 					logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果共有 %v 页，计划抓取 %v 页 **********************", ctx.GetKeyin(), maxPage, ctx.GetLimit())
 84 | 					// 调用指定规则下辅助函数
 85 | 					ctx.Aid(map[string]interface{}{"loop": [2]int{1, ctx.GetLimit()}, "Rule": "搜索结果"})
 86 | 					// 用指定规则解析响应流
 87 | 					ctx.Parse("搜索结果")
 88 | 				},
 89 | 			},
 90 | 
 91 | 			"搜索结果": {
 92 | 				ParseFunc: func(ctx *Context) {
 93 | 					query := ctx.GetDom()
 94 | 					src := query.Find("script").Text()
 95 | 
 96 | 					re, _ := regexp.Compile(`"auctions".*,"recommendAuctions"`)
 97 | 					src = re.FindString(src)
 98 | 
 99 | 					re, _ = regexp.Compile(`"auctions":`)
100 | 					src = re.ReplaceAllString(src, "")
101 | 
102 | 					re, _ = regexp.Compile(`,"recommendAuctions"`)
103 | 					src = re.ReplaceAllString(src, "")
104 | 
105 | 					re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
106 | 					// src = re.ReplaceAllStringFunc(src, strings.ToLower)
107 | 					src = re.ReplaceAllString(src, " ")
108 | 
109 | 					src = strings.Trim(src, " \t\n")
110 | 
111 | 					infos := []map[string]interface{}{}
112 | 
113 | 					err := json.Unmarshal([]byte(src), &infos)
114 | 
115 | 					if err != nil {
116 | 						logs.Log.Error("error is %v\n", err)
117 | 						return
118 | 					} else {
119 | 						for _, info := range infos {
120 | 							ctx.AddQueue(&request.Request{
121 | 								Url:  "http:" + info["detail_url"].(string),
122 | 								Rule: "商品详情",
123 | 								Temp: ctx.CreatItem(map[int]interface{}{
124 | 									0: info["raw_title"],
125 | 									1: info["view_price"],
126 | 									2: info["view_sales"],
127 | 									3: info["nick"],
128 | 									4: info["item_loc"],
129 | 								}, "商品详情"),
130 | 								Priority: 1,
131 | 							})
132 | 						}
133 | 					}
134 | 				},
135 | 			},
136 | 			"商品详情": {
137 | 				//注意：有无字段语义和是否输出数据必须保持一致
138 | 				ItemFields: []string{
139 | 					"标题",
140 | 					"价格",
141 | 					"销量",
142 | 					"店铺",
143 | 					"发货地",
144 | 				},
145 | 				ParseFunc: func(ctx *Context) {
146 | 					r := ctx.CopyTemps()
147 | 
148 | 					re := regexp.MustCompile(`"newProGroup":.*,"progressiveSupport"`)
149 | 					d := re.FindString(ctx.GetText())
150 | 
151 | 					if d == "" {
152 | 						h, _ := ctx.GetDom().Find(".attributes-list").Html()
153 | 						d = UnicodeToUTF8(h)
154 | 						d = strings.Replace(d, "&nbsp;", " ", -1)
155 | 						d = CleanHtml(d, 5)
156 | 						d = strings.Replace(d, "产品参数：\n", "", -1)
157 | 
158 | 						for _, v := range strings.Split(d, "\n") {
159 | 							if v == "" {
160 | 								continue
161 | 							}
162 | 							feild := strings.Split(v, ":")
163 | 							// 去除英文空格
164 | 							// feild[0] = strings.Trim(feild[0], " ")
165 | 							// feild[1] = strings.Trim(feild[1], " ")
166 | 							// 去除中文空格
167 | 							feild[0] = strings.Trim(feild[0], " ")
168 | 							feild[1] = strings.Trim(feild[1], " ")
169 | 
170 | 							if feild[0] == "" || feild[1] == "" {
171 | 								continue
172 | 							}
173 | 
174 | 							ctx.UpsertItemField(feild[0])
175 | 							r[feild[0]] = feild[1]
176 | 						}
177 | 
178 | 					} else {
179 | 						d = strings.Replace(d, `"newProGroup":`, "", -1)
180 | 						d = strings.Replace(d, `,"progressiveSupport"`, "", -1)
181 | 
182 | 						infos := []map[string]interface{}{}
183 | 
184 | 						err := json.Unmarshal([]byte(d), &infos)
185 | 
186 | 						if err != nil {
187 | 							logs.Log.Error("error is %v\n", err)
188 | 							return
189 | 						} else {
190 | 							for _, info := range infos {
191 | 								for _, attr := range info["attrs"].([]interface{}) {
192 | 									a := attr.(map[string]interface{})
193 | 									ctx.UpsertItemField(a["name"].(string))
194 | 									r[a["name"].(string)] = a["value"]
195 | 								}
196 | 							}
197 | 						}
198 | 					}
199 | 
200 | 					ctx.Output(r)
201 | 				},
202 | 			},
203 | 		},
204 | 	},
205 | }
206 | 


--------------------------------------------------------------------------------
/wangyi/wangyi.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"               //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 
 17 | 	// "encoding/xml"
 18 | 	// "encoding/json"
 19 | 
 20 | 	// 字符串处理包
 21 | 	"regexp"
 22 | 	// "strconv"
 23 | 	"strings"
 24 | 	// 其他包
 25 | 	// "fmt"
 26 | 	// "math"
 27 | 	// "time"
 28 | )
 29 | 
 30 | func init() {
 31 | 	Wangyi.Register()
 32 | }
 33 | 
 34 | var Wangyi = &Spider{
 35 | 	Name:        "网易新闻",
 36 | 	Description: "网易排行榜新闻，含点击/跟帖排名 [Auto Page] [news.163.com/rank]",
 37 | 	// Pausetime:    300,
 38 | 	// Keyin:        KEYIN,
 39 | 	// Limit:        LIMIT,
 40 | 	EnableCookie: false,
 41 | 	RuleTree: &RuleTree{
 42 | 		Root: func(ctx *Context) {
 43 | 			ctx.AddQueue(&request.Request{Url: "http://news.163.com/rank/", Rule: "排行榜主页"})
 44 | 		},
 45 | 
 46 | 		Trunk: map[string]*Rule{
 47 | 
 48 | 			"排行榜主页": {
 49 | 				ParseFunc: func(ctx *Context) {
 50 | 					query := ctx.GetDom()
 51 | 					query.Find(".subNav a").Each(func(i int, s *goquery.Selection) {
 52 | 						if url, ok := s.Attr("href"); ok {
 53 | 							ctx.AddQueue(&request.Request{Url: url, Rule: "新闻排行榜"})
 54 | 						}
 55 | 					})
 56 | 				},
 57 | 			},
 58 | 
 59 | 			"新闻排行榜": {
 60 | 				ParseFunc: func(ctx *Context) {
 61 | 					topTit := []string{
 62 | 						"1小时前点击排行",
 63 | 						"24小时点击排行",
 64 | 						"本周点击排行",
 65 | 						"今日跟帖排行",
 66 | 						"本周跟帖排行",
 67 | 						"本月跟贴排行",
 68 | 					}
 69 | 					query := ctx.GetDom()
 70 | 					// 获取新闻分类
 71 | 					newsType := query.Find(".titleBar h2").Text()
 72 | 
 73 | 					urls_top := map[string]string{}
 74 | 
 75 | 					query.Find(".tabContents").Each(func(n int, t *goquery.Selection) {
 76 | 						t.Find("tr").Each(func(i int, s *goquery.Selection) {
 77 | 							// 跳过标题栏
 78 | 							if i == 0 {
 79 | 								return
 80 | 							}
 81 | 							// 内容链接
 82 | 							url, ok := s.Find("a").Attr("href")
 83 | 
 84 | 							// 排名
 85 | 							top := s.Find(".cBlue").Text()
 86 | 
 87 | 							if ok {
 88 | 								urls_top[url] += topTit[n] + ":" + top + ","
 89 | 							}
 90 | 						})
 91 | 					})
 92 | 					for k, v := range urls_top {
 93 | 						ctx.AddQueue(&request.Request{
 94 | 							Url:  k,
 95 | 							Rule: "热点新闻",
 96 | 							Temp: map[string]interface{}{
 97 | 								"newsType": newsType,
 98 | 								"top":      v,
 99 | 							},
100 | 						})
101 | 					}
102 | 				},
103 | 			},
104 | 
105 | 			"热点新闻": {
106 | 				//注意：有无字段语义和是否输出数据必须保持一致
107 | 				ItemFields: []string{
108 | 					"标题",
109 | 					"内容",
110 | 					"排名",
111 | 					"类别",
112 | 					"ReleaseTime",
113 | 				},
114 | 				ParseFunc: func(ctx *Context) {
115 | 					query := ctx.GetDom()
116 | 
117 | 					// 若有多页内容，则获取阅读全文的链接并获取内容
118 | 					if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 {
119 | 						if pageAllUrl, ok := pageAll.Attr("href"); ok {
120 | 							ctx.AddQueue(&request.Request{
121 | 								Url:  pageAllUrl,
122 | 								Rule: "热点新闻",
123 | 								Temp: ctx.CopyTemps(),
124 | 							})
125 | 						}
126 | 						return
127 | 					}
128 | 
129 | 					// 获取标题
130 | 					title := query.Find("#h1title").Text()
131 | 
132 | 					// 获取内容
133 | 					content := query.Find("#endText").Text()
134 | 					re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
135 | 					// content = re.ReplaceAllStringFunc(content, strings.ToLower)
136 | 					content = re.ReplaceAllString(content, "")
137 | 
138 | 					// 获取发布日期
139 | 					release := query.Find(".ep-time-soure").Text()
140 | 					release = strings.Split(release, "来源:")[0]
141 | 					release = strings.Trim(release, " \t\n")
142 | 
143 | 					// 结果存入Response中转
144 | 					ctx.Output(map[int]interface{}{
145 | 						0: title,
146 | 						1: content,
147 | 						2: ctx.GetTemp("top", ""),
148 | 						3: ctx.GetTemp("newsType", ""),
149 | 						4: release,
150 | 					})
151 | 				},
152 | 			},
153 | 		},
154 | 	},
155 | }
156 | 


--------------------------------------------------------------------------------
/weibo_fans/weibo_fans.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	. "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  8 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  9 | 	"github.com/henrylee2cn/pholcus/logs"                   //信息输出
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 
 24 | 	// 其他包
 25 | 	"fmt"
 26 | 	// "math"
 27 | 	// "time"
 28 | 	// "io/ioutil"
 29 | )
 30 | 
 31 | func init() {
 32 | 	WeiboFans.Register()
 33 | }
 34 | 
 35 | var WeiboFans = &Spider{
 36 | 	Name:         "微博粉丝列表",
 37 | 	Description:  `新浪微博粉丝 [自定义输入格式 "ID"::"Cookie"][最多支持250页，内设定时1~2s]`,
 38 | 	Pausetime:    2000,
 39 | 	Keyin:        KEYIN,
 40 | 	Limit:        LIMIT,
 41 | 	EnableCookie: true,
 42 | 	RuleTree: &RuleTree{
 43 | 		Root: func(ctx *Context) {
 44 | 			param := strings.Split(ctx.GetKeyin(), "::")
 45 | 			if len(param) != 2 {
 46 | 				logs.Log.Error("自定义输入的参数不正确！")
 47 | 				return
 48 | 			}
 49 | 			id := strings.Trim(param[0], " ")
 50 | 			cookie := strings.Trim(param[1], " ")
 51 | 
 52 | 			var count1 = 250
 53 | 			var count2 = 50
 54 | 			if ctx.GetLimit() < count1 {
 55 | 				count1 = ctx.GetLimit()
 56 | 			}
 57 | 			if ctx.GetLimit() < count2 {
 58 | 				count2 = ctx.GetLimit()
 59 | 			}
 60 | 			for i := count1; i > 0; i-- {
 61 | 				ctx.AddQueue(&request.Request{
 62 | 					Url:          "http://weibo.com/" + id + "/fans?cfs=600&relate=fans&t=1&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68",
 63 | 					Rule:         "好友列表",
 64 | 					Header:       http.Header{"Cookie": []string{cookie}},
 65 | 					DownloaderID: 0,
 66 | 				})
 67 | 			}
 68 | 			for i := 1; i <= count2; i++ {
 69 | 				ctx.AddQueue(&request.Request{
 70 | 					Url:          "http://www.weibo.com/" + id + "/fans?cfs=&relate=fans&t=5&f=1&type=&Pl_Official_RelationFans__68_page=" + strconv.Itoa(i) + "#Pl_Official_RelationFans__68",
 71 | 					Rule:         "好友列表",
 72 | 					Header:       http.Header{"Cookie": []string{cookie}},
 73 | 					DownloaderID: 0,
 74 | 				})
 75 | 			}
 76 | 		},
 77 | 
 78 | 		Trunk: map[string]*Rule{
 79 | 			"好友列表": {
 80 | 				ParseFunc: func(ctx *Context) {
 81 | 					query := ctx.GetDom()
 82 | 					fmt.Println(query.Find(".follow_list").Text())
 83 | 					query.Find(".follow_list .mod_info").Each(func(i int, s *goquery.Selection) {
 84 | 						fmt.Println("222")
 85 | 						name, _ := s.Find(".info_name a").Attr("title")
 86 | 						fmt.Println(name)
 87 | 						url, _ := s.Find(".info_name a").Attr("href")
 88 | 						uid := strings.Replace(url, "/u", "", -1)
 89 | 						uid = strings.Replace(uid, "/", "", -1)
 90 | 						url = "http://weibo.com/p/100505" + uid + "/info?mod=pedit_more"
 91 | 						var 认证 string = ""
 92 | 						if _, isExist := s.Find(".info_name i").Attr("title"); isExist {
 93 | 							认证 = "认证"
 94 | 						}
 95 | 						关注 := s.Find(".info_connect em a").Eq(0).Text()
 96 | 						粉丝 := s.Find(".info_connect em a").Eq(1).Text()
 97 | 						微博 := s.Find(".info_connect em a").Eq(2).Text()
 98 | 						fmt.Println(关注, 粉丝, 微博)
 99 | 						x := &request.Request{
100 | 							Url:          url,
101 | 							Rule:         "好友资料",
102 | 							DownloaderID: 0,
103 | 							Temp: map[string]interface{}{
104 | 								"好友名":  name,
105 | 								"好友ID": uid,
106 | 								"认证":   认证,
107 | 								"关注":   关注,
108 | 								"粉丝":   粉丝,
109 | 								"微博":   微博,
110 | 							},
111 | 						}
112 | 						ctx.AddQueue(x)
113 | 					})
114 | 				},
115 | 			},
116 | 			"好友资料": {
117 | 				ItemFields: []string{
118 | 					"好友名",
119 | 					"好友ID",
120 | 					"认证",
121 | 					"关注",
122 | 					"粉丝",
123 | 					"微博",
124 | 				},
125 | 				ParseFunc: func(ctx *Context) {
126 | 					query := ctx.GetDom()
127 | 					var 属性 map[string]string
128 | 					var title string
129 | 					var detail string
130 | 					query.Find(".li_1").Each(func(i int, s *goquery.Selection) {
131 | 						if 属性 == nil {
132 | 							属性 = map[string]string{}
133 | 						}
134 | 						title = s.Find(".pt_title").Text()
135 | 						title = Deprive2(title)
136 | 						detail = s.Find(".pt_detail").Text()
137 | 						detail = Deprive2(detail)
138 | 						属性[title] = detail
139 | 					})
140 | 					结果 := map[int]interface{}{
141 | 						0: ctx.GetTemp("好友名", ""),
142 | 						1: ctx.GetTemp("好友ID", ""),
143 | 						2: ctx.GetTemp("认证", ""),
144 | 						3: ctx.GetTemp("关注", ""),
145 | 						4: ctx.GetTemp("粉丝", ""),
146 | 						5: ctx.GetTemp("微博", ""),
147 | 					}
148 | 					for k, v := range 属性 {
149 | 						idx := ctx.UpsertItemField(k)
150 | 						结果[idx] = v
151 | 					}
152 | 
153 | 					// 结果输出
154 | 					ctx.Output(结果)
155 | 				},
156 | 			},
157 | 		},
158 | 	},
159 | }
160 | 


--------------------------------------------------------------------------------
/wukongwenda/README.md:
--------------------------------------------------------------------------------
1 | ## 悟空问答每个专栏
2 | 
3 | > 抓取悟空问答每个专栏的内容，只要不停止，就会不停的抓取
4 | 


--------------------------------------------------------------------------------
/wukongwenda/wukongwenda.go:
--------------------------------------------------------------------------------
  1 | package wukongwenda
  2 | 
  3 | import (
  4 | 	// 基础包
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	//"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	"strings"
 23 | 
 24 | 	// 其他包
 25 | 	// "math"
 26 | 	"time"
 27 | 	"github.com/tidwall/gjson" //引用的json处理的包
 28 | 
 29 | )
 30 | 
 31 | func init() {
 32 | 	WukongWenda.Register()
 33 | }
 34 | 
 35 | var domains = []string{
 36 | 	"6300775428692904450",//热门
 37 | 	"6215497896830175745",//娱乐
 38 | 	"6215497726554016258",//体育
 39 | 	"6215497898671475202",//汽车
 40 | 	"6215497899594222081",//科技
 41 | 	"6215497900164647426",//育儿
 42 | 	"6215497899774577154",//美食
 43 | 	"6215497897518041601",//数码
 44 | 	"6215497898084272641",//时尚
 45 | 	"6215847700051528193",//宠物
 46 | 	"6215847700907166210",//收藏
 47 | 	"6215497901804620289",//家居
 48 | 	"6281512530493835777",//心理
 49 | 	"6215497897710979586",//更多 文化
 50 | 	"6215847700454181377",//更多 三农
 51 | 	"6215497895248923137",//更多 健康
 52 | 	"6215848044378720770",//更多 科学
 53 | 	"6215497899027991042",//更多 游戏
 54 | 	"6215497895852902913",//更多 动漫
 55 | 	"6215497897312520705",//更多 教育
 56 | 	"6215497899963320834",//更多 职场
 57 | 	"6215497897899723265",//更多 旅游
 58 | 	"6215497900554717698",//更多 电影
 59 | }
 60 | 
 61 | const (
 62 | 	WUKONG_NORMAL_URL = "https://www.wukong.com/wenda/web/nativefeed/brow/?concern_id=" //不同栏目访问地址
 63 | 	UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
 64 | )
 65 | 
 66 | 
 67 | var WukongWenda = &Spider{
 68 | 	Name:        "悟空问答",
 69 | 	Description: "悟空问答 各个频道专栏问题",
 70 | 	// Pausetime: 300,
 71 | 	// Keyin:   KEYIN,
 72 | 	// Limit:        LIMIT,
 73 | 	EnableCookie: false,
 74 | 	RuleTree: &RuleTree{
 75 | 		Root: func(ctx *Context) {
 76 | 			//处理解析结构相同的领域
 77 | 			for _,  domain := range domains{
 78 | 				url := WUKONG_NORMAL_URL + domain + "&t=" +
 79 | 					strconv.FormatInt(time.Now().UnixNano()/1e6, 10)
 80 | 				header := http.Header{}
 81 | 				header.Add("User-Agent", UA)
 82 | 
 83 | 				ctx.AddQueue(&request.Request{
 84 | 					Url: url,
 85 | 					Header: header,
 86 | 					Rule: "获取结果",
 87 | 				})
 88 | 
 89 | 			}
 90 | 		},
 91 | 
 92 | 
 93 | 		Trunk: map[string]*Rule{
 94 | 			"获取结果": {
 95 | 				//注意：有无字段语义和是否输出数据必须保持一致
 96 | 				ItemFields: []string{
 97 | 					"问题标题",
 98 | 					"问题描述",
 99 | 					"问题回答",
100 | 					"问题url地址",
101 | 				},
102 | 				ParseFunc: func(ctx *Context) {
103 | 
104 | 					type question struct{
105 | 						title string
106 | 						content string
107 | 						answer string
108 | 						url string
109 | 						offset string
110 | 					}
111 | 
112 | 					var questionlist []question
113 | 					data := gjson.Get(ctx.GetText(), "data")
114 | 					more := gjson.Get(ctx.GetText(), "has_more").String()
115 | 
116 | 					data.ForEach(func(key, value gjson.Result) bool{
117 | 						questionlist = append(questionlist,
118 | 							question{
119 | 								title:gjson.Get(value.String(), "question.title").String(),
120 | 								content:gjson.Get(value.String(), "question.content.text").String(),
121 | 								answer:gjson.Get(value.String(), "answer.content").String(),
122 | 								url:"https://www.wukong.com/question/" + gjson.Get(value.String(), "question.qid").String() + "/",
123 | 								offset:gjson.Get(value.String(), "behot_time").String(),
124 | 							})
125 | 						return true
126 | 					})
127 | 
128 | 					if more == "true"{
129 | 						newOffset := questionlist[len(questionlist) - 1].offset
130 | 						header := http.Header{}
131 | 						header.Add("User-Agent", UA)
132 | 
133 | 						visit_url := ctx.GetUrl()
134 | 						if strings.Contains(visit_url, "&max_behot_time="){
135 | 							visit_url = strings.Split(visit_url, "&max_behot_time=")[0]
136 | 						}
137 | 
138 | 						ctx.AddQueue(&request.Request{
139 | 							Url: visit_url + "&max_behot_time=" + newOffset,
140 | 							Header: header,
141 | 							Rule: "获取结果",
142 | 						})
143 | 
144 | 					}
145 | 
146 | 					for _, v := range questionlist{
147 | 						ctx.Output(map[int]interface{}{
148 | 							0:v.title,
149 | 							1:v.content,
150 | 							2:v.answer,
151 | 							3:v.url,
152 | 						})
153 | 					}
154 | 
155 | 				},
156 | 			},
157 | 		},
158 | 	},
159 | }
160 | 
161 | 


--------------------------------------------------------------------------------
/zhihu_bianji/README.md:
--------------------------------------------------------------------------------
1 | ## 知乎编辑推荐
2 | 
3 | > 目前抓取推荐专栏的问题和回答。
4 | > 能够翻页抓取，
5 | > 抓取的内容中的段落标签(``<p>``)、图片标签(``<img>``)等均原封不动的抓取过来，没做转义替换处理
6 | > 编辑中有两类文本，一类是知乎作家写的文章，一类是知乎用户回答的问题。这两类均抓取了
7 | > 支持采集最少url数，即可以手动输入"采集上限"，那就是最少采集数


--------------------------------------------------------------------------------
/zhihu_bianji/zhihu_bianji.go:
--------------------------------------------------------------------------------
  1 | package zhihu_bianji
  2 | 
  3 | // 基础包
  4 | import (
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	. "github.com/henrylee2cn/pholcus/app/spider"           //必需
  7 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  8 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common"    //选用
  9 | 	//"github.com/henrylee2cn/pholcus/logs" //信息输出
 10 | 
 11 | 	// net包
 12 | 	"net/http" //设置http.Header
 13 | 	"net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	"encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	//"strconv"
 21 | 
 22 | 	// 其他包
 23 | 	// "fmt"
 24 | 	// "time"
 25 | 	//"strconv"
 26 | 	"io/ioutil"
 27 | 	"strings"
 28 | 	"strconv"
 29 | 	"regexp"
 30 | 	"math"
 31 | )
 32 | 
 33 | func init() {
 34 | 	ZhihuBianji.Register()
 35 | }
 36 | 
 37 | var urlList []string
 38 | 
 39 | var ZhihuBianji = &Spider{
 40 | 	Name:        "知乎编辑推荐",
 41 | 	Description: "知乎编辑推荐",
 42 | 	Pausetime:    300,
 43 | 	//Keyin:        KEYIN,
 44 | 	Limit:        LIMIT,
 45 | 	EnableCookie: false,
 46 | 	RuleTree: &RuleTree{
 47 | 		Root: func(ctx *Context) {
 48 | 			ctx.AddQueue(&request.Request{
 49 | 				Url: "https://www.zhihu.com/explore/recommendations",
 50 | 				Rule: "知乎编辑推荐",
 51 | 			})
 52 | 
 53 | 
 54 | 		},
 55 | 
 56 | 		Trunk: map[string]*Rule{
 57 | 			"知乎编辑推荐": {
 58 | 				ParseFunc: func(ctx *Context) {
 59 | 					query := ctx.GetDom()
 60 | 					regular := "#zh-recommend-list-full .zh-general-list .zm-item h2 a";
 61 | 					query.Find(regular).
 62 | 						Each(func(i int, s *goquery.Selection) {
 63 | 						if url, ok := s.Attr("href"); ok {
 64 | 							url = changeToAbspath(url)
 65 | 							ctx.AddQueue(&request.Request{Url: url, Rule: "解析落地页"})
 66 | 						}})
 67 | 
 68 | 					limit := ctx.GetLimit()
 69 | 
 70 | 					if len(query.Find(regular).Nodes) < limit	{
 71 | 						total := int(math.Ceil(float64(limit) / float64(20)))
 72 | 						ctx.Aid(map[string]interface{}{
 73 | 							"loop": [2]int{1, total},
 74 | 							"Rule": "知乎编辑推荐翻页",
 75 | 						}, "知乎编辑推荐翻页")
 76 | 					}
 77 | 				},
 78 | 			},
 79 | 
 80 | 			"知乎编辑推荐翻页": {
 81 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 82 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 83 | 						offset := loop[0] * 20
 84 | 						header := make(http.Header)
 85 | 						header.Set("Content-Type", "application/x-www-form-urlencoded")
 86 | 						ctx.AddQueue(&request.Request{
 87 | 							Url:  "https://www.zhihu.com/node/ExploreRecommendListV2",
 88 | 							Rule: aid["Rule"].(string),
 89 | 							Method: "POST",
 90 | 							Header: header,
 91 | 							PostData: url.Values{"method":{"next"}, "params":{`{"limit":20,"offset":` + strconv.Itoa(offset) + `}`}}.Encode(),
 92 | 							Reloadable: true,
 93 | 						})
 94 | 					}
 95 | 
 96 | 					return nil
 97 | 				},
 98 | 				ParseFunc: func(ctx *Context) {
 99 | 					type Items struct {
100 | 						R int `json:"r"`
101 | 						Msg []interface{} `json:"msg"`
102 | 					}
103 | 
104 | 					content, err := ioutil.ReadAll(ctx.GetResponse().Body)
105 | 
106 | 					ctx.GetResponse().Body.Close()
107 | 
108 | 					if err != nil {
109 | 						ctx.Log().Error(err.Error());
110 | 					}
111 | 
112 | 					e := new(Items)
113 | 
114 | 					err = json.Unmarshal(content, e)
115 | 
116 | 					html := ""
117 | 
118 | 					for _, v := range e.Msg{
119 | 						msg, ok := v.(string)
120 | 						if ok {
121 | 							html = html + "\n" + msg
122 | 						}
123 | 					}
124 | 
125 | 
126 | 					ctx = ctx.ResetText(html)
127 | 
128 | 					query := ctx.GetDom()
129 | 
130 | 					query.Find(".zm-item h2 a").Each(func(i int, selection *goquery.Selection){
131 | 						if url, ok := selection.Attr("href"); ok {
132 | 							url = changeToAbspath(url)
133 | 							if filterZhihuAnswerURL(url){
134 | 								ctx.AddQueue(&request.Request{Url: url, Rule: "解析知乎问答落地页"})
135 | 							}else{
136 | 								ctx.AddQueue(&request.Request{Url: url, Rule: "解析知乎文章落地页"})
137 | 							}
138 | 						}
139 | 					})
140 | 
141 | 				},
142 | 			},
143 | 
144 | 			"解析知乎问答落地页": {
145 | 				ItemFields: []string{
146 | 					"标题",
147 | 					"提问内容",
148 | 					"回答内容",
149 | 				},
150 | 				ParseFunc: func(ctx *Context) {
151 | 					query := ctx.GetDom()
152 | 
153 | 					questionHeader := query.Find(".QuestionPage .QuestionHeader .QuestionHeader-content")
154 | 					//headerSide := questionHeader.Find(".QuestionHeader-side")
155 | 					headerMain := questionHeader.Find(".QuestionHeader-main")
156 | 
157 | 					// 获取问题标题
158 | 					title := headerMain.Find(".QuestionHeader-title").Text()
159 | 
160 | 					// 获取问题描述
161 | 					content := headerMain.Find(".QuestionHeader-detail span").Text()
162 | 
163 | 					answerMain := query.Find(".QuestionPage .Question-main")
164 | 
165 | 					answer, _ := answerMain.Find(".AnswerCard .QuestionAnswer-content .ContentItem .RichContent .RichContent-inner").First().Html()
166 | 
167 | 					// 结果存入Response中转
168 | 					ctx.Output(map[int]interface{}{
169 | 						0: title,
170 | 						1: content,
171 | 						2: answer,
172 | 					})
173 | 
174 | 				},
175 | 			},
176 | 
177 | 			"解析知乎文章落地页": {
178 | 				ItemFields: []string{
179 | 					"标题",
180 | 					"内容",
181 | 				},
182 | 				ParseFunc: func(ctx *Context) {
183 | 					query := ctx.GetDom()
184 | 
185 | 					// 获取问题标题
186 | 					title,_ := query.Find(".PostIndex-title.av-paddingSide.av-titleFont").Html()
187 | 
188 | 					// 获取问题描述
189 | 					content, _ := query.Find(".RichText.PostIndex-content.av-paddingSide.av-card").Html()
190 | 
191 | 					// 结果存入Response中转
192 | 					ctx.Output(map[int]interface{}{
193 | 						0: title,
194 | 						1: content,
195 | 					})
196 | 
197 | 				},
198 | 			},
199 | 		},
200 | 	},
201 | }
202 | 
203 | //将相对路径替换为绝对路径
204 | func changeToAbspath(url string)string{
205 | 	if strings.HasPrefix(url, "https://"){
206 | 		return url
207 | 	}
208 | 	return "https://www.zhihu.com" + url
209 | }
210 | 
211 | //判断是用户回答的问题，还是知乎专栏作家书写的文章
212 | func filterZhihuAnswerURL(url string) bool{
213 | 	return regexp.MustCompile(`^https:\/\/www\.zhihu\.com\/question\/\d{1,}(\/answer\/\d{1,})?$`).MatchString(url)
214 | }


--------------------------------------------------------------------------------
/zhihu_daily/README.md:
--------------------------------------------------------------------------------
1 | ## 知乎每日推荐
2 | 
3 | > 目前抓取知乎每日推荐的问题和回答。
4 | > 能够翻页抓取，
5 | > 抓取的内容中的段落标签(``<p>``)、图片标签(``<img>``)等均原封不动的抓取过来，没做转义替换处理
6 | > 支持采集最少url数，即可以手动输入"采集上限"，那就是最少采集数


--------------------------------------------------------------------------------
/zhihu_daily/zhihu_daily.go:
--------------------------------------------------------------------------------
  1 | package zhihu_daily
  2 | 
  3 | import (
  4 | 	// 基础包
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strings"
 22 | 	// 其他包
 23 | 	// "fmt"
 24 | 	"math"
 25 | 	"strconv"
 26 | )
 27 | 
 28 | func init() {
 29 | 	ZhihuDaily.Register()
 30 | }
 31 | 
 32 | var ZhihuDaily = &Spider{
 33 | 	Name:        "知乎每日推荐",
 34 | 	Description: "知乎每日推荐",
 35 | 	Pausetime: 300,
 36 | 	// Keyin:   KEYIN,
 37 | 	Limit:        LIMIT,
 38 | 	EnableCookie: false,
 39 | 	RuleTree: &RuleTree{
 40 | 		Root: func(ctx *Context) {
 41 | 			ctx.AddQueue(&request.Request{
 42 | 				Url:"https://www.zhihu_bianji.com/explore#daily-hot",
 43 | 				Rule: "获取首页结果",
 44 | 				Temp: map[string]interface{}{
 45 | 					"target":"first",
 46 | 				},
 47 | 			})
 48 | 
 49 | 			limit := ctx.GetLimit()
 50 | 			if limit > 15{
 51 | 				totalTimes := int(math.Ceil(float64(limit) / float64(5)))
 52 | 				for i := 1; i < totalTimes; i++{
 53 | 					offset := strconv.Itoa(i*5)
 54 | 					ctx.AddQueue(&request.Request{
 55 | 						Url: `https://www.zhihu_bianji.com/node/ExploreAnswerListV2?params={"offset":` + offset + `,"type":"day"}`,
 56 | 						Rule: "获取首页结果",
 57 | 						Temp: map[string]interface{}{
 58 | 							"target": "next_page",
 59 | 						},
 60 | 					})
 61 | 				}
 62 | 			}
 63 | 		},
 64 | 
 65 | 		Trunk: map[string]*Rule{
 66 | 			"获取首页结果": {
 67 | 				ParseFunc: func(ctx *Context) {
 68 | 					query := ctx.GetDom()
 69 | 					target := ctx.GetTemps()["target"].(string)
 70 | 					regular := "[data-type='daily'] .explore-feed.feed-item h2 a"
 71 | 					if target == "next_page"{
 72 | 						regular = ".explore-feed.feed-item h2 a"
 73 | 					}
 74 | 
 75 | 					query.Find(regular).
 76 | 						Each(func(i int, selection *goquery.Selection) {
 77 | 						url, isExist := selection.Attr("href")
 78 | 						url = changeToAbspath(url)
 79 | 						if isExist{
 80 | 							ctx.AddQueue(&request.Request{Url: url, Rule: "解析落地页"})
 81 | 						}
 82 | 					})
 83 | 				},
 84 | 			},
 85 | 
 86 | 			"解析落地页": {
 87 | 				ItemFields: []string{
 88 | 					"标题",
 89 | 					"提问内容",
 90 | 					"回答内容",
 91 | 				},
 92 | 				ParseFunc: func(ctx *Context) {
 93 | 					query := ctx.GetDom()
 94 | 
 95 | 					questionHeader := query.Find(".QuestionPage .QuestionHeader .QuestionHeader-content")
 96 | 					//headerSide := questionHeader.Find(".QuestionHeader-side")
 97 | 					headerMain := questionHeader.Find(".QuestionHeader-main")
 98 | 
 99 | 					// 获取问题标题
100 | 					title := headerMain.Find(".QuestionHeader-title").Text()
101 | 
102 | 					// 获取问题描述
103 | 					content := headerMain.Find(".QuestionHeader-detail span").Text()
104 | 
105 | 					answerMain := query.Find(".QuestionPage .Question-main")
106 | 
107 | 					answer, _ := answerMain.Find(".AnswerCard .QuestionAnswer-content .ContentItem .RichContent .RichContent-inner").First().Html()
108 | 
109 | 					// 结果存入Response中转
110 | 					ctx.Output(map[int]interface{}{
111 | 						0: title,
112 | 						1: content,
113 | 						2: answer,
114 | 					})
115 | 
116 | 				},
117 | 			},
118 | 		},
119 | 	},
120 | }
121 | 
122 | //将相对路径替换为绝对路径
123 | func changeToAbspath(url string)string{
124 | 	if strings.HasPrefix(url, "https://"){
125 | 		return url
126 | 	}
127 | 	return "https://www.zhihu_bianji.com" + url
128 | }
129 | 
130 | 


--------------------------------------------------------------------------------
/zolpc/zolpc.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | import (
  4 | 	// 基础包
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	// "strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Zolpc.Register()
 31 | }
 32 | 
 33 | var Zolpc = &Spider{
 34 | 	Name:        "中关村笔记本",
 35 | 	Description: "中关村笔记本数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{1, 720}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 50 | 						ctx.AddQueue(&request.Request{
 51 | 							Url:  "http://bbs.zol.com.cn/nbbbs/p" + strconv.Itoa(loop[0]) + ".html#c",
 52 | 							Rule: aid["Rule"].(string),
 53 | 						})
 54 | 					}
 55 | 					return nil
 56 | 				},
 57 | 				ParseFunc: func(ctx *Context) {
 58 | 					query := ctx.GetDom()
 59 | 					ss := query.Find("tbody").Find("tr[id]")
 60 | 					ss.Each(func(i int, goq *goquery.Selection) {
 61 | 						ctx.SetTemp("html", goq)
 62 | 						ctx.Parse("获取结果")
 63 | 					})
 64 | 				},
 65 | 			},
 66 | 
 67 | 			"获取结果": {
 68 | 				//注意：有无字段语义和是否输出数据必须保持一致
 69 | 				ItemFields: []string{
 70 | 					"机型",
 71 | 					"链接",
 72 | 					"主题",
 73 | 					"发表者",
 74 | 					"发表时间",
 75 | 					"总回复",
 76 | 					"总查看",
 77 | 					"最后回复者",
 78 | 					"最后回复时间",
 79 | 				},
 80 | 				ParseFunc: func(ctx *Context) {
 81 | 					var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
 82 | 
 83 | 					//url
 84 | 					outUrls := selectObj.Find("td").Eq(1)
 85 | 					outUrl, _ := outUrls.Attr("data-url")
 86 | 					outUrl = "http://bbs.zol.com.cn/" + outUrl
 87 | 					//title type
 88 | 					outTitles := selectObj.Find("td").Eq(1)
 89 | 					outType := outTitles.Find(".iclass a").Text()
 90 | 					outTitle := outTitles.Find("div a").Text()
 91 | 
 92 | 					//author stime
 93 | 					authors := selectObj.Find("td").Eq(2)
 94 | 					author := authors.Find("a").Text()
 95 | 					stime := authors.Find("span").Text()
 96 | 
 97 | 					//reply read
 98 | 					replys := selectObj.Find("td").Eq(3)
 99 | 					reply := replys.Find("span").Text()
100 | 					read := replys.Find("i").Text()
101 | 
102 | 					//ereply etime
103 | 					etimes := selectObj.Find("td").Eq(4)
104 | 					ereply := etimes.Find("a").Eq(0).Text()
105 | 					etime := etimes.Find("a").Eq(1).Text()
106 | 
107 | 					// 结果存入Response中转
108 | 					ctx.Output(map[int]interface{}{
109 | 						0: outType,
110 | 						1: outUrl,
111 | 						2: outTitle,
112 | 						3: author,
113 | 						4: stime,
114 | 						5: reply,
115 | 						6: read,
116 | 						7: ereply,
117 | 						8: etime,
118 | 					})
119 | 				},
120 | 			},
121 | 		},
122 | 	},
123 | }
124 | 


--------------------------------------------------------------------------------
/zolphone/zolphone.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | import (
  4 | 	// 基础包
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	// "strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Zolphone.Register()
 31 | }
 32 | 
 33 | var Zolphone = &Spider{
 34 | 	Name:        "中关村手机",
 35 | 	Description: "中关村苹果手机数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{1, 950}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 50 | 						ctx.AddQueue(&request.Request{
 51 | 							Url:  "http://bbs.zol.com.cn/sjbbs/d544_p" + strconv.Itoa(loop[0]) + ".html#c",
 52 | 							Rule: aid["Rule"].(string),
 53 | 						})
 54 | 					}
 55 | 					return nil
 56 | 				},
 57 | 				ParseFunc: func(ctx *Context) {
 58 | 					query := ctx.GetDom()
 59 | 					ss := query.Find("tbody").Find("tr[id]")
 60 | 					ss.Each(func(i int, goq *goquery.Selection) {
 61 | 						ctx.SetTemp("html", goq)
 62 | 						ctx.Parse("获取结果")
 63 | 
 64 | 					})
 65 | 				},
 66 | 			},
 67 | 
 68 | 			"获取结果": {
 69 | 				//注意：有无字段语义和是否输出数据必须保持一致
 70 | 				ItemFields: []string{
 71 | 					"机型",
 72 | 					"链接",
 73 | 					"主题",
 74 | 					"发表者",
 75 | 					"发表时间",
 76 | 					"总回复",
 77 | 					"总查看",
 78 | 					"最后回复者",
 79 | 					"最后回复时间",
 80 | 				},
 81 | 				ParseFunc: func(ctx *Context) {
 82 | 					var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
 83 | 					//url
 84 | 					outUrls := selectObj.Find("td").Eq(1)
 85 | 					outUrl, _ := outUrls.Attr("data-url")
 86 | 					outUrl = "http://bbs.zol.com.cn/" + outUrl
 87 | 
 88 | 					//title type
 89 | 					outTitles := selectObj.Find("td").Eq(1)
 90 | 					outType := outTitles.Find(".iclass a").Text()
 91 | 					outTitle := outTitles.Find("div a").Text()
 92 | 
 93 | 					//author stime
 94 | 					authors := selectObj.Find("td").Eq(2)
 95 | 					author := authors.Find("a").Text()
 96 | 					stime := authors.Find("span").Text()
 97 | 
 98 | 					//reply read
 99 | 					replys := selectObj.Find("td").Eq(3)
100 | 					reply := replys.Find("span").Text()
101 | 					read := replys.Find("i").Text()
102 | 
103 | 					//ereply etime
104 | 					etimes := selectObj.Find("td").Eq(4)
105 | 					ereply := etimes.Find("a").Eq(0).Text()
106 | 					etime := etimes.Find("a").Eq(1).Text()
107 | 
108 | 					// 结果存入Response中转
109 | 					ctx.Output(map[int]interface{}{
110 | 						0: outType,
111 | 						1: outUrl,
112 | 						2: outTitle,
113 | 						3: author,
114 | 						4: stime,
115 | 						5: reply,
116 | 						6: read,
117 | 						7: ereply,
118 | 						8: etime,
119 | 					})
120 | 				},
121 | 			},
122 | 		},
123 | 	},
124 | }
125 | 


--------------------------------------------------------------------------------
/zolslab/zolslab.go:
--------------------------------------------------------------------------------
  1 | package pholcus_lib
  2 | 
  3 | import (
  4 | 	// 基础包
  5 | 	"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
  6 | 	"github.com/henrylee2cn/pholcus/common/goquery"         //DOM解析
  7 | 	// "github.com/henrylee2cn/pholcus/logs"           //信息输出
  8 | 	. "github.com/henrylee2cn/pholcus/app/spider" //必需
  9 | 	// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
 10 | 
 11 | 	// net包
 12 | 	// "net/http" //设置http.Header
 13 | 	// "net/url"
 14 | 
 15 | 	// 编码包
 16 | 	// "encoding/xml"
 17 | 	// "encoding/json"
 18 | 
 19 | 	// 字符串处理包
 20 | 	// "regexp"
 21 | 	"strconv"
 22 | 	// "strings"
 23 | 	// 其他包
 24 | 	// "fmt"
 25 | 	// "math"
 26 | 	// "time"
 27 | )
 28 | 
 29 | func init() {
 30 | 	Zolslab.Register()
 31 | }
 32 | 
 33 | var Zolslab = &Spider{
 34 | 	Name:        "中关村平板",
 35 | 	Description: "中关村平板数据 [Auto Page] [bbs.zol.com.cn/sjbbs/d544_p]",
 36 | 	// Pausetime: 300,
 37 | 	// Keyin:   KEYIN,
 38 | 	// Limit:        LIMIT,
 39 | 	EnableCookie: false,
 40 | 	RuleTree: &RuleTree{
 41 | 		Root: func(ctx *Context) {
 42 | 			ctx.Aid(map[string]interface{}{"loop": [2]int{1, 640}, "Rule": "生成请求"}, "生成请求")
 43 | 		},
 44 | 
 45 | 		Trunk: map[string]*Rule{
 46 | 
 47 | 			"生成请求": {
 48 | 				AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
 49 | 					for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
 50 | 						ctx.AddQueue(&request.Request{
 51 | 							Url:  "http://bbs.zol.com.cn/padbbs/p" + strconv.Itoa(loop[0]) + ".html#c",
 52 | 							Rule: aid["Rule"].(string),
 53 | 						})
 54 | 					}
 55 | 					return nil
 56 | 				},
 57 | 				ParseFunc: func(ctx *Context) {
 58 | 					query := ctx.GetDom()
 59 | 					ss := query.Find("tbody").Find("tr[id]")
 60 | 					ss.Each(func(i int, goq *goquery.Selection) {
 61 | 						ctx.SetTemp("html", goq)
 62 | 						ctx.Parse("获取结果")
 63 | 
 64 | 					})
 65 | 				},
 66 | 			},
 67 | 
 68 | 			"获取结果": {
 69 | 				//注意：有无字段语义和是否输出数据必须保持一致
 70 | 				ItemFields: []string{
 71 | 					"机型",
 72 | 					"链接",
 73 | 					"主题",
 74 | 					"发表者",
 75 | 					"发表时间",
 76 | 					"总回复",
 77 | 					"总查看",
 78 | 					"最后回复者",
 79 | 					"最后回复时间",
 80 | 				},
 81 | 				ParseFunc: func(ctx *Context) {
 82 | 					var selectObj = ctx.GetTemp("html", &goquery.Selection{}).(*goquery.Selection)
 83 | 					//url
 84 | 					outUrls := selectObj.Find("td").Eq(1)
 85 | 					outUrl, _ := outUrls.Attr("data-url")
 86 | 					outUrl = "http://bbs.zol.com.cn/" + outUrl
 87 | 
 88 | 					//title type
 89 | 					outTitles := selectObj.Find("td").Eq(1)
 90 | 					outType := outTitles.Find(".iclass a").Text()
 91 | 					outTitle := outTitles.Find("div a").Text()
 92 | 
 93 | 					//author stime
 94 | 					authors := selectObj.Find("td").Eq(2)
 95 | 					author := authors.Find("a").Text()
 96 | 					stime := authors.Find("span").Text()
 97 | 
 98 | 					//reply read
 99 | 					replys := selectObj.Find("td").Eq(3)
100 | 					reply := replys.Find("span").Text()
101 | 					read := replys.Find("i").Text()
102 | 
103 | 					//ereply etime
104 | 					etimes := selectObj.Find("td").Eq(4)
105 | 					ereply := etimes.Find("a").Eq(0).Text()
106 | 					etime := etimes.Find("a").Eq(1).Text()
107 | 
108 | 					// 结果存入Response中转
109 | 					ctx.Output(map[int]interface{}{
110 | 						0: outType,
111 | 						1: outUrl,
112 | 						2: outTitle,
113 | 						3: author,
114 | 						4: stime,
115 | 						5: reply,
116 | 						6: read,
117 | 						7: ereply,
118 | 						8: etime,
119 | 					})
120 | 				},
121 | 			},
122 | 		},
123 | 	},
124 | }
125 | 


--------------------------------------------------------------------------------