├── DoubanMovie
├── movie_crawler.py
└── write_to_mysql.py
├── Huaban
├── explain.md
├── huaban_crawler.py
└── huaban_travel_places_result.txt
├── IpProxy
├── Ip181FreeProxy
│ └── get_ip181.py
├── KuaiFreeProxy
│ └── get_kuaifreeproxy.py
└── XunFreeProxy
│ └── get_xunfreeproxy.py
├── README.md
├── SinaWeibo
├── chromedriver
├── image_result.md
├── weibo_crawler.py
└── weibo_hot_topic_crawler.py
└── WechatOfficialAccounts
└── spider_wechat_official_accounts.py
/DoubanMovie/movie_crawler.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8
2 |
3 | import requests
4 | import json
5 | import os,sys,time
6 | from lxml import etree
7 | from scrapy.selector import Selector
8 | from scrapy.http import HtmlResponse
9 | from bs4 import BeautifulSoup
10 | import re
11 | reload(sys)
12 | sys.setdefaultencoding("utf-8")
13 |
14 | LANGUAGES_RE = re.compile(ur"语言: (.+?)
")
15 | COUNTRIES_RE = re.compile(ur"制片国家/地区: (.+?)
")
16 | ALTERNATE_NAME_RE = re.compile(ur"又名: (.+?)
")
17 | RELEASE_TIME_RE = re.compile(ur"上映日期: (.+?)
")
18 | NUM_RE = re.compile(r"(\d+)")
19 |
20 | data_save_file = "douban_donghua_results.txt"
21 | headers = {
22 | 'Accept':'*/*',
23 | 'Accept-Encoding':'gzip, deflate, br',
24 | 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
25 | 'Connection':'keep-alive',
26 | 'Host':'movie.douban.com',
27 | 'Referer':'https://movie.douban.com/explore',
28 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
29 | 'X-Requested-With':'XMLHttpRequest'
30 | }
31 |
32 | def get_item_list(d_url,d_type,d_tag,d_sort,d_page_limit,d_page_start):
33 | params = {}
34 | params["type"] = d_type
35 | params["tag"] = d_tag
36 | if d_sort != "":
37 | params["sort"] = d_sort
38 | params["page_limit"] = d_page_limit
39 | params["page_start"] = d_page_start
40 | response = requests.get(d_url,headers = headers,params = params,timeout =10)
41 | json_obj = response.json()
42 | json_array = json_obj["subjects"]
43 | return json_array
44 |
45 | def get_item_list_from_newsearch(d_url,d_sort,d_range,d_tag,d_page_start):
46 | params = {}
47 | params["sort"] = d_sort
48 | params["tags"] = d_tag
49 | params["range"] = d_range
50 | params["start"] = d_page_start
51 | response = requests.get(d_url,headers = headers,params = params,timeout =10)
52 | json_obj = response.json()
53 | json_array = json_obj["data"]
54 | return json_array
55 | def get_item_detail(item_detail_url):
56 | result_obj = {}
57 | result_obj["subject_id"] = int(item_detail_url.split("/")[-2])
58 | celebrities_url = "https://movie.douban.com/subject/"+str(result_obj["subject_id"])+"/celebrities"
59 | (directors_cn_names,directors_en_names,actors_cn_names,actors_en_names)=get_directors_and_actors(celebrities_url)
60 | result_obj["directors_cn_names"] = directors_cn_names
61 | result_obj["directors_en_names"] = directors_en_names
62 | result_obj["actors_cn_names"] = actors_cn_names
63 | result_obj["actors_en_names"] = actors_en_names
64 | response = requests.get(item_detail_url,headers = headers,timeout = 10)
65 | selector = etree.HTML(response.text)
66 | s_response = HtmlResponse(url=item_detail_url,body = response.text,encoding='utf-8')
67 |
68 | name = s_response.selector.xpath("//title/text()").extract()
69 | if name: result_obj["movie_name"] = name[0].replace(u" (豆瓣)", "").strip()
70 |
71 | genres = s_response.selector.xpath("//span[@property='v:genre']/text()").extract()
72 | if genres: result_obj["genres"] = genres
73 |
74 | S = "".join(s_response.selector.xpath("//div[@id='info']").extract())
75 |
76 | M = COUNTRIES_RE.search(S)
77 | if M is not None:
78 | result_obj["countries"] = [country.strip() for country in M.group(1).split("/")]
79 |
80 | L = LANGUAGES_RE.search(S)
81 | if L is not None:
82 | result_obj["languages"] = [ lang.strip() for lang in L.group(1).split("/") ]
83 |
84 | A = ALTERNATE_NAME_RE.search(S)
85 | if A is not None:
86 | result_obj["alternate_name"] =[ alternate.strip() for alternate in A.group(1).split("/")]
87 |
88 | T = []
89 | tags = s_response.selector.xpath("//div[@class='tags-body']/a")
90 | for tag in tags:
91 | t = tag.xpath("text()").extract()
92 | if t: T.append(t[0])
93 | if T: result_obj["tags"] = T
94 |
95 | average = s_response.selector.xpath("//strong[@property='v:average']/text()").extract()
96 | if average and average[0] != "": result_obj["average"] = float( average[0] ) + 0.0
97 |
98 | json_value = json.dumps(result_obj,ensure_ascii = False)
99 | print(json_value)
100 | return json_value
101 |
102 | def get_directors_and_actors(celebrities_url):
103 | try:
104 | p = requests.get(celebrities_url,headers = headers)
105 | html = p.text
106 | soup = BeautifulSoup(html,"html.parser")
107 | div_list = soup.find_all("div","list-wrapper")
108 | directors_html = div_list[0]
109 | directors = directors_html.find_all("a")
110 | directors_cn_names = []
111 | directors_en_names = []
112 | actors_cn_names = []
113 | actors_en_names = []
114 | for x in xrange(len(directors)):
115 | if directors[x].get("target") != "_blank":
116 | director = directors[x].text
117 | first_tag = director.find(" ")
118 | directors_cn_name = director[:first_tag].strip()
119 | directors_en_name = director[first_tag+1:].strip()
120 | if directors_cn_name != "":
121 | directors_cn_names.append(directors_cn_name)
122 | if directors_en_name != "":
123 | directors_en_names.append(directors_en_name)
124 | print directors_cn_name
125 | print directors_en_name
126 |
127 | actors_html = div_list[1]
128 | actors = actors_html.find_all("a")
129 | for x in xrange(len(actors)):
130 | if actors[x].get("target") != "_blank":
131 | actor = actors[x].text
132 | first_tag = actor.find(" ")
133 | actors_cn_name = actor[:first_tag].strip()
134 | actors_en_name = actor[first_tag+1:].strip()
135 | if actors_cn_name != "":
136 | actors_cn_names.append(actors_cn_name)
137 | print "cn_name: "+actors_cn_name
138 | if actors_en_name != "":
139 | actors_en_names.append(actors_en_name)
140 | print "en_name: "+actors_en_name
141 | except Exception, e:
142 | print e
143 | directors_cn_names = []
144 | directors_en_names = []
145 | actors_cn_names = []
146 | actors_en_names = []
147 | finally:
148 | return (directors_cn_names,directors_en_names,actors_cn_names,actors_en_names)
149 |
150 |
151 |
152 | def write_json_obj(json_value):
153 | os.system("touch "+data_save_file)
154 | f= open(data_save_file,'a+')
155 | f.write(str(json_value)+",")
156 | f.close()
157 |
158 | search_url = "https://movie.douban.com/j/search_subjects?"
159 | tag_search_url = "https://movie.douban.com/j/new_search_subjects?"
160 |
161 | #豆瓣电影-选电影下的爬虫,如下例子是经典里的前50页
162 | for x in xrange(0,50):
163 | print x
164 | page_start = 20*x
165 | print page_start
166 | time.sleep(1)
167 | json_array = get_item_list(search_url,"movie","经典","time",20,page_start)
168 | for x in xrange(len(json_array)):
169 | time.sleep(1)
170 | json_value = get_item_detail(json_array[x]["url"])
171 | write_json_obj(json_value)
172 |
173 | #豆瓣电影-分类下的爬虫,如下例子是动画里的前50页
174 | for x in xrange(0,50):
175 | print x
176 | page_start = 20*x
177 | print page_start
178 | time.sleep(1)
179 | json_array = get_item_list_from_newsearch(tag_search_url,"T","0,10","动画",page_start)
180 | for x in xrange(len(json_array)):
181 | time.sleep(1)
182 | json_value = get_item_detail(json_array[x]["url"])
183 | write_json_obj(json_value)
184 |
185 |
186 |
--------------------------------------------------------------------------------
/DoubanMovie/write_to_mysql.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8
2 | import MySQLdb as mdb
3 | import sys
4 | import json
5 | con = None
6 |
7 | json_path = "douban_donghua_results.txt"
8 | def get_jsonarray_from_txt(path):
9 | try:
10 | f = open(path,"r")
11 | text = f.read()
12 | array = json.loads(text)
13 | print len(array)
14 | f.close()
15 | return array
16 | except Exception, e:
17 | print e
18 | finally:
19 | pass
20 |
21 | try:
22 | #连接 mysql 的方法: connect('ip','user','password','dbname')
23 | con = mdb.connect('localhost', 'root','root', 'test',charset="utf8");
24 |
25 | #所有的查询,都在连接 con 的一个模块 cursor 上面运行的
26 | cur = con.cursor()
27 |
28 | data = []
29 | json_array = get_jsonarray_from_txt(json_path)
30 | for x in xrange(len(json_array)):
31 | item = json_array[x]
32 | print item["directors_cn_names"]
33 | print ','.join(item["directors_cn_names"])
34 | # values = [item["subject_id"],item["movie_name"],item["directors_cn_names"],item["directors_en_names"],item["actors_cn_names"],item["actors_en_names"],item["genres"],item["tags"],item["languages"],item["average"],item["alternate_name"],item["countries"]]
35 | try:
36 | values = [item["subject_id"],item["movie_name"],','.join(item["directors_cn_names"]),','.join(item["directors_en_names"]),','.join(item["actors_cn_names"]),','.join(item["actors_en_names"]),','.join(item["genres"]),','.join(item["tags"]),','.join(item["languages"]),item["average"],','.join(item["alternate_name"]),','.join(item["countries"])]
37 | cur.execute('insert into douban_movie(subject_id,movie_name,directors_cn_names,directors_en_names,actors_cn_names,actors_en_names,genres,tags,languages,average,alternate_name,countries) value(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',values)
38 | except KeyError, e:
39 | print e
40 | finally:
41 | pass
42 | cur.close()
43 | except Exception, e:
44 | print e
45 | finally:
46 | if con:
47 | #无论如何,连接记得关闭
48 | con.commit()
49 | con.close()
--------------------------------------------------------------------------------
/Huaban/explain.md:
--------------------------------------------------------------------------------
1 | #### 花瓣爬虫简单说明
2 |
3 | 首先进入你要爬取的页面
4 |
5 | 我以http://huaban.com/favorite/travel_places/ 为例子
6 |
7 | 右键 “检查”,选择network, 往下滑动页面,这时候就会有网络请求,找到我们需要的请求
8 |
9 | 如 http://huaban.com/favorite/travel_places/?j0xapa21&max=1081611043&limit=20&wfl=1
10 |
11 | 
12 |
13 | 然后复制下Response的结果,在 http://json.cn/ 里查看format后的结果,找到对应的数据。
14 |
15 | ```json
16 | {
17 | "filter":"pin:category:travel_places",
18 | "pins":[
19 | {
20 | "pin_id":1081388818,
21 | "user_id":141402,
22 | "board_id":409091,
23 | "file_id":131759569,
24 | "file":{
25 | "id":131759569,
26 | "farm":"farm1",
27 | "bucket":"hbimg",
28 | "key":"cad3b3be27c98e222065f6a20bb2285d9c1d872d9e124-R3LxxT",
29 | "type":"image/jpeg",
30 | "width":"1024",
31 | "height":"683",
32 | "frames":"1",
33 | "colors":[
34 | {
35 | "color":14342874,
36 | "ratio":0.1
37 | }
38 | ],
39 | "audit":{
40 | "porn":{
41 | "rate":0.9999141809676075,
42 | "label":0,
43 | "review":false
44 | }
45 | },
46 | "theme":"dadada"
47 | },
48 | "media_type":0,
49 | "source":"nipic.com",
50 | "link":"http://www.nipic.com/show/16746237.html?v=2",
51 | "raw_text":"新疆喀纳斯湖 喀纳斯景区 旅游观光胜地 峡谷中的湖 内陆淡水湖 山峦起伏 植物树木 阿勒泰地区 人间仙境 高山湖泊 清澈湖水 变换颜色湖水 自然风光",
52 | "text_meta":{
53 |
54 | },
55 | "via":1043457819,
56 | "via_user_id":19710125,
57 | "original":1043457819,
58 | "created_at":1490868088,
59 | "like_count":0,
60 | "comment_count":0,
61 | "repin_count":1,
62 | "is_private":0,
63 | "orig_source":null,
64 | "user":{
65 | "user_id":141402,
66 | "username":"休纱",
67 | "urlname":"wangheady",
68 | "created_at":1332149742,
69 | "avatar":{
70 | "id":74814335,
71 | "farm":"farm1",
72 | "bucket":"hbimg",
73 | "key":"dee8c814cd883df97eadaf34cc416847ef42b7403fbf-viFpjv",
74 | "type":"image/jpeg",
75 | "width":408,
76 | "height":408,
77 | "frames":1
78 | },
79 | "extra":null
80 | },
81 | "board":{
82 | "board_id":409091,
83 | "user_id":141402,
84 | "title":"旅行",
85 | "description":"",
86 | "category_id":"travel_places",
87 | "seq":1,
88 | "pin_count":972,
89 | "follow_count":42,
90 | "like_count":0,
91 | "created_at":1332149777,
92 | "updated_at":1490868097,
93 | "deleting":0,
94 | "is_private":0,
95 | "extra":null
96 | },
97 | "via_user":{
98 | "user_id":19710125,
99 | "username":"六王爷",
100 | "urlname":"znl21",
101 | "created_at":1479094868,
102 | "avatar":{
103 | "bucket":"hbimg",
104 | "farm":"farm1",
105 | "frames":1,
106 | "height":300,
107 | "id":102890808,
108 | "key":"654953460733026a7ef6e101404055627ad51784a95c-B6OFs4",
109 | "type":"image/jpeg",
110 | "width":300
111 | },
112 | "extra":null
113 | }
114 | }
115 | ],
116 | "explore":null,
117 | "promotions":null,
118 | "suggests":{
119 |
120 | },
121 | "banner_box_promotion":null,
122 | "query":null
123 | }
124 | ```
125 |
126 | 这里选取了返回的20个结果的一个作为示例,pins对应的JsonArray的最后一个,找到Key为pin_id作为下一次请求的max对应的值。每个图片的地址为 "http://img.hb.aicdn.com/"+pins里的"file"中的"key"。
--------------------------------------------------------------------------------
/Huaban/huaban_crawler.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | #encoding:utf-8
3 | '''
4 | 花瓣网爬虫,以花瓣下的旅行模块为例 http://huaban.com/favorite/travel_places/
5 | main_page中的max参数对应的是起始的ID,可以点击某个图片进入详情即可看到
6 | 如示例代码中的max参数的值,就是点击这个图片进入详情页后的ID http://huaban.com/pins/1082254826/
7 | 一页爬取20个,爬取完一页后以接口返回的Json中的最后一个的 pin_id 为下一次请求的max的参数
8 | huaban_travel_places_result.txt 为运行的结果
9 | '''
10 |
11 |
12 | import json
13 | import os
14 | import requests
15 |
16 | main_page = "http://huaban.com/favorite/travel_places/?j0x9q48g&max=1082254826&limit=20&wfl=1"
17 | save_result_path = "huaban_travel_places_result.txt"
18 |
19 | headers = {
20 | 'Accept':'application/json',
21 | 'Accept-Encoding':'gzip, deflate, sdch',
22 | 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
23 | 'Cache-Control':'no-cache',
24 | 'Connection':'keep-alive',
25 | 'Cookie':'BDTUJIAID=f6b17872d06259f8a38509c1baf402e9; UM_distinctid=15adb12a1f22b5-0b1ea1d4ebe05e-1d396850-1fa400-15adb12a1f3672; _f=iVBORw0KGgoAAAANSUhEUgAAADIAAAAUCAYAAADPym6aAAAFjElEQVRYR7WWeWxUVRTGf2fa6UIrNUIRi0VBqMiq4mtBAp2JBhODAQE1aoJhp%2BDGvgkIQgAx0PlDQIIlxhiwEjBCMEikU9EiMwVFG0ooFGQxgWKRWKTQzhxz%2B96UKXul3GQyL%2B%2Bdd%2B%2F5zvd95zxBVXHWzMWLaXv8OONXrkRF6u6qSB%2FgG%2BABgZpI7O3%2BK8wEkgVm3%2B4714vLrcytz%2FN6zyUayLjVq0k%2FcYLZixbVx6pIV2AHkC4QamwyCuOAlgILG%2FtudHyjgWQcOsSk5cuJra0lFBND2OV6AvgS6Ay8BKQAa4BXgI7ACoEqhRbA20Br4C%2FgI4FKhZHAamAV8BwwRqCwjm0YgP27BPwkkK%2BQ6hQuD3gXWAss8VXm3rSIDRgZkZfHxxMmsGTGDOIvXaI6IYH58%2BZ95oDoDTQHfgMetvNgH9Af%2BAf4FXgGOAscAHoJnHEYyRCYpPA48AvwCPA00A6bKVOgTIEpCu8B8wAL%2BAMoM%2Bf7KnNP34zRBkCMN1qePcuCuXOjpZWBXcF0oBmwxUk47HjHVLwtMFBgjoIx13rDjgNkPNDGeMR5FgSmA3OAFwXOqc32GwLT1T5jW9QZm4CxjQIyfN06Ohw%2BfLVHugCbgceAJKAEaA9EgLzmVHi%2BwEAn2d%2BBvk6SBojxyIIoIJOABUCOQKlCtpGYwNTrADGNZpivMrfythhpX17OqpwcatxuDKCK1NQ6n9S43e8ASwEvYKq5H8gH3NgyMNJYBpjkTSUNQJP8t46PRgBvAi8APYHRwLOONwxz7wOjgA7AEOBf4Csgy7kuAkb4KnPNfjdc9dLqWlLC5bg4YkIhLsXHU96%2BPQnV1VxMTOzhbGgoN2y0AvoBPwP3OZ44BcQDplUfA847ctqvtiQTnUQvGpmKDdaYrJPTHHYD3YFyIA0wcWY%2FE1cLxPoqc43vbg3kxhHOQLmT3tkE7zaq%2FV73PLkWiBbuaYdKPioXENwopeK1Rqk%2FsAXEsJSAsFKyrU8je%2BqPxW2p1fWopoAsFK%2B14erz1B%2FYhJKBS3ZItjXx6jkiCkOPdGNvq1OUN29omQZd63aB1MmiIDgUqZsv28VjPW%2FfK4hFkk4hki%2FZ1lv1IMx9kg%2BDbiQ2Zj2h0E5EPJJtmVZct9QfNP5KQauykeRyVNaI96n6Ifr5n2t0cHkXmtXGse2hg5SlmFF1Zf1vIPbhAaPtLmhSmni7VOmuvT0JhXeQGO4oWVn1J6k%2FYMzvI1Fbm%2FvqDxSB%2FF1fgOLiFKq0AnUNFm%2FPreovng86SjxWm0iq3x%2FeoK6wkHUmHX9aOYfuNeOqqYDYrJgOM0081jL1B74AaSkey0zw%2BqWFwRUoQ8RjmXkTKcD94sk0bRz1F3shvA0NdxRvr5PqDy5FdTJx8WnSp8cZE2M8YqQ19kAWO9scaVogtpwCJxEuiCfzUftaRovHatAq1R%2F80LRf8VhmFhlZfg10Eq9lupYDRLfjjnvQJG4zKCtwx6VHA4kJuxhdat0lIIWBD1CZhcgsNDzCALrWxAaIjkWT2tRJ0B8w86h5JNZh5DvQfuLJ2m0D14kRKUYYubtACo4mIGcrzKc6yiLxWmZINlhasHcAEtoc5ZEgSKl4rGFaWNyNcIsypOIcKsNNN9OC4BJgkGFMd%2B3rIX2f3B8tre3pZRxtqq7VwAMFwY2gg6I1fS0rgd2ohBD5AQ2PIS6%2BMzWXpwCTQXuhvI7wKuKaRlh9xDAScR0jFC4GXenrXjQ%2B63Q65nfinvNsbXuQmpgrH8R31LUiyZqqEQ6Nk2wr52oADQD7g5%2BApuGOH2l7YU9vxPUy4aqp4vXWqr94EhruD7JWvNZGu50nLyVW83ydi0oyT6dT4wqRGHKzL%2FUU1TFm6NurSYDcLPmmenZXJntTJdeYfW4F5D%2BFfIk3uiHuDgAAAABJRU5ErkJggg%3D%3D%2CMacIntel.1920.1080.24; wft=1; crtg_rta=criteo_250x250crtnative3criteo_200x200_Pins%3Bcriteo_200x200_Search%3B; _cnzz_CV1256903590=is-logon%7Clogged-out%7C1490170334892; _ga=GA1.2.659722060.1489734116; __asc=c44c6e9615af4faa683f624a6bd; __auc=4426e80715adb12a228b727e1b4; sid=LpKUX2dYDSQobYImEL6VqeAkafp.Ye41ttTXInOv21reRwzXRyhebcGzTKkgn%2FlSWI2yYEw; CNZZDATA1256903590=449876967-1489731133-null%7C1490168535',
26 | 'Host':'huaban.com',
27 | 'Pragma':'no-cache',
28 | 'Referer':'http://huaban.com/favorite/pets/',
29 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
30 | 'X-Request':'JSON',
31 | 'X-Requested-With':'XMLHttpRequest'
32 | }
33 |
34 | def get_image_urls(url):
35 | response = requests.get(url,headers=headers)
36 | response.encoding= "utf-8"
37 | jsonObj = response.json()
38 | pins = jsonObj["pins"]
39 | for x in xrange(len(pins)):
40 | url = "http://img.hb.aicdn.com/"+pins[x]["file"]["key"]
41 | write_txt(url)
42 | print url
43 | next_url_id=pins[-1]['pin_id']
44 | print next_url_id
45 | next_page_url = make_next_request_url(next_url_id)
46 | print next_page_url
47 | get_image_urls(next_page_url)
48 |
49 | def make_next_request_url(id_num):
50 | return "http://huaban.com/favorite/travel_places/?j0x9q48g&max=" + str(id_num) + "&limit=20&wfl=1"
51 |
52 | def write_txt(url):
53 | try:
54 | if os.path.isfile(save_result_path)==False:
55 | os.system("touch "+save_result_path)
56 | f= open(save_result_path,'a+')
57 | f.write(url.encode("utf-8"))
58 | f.write('\n')
59 | except Exception, e:
60 | print e
61 | finally:
62 | pass
63 |
64 |
65 |
66 | get_image_urls(main_page)
67 |
68 |
69 |
--------------------------------------------------------------------------------
/IpProxy/Ip181FreeProxy/get_ip181.py:
--------------------------------------------------------------------------------
1 | # /usr/bin/python
2 | # encoding:utf-8
3 |
4 | import requests
5 | import json
6 | from bs4 import BeautifulSoup
7 | import sys
8 | reload(sys)
9 | sys.setdefaultencoding('utf-8')
10 |
11 | base_url = 'http://www.ip181.com/'
12 | proxy_list = []
13 |
14 |
15 | def get_181_free_proxies():
16 | try:
17 | print "--------------------------get_181_freeproxy---------------------------"
18 | global proxy_list
19 | p = requests.get(base_url)
20 | requests.encoding = "gb2312"
21 | html = p.text
22 | soup = BeautifulSoup(html,"html.parser")
23 | content = soup.find("tbody")
24 | tr_list = content.find_all("tr")
25 | for x in xrange(1,len(tr_list)):
26 | one_tr = tr_list[x]
27 | ip = one_tr.find_all("td")[0].text
28 | port = one_tr.find_all("td")[1].text
29 | kuai_proxy = ip+":"+port
30 | print kuai_proxy
31 | proxy_list.append(kuai_proxy)
32 | return proxy_list
33 | except Exception, e:
34 | print e
35 | finally:
36 | pass
37 |
38 | def get_one_from_list():
39 | try:
40 | print "------------------requests timeout, change a new proxy------------------"
41 | global proxy_list
42 | del proxy_list[0]
43 | if len(proxy_list)<=5:
44 | get_181_free_proxies()
45 | return proxy_list[0]
46 | except Exception, e:
47 | print e
48 | finally:
49 | pass
--------------------------------------------------------------------------------
/IpProxy/KuaiFreeProxy/get_kuaifreeproxy.py:
--------------------------------------------------------------------------------
1 | # /usr/bin/python
2 | # encoding:utf-8
3 |
4 | import requests
5 | import json
6 | from bs4 import BeautifulSoup
7 | base_url = 'http://www.kuaidaili.com/free/'
8 | kuai_proxy_list = []
9 |
10 | def get_kuai_free_proxies(url):
11 | try:
12 | print "--------------------------get_kuai_freeproxy---------------------------"
13 | global kuai_proxy_list
14 | p = requests.get(url)
15 | html = p.text
16 | soup = BeautifulSoup(html,"html.parser")
17 | content = soup.find("div",id="list")
18 | tr_list = content.find_all("tr")
19 | for x in xrange(1,len(tr_list)):
20 | one_tr = tr_list[x]
21 | ip = one_tr.find_all("td")[0].text
22 | port = one_tr.find_all("td")[1].text
23 | kuai_proxy = ip+":"+port
24 | print kuai_proxy
25 | kuai_proxy_list.append(kuai_proxy)
26 | return kuai_proxy_list
27 | except Exception, e:
28 | print e
29 | finally:
30 | pass
31 |
32 | def get_one_from_list():
33 | try:
34 | print "------------------requests timeout, change a new proxy------------------"
35 | global kuai_proxy_list
36 | del kuai_proxy_list[0]
37 | if len(kuai_proxy_list)<=5:
38 | get_kuai_free_proxies()
39 | return kuai_proxy_list[0]
40 | except Exception, e:
41 | print e
42 | finally:
43 | pass
44 | get_kuai_free_proxies(base_url)
--------------------------------------------------------------------------------
/IpProxy/XunFreeProxy/get_xunfreeproxy.py:
--------------------------------------------------------------------------------
1 | # /usr/bin/python
2 | #encoding:utf-8
3 |
4 | '''
5 | 获取免费的讯代理 http://www.xdaili.cn/freeproxy.html
6 | '''
7 |
8 | import requests
9 | import json
10 |
11 | xun_free_url = "http://www.xdaili.cn/ipagent//freeip/getFreeIps?page=1&rows=10"
12 |
13 | xun_proxy_list = []
14 | proxies = {}
15 |
16 | def get_xun_free_proxy():
17 | try:
18 | print "--------------------------get_xun_freeproxy---------------------------"
19 | global xun_proxy_list
20 | response = requests.get(xun_free_url)
21 | print json.dumps(response.json(),ensure_ascii = False)
22 | xun_proxy_list_result = []
23 | for x in xrange(len(response.json()["rows"])):
24 | xun_proxy = response.json()["rows"][x]["ip"]+":"+response.json()["rows"][x]["port"]
25 | print xun_proxy
26 | xun_proxy_list_result.append(xun_proxy)
27 | xun_proxy_list = xun_proxy_list + xun_proxy_list_result
28 | return xun_proxy_list
29 | except Exception, e:
30 | print e
31 | finally:
32 | pass
33 |
34 | def get_one_from_list():
35 | try:
36 | print "------------------requests timeout, change a new proxy------------------"
37 | global xun_proxy_list
38 | del xun_proxy_list[0]
39 | if len(xun_proxy_list)<=5:
40 | get_xun_free_proxy()
41 | return xun_proxy_list[0]
42 | except Exception, e:
43 | print e
44 | finally:
45 | pass
46 |
47 |
48 | get_xun_free_proxy()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### 1.网站的图片爬虫
2 |
3 | * ##### 微信公众号爬虫,因为微信公众号无网页端展示,只能用第三方搜狗平台的微信公众号。
4 |
5 | * ##### 新浪微博爬虫,使用webdriver登录来获取cookie,然后通过cookie调取接口来获取微博数据。详细分析见Python实现微博爬虫
6 |
7 | * ##### 花瓣网爬虫,以花瓣下的旅行模块为例 http://huaban.com/favorite/travel_places/
8 |
9 | ### 2.爬虫IP代理
10 |
11 | * ##### 免费IP代理,详情见IPProxy文件夹。
12 |
13 | * ##### 包含181代理,快代理,讯代理。
14 |
15 | ### 3.豆瓣电影爬虫
16 |
17 | * ##### 因为懒的再去新建一个Repository,所以也放这了。
18 |
19 |
--------------------------------------------------------------------------------
/SinaWeibo/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/darrenfantasy/image_crawler/b5c38afd80dae0bb89c948ff2bad614c25c2d8d7/SinaWeibo/chromedriver
--------------------------------------------------------------------------------
/SinaWeibo/image_result.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 
5 | 
6 | 
7 | 
8 | 
9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 | 
367 | 
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 |
--------------------------------------------------------------------------------
/SinaWeibo/weibo_crawler.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # encoding:utf-8
3 | '''
4 | 爬取微博的流程:因为微博调用接口的时候需要cookie,所以我们要用webdriver来登录微博获取cookie,微博的cookie有效期应该蛮长的,我设置过期时间6hours,未过期则去本地读取,否则重新登录获取cookie
5 | 获取cookie后则分析微博网页端的请求,找到相应接口和参数,然后去请求我们要的数据。
6 | 这个例子是去获取微博里的图片,例子爬取的微博是我伦的官方账号:MRJ台灣官方
7 | 运行代码脚本需要加5个参数 分别为 1.微博账号 2.微博密码 3.要爬取的账号的个性域名(无个性域名则输入 u/+微博id)4.要爬取的账号的ID 5.爬取页数
8 | 如:python weibo_crawler.py username password mrj168 1837498771 5
9 | '''
10 | from selenium import webdriver
11 | import time
12 | import requests
13 | import json
14 | from bs4 import BeautifulSoup
15 | import os
16 | import sys
17 |
18 | request_params = {"ajwvr":"6","domain":"100505","domain_op":"100505","feed_type":"0","is_all":"1","is_tag":"0","is_search":"0"}
19 | profile_request_params = {"profile_ftype":"1","is_all":"1"}
20 |
21 | weibo_url = "http://weibo.com/"
22 | requset_url = "http://weibo.com/p/aj/v6/mblog/mbloglist?"
23 |
24 |
25 | cookie_save_file = "cookie.txt"#存cookie的文件名
26 | cookie_update_time_file = "cookie_timestamp.txt"#存cookie时间戳的文件名
27 | image_result_file = "image_result.md"#存图片结果的文件名
28 |
29 |
30 | # username = 'your weibo accounts'##你的微博账号
31 | # password = 'your weibo password'##你的微博密码
32 |
33 | person_site_name = "mrj168"#想爬取的微博号的个性域名 无个性域名则换成: u/+"微博id" 如 u/12345678
34 | weibo_id = "1837498771"#微博id可以在网页端打开微博,显示网页源代码,找到关键词$CONFIG['oid']='1837498771';
35 | page_size = 5#你要爬取的微博的页数
36 |
37 |
38 |
39 |
40 |
41 |
42 | headers = {#User-Agent需要根据每个人的电脑来修改
43 | 'Accept': '*/*',
44 | 'Accept-Encoding': 'gzip, deflate, sdch',
45 | 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
46 | 'Cache-Control':'no-cache',
47 | 'Connection':'keep-alive',
48 | 'Content-Type':'application/x-www-form-urlencoded',
49 | 'Host':'weibo.com',
50 | 'Pragma':'no-cache',
51 | 'Referer':'http://weibo.com/u/3278620272?profile_ftype=1&is_all=1',
52 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
53 | 'X-Requested-With':'XMLHttpRequest'
54 | }
55 |
56 | def get_timestamp():#获取当前系统时间戳
57 | try:
58 | tamp = time.time()
59 | timestamp = str(int(tamp))+"000"
60 | print timestamp
61 | return timestamp
62 | except Exception, e:
63 | print e
64 | finally:
65 | pass
66 |
67 | def login_weibo_get_cookies():#登录获取cookies
68 | time.sleep(2)
69 | driver.find_element_by_name("username").send_keys(username)##输入用户名
70 | driver.find_element_by_name("password").send_keys(password)##输入密码
71 | driver.find_element_by_xpath("//a[@node-type='submitBtn']").click()##点击登录按钮
72 | cookies = driver.get_cookies()##获取cookies
73 | print cookies
74 | cookie = ""
75 | ##将返回的Cookies数组转成微博需要的cookie格式
76 | for x in xrange(len(cookies)):
77 | value = cookies[x]['name']+"="+cookies[x]['value']+";"
78 | cookie = cookie+value
79 | print cookie
80 | return cookie
81 |
82 | def save_cookie(cookie):#把cookie存到本地
83 | try:
84 | f= open(cookie_save_file,'w')
85 | f.write(cookie)
86 | f.close()
87 | except Exception, e:
88 | print e
89 | finally:
90 | pass
91 |
92 | def get_cookie_from_txt():#从本地文件里读取cookie
93 | f = open(cookie_save_file)
94 | cookie = f.read()
95 | print cookie
96 | return cookie
97 |
98 | def save_cookie_update_timestamp(timestamp):#把cookie存到本地
99 | try:
100 | f= open(cookie_update_time_file,'w')
101 | f.write(timestamp)
102 | f.write('\n')
103 | f.close()
104 | except Exception, e:
105 | print e
106 | finally:
107 | pass
108 |
109 | def get_cookie_update_time_from_txt():#获取上一次cookie更新时间
110 | try:
111 | f = open(cookie_update_time_file)
112 | lines = f.readlines()
113 | cookie_update_time = lines[0]
114 | print cookie_update_time
115 | return cookie_update_time
116 | except Exception, e:
117 | print e
118 | finally:
119 | pass
120 |
121 | def write_image_urls(image_list):
122 | try:
123 | f= open(image_result_file,'a+')
124 | for x in xrange(len(image_list)):
125 | image = image_list[x]
126 | show_image = ""
127 | f.write(show_image.encode("utf-8"))
128 | f.write('\n')
129 | f.close()
130 | except Exception, e:
131 | print e
132 | finally:
133 | pass
134 |
135 |
136 | def is_valid_cookie():#判断cookie是否有效
137 | if os.path.isfile(cookie_update_time_file)==False:
138 | return False
139 | else :
140 | f = open(cookie_update_time_file)
141 | lines = f.readlines()
142 | if len(lines) == 0:
143 | return False
144 | else :
145 | last_time_stamp = get_cookie_update_time_from_txt()
146 | if long(get_timestamp()) - long(last_time_stamp) > 6*60*60*1000:
147 | return False
148 | else :
149 | return True
150 |
151 | def get_object_weibo_by_weibo_id_and_cookie(weibo_id,person_site_name,cookie,pagebar,page):#通过微博ID和cookie来调取接口
152 | try:
153 | headers["Cookie"] = cookie
154 | headers['Referer'] = weibo_url+person_site_name+"?profile_ftype=1&is_all=1"
155 | request_params["__rnd"] = get_timestamp()
156 | request_params["page"] = page
157 | request_params["pre_page"] = page
158 | request_params["pagebar"] = pagebar
159 | request_params["id"] = "100505"+weibo_id
160 | request_params["script_uri"] = "/"+person_site_name
161 | request_params["pl_name"] = "Pl_Official_MyProfileFeed__22"
162 | request_params["profile_ftype"] = 1
163 | response = requests.get(requset_url,headers=headers,params=request_params)
164 | print response.url
165 | html = response.json()["data"]
166 | return html
167 | except Exception, e:
168 | print e
169 | finally:
170 | pass
171 |
172 |
173 | def get_object_top_weibo_by_person_site_name_and_cookie(person_site_name,cookie,page):#每一页顶部微博
174 | try:
175 | profile_url = weibo_url+person_site_name+"?"
176 | headers["Cookie"] = cookie
177 | profile_request_params["page"] = page
178 | response = requests.get(profile_url,headers=headers,params=profile_request_params)
179 | print response.url
180 | html = response.text
181 | soup = BeautifulSoup(html,"html.parser")
182 | script_list = soup.find_all("script")
183 | script_size = len(script_list)
184 | print "script_size:"+str(script_size)
185 | tag = 0
186 | for x in xrange(script_size):
187 | if "WB_feed WB_feed_v3 WB_feed_v4" in str(script_list[x]):
188 | tag = x
189 | print "tag:"+str(tag)
190 | # print script_list[script_size-1]
191 | html_start = str(script_list[tag]).find("