(.*?)', unquote_res)[0],
80 | # 视频点赞数
81 | other_data[0],
82 | # 视频评论数
83 | other_data[1],
84 | # 视频收藏数
85 | other_data[2],
86 | # 视频的发布日期
87 | re.findall('(.*?)', unquote_res)[0].split('>')[-1]
88 | ]
89 | return item
90 |
91 | def save_video(self, item, video_url, author_name):
92 | with open(f'{author_name}/{item[1]}.mp4', 'wb') as w:
93 | res = requests.get(video_url).content
94 | w.write(res)
95 | print(item[1], " --- 保存完成!")
96 | with open(f'{author_name}/{author_name}_所有视频信息.csv', 'a+') as a:
97 | if a.read() == '':
98 | a.write(f'视频链接,视频标题,点赞数,评论数,收藏数,发布日期\n')
99 | a.write(','.join(item) + '\n')
100 |
101 |
102 | if __name__ == '__main__':
103 | dv = DownloadVideo('https://www.douyin.com/user/MS4wLjABAAAAkvysSgdqmkgtgucxkirpMWFHbTeZgVOW7zcdUjU3jM4')
104 | dv.run()
105 |
--------------------------------------------------------------------------------
/fangtianxia_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨爬取房天下全部的楼盘数据✨
2 | - 郑州房天下官网:[https://zz.newhouse.fang.com/house/s/](https://zz.newhouse.fang.com/house/s/)
3 |
4 | - 爬取房天下全部的楼盘数据,包含:
5 | - 楼盘标签
6 | - 楼盘面积
7 | - 楼盘价格(平方米)
8 | - 楼盘的网页链接
9 | - 楼盘所在地址
10 | - 楼盘评论数
11 | - 爬取的数据存储方式:
12 | - 通过a追加内容模式,将爬取的数据存储到`data/`文件夹下的json文件
13 | - 该爬虫使用到的模块:
14 | - requests
15 | - time
16 | - json
17 | - lxml
18 | - re
19 |
--------------------------------------------------------------------------------
/fangtianxia_spider/data/郑州楼盘_数据.json:
--------------------------------------------------------------------------------
1 | "2022-04-26 22:15": [
2 | {
3 | "title": "万科·民安 理想星光",
4 | "area": "—76~100平米",
5 | "price": "14000元/㎡",
6 | "link": "http://zz.newhouse.fang.com/loupan/2510152991.htm",
7 | "address": "惠济北三环南阳路交会处向西约300米",
8 | "comment": "44条评论"
9 | },
10 | {
11 | "title": "大溪地",
12 | "area": "—79~700平米",
13 | "price": "8200元/㎡",
14 | "link": "http://zz.newhouse.fang.com/loupan/2510665175.htm",
15 | "address": "[四至五环]荥阳中原西路与商隐路交汇处",
16 | "comment": "647条评论"
17 | },
18 | {
19 | "title": "华瑞紫韵城",
20 | "area": "—101~143平米",
21 | "price": "13500元/㎡",
22 | "link": "http://zz.newhouse.fang.com/loupan/2510819101.htm",
23 | "address": "[三至四环]中原建设西路与长椿路交叉口南北两侧",
24 | "comment": "303条评论"
25 | },
26 | {
27 | "title": "旭辉·一江雲著",
28 | "area": "—98~143平米",
29 | "price": "16000元/㎡",
30 | "link": "http://zz.newhouse.fang.com/loupan/2510152771.htm",
31 | "address": "惠济滨河路与清华园路交叉口向东200米",
32 | "comment": "45条评论"
33 | },
34 | {
35 | "title": "中建·澜溪苑",
36 | "area": "—79~118平米",
37 | "price": "10000元/㎡",
38 | "link": "http://zz.newhouse.fang.com/loupan/2510149437.htm",
39 | "address": "经开前程大道与浔江东路交会处向东200米",
40 | "comment": "109条评论"
41 | },
42 | {
43 | "title": "万科·新田 湖与城",
44 | "area": "—78~144平米",
45 | "price": "7200元/㎡",
46 | "link": "http://zz.newhouse.fang.com/loupan/2510148935.htm",
47 | "address": "[四至五环]荥阳五龙路与博学路交会处·未来生活体验场",
48 | "comment": "133条评论"
49 | },
50 | {
51 | "title": "保利·天汇",
52 | "area": "—98~141平米",
53 | "price": "19500元/㎡",
54 | "link": "http://zz.newhouse.fang.com/loupan/2510149275.htm",
55 | "address": "[二至三环]经开中州大道·航海路·中原福塔北300米",
56 | "comment": "165条评论"
57 | },
58 | {
59 | "title": "美盛教育港湾",
60 | "area": "—89~144平米",
61 | "price": "19500元/㎡",
62 | "link": "http://zz.newhouse.fang.com/loupan/2510148595.htm",
63 | "address": "[三至四环]金水文化路与国基路交汇处",
64 | "comment": "167条评论"
65 | },
66 | {
67 | "title": "富田城·九鼎公馆",
68 | "area": "—76~142平米",
69 | "price": "14300元/㎡起",
70 | "link": "http://zz.newhouse.fang.com/loupan/2510148083.htm",
71 | "address": "[三至四环]管城南三环金岱路(郑尉路)交会处",
72 | "comment": "194条评论"
73 | },
74 | {
75 | "title": "金沙湖高尔夫观邸",
76 | "area": "—70~398平米",
77 | "price": "15500元/㎡起",
78 | "link": "http://zz.newhouse.fang.com/loupan/2510726519.htm",
79 | "address": "[三至四环]经开南三环与第五大街下桥口南500米",
80 | "comment": "533条评论"
81 | },
82 | {
83 | "title": "美盛·金水印",
84 | "area": "—108~165平米",
85 | "price": "23500元/㎡",
86 | "link": "http://zz.newhouse.fang.com/loupan/2510152929.htm",
87 | "address": "[三至四环]金水渠东路与北三环交汇处",
88 | "comment": "37条评论"
89 | },
90 | {
91 | "title": "保利和光屿湖",
92 | "area": "—90~140平米",
93 | "price": "12500元/㎡",
94 | "link": "http://zz.newhouse.fang.com/loupan/2510149583.htm",
95 | "address": "高新双湖科技城创新大道与青梅街交叉口向东100米路北",
96 | "comment": "70条评论"
97 | },
98 | {
99 | "title": "新城时光印象",
100 | "area": "—98~143平米",
101 | "price": "15800元/㎡",
102 | "link": "http://zz.newhouse.fang.com/loupan/2510152741.htm",
103 | "address": "[三至四环]管城南三环文治路南500米",
104 | "comment": "89条评论"
105 | },
106 | {
107 | "title": "远洋沁园",
108 | "area": "—82~138平米",
109 | "price": "7200元/㎡",
110 | "link": "http://zz.newhouse.fang.com/loupan/2510149009.htm",
111 | "address": "[五环以外]荥阳洞林湖与五龙路交汇处",
112 | "comment": "90条评论"
113 | },
114 | {
115 | "title": "碧桂园天玺湾",
116 | "area": "—76~143平米",
117 | "price": "17000元/㎡",
118 | "link": "http://zz.newhouse.fang.com/loupan/2510152717.htm",
119 | "address": "金水杨金路与博学路东南角",
120 | "comment": "82条评论"
121 | },
122 | {
123 | "title": "绿都·东澜岸",
124 | "area": "—89~140平米",
125 | "price": "9800元/㎡",
126 | "link": "http://zz.newhouse.fang.com/loupan/2510148633.htm",
127 | "address": "[四至五环]经开南三环与龙飞北街交会处向北500米",
128 | "comment": "261条评论"
129 | },
130 | {
131 | "title": "锦艺四季城",
132 | "area": "—92~96平米",
133 | "price": "5300元/㎡",
134 | "link": "http://zz.newhouse.fang.com/loupan/2510815251.htm",
135 | "address": "[三至四环]惠济京广快速路与天河路交接处北500米路东",
136 | "comment": "251条评论"
137 | },
138 | {
139 | "title": "兴港永威南樾",
140 | "area": "—87~141平米",
141 | "price": "8500元/㎡起",
142 | "link": "http://zz.newhouse.fang.com/loupan/2510785327.htm",
143 | "address": "[五环以外]航空港区桥航路与凌空街交会处东南角",
144 | "comment": "844条评论"
145 | },
146 | {
147 | "title": "康桥山海云图",
148 | "area": "—89~121平米",
149 | "price": "11000元/㎡起",
150 | "link": "http://zz.newhouse.fang.com/loupan/2510152725.htm",
151 | "address": "高新长椿路开元路·河工大北侧",
152 | "comment": "139条评论"
153 | }
154 | ],
155 | "2022-04-26 22:15": [
156 | {
157 | "title": "雅宝龙湖·天钜",
158 | "area": "—89~149平米",
159 | "price": "18000元/㎡起",
160 | "link": "http://zz.newhouse.fang.com/loupan/2510149563.htm",
161 | "address": "[一至二环]管城航海路与城东南路交汇处向北800米",
162 | "comment": "108条评论"
163 | },
164 | {
165 | "title": "腾威城",
166 | "area": "—86~154平米",
167 | "price": "15500元/㎡起",
168 | "link": "http://zz.newhouse.fang.com/loupan/2510148421.htm",
169 | "address": "[一至二环]金水郑汴路与英协路交叉口向南300米",
170 | "comment": "98条评论"
171 | },
172 | {
173 | "title": "郑地·美景 紫华城",
174 | "area": "—98~128平米",
175 | "price": "12000元/㎡",
176 | "link": "http://zz.newhouse.fang.com/loupan/2510149621.htm",
177 | "address": "郑东新区中原大道与凤栖街交汇处南/地铁八号线龙王庙站南800米",
178 | "comment": "47条评论"
179 | },
180 | {
181 | "title": "华润置地新时代广场",
182 | "area": "—29~49平米",
183 | "price": "12500元/㎡",
184 | "link": "http://zz.newhouse.fang.com/loupan/2510148343.htm",
185 | "address": "[三至四环]郑东新区商鼎路与博学路交汇处",
186 | "comment": "100条评论"
187 | },
188 | {
189 | "title": "保利云上",
190 | "area": "—96~129平米",
191 | "price": "11000元/㎡起",
192 | "link": "http://zz.newhouse.fang.com/loupan/2510152767.htm",
193 | "address": "二七郑密路双铁路交会处向西1300米路南",
194 | "comment": "108条评论"
195 | },
196 | {
197 | "title": "金地西湖春晓",
198 | "area": "—94~122平米",
199 | "price": "11800元/㎡",
200 | "link": "http://zz.newhouse.fang.com/loupan/2510149497.htm",
201 | "address": "[四至五环]中原中原路与杭州路交会处西南",
202 | "comment": "150条评论"
203 | },
204 | {
205 | "title": "中建·滨水苑",
206 | "area": "—94~141平米",
207 | "price": "11000元/㎡",
208 | "link": "http://zz.newhouse.fang.com/loupan/2510148653.htm",
209 | "address": "[五环以外]航空港区桥航路与凌空街交汇处向北100米",
210 | "comment": "105条评论"
211 | },
212 | {
213 | "title": "美盛象湖100",
214 | "area": "—27~33平米",
215 | "price": "8700元/㎡起",
216 | "link": "http://zz.newhouse.fang.com/loupan/2510148389.htm",
217 | "address": "郑东新区金水东路与凤栖街交汇处",
218 | "comment": "94条评论"
219 | },
220 | {
221 | "title": "蓝城·凤起梧桐",
222 | "area": "—137~155平米",
223 | "price": "19000元/㎡",
224 | "link": "http://zz.newhouse.fang.com/loupan/2510148187.htm",
225 | "address": "[四至五环]金水中州大道与杨金路交汇处东360米",
226 | "comment": "109条评论"
227 | },
228 | {
229 | "title": "融创空港宸院",
230 | "area": "—89~138平米",
231 | "price": "9500元/㎡起",
232 | "link": "http://zz.newhouse.fang.com/loupan/2510148561.htm",
233 | "address": "[五环以外]航空港区长安路鄱阳湖路交汇处",
234 | "comment": "164条评论"
235 | },
236 | {
237 | "title": "兴港和昌·凌云筑",
238 | "area": "—95~115平米",
239 | "price": "10500元/㎡",
240 | "link": "http://zz.newhouse.fang.com/loupan/2510149227.htm",
241 | "address": "[四至五环]经开朗星路以南、龙善街以西、美辰路以北、龙真街以东",
242 | "comment": "186条评论"
243 | },
244 | {
245 | "title": "远洋臻园",
246 | "area": "—89~119平米",
247 | "price": "11800元/㎡起",
248 | "link": "http://zz.newhouse.fang.com/loupan/2510152775.htm",
249 | "address": "[四至五环]二七大学南路与芦庄路交汇处向西600米",
250 | "comment": "54条评论"
251 | },
252 | {
253 | "title": "新城郡望府",
254 | "area": "—89~125平米",
255 | "price": "6200元/㎡",
256 | "link": "http://zz.newhouse.fang.com/loupan/2510148183.htm",
257 | "address": "[四至五环]荥阳郑上路与飞龙路交汇处东南侧",
258 | "comment": "128条评论"
259 | },
260 | {
261 | "title": "正商美誉铭筑",
262 | "area": "—36~132平米",
263 | "price": "7600元/㎡",
264 | "link": "http://zz.newhouse.fang.com/loupan/2510780543.htm",
265 | "address": "[三至四环]管城南三环郑新快速路往南300米路东",
266 | "comment": "368条评论"
267 | },
268 | {
269 | "title": "融侨悦城",
270 | "area": "—89~138平米",
271 | "price": "10400元/㎡",
272 | "link": "http://zz.newhouse.fang.com/loupan/2510148379.htm",
273 | "address": "[三至四环]中原航海西路(郑少高速连接线)与西四环交会处",
274 | "comment": "86条评论"
275 | },
276 | {
277 | "title": "世茂振兴璀璨熙湖",
278 | "area": "—106~142平米",
279 | "price": "14000元/㎡起",
280 | "link": "http://zz.newhouse.fang.com/loupan/2510148381.htm",
281 | "address": "[三至四环]中原陇海西路与汇智路交叉口往南800米",
282 | "comment": "105条评论"
283 | },
284 | {
285 | "title": "保利金茂时光悦",
286 | "area": "—71~130平米",
287 | "price": "10500元/㎡",
288 | "link": "http://zz.newhouse.fang.com/loupan/2510149119.htm",
289 | "address": "[四至五环]经开浔江东路与蓝湖街交叉口向北300米",
290 | "comment": "189条评论"
291 | },
292 | {
293 | "title": "坤达江山筑",
294 | "area": "—89~147平米",
295 | "price": "7300元/㎡",
296 | "link": "http://zz.newhouse.fang.com/loupan/2510148583.htm",
297 | "address": "[四至五环]新郑紫荆山南路和新老107连接线交汇处",
298 | "comment": "149条评论"
299 | },
300 | {
301 | "title": "融创森屿墅",
302 | "area": "—143~165平米",
303 | "price": "8000元/㎡起",
304 | "link": "http://zz.newhouse.fang.com/loupan/2510148961.htm",
305 | "address": "[五环以外]荥阳滨河路与陇海快速路交汇处",
306 | "comment": "71条评论"
307 | },
308 | {
309 | "title": "奥园·悦城",
310 | "area": "—89~112平米",
311 | "price": "5800元/㎡",
312 | "link": "http://zz.newhouse.fang.com/loupan/2510149491.htm",
313 | "address": "新郑大学南路与g107连接线东北角",
314 | "comment": "51条评论"
315 | }
316 | ],
317 |
--------------------------------------------------------------------------------
/fangtianxia_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/26 18:38
3 | # @Author : Torres-圣君
4 | # @File : douban_run_spider.py
5 | # @Sofaware : PyCharm
6 | # https://zz.newhouse.fang.com/house/s/
7 | import requests
8 | from lxml import etree
9 | import re
10 | import json
11 | import time
12 | from user_agent import get_ua
13 |
14 |
15 | def run():
16 | # 获取总页数
17 | page_number = get_page_number(first_url)
18 | for i in range(2, page_number+1):
19 | time.sleep(1)
20 | url = f"{first_url}b9{str(i)}/"
21 | parse_page(url)
22 | print(f"第<{i}>页数据保存完毕!")
23 |
24 |
25 | def get_page_number(url):
26 | global city_name
27 | res = requests.get(url, headers=headers)
28 | html = etree.HTML(res.text)
29 | # 城市名称
30 | city_name = html.xpath('//ul[@class="tf f12"]/li[2]/a/text()')[0]
31 | # 提取页码
32 | page_number = html.xpath('//div[@class="otherpage"]/span[2]/text()')[0]
33 | return int(re.findall(r"(\d+)", page_number)[0])
34 |
35 |
36 | def parse_page(url):
37 | res = requests.get(url, headers=headers)
38 | html = etree.HTML(res.text)
39 | div_list = html.xpath('//*[@id="newhouse_loupan_list"]/ul/li/div[1]/div[2]')
40 | # print(len(div_list))
41 | all_data_list = []
42 | for div in div_list:
43 | try:
44 | item = dict()
45 | item["title"] = div.xpath('./div[1]/div[1]/a/text()')[0].strip(" \t\n")
46 | item["area"] = div.xpath('./div[2]//text()')[-1].strip(" \t\n")
47 | item["price"] = div.xpath('./div[5]/span/text()')[0].strip(" \t\n")
48 | try:
49 | item["price"] = item["price"] + div.xpath('./div[5]/em/text()')[0].strip(" \t\n")
50 | except IndexError:
51 | pass
52 | item["link"] = div.xpath('./div[1]/div[1]/a/@href')[0].strip(" \t\n")
53 | item["address"] = div.xpath('./div[3]/div/a/@title')[0].strip(" \t\n")
54 | item["comment"] = div.xpath('./div[1]/div[2]/a/span/text()')[0].strip(" ()\t\n")
55 | # 展示数据
56 | print(item)
57 | all_data_list.append(item)
58 | except IndexError:
59 | pass
60 | save_data(all_data_list)
61 |
62 |
63 | def save_data(item):
64 | if len(item) != 0:
65 | date = time.localtime()
66 | now_date = time.strftime("%Y-%m-%d %H:%M", date)
67 | data = json.dumps(item, indent=1, ensure_ascii=False)
68 | with open(f"./data/{city_name}_数据.json", "a", encoding="utf-8") as w:
69 | w.write(f'"{now_date}": ' + data + ",\n")
70 |
71 |
72 | if __name__ == '__main__':
73 | # 房价首页链接
74 | first_url = "https://zz.newhouse.fang.com/house/s/"
75 | # 城市名称
76 | city_name = ""
77 | headers = {
78 | "user-agent": get_ua(),
79 | }
80 | run()
81 |
--------------------------------------------------------------------------------
/fangtianxia_spider/ua_pool.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/16 21:13
3 | # @Author : Torres-圣君
4 | # @File : ua_pool.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_user_agent():
10 | # UA池
11 | user_agent_list = [
12 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
13 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
15 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
16 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
17 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
19 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
20 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
21 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
22 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
23 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
33 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
34 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
35 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
36 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
37 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
38 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
39 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
40 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
41 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
42 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
43 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
44 | "UCWEB7.0.2.37/28/999",
45 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39",
46 | "NOKIA5700/ UCWEB7.0.2.37/28/999",
47 | "Openwave/ UCWEB7.0.2.37/28/999",
48 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
49 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
50 | ]
51 | # 设置UA伪装
52 | return random.choice(user_agent_list)
--------------------------------------------------------------------------------
/gupiao_rank_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取东方财富个股人气榜✨
2 | - 东方财富个股人气榜官网:[http://guba.eastmoney.com/rank/](http://guba.eastmoney.com/rank/)
3 |
4 | - 输入日期、出发地、目的地获取火车票信息,包含:
5 | - 当前排名
6 | - 排名较昨日变动
7 | - 股票代码
8 | - 股票名称
9 | - 最新价
10 | - 涨跌额
11 | - 涨跌幅
12 | - 最高价
13 | - 最低价
14 |
15 | ```python
16 | 通过抓包获取到接口后,发现接口数据为aes加密数据
17 | 这里通过拿到关键参数,利用js还原加密的密钥和偏移量
18 |
19 | 使用Python的第三方模块'Crypto',对AES的CBC模式进行解密
20 | 通过解密后的数据,获取每个不同股票对应的代码
21 | 再通过对不同代码进行分析和修改,最终构建完整的params
22 | 最后携带上params参数对链接发送请求,后提取关键数据,将其存储到data目录下
23 | ```
24 |
25 | - 该爬虫使用到的模块:
26 | - requests
27 | - time
28 | - json
29 | - openpyxl
30 | - Crypto
31 | - base64
32 |
--------------------------------------------------------------------------------
/gupiao_rank_spider/data/A股市场_人气榜.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/A股市场_人气榜.xlsx
--------------------------------------------------------------------------------
/gupiao_rank_spider/data/港股市场_人气榜.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/港股市场_人气榜.xlsx
--------------------------------------------------------------------------------
/gupiao_rank_spider/data/美股市场_人气榜.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/美股市场_人气榜.xlsx
--------------------------------------------------------------------------------
/gupiao_rank_spider/decryption_AES.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/23 9:58
3 | # @Author : Torres-圣君
4 | # @File : decryption_AES.py
5 | # @Software : PyCharm
6 | from Crypto.Cipher import AES
7 | import base64
8 |
9 | BLOCK_SIZE = 16 # Bytes
10 | pad = lambda s: s + (BLOCK_SIZE - len(s) % BLOCK_SIZE) * chr(BLOCK_SIZE - len(s) % BLOCK_SIZE)
11 | unpad = lambda s: s[:-ord(s[len(s) - 1:])]
12 |
13 |
14 | # 密钥(key), 偏移量(iv) CBC模式加密
15 |
16 | def AES_Decrypt(key, vi, data):
17 | data = data.encode('utf8')
18 | encodebytes = base64.decodebytes(data)
19 | # 将加密数据转换位bytes类型数据
20 | cipher = AES.new(key.encode('utf8'), AES.MODE_CBC, vi.encode('utf8'))
21 | text_decrypted = cipher.decrypt(encodebytes)
22 | # 去补位
23 | text_decrypted = unpad(text_decrypted)
24 | text_decrypted = text_decrypted.decode('utf8')
25 | return text_decrypted
26 |
--------------------------------------------------------------------------------
/gupiao_rank_spider/get_message.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/23 15:19
3 | # @Author : Torres-圣君
4 | # @File : get_message.py
5 | # @Software : PyCharm
6 | import requests
7 |
8 |
9 | class OtherData:
10 | def __init__(self, headers):
11 | self.url = "https://push2.eastmoney.com/api/qt/ulist.np/get"
12 | self.headers = headers
13 |
14 | def join_type1_params(self, page_data):
15 | secids_list = []
16 | for i in page_data:
17 | # HK_开头的为港股市场,代码前加116.
18 | if i[2].startswith('HK_'):
19 | i[2] = i[2].replace("HK_", "")
20 | secids_list.append(f'116.{i[2]}')
21 | # NASDAQ_开头的为美股市场,代码前加105.
22 | elif i[2].startswith('NASDAQ_'):
23 | i[2] = i[2].replace("NASDAQ_", "")
24 | secids_list.append(f'105.{i[2]}')
25 | # NYSE_开头的为美股市场,代码前加106.
26 | elif i[2].startswith('NYSE_'):
27 | i[2] = i[2].replace("NYSE_", "")
28 | secids_list.append(f'106.{i[2]}')
29 | # AMEX_开头的为美股市场,代码前加107.
30 | elif i[2].startswith('AMEX_'):
31 | i[2] = i[2].replace("AMEX_", "")
32 | secids_list.append(f'107.{i[2]}')
33 | # 数字6开头的为A股市场,代码前加1.
34 | elif i[2].startswith('6'):
35 | secids_list.append(f'1.{i[2]}')
36 | else:
37 | secids_list.append(f'0.{i[2]}')
38 | params = {
39 | "fltt": 2,
40 | "np": 3,
41 | "ut": "a79f54e3d4c8d44e494efb8f748db291",
42 | "invt": 2,
43 | "secids": ",".join(secids_list),
44 | "fields": "f1,f2,f3,f4,f12,f13,f14,f152,f15,f16",
45 | }
46 | print(params)
47 | return params
48 |
49 | def get_response(self, page_data):
50 | params = self.join_type1_params(page_data)
51 | res = requests.get(self.url, headers=self.headers, params=params).json()
52 | page_other_data = []
53 | for data in res['data']['diff']:
54 | item = [
55 | data['f14'],
56 | data['f2'],
57 | data['f4'],
58 | str(data['f3'])+'%',
59 | data['f15'],
60 | data['f16']
61 | ]
62 | page_other_data.append(item)
63 | print(item)
64 | # 拼接完整的股票数据并返回
65 | page_all_data = [page_data[i]+page_other_data[i] for i in range(len(page_data))]
66 | return page_all_data
67 |
--------------------------------------------------------------------------------
/gupiao_rank_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/23 9:53
3 | # @Author : Torres-圣君
4 | # @File : download_fonts.py
5 | # @Software : PyCharm
6 | import time
7 | import requests
8 | import json
9 | from decryption_AES import AES_Decrypt
10 | from get_message import OtherData
11 | from openpyxl import Workbook
12 | from openpyxl import load_workbook
13 |
14 |
15 | class GetAESData:
16 | def __init__(self):
17 | self.url = 'http://gbcdn.dfcfw.com/rank/popularityList.js'
18 | self.headers = {
19 | "Referer": "http://guba.eastmoney.com/",
20 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"
21 | }
22 | self.gt = OtherData(self.headers)
23 | self.count = 1
24 |
25 | def run(self):
26 | types_list = ['A股市场', '港股市场', '美股市场']
27 | for types in range(0, 3):
28 | # 循环获取前五页top100
29 | for page in range(1, 6):
30 | time.sleep(1)
31 | print(f"正在获取第{page}页数据!")
32 | # 构建请求参数
33 | params = self.build_params(types, page)
34 | # 解密数据
35 | decrypt_data = self.get_response(params)
36 | # 获取页面数据
37 | page_all_data = self.format_data(decrypt_data)
38 | # 保存页面数据
39 | self.save_data(types_list[types], page_all_data)
40 | print(f"第{page}页数据保存完成!")
41 | # 计数器归1
42 | self.count = 1
43 |
44 | def build_params(self, types, page):
45 | """
46 | type: 0
47 | sort: 0
48 | page: 1
49 | v: 2022_6_23_9_56
50 | """
51 | t = time.localtime()
52 | time_list = time.strftime("%Y_%m_%d_%H_%M", t).split('_')
53 | now = '_'.join([i[-1] if i.startswith('0') else i for i in time_list])
54 | params = {
55 | "type": types,
56 | "sort": 0,
57 | "page": page,
58 | "v": now
59 | }
60 | print(params)
61 | return params
62 |
63 | def get_response(self, params):
64 | res = requests.get(self.url, headers=self.headers, params=params).text
65 | # 加密数据
66 | aes_data = res.split("'")[1]
67 | # 密钥
68 | key = 'ae13e0ad97cdd6e12408ac5063d88721'
69 | # 偏移量
70 | vi = 'getClassFromFile'
71 | # 使用AES解密
72 | decrypt_data = AES_Decrypt(key, vi, aes_data)
73 | return decrypt_data
74 |
75 | def format_data(self, decrypt_data):
76 | json_data = json.loads(decrypt_data)
77 | page_data = []
78 | for everyone in json_data:
79 | item = [
80 | everyone['rankNumber'],
81 | everyone['changeNumber'],
82 | everyone['code']
83 | ]
84 | page_data.append(item)
85 | print(item)
86 | page_all_data = self.gt.get_response(page_data)
87 | return page_all_data
88 |
89 | def save_data(self, title, page_all_data):
90 | # 首次保存需创建表格,并写入表头信息
91 | if self.count == 1:
92 | wb = Workbook()
93 | # 创建新的工作蒲
94 | sheet = wb.create_sheet('sheet1', -1)
95 | # 添加表头信息
96 | data_header = ['当前排名', '排名较昨日变动', '股票代码', '股票名称', '最新价', '涨跌额', '涨跌幅', '最高价', '最低价']
97 | page_all_data.insert(0, data_header)
98 | else:
99 | # 读取已有的工作蒲
100 | wb = load_workbook(f'./data/{title}_人气榜.xlsx')
101 | sheet = wb["sheet1"]
102 | for x in range(len(page_all_data)):
103 | for y in range(len(page_all_data[x])):
104 | sheet.cell(x + self.count, y + 1).value = page_all_data[x][y]
105 | # 保存表格并追加计数
106 | wb.save(f'./data/{title}_人气榜.xlsx')
107 | self.count += len(page_all_data)
108 |
109 |
110 | if __name__ == '__main__':
111 | aes = GetAESData()
112 | aes.run()
113 |
--------------------------------------------------------------------------------
/gupiao_rank_spider/获取密钥和偏移量.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 |
10 |
16 |
49 |
--------------------------------------------------------------------------------
/huya_all_types_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取虎牙直播平台在播用户✨
2 | - 虎牙直播全分类网站:[https://www.huya.com/g](https://www.huya.com/g)
3 |
4 | - 输入指定岗位,抓取该岗位的所有招聘信息,包含:
5 | - 主播头像链接
6 | - 主播昵称
7 | - 房间ID号
8 | - 房间标题
9 | - 房间标签
10 | - 直播链接
11 | - 爬取的数据存储方式:
12 | - 文件`all_types_msg.json`存放了虎牙平台实时分类的信息,包含分类的名称、分类的链接、分类的ID
13 | - 通过使用`openpyxl`模块,将爬取的数据存储到`data`文件夹下的`xxx_直播用户信息.xlsx`表格
14 | - 关于主播头像,本打算直接在Excel中插入图片,但考虑速度问题,暂以链接形式填充
15 | - 该爬虫使用到的模块:
16 | - threading
17 | - requests
18 | - json
19 | - time
20 | - lxml
21 | - openpyxl
22 | - random
23 |
--------------------------------------------------------------------------------
/huya_all_types_spider/data/Apex英雄_直播用户信息.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/huya_all_types_spider/data/Apex英雄_直播用户信息.xlsx
--------------------------------------------------------------------------------
/huya_all_types_spider/data/all_types_msg.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/huya_all_types_spider/data/all_types_msg.json
--------------------------------------------------------------------------------
/huya_all_types_spider/get_proxyz.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/1 17:00
3 | # @Author : Torres-圣君
4 | # @File : get_proxyz.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_proxies():
10 | proxies_list = [
11 | {
12 | "https": "https://58.220.95.42:10174"
13 | }, {
14 | "https": "https://118.163.13.200:8080"
15 | }, {
16 | "http": "http://223.96.90.216:8085"
17 | }, {
18 | "http": "http://165.225.202.95:10605"
19 | }, {
20 | "https": "https://139.198.157.59:7890"
21 | }, {
22 | "http": "http://120.220.220.95:8085"
23 | }, {
24 | "http": "http://182.61.201.201:80"
25 | }, {
26 | "http": "http://165.225.206.106:10605"
27 | }, {
28 | "https": "https://117.26.40.251:3712"
29 | }, {
30 | "http": "http://39.130.150.43:80"
31 | }, {
32 | "https": "https://103.38.80.138:3128"
33 | }, {
34 | "http": "http://39.130.150.42:80"
35 | }, {
36 | "http": "http://113.96.62.246:8081"
37 | }, {
38 | "http": "http://39.130.150.44:80"
39 | }, {
40 | "http": "http://112.6.117.135:8085"
41 | }, {
42 | "http": "http://39.130.150.44:80"
43 | }, {
44 | "http": "http://165.225.76.175:10605"
45 | }, {
46 | "https": "https://223.112.99.150:80"
47 | }, {
48 | "http": "http://39.130.150.44:80"
49 | }, {
50 | "https": "https://40.83.102.86:80"
51 | }, {
52 | "https": "https://113.21.237.83:443"
53 | }, {
54 | "http": "http://112.6.117.178:8085"
55 | }, {
56 | "http": "http://218.59.139.238:80"
57 | }, {
58 | "https": "https://210.5.10.87:53281"
59 | }, {
60 | "http": "http://183.247.199.153:30001"
61 | }, {
62 | "http": "http://112.6.117.178:8085"
63 | }, {
64 | "http": "http://47.113.90.161:83"
65 | }, {
66 | "https": "https://222.69.240.130:8001"
67 | }, {
68 | "https": "https://14.20.235.19:45770"
69 | }, {
70 | "http": "http://165.225.204.12:10605"
71 | }, {
72 | "http": "http://103.148.72.192:80"
73 | }, {
74 | "http": "http://165.225.76.165:10605"
75 | }, {
76 | "http": "http://120.220.220.95:8085"
77 | }, {
78 | "http": "http://103.37.141.69:80"
79 | }, {
80 | "https": "https://103.133.177.141:443"
81 | }, {
82 | "http": "http://223.96.90.216:8085"
83 | }, {
84 | "http": "http://120.220.220.95:8085"
85 | }, {
86 | "http": "http://221.122.91.60:80"
87 | }, {
88 | "https": "https://47.93.48.155:8888"
89 | }, {
90 | "http": "http://103.148.72.192:80"
91 | }, {
92 | "http": "http://120.220.220.95:8085"
93 | }, {
94 | "https": "https://42.193.253.152:8089"
95 | },
96 | ]
97 | return random.choice(proxies_list)
98 |
--------------------------------------------------------------------------------
/huya_all_types_spider/get_types_user_msg.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/18 18:13
3 | # @Author : Torres-圣君
4 | # @File : get_types_user_msg.py
5 | # @Sofaware : PyCharm
6 | # https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&gameId=1&tagAll=0&callback=getLiveListJsonpCallback&page=1
7 | import requests
8 | import json
9 | import re
10 | import threading
11 | from openpyxl import Workbook
12 | from get_proxyz import get_proxies
13 | from get_ua import get_ua
14 |
15 |
16 | class NowLiveUsers:
17 | def __init__(self, key, url, gameId):
18 | self.lock = threading.Lock()
19 | self.key = key
20 | self.url = "https://www.huya.com/cache.php"
21 | self.headers = {
22 | "Host": "www.huya.com",
23 | "Referer": url,
24 | "User-Agent": get_ua()
25 | }
26 | self.params = {
27 | "m": "LiveList",
28 | "do": "getLiveListByPage",
29 | "gameId": gameId,
30 | "tagAll": 0,
31 | "callback": "getLiveListJsonpCallback",
32 | "page": 1
33 | }
34 |
35 | def get_page_msg(self):
36 | # 创建一个用于汇总页面数据的列表
37 | all_users_data = [
38 | ["主播头像", "主播昵称", "房间ID号", "房间标题", "房间标签", "直播链接"]
39 | ]
40 | count = 0
41 | # 循环请求不同的页面
42 | while True:
43 | # 启用线程锁,防止数据穿线
44 | with self.lock:
45 | count += 1
46 | # 设置请求参数的页码值
47 | self.params['page'] = count
48 | # 对页面发送请求
49 | res = requests.get(self.url, headers=self.headers, params=self.params, proxies=get_proxies()).text
50 | # 使用re提取数据
51 | dict_data = re.findall(r'getLiveListJsonpCallback\((.*)\)', res)[0]
52 | json_data = json.loads(dict_data)
53 | data_list = json_data["data"]["datas"]
54 | # 如果页面返回为空时,跳出循环
55 | if len(data_list) != 0:
56 | for data in data_list:
57 | user_data = [
58 | data['avatar180'],
59 | data['nick'],
60 | data['profileRoom'],
61 | data['roomName'],
62 | data['recommendTagName'],
63 | "https://www.huya.com/" + data['profileRoom']
64 | ]
65 | # 将数据添加进页面汇总列表
66 | all_users_data.append(user_data)
67 | # 展示数据
68 | print(all_users_data)
69 | else:
70 | # 保存数据到Excel表格
71 | self.save_data(all_users_data)
72 | break
73 |
74 | def save_data(self, all_users_data_list):
75 | # 创建新的excel表格
76 | wb = Workbook()
77 | # 创建新的工作蒲
78 | sheet = wb.create_sheet(self.key, -1)
79 | # 遍历表格索引,写入数据
80 | for x in range(len(all_users_data_list)):
81 | for y in range(len(all_users_data_list[x])):
82 | sheet.cell(x+1, y+1).value = all_users_data_list[x][y]
83 | # 保存该文件
84 | wb.save(f"./data/{self.key}_直播用户信息.xlsx")
85 |
--------------------------------------------------------------------------------
/huya_all_types_spider/get_ua.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/1 17:02
3 | # @Author : Torres-圣君
4 | # @File : get_ua.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_ua():
10 | user_agent_list = [
11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
43 | "UCWEB7.0.2.37/28/999",
44 | "NOKIA5700/ UCWEB7.0.2.37/28/999",
45 | "Openwave/ UCWEB7.0.2.37/28/999",
46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
48 | ]
49 | # 设置UA伪装
50 | return random.choice(user_agent_list)
51 |
--------------------------------------------------------------------------------
/huya_all_types_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/18 17:36
3 | # @Author : Torres-圣君
4 | # @File : download_fonts.py
5 | # @Sofaware : PyCharm
6 | # https://www.huya.com/g 全部分类链接
7 | import threading
8 | import requests
9 | import json
10 | import time
11 | from lxml import etree
12 | from get_types_user_msg import NowLiveUsers
13 |
14 |
15 | class HuyaAllTypes:
16 | def __init__(self):
17 | self.url = "https://www.huya.com/g"
18 | self.headers = {
19 | "Host": "www.huya.com",
20 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"
21 | }
22 |
23 | def get_types_url(self):
24 | # 请求目标网站,并将GBK的'\xa0'转为对应的空格
25 | res = requests.get(self.url, headers=self.headers).text.replace(u'\xa0', u' ')
26 | html = etree.HTML(res)
27 | all_links_list = html.xpath('//*[@id="js-game-list"]/li')
28 | print("共发现%d种分类" % len(all_links_list))
29 | # 创建字典,用于存放所有数据
30 | all_types_msg = dict()
31 | # 循环获取所有分类信息
32 | for all_links in all_links_list:
33 | # 字典的存放格式 --> {分类的名称 :[分类的链接, 分类的gameId]}
34 | all_types_msg[all_links.xpath('./a/p/text()')[0]] = [
35 | all_links.xpath('./a/@href')[0],
36 | all_links.xpath('./a/img/@src')[0].split('/')[-1].split('-')[0]
37 | ]
38 | # 将分类信息保存到本地
39 | self.save_all_types(all_types_msg)
40 | return all_types_msg
41 |
42 | def save_all_types(self, all_types_msg):
43 | json_data = json.dumps(all_types_msg, indent=1, ensure_ascii=False)
44 | # 将分类信息写入JSON文件
45 | with open('./data/all_types_msg.json', 'w') as w:
46 | w.write(json_data)
47 | print("\n全部分类信息保存完毕!")
48 |
49 |
50 | if __name__ == '__main__':
51 | # 获取所有分类的链接
52 | huya = HuyaAllTypes()
53 | all_types_dict_msg = huya.get_types_url()
54 | # 获取每个分类下的所有直播用户
55 | tasks = []
56 | for key, val in all_types_dict_msg.items():
57 | users_msg = NowLiveUsers(key, val[0], val[1])
58 | tasks.append(
59 | threading.Thread(target=users_msg.get_page_msg)
60 | )
61 | # users_msg = NowLiveUsers(key, val[0], val[1])
62 | # users_msg.get_page_msg()
63 | for task in tasks:
64 | # 还是设置间隔1秒比较好点
65 | time.sleep(1)
66 | task.start()
67 | for task in tasks:
68 | task.join()
69 |
--------------------------------------------------------------------------------
/lagou_jobs_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取拉钩网岗位的招聘信息✨
2 | - 拉钩招聘网:[https://www.lagou.com/](https://www.lagou.com/)
3 |
4 | - 输入指定岗位,抓取该岗位的所有招聘信息,包含:
5 | - 工作标题
6 | - 工作链接
7 | - 公司名称
8 | - 薪资范围
9 | - 投递要求
10 | - 公司地址
11 | - 爬取的数据存储方式:
12 | - 通过a追加模式,将爬取的数据存储到`data`文件夹下的`xxx.csv`文件
13 | - 该爬虫使用到的模块:
14 | - os
15 | - csv
16 | - playwright
17 | - `playwright`:新一代自动化工具,相比selenium速度更快,书写更佳
18 | - playwright使用教程:[点击链接进入](https://blog.csdn.net/qq_44091819/article/details/124656846)
--------------------------------------------------------------------------------
/lagou_jobs_spider/data/Python爬虫.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/lagou_jobs_spider/data/Python爬虫.csv
--------------------------------------------------------------------------------
/lagou_jobs_spider/data/lagou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/lagou_jobs_spider/data/lagou.png
--------------------------------------------------------------------------------
/lagou_jobs_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/10 19:18
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import os
7 | import csv
8 | from playwright.sync_api import sync_playwright
9 |
10 |
11 | class LagouJbos:
12 | def __init__(self, job_name):
13 | self.job_name = job_name
14 | self.url = "https://www.lagou.com/jobs/list_" + job_name
15 | self.flag = True
16 |
17 | def get_page_data(self):
18 | with sync_playwright() as pw:
19 | browser = pw.chromium.launch()
20 | page = browser.new_page()
21 | page.goto(self.url)
22 | # 获取总页数
23 | page_max = page.locator('xpath=//*[@id="order"]/li/div[4]/div[3]/span[2]').text_content()
24 | print(f"共找到<{page_max}>页相关数据")
25 | self.is_file()
26 | for i in range(0, int(page_max)):
27 | print(f"正在获取第<{i+1}>页")
28 | self.get_jobs_data(page)
29 | print(f"第<{i+1}>页数据写入完毕,正在进入下一页...")
30 | page.click('xpath=//*[@id="order"]/li/div[4]/div[2]')
31 | page.screenshot(path=f"./data/lagou.png")
32 | browser.close()
33 |
34 | def get_jobs_data(self, page):
35 | try:
36 | jobs_data_list = page.query_selector_all('//*[@id="s_position_list"]/ul/li')
37 | # print(len(jobs_data_list))
38 | for jobs_data in jobs_data_list:
39 | item = dict()
40 | # 工作标题
41 | item['job_title'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a/h3').text_content()
42 | # 工作链接
43 | item['job_link'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a').get_attribute('href')
44 | # 公司名称
45 | item['job_company'] = jobs_data.query_selector('xpath=./div[1]/div[2]/div[1]/a').text_content().strip('\n')
46 | # 薪资范围
47 | item['job_price'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[2]/div/span').text_content()
48 | # 投递要求
49 | item['job_demand'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[2]/div').text_content().strip(' \n').split('\n')[-1]
50 | # 公司地址
51 | item['job_address'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a/span/em').text_content()
52 | # 将数据保存为csv格式
53 | self.save_data(item)
54 | except:
55 | pass
56 |
57 | def save_data(self, item):
58 | # 写入的数据为字典类型
59 | with open(f'./data/{self.job_name}.csv', 'a', newline='') as w:
60 | # 创建一个csv的DictWriter对象
61 | w_csv = csv.DictWriter(w, ['job_title', 'job_link', 'job_company', 'job_price', 'job_demand', 'job_address'])
62 | if self.flag:
63 | # 写入一行当表头,即字典键名
64 | w_csv.writeheader()
65 | self.flag = False
66 | # 写入对行数据,即字典的所有值
67 | w_csv.writerow(item)
68 |
69 | def is_file(self):
70 | try:
71 | # 检测文件是否存在,用于相同工作二次执行
72 | os.remove(f'./data/{self.job_name}.csv')
73 | except:
74 | pass
75 |
76 |
77 | if __name__ == '__main__':
78 | job_name = input("请输入职位名称:")
79 | lagou = LagouJbos(job_name)
80 | lagou.get_page_data()
81 |
--------------------------------------------------------------------------------
/lol_hero_message_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取LOL全英雄的资料信息✨
2 | - LOL全英雄资料信息网站:[https://yz.lol.qq.com/zh_CN/champions/](https://yz.lol.qq.com/zh_CN/champions/)
3 |
4 | - 获取LOL全英雄的资料信息,包含:
5 | - 上线日期
6 | - 英文名称
7 | - 英雄名称
8 | - 英雄定位
9 | - 英雄台词
10 | - 英雄链接
11 | - 原画链接
12 | - 故事简述
13 | - 背景故事
14 | - 爬取的数据存储方式:
15 | - 通过w写入模式,将爬取的数据存储到`heroes_data.json`文件
16 | - 该爬虫使用到的模块:
17 | - requests
18 | - json
19 | - time
--------------------------------------------------------------------------------
/lol_hero_message_spider/heroes_data.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "上线日期": "2013-06-13",
4 | "英文名称": "aatrox",
5 | "英雄名称": "暗裔剑魔·亚托克斯",
6 | "英雄定位": "战士, 坦克",
7 | "英雄台词": "我必须连同希望一起毁坏……",
8 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/aatrox",
9 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Aatrox_0.jpg",
10 | "故事简述": "亚托克斯曾是恕瑞玛抗击虚空时的伟大战士。但是,他和他的同胞却有可能变成符文之地更大的威胁。最终,他们败给了凡人的狡诈巫术,自身的精魂被锁在了武器之内。数百年的监禁之后,亚托克斯头一个挣脱出来,腐蚀并转化那些胆敢染指的蠢人。现在,他将夺来的血肉模仿着自己曾经的形象粗暴地重塑,渴望着迟来许久的末世复仇。",
11 | "背景故事": "许多传说都曾提到过暗裔魔剑,有的描述他是天神,也有的说他是恶魔。但很少有人知道他的真名,以及他是如何败落的。上古时代,远在黄沙吞噬帝国之前,一位伟大的恕瑞玛英雄被带到了太阳圆盘面前,成为一个如今无人记得的星间信念的化身。当他被重塑为飞升者之后,他的双翼彷如黎明时分的金光,盔甲闪亮,如同深空巨帷背后引人遥望的星座。亚托克斯就是他的真名。他在每一场高贵的战斗中都冲锋在前。他真诚待人、领兵有方,其他天神战士总是聚在他的麾下,身后则跟随着一万名恕瑞玛的凡人士兵。当飞升武后瑟塔卡因为艾卡西亚的叛乱而寻求他的帮助时,亚托克斯毫不犹豫地应允了。但是,没人能预料到当地的叛军后来竟然释放出了如此恐怖的力量。虚空转瞬间反客为主,吞噬了艾卡西亚,之后便开始毁灭一切所遭遇的生灵。经过多年苦战,亚托克斯和他的同胞终于遏制住了虚空狂乱的扩张,并将最大的裂口烧熔封铸了起来。但是,活下来的飞升者——他们自称为太阳血脉,却被他们的敌人永远地改变了。虽然恕瑞玛得胜了,但他们全都失去了一些东西……高贵的亚托克斯也不例外。时光流逝,恕瑞玛也陨落了。正如所有帝国的命运。没有了誓死守卫的王权,虚空的威胁也不再迫切,亚托克斯和太阳血脉开始互相争斗,最终演变成了一场战争,毁灭了他们的世界。侥幸逃脱的凡人给了他们一个新的名字,也是一个蔑称:暗裔。正如虚空的侵袭一样,因为担忧堕落的飞升者们也会危及符文之地的生存,巨神族便出手干涉了。据说,暮光星灵传授给了凡人禁锢暗裔的手段,而新近重生的战争星灵联合起了大军对抗他们。亚托克斯和他的军队何曾畏惧,早已蓄势待发。但是,等到他发觉自己中计的时候已经太晚了。一股比上千颗死去的恒星更强大的引力将他拖入了他手中随他出征无数次的巨剑,把他不朽的精魂永远地锁闭在内。这把武器是一座监狱,将他的意识封禁在密不透风的永恒黑暗里,甚至剥夺了他自我了断的能力。他与这地狱般的桎梏拉扯了数百年,直到某个愚蠢透顶的无名氏再次抓起这把巨剑。亚托克斯把握住机会,强行将意志注入到宿主体内,并模仿自己原本的形象重塑了宿主的躯体,同时也夺去了宿主的生命。此后数年间,亚托克斯侵占了许多宿主——无论男女,只要是生机勃勃,或是刚毅非凡。虽然他所掌握的魔法不算精深,但他却能在转瞬间便夺取凡人的身体。而且在战斗中,他发觉死去的人也能为他所用,把自己变得更加健硕强壮。亚托克斯在大地上巡游,不停地竭力寻找能够让他重回飞升之身的办法,但这把剑身上的谜团最终也无法解开,并且他也意识到自己永远也不能获得自由。强夺而来并残忍重塑的血肉愈发地像是一种嘲弄,嘲弄着他曾经的荣光——而那也不过是另一个比巨剑稍微大一些的牢笼罢了。绝望与羞愧在他心中滋长。他曾经所代表的神力,和他所有的记忆统统都被抹去了。不公的命运令他出离地愤怒了。而他最终想到的办法,完全是一个囚犯刻骨的绝望。如果他不能摧毁这把剑,也不能解脱自己,那他就拥抱湮灭好了。现在,亚托克斯怀抱着这无情的决心,沿途散布战争和死亡。他心中只剩下一个盲目的期望:如果他可以把一切造物都拖进一场最终的末日之战——一切都会因此毁灭——那么也许他和这把剑也会永远地不复存在。"
12 | },
13 | {
14 | "上线日期": "2011-12-14",
15 | "英文名称": "ahri",
16 | "英雄名称": "九尾妖狐·阿狸",
17 | "英雄定位": "法师, 刺客",
18 | "英雄台词": "人心善变,甚于最深奥的魔法。",
19 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/ahri",
20 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Ahri_0.jpg",
21 | "故事简述": "天生就与精神领域的魔法存在连接的阿狸,是一名狐狸模样的瓦斯塔亚,在世界上寻找着自己所属的位置。进入凡人社会以后,她成为了一名充满悔意和同情心的掠食者,她喜欢操纵猎物的情绪,然后再吸食他们的生命精魄——每吞噬一个灵魂,都伴着他们生前的记忆片段与领悟洞见。",
22 | "背景故事": "阿狸的身世是个迷,甚至她自己也不清楚。\n\n她找不到自己瓦斯塔亚部族的历史,也不知道自己这一族在其他瓦斯塔亚中的地位。留给她的线索只有她此生一直佩戴的双生宝石。事实上,她最早的记忆,是在尚赞北部与一群冰原狐共同奔跑。虽然她知道自己不是它们的一员,但它们却将她视为相同的灵魂,将她接纳为狐群的一员。
\n\n在那段狂野狩猎的日子里,阿狸始终感受到自己与周围的森林存在着更深层次的连接。不多久,她便领悟到,这就是流淌在她体内的瓦斯塔亚魔法,与彼端的精神领域产生着共鸣。虽然没有人教她,但她却以自己的方式学会使用这股力量——最常用的方式是强化自己的反应速度,便于追逐猎物。而如果她小心靠近,甚至还能安抚一只受惊的小鹿,即使被她的利齿刺入血肉,也一直保持安宁冷静。
\n\n凡人的世界对于阿狸和冰原狐都很遥远、嘈杂,但她却因为某种说不清的理由感到一种吸引力。人类是一种尤其粗鲁、生硬的生物……一天,有一群猎人在附近扎营,阿狸从远处看着他们进行可怕的工作。
\n\n当其中一人被弓箭误伤的时候,阿狸感受到他渐渐流失的生命。生而为掠食者的她,在仅有的本趋势下,品尝了潺潺流出的精魄,与此同时她获得了这位猎人零散的记忆——在战斗中殒命的爱人、留在南方家中的孩子们。阿狸轻轻把他的情绪从恐惧推到悲伤再推到快乐,用温馨的景象安慰他,让他的濒死记忆停留在一片暖融融的草地上。
\n\n后来,她发现自己能轻易理解人类的词汇,他们的语言就像一场模糊的梦,于是阿狸知道,是时候该离开自己的狐群了。
\n\n她游离在人类社会的边缘,从未感到如此充满活力。她依然保留着掠食者本性,但却陷入了许许多多新体验、情绪和艾欧尼亚的传统之中。看起来,凡人也同样会被她迷得神魂颠倒——她经常利用这一点,吸取他们的精魄,同时让他们陷入优美的回忆、渴望的幻象和痛彻心扉的忧伤梦境之中。
\n\n那些不属于她的记忆令她沉醉,而结束他人性命则让她感到精神焕发,只不过她也能感受到自己给受害者带去的悲伤和痛苦。记忆的闪回让她体验到短暂而又美妙的心碎与欣喜,让她欲罢不能。她在记忆的画面中看到一群来自铁与石之地的残忍入侵者,并为猎物落泪。这种感觉让她无所适从,但每当她试图远离人类,就会感到自己的力量开始消散,于是忍不住一次次进食……也一次次因此而痛苦。
\n\n通过无数个偷来的记忆,阿狸开始瞥见更多关于瓦斯塔亚的信息。看来她并不是孤身一人,现在有许多部族都与凡人存在某种紧张关系。最后,她得知了一场反叛运动,目标是要让瓦斯塔亚诸族恢复往日的荣光。
\n\n或许,这将带她找到那段缺失回忆的过去。
\n\n她手中紧握着那对双生宝石,出发寻找自己的同类。她将不再依赖那些借来的回忆和陌生的梦境——如果她的部族依然在符文之地上留存着痕迹,那她就一定要找到它。"
23 | },
24 | {
25 | "上线日期": "2010-05-11",
26 | "英文名称": "akali",
27 | "英雄名称": "离群之刺·阿卡丽",
28 | "英雄定位": "刺客",
29 | "英雄台词": "如果你看上去凶神恶煞,你最好真的是凶神恶煞。",
30 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/akali",
31 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Akali_0.jpg",
32 | "故事简述": "无论是均衡教派还是暗影之拳的称号,都已被阿卡丽抛弃,如今的阿卡丽独来独往,随时可以成为她的人民所需要的夺命武器。虽然她牢牢铭记着她从宗师慎身上学来的一切,但她效忠保护艾欧尼亚并铲除敌人,每次一条命。或许阿卡丽的出击悄然无声,但她传达的信息将响亮无比:不听命于任何人的刺客最为可怕。",
33 | "背景故事": "艾欧尼亚一直以来都是个充满狂野魔法的地方,形形色色的人类和强大的灵体力求和谐共处……但有的时候这宁静的平衡并不能轻易得来。有的时候平衡也需要维护。均衡教派就是一个以维护艾欧尼亚神圣平衡为己任的团体。教派的信徒们行走于精神与物质两个世界之中,协调两界之间的冲突,而且有必要的时候,也会使用强制力介入。阿卡丽天生就是其中一员,她的母亲是梅目•约曼•特曦,身居暗影之拳的位置,她和她的伴侣塔诺决定让女儿从小就在均衡教派内成长,接受教派的宗师——暮光之眼苦说大师的精悉引导。每当阿卡丽的父母受派外出,教派的其他成员就会担当阿卡丽的代理家长,狂暴之心凯南就曾和这个小姑娘共同度过了许多时光,传授她手里剑的技法,教她利用速度和敏捷而非蛮力。阿卡丽拥有超越同龄人的心智,像吸水的海棉一样吸收学识。所有人都看得出,她将沿着自己父母的道路成长——她将伴着宗师的儿子和既定的继任者慎一起,引领新一代信徒,致力于保护艾欧尼亚的平衡。但平衡往往转瞬即逝,教派内部发生了分裂。一个名叫劫的信徒修行归来,刚愎自用的他与苦说大师激烈交锋,通过一场血腥的哗变让教派内部的权力动荡。阿卡丽逃到了东边的山林之中,一起逃走的还有梅目、慎、凯南和其他几名信徒。令人惋惜的是,塔诺不在其中。劫已经几乎把均衡教派变成了绝情的影流。但作为新一任暮光之眼的慎想要重建那些逝去的东西。他们要回归均衡的三大基本理念:以观星寻找纯粹的公正,用逐日执行正义的审判,并通过修枝根除一切不平衡。虽然他们势单力薄,但他们会训练新的信徒,光复并再次振兴他们的教派。当阿卡丽长大到十四岁的时候,她正式开始了均衡教派的训练,决心要继承母亲的位置,成为新的暗影之拳。阿卡丽是个练武奇才,精通钩镰和苦无——也就是单手镰和飞刀。虽然她不像其他信徒那样掌握魔法能力,但她依然用实力证明自己配得上这个称号,不久后,她的母亲就得以退居后方,担任年轻一代信徒的导师。但阿卡丽的灵魂坐立难安,她始终睁眼看着世界。在诺克萨斯入侵艾欧尼亚的战争余波中,虽然均衡和影流之间达成了脆弱的一致,但她却看到自己的家乡继续遭受着痛苦。她质疑他们的教派是否真正履行着职责。所谓修枝,就应该根除一切威胁到神圣平衡的人……然而慎却总是敦促克制。他总是在限制她。所有那些颂文和冥想都能让她的灵魂获得安宁,但这些陈词滥调却不能击败他们的敌人。她的热血冲动和超前成熟变成了不加掩饰的叛逆。她与慎大声争论,她公然反抗他,她用自己的方式解决艾欧尼亚的敌人。当着整个教派的面,她公然宣称均衡教派都是无能之辈,所有关于精神平衡和耐心的说教都毫无成效。艾欧尼亚人正在物质领域垂死挣扎,所以这也是阿卡丽要保护的领域。她接受了刺客的训练,所以她要做刺客该做的事。她已经不再需要这个教派了。慎没有挽留也没有阻止,他知道这条路必须由阿卡丽自己走下去。或许有朝一日她会沿着那条路走回来,但那一切都将由她自己决定。"
34 | }
35 | ]
--------------------------------------------------------------------------------
/lol_hero_message_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @Time : 2021/11/15 14:06
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import time
7 | import json
8 | import requests
9 |
10 |
11 | class HeroMessage:
12 | def __init__(self):
13 | self.headers = {
14 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
15 | }
16 | self.heroes_message_list = []
17 | self.error_list = []
18 |
19 | # 获取英雄对应的链接
20 | def get_heroes_link(self):
21 | url = "https://yz.lol.qq.com/v1/zh_cn/champion-browse/index.json"
22 | res = requests.get(url, headers=self.headers).json()
23 | heroes_list = res["champions"]
24 | for heroes in heroes_list:
25 | item = dict()
26 | # 英雄上线日期
27 | item['上线日期'] = heroes["release-date"][:10]
28 | # 英雄英文名称
29 | item['英文名称'] = heroes["slug"]
30 | # 英雄信息的链接
31 | heroes_slug_link = f"https://yz.lol.qq.com/v1/zh_cn/champions/{item['英文名称']}/index.json"
32 | # 获取英雄详细信息
33 | try:
34 | self.heroes_message_list.append(
35 | self.get_heroes_msg(heroes_slug_link, item)
36 | )
37 | except:
38 | print(heroes_slug_link, "获取信息失败!")
39 | self.error_list.append(heroes_slug_link)
40 | time.sleep(0.5)
41 | # 保存英雄全部数据
42 | self.save_data(self.heroes_message_list)
43 | # 采集失败的链接
44 | print("采集失败的链接", self.error_list)
45 |
46 | # 获取英雄别名->英雄名全称
47 | def get_heroes_msg(self, heroes_slug_link, item):
48 | # 显示正在请求的链接
49 | print("正在获取:", heroes_slug_link)
50 | # 对链接发送请求
51 | res = requests.get(heroes_slug_link, self.headers).json()
52 | # 英雄中文名称
53 | item['英雄名称'] = res["champion"]["title"] + "·" + res["champion"]["name"]
54 | # 英雄定位
55 | item['英雄定位'] = ", ".join([roles["name"] for roles in res["champion"]["roles"]])
56 | # 英雄台词
57 | item['英雄台词'] = res["champion"]["biography"]["quote"].strip("“”").replace("", "")
58 | # 英雄链接
59 | item['英雄链接'] = "https://yz.lol.qq.com/zh_CN/champion/" + item['英文名称']
60 | # 英雄原画
61 | item['原画链接'] = res["champion"]["image"]["uri"]
62 | # 英雄精简故事
63 | item['故事简述'] = res["champion"]["biography"]["short"].strip("
")
64 | # 英雄完整故事
65 | item['背景故事'] = res["champion"]["biography"]["full"].strip("").replace("", "").replace("", "").replace(r"\n", "")
66 | # print(item)
67 | return item
68 |
69 | # 保存英雄数据
70 | def save_data(self, dict_data):
71 | data = json.dumps(dict_data, indent=1, ensure_ascii=False)
72 | with open("heroes_data.json", "w", encoding='utf-8') as w:
73 | w.write(data)
74 | print("英雄信息写入完成...")
75 |
76 |
77 | if __name__ == '__main__':
78 | hero = HeroMessage()
79 | hero.get_heroes_link()
80 |
--------------------------------------------------------------------------------
/lol_skins_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取LOL道聚城所有皮肤信息✨
2 | - LOL道聚城所有皮肤:[https://daoju.qq.com/lol/list/17-0-0-0-0-0-0-0-0-0-0-00-0-0-1-1.shtml](https://daoju.qq.com/lol/list/17-0-0-0-0-0-0-0-0-0-0-00-0-0-1-1.shtml)
3 |
4 | - 爬取LOL道聚城所有皮肤,包含:
5 | - 皮肤名称
6 | - 皮肤价格(点券)
7 | - 皮肤上架日期
8 | - 爬取的数据存储方式:
9 | - 通过w写入模式,将爬取的数据存储到`lol_skins_data.json`文件
10 | - 该爬虫使用到的模块:
11 | - requests
12 | - re
13 | - time
14 | - json
--------------------------------------------------------------------------------
/lol_skins_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/6 12:03
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import requests
7 | import time
8 | import re
9 | import json
10 |
11 |
12 | class LolSkins:
13 | def __init__(self):
14 | self.url = "https://apps.game.qq.com/daoju/v3/api/hx/goods/app/v71/GoodsListApp.php?"
15 | self.headers = {
16 | "referer": "https://daoju.qq.com/",
17 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32"
18 | }
19 | self.params = {
20 | "view": "biz_cate",
21 | "page": int,
22 | "pageSize": 16,
23 | "orderby": "dtShowBegin",
24 | "ordertype": "desc",
25 | "cate": 17,
26 | "appSource": "pc",
27 | "plat": 1,
28 | "output_format": "jsonp",
29 | "biz": "lol",
30 | "_": int(time.time() * 1000)
31 | }
32 |
33 | def get_data(self):
34 | # 初始化字典
35 | all_skins_data = dict()
36 | # 循环请求页面
37 | for i in range(1, 51):
38 | # 请求间隔
39 | time.sleep(1)
40 | # 参数页码
41 | self.params['page'] = i
42 | # 发送请求
43 | res = requests.get(self.url, headers=self.headers, params=self.params)
44 | # 提取数据
45 | skins_list = self.data_format(res.text)
46 | # 添加进字典
47 | all_skins_data[f"lol道具城第<{i}>页"] = skins_list
48 | # 保存数据
49 | self.save_data(all_skins_data)
50 |
51 | def data_format(self, data):
52 | # 皮肤名称
53 | skin_name_list = re.findall(r'"propName":"(.*?)"', data)
54 | # 皮肤价格
55 | skin_price_list = re.findall(r'"iDqPrice":"(\d+)"', data)
56 | # 上架日期
57 | skin_date_list = re.findall(r'"dtBegin":"(.*?)"', data)
58 |
59 | skins_list = []
60 |
61 | for i in range(0, len(skin_name_list)):
62 | item = dict()
63 | item["skin_name"] = str(skin_name_list[i]).encode('utf8').decode('unicode_escape').replace("\\", "")
64 | item["skin_price"] = skin_price_list[i]
65 | item["skin_date"] = skin_date_list[i]
66 | skins_list.append(item)
67 | # 展示数据
68 | print(item)
69 |
70 | return skins_list
71 |
72 | def save_data(self, all_skins_data):
73 | # JSON序列化
74 | json_data = json.dumps(all_skins_data, indent=1, ensure_ascii=False)
75 | with open("lol_skins_data.json", "w", encoding="utf-8") as w:
76 | w.write(json_data)
77 |
78 |
79 | if __name__ == '__main__':
80 | lol = LolSkins()
81 | lol.get_data()
--------------------------------------------------------------------------------
/maoyan_data_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取猫眼电影实时数据✨
2 | - 猫眼专业版实时数据:[https://piaofang.maoyan.com/](https://piaofang.maoyan.com/)
3 | - 一键爬取猫眼实时数据,包含:
4 | - 电影实时票房
5 | - 电影当日排片
6 | - 网络影视热度榜
7 | - 电视收视节目排行
8 | - 爬取的数据存储方式:
9 | - 通过使用`openpyxl`模块,将爬取的数据存储到`data`文件夹下的`猫眼实时数据.xlsx`表格
10 | - 该爬虫使用到的模块:
11 | - requests
12 | - lxml
13 | - openpyxl
14 |
--------------------------------------------------------------------------------
/maoyan_data_spider/data/猫眼实时数据.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/maoyan_data_spider/data/猫眼实时数据.xlsx
--------------------------------------------------------------------------------
/maoyan_data_spider/get_url_data.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/11 18:33
3 | # @Author : Torres-圣君
4 | # @File : get_url_data.py
5 | # @Sofaware : PyCharm
6 | import requests
7 | from lxml import etree
8 | from save_data import *
9 |
10 |
11 | class ExtractData:
12 | def __init__(self, url):
13 | self.url = url
14 | # 需要携带的请求头
15 | self.headers = {
16 | "Referer": "https://piaofang.maoyan.com",
17 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"
18 | }
19 | self.html = etree.HTML(requests.get(self.url, headers=self.headers).text)
20 |
21 | def who_owns(self):
22 | # 判断进来的链接,根据不同的链接选用不同的解析方式
23 | if "box-office" in self.url:
24 | self.box_office()
25 | elif "session" in self.url:
26 | self.session()
27 | elif "web-heat" in self.url:
28 | self.web_heat()
29 | elif "getTVList" in self.url:
30 | self.getTVList()
31 |
32 | def box_office(self):
33 | data_list = []
34 | header_list = ["影片", "票房(万)", "票房占比", "排片占比", "排座占比"]
35 | data_list.append(header_list)
36 | for i in range(1, 11):
37 | body_list = self.html.xpath(f'//*[@class="table-body"]/tr[{i}]')
38 | for body in body_list:
39 | item = [
40 | body.xpath('./td[1]/div/p[1]/text()')[0],
41 | body.xpath('./td[2]/div/text()')[0],
42 | body.xpath('./td[3]/div/text()')[0],
43 | body.xpath('./td[4]/div/text()')[0],
44 | body.xpath('./td[5]/div/text()')[0]
45 | ]
46 | data_list.append(item)
47 | save_data(data_list, "实时票房")
48 |
49 | def session(self):
50 | data_list = []
51 | header_list = ["片名", "场次占比", "场次"]
52 | data_list.append(header_list)
53 | for i in range(1, 12):
54 | body_list = self.html.xpath(f'//table//tr[{i}]')
55 | for body in body_list:
56 | item = [
57 | body.xpath('./td[1]/div/div/span/text()')[0],
58 | body.xpath('./td[2]/div/text()')[0],
59 | body.xpath('./td[3]/div/text()')[0]
60 | ]
61 | data_list.append(item)
62 | save_data(data_list, "电影排片")
63 |
64 | def web_heat(self):
65 | data_list = []
66 | header_list = ["节目", "平台", "上线时长", "实时热度"]
67 | data_list.append(header_list)
68 | for i in range(1, 11):
69 | body_list = self.html.xpath(f'//*[@class="table-body"]/tr[{i}]')
70 | for body in body_list:
71 | item = [
72 | body.xpath('./td[1]/div/div[2]/p[1]/text()')[0],
73 | body.xpath('./td[1]/div/div[2]/p[2]/text()')[0],
74 | body.xpath('./td[1]/div/div[2]/p[2]/span/text()')[0],
75 | body.xpath('./td[2]/div/div[1]/div[1]/text()')[0]
76 | ]
77 | data_list.append(item)
78 | save_data(data_list, f"影视热度榜")
79 |
80 | def getTVList(self):
81 | data_list = []
82 | title = "央视频道" if "0" in self.url else "卫视频道"
83 | header_list = ["节目", "频道", "实时关注度", "市占率"]
84 | data_list.append(header_list)
85 | # 获取返回的JSON数据
86 | json_data = requests.get(self.url, headers=self.headers).json()
87 | body_list = json_data["tvList"]["data"]["list"]
88 | for i in range(0, len(body_list)):
89 | item = [
90 | body_list[i]["programmeName"],
91 | body_list[i]["channelName"],
92 | body_list[i]["attentionRateDesc"],
93 | body_list[i]["marketRateDesc"]
94 | ]
95 | data_list.append(item)
96 | save_data(data_list, title)
97 |
98 |
--------------------------------------------------------------------------------
/maoyan_data_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/11 10:52
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Sofaware : PyCharm
6 | import time
7 | from get_url_data import *
8 |
9 |
10 | class MaoyanData:
11 | def __init__(self, son_url_list: list):
12 | self.son_url_list = son_url_list
13 |
14 | def get_data(self):
15 | for i in self.son_url_list:
16 | for j in i:
17 | url = "https://piaofang.maoyan.com/" + j
18 | print(f"正在获取<{url}>")
19 | ExtractData(url).who_owns()
20 | time.sleep(1)
21 |
22 |
23 | if __name__ == '__main__':
24 | maoyan = MaoyanData(
25 | [
26 | ["box-office?ver=normal"],
27 | ["session"],
28 | ["web-heat"],
29 | ["getTVList?showDate=2&type=" + str(i) for i in range(2)]
30 | ]
31 | )
32 | maoyan.get_data()
33 |
--------------------------------------------------------------------------------
/maoyan_data_spider/save_data.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/11 19:36
3 | # @Author : Torres-圣君
4 | # @File : save_data.py
5 | # @Sofaware : PyCharm
6 | from openpyxl import load_workbook
7 |
8 |
9 | def save_data(data_list, title):
10 | # 创建Excel表对象
11 | wb = load_workbook("./data/猫眼实时数据.xlsx")
12 | # 创建新的sheet
13 | sheet = wb.create_sheet(title, -1)
14 | for i in range(0, len(data_list)):
15 | for j in range(0, len(data_list[i])):
16 | # 写入数据到单元格
17 | sheet.cell(row=i+1, column=j+1).value = data_list[i][j]
18 | # 保存并关闭文件
19 | wb.save("./data/猫眼实时数据.xlsx")
20 | print(f"{title}_写入成功...")
21 | wb.close()
22 |
--------------------------------------------------------------------------------
/meituan_foods_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取美团美食的店铺信息✨
2 | - 美团北京美食所有店铺:[https://bj.meituan.com/meishi/](https://bj.meituan.com/meishi/)
3 |
4 | ```python
5 | 修改 'self.start_url' :修改为想要抓取的城市url
6 | 修改 'self.headers' 下的 'Cookie'和'User-Agent':修改为自己账号登录后的值
7 | 修改 'self.mongo_address' :修改为自己的MongoDB数据库地址
8 | 更换 'ip_pool_run.py':修改其文件下的IP代理地址
9 | ```
10 |
11 | - 爬取美团北京美食店铺的信息,包含:
12 | - 店铺链接
13 | - 店铺名称
14 | - 店铺地址
15 | - 店铺评论数
16 | - 店铺评分
17 | - 爬取的数据存储方式:
18 | - 通过连接MongoDB数据库,将其存入数据库
19 | - 该爬虫使用到的模块:
20 | - requests
21 | - re
22 | - time
23 | - json
24 | - pymongo
--------------------------------------------------------------------------------
/meituan_foods_spider/ip_pool.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/22 16:18
3 | # @Author : Torres-圣君
4 | # @File : ip_pool_run.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_ip():
10 | proxys = [
11 | {
12 | "http": "http://211.103.138.117:8000"
13 | }, {
14 | "http": "http://183.247.215.218:30001"
15 | }, {
16 | "http": "http://221.7.197.248:8000"
17 | }, {
18 | "http": "http://39.175.85.225:30001"
19 | }, {
20 | "http": "http://39.175.85.225:30001"
21 | }, {
22 | "http": "http://123.57.246.163:8118"
23 | }, {
24 | "http": "http://120.76.244.188:8080"
25 | }, {
26 | "http": "http://58.20.232.245:9091"
27 | }, {
28 | "http": "http://203.222.25.57:80"
29 | }, {
30 | "http": "http://223.96.90.216:8085"
31 | }, {
32 | "http": "http://221.7.197.248:8000"
33 | }, {
34 | "http": "http://218.64.84.117:8060"
35 | }, {
36 | "http": "http://120.220.220.95:8085"
37 | },
38 | ]
39 | return random.choice(proxys)
40 |
--------------------------------------------------------------------------------
/meituan_foods_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/22 14:59
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import json
7 | import re
8 | import time
9 | from pymongo import MongoClient
10 | import requests
11 | from .ip_pool import get_ip
12 |
13 |
14 | class MeituanSpider:
15 | def __init__(self):
16 | # 目标网址,修改为想要抓取的城市url
17 | self.start_url = 'https://bj.meituan.com/meishi/'
18 | # 首先需要登录自己的账号上 获取登录后的Cookie信息和User-Agent来构造响应头
19 | self.headers = {
20 | # 修改成自己的cookie
21 | "Cookie": "uuid=fc13f93e2548beaced.1650610445.1.0.0; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=180500c9c6694-0455eeafa4c762-6b3e555b-144000-180500c9c67c8; ci=1; rvct=1; __mta=147677064.1650610453704.1650610453704.1650610453704.1; client-id=19be5210-8e89-4598-a6e2-0decd5081934; mtcdn=K; userTicket=FZodTvVmqRNBtcSIIGENmCRtUCXxFFlvvDUNbDQC; _yoda_verify_resp=lw%2FRe7KjJCSrXlzZjUUHoMC6cv33iCr7LluQL36vp7W%2FSWLD%2FcLgW2NnaEO1MT8u%2Fy0OGm3szpTRomNQj%2BLkD7AlVDDto75c16MkwWz2LQd39H2TWG5%2Fl6%2Bm5UU7W6F23%2BKoK3jYjHETueVKU67hIe%2Boztzp5vFoGPn3Ygs27T9M9Zf6Pd4zsLPyeFy9452ATZNT%2FFQkbqNOM1BLiHC4CdOT4QhO0DAhJU%2BIGJvnXZrRtPnlhlUulQoUsSJBtGPYwAQFJHOyRRM8CD0GXrMddMsXQiS%2FB8kx6aQFCxZPfFy04QHF26N2ztzmTL30e9Uy4Pqk3hS9w2oMRBsdH0wtTV8Mw1p9eqMAIpjTbuIcedfEt6fr2iQusiMwjUCCWTtt; _yoda_verify_rid=150ee0f22540000c; u=2988513400; n=Torres%E5%9C%A3%E5%90%9B; lt=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; mt_c_token=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; token=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; token2=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; unc=Torres%E5%9C%A3%E5%90%9B; _lxsdk=180500c9c6694-0455eeafa4c762-6b3e555b-144000-180500c9c67c8; _hc.v=89028ea2-f5ad-36f8-2732-5d938ae5b422.1650611594; lat=39.983375; lng=116.410765; firstTime=1650612012131; _lxsdk_s=180500c9c67-e43-f4b-d35%7C%7C77",
22 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
23 | }
24 | # 初始化MongoDB数据库并创建数据库连接
25 | self.mongo_address = '127.0.0.1'
26 | self.client = MongoClient(self.mongo_address, 27017)
27 | self.db = self.client['meituan']
28 | self.col = self.db['bj_foods']
29 |
30 | # 获取需要爬取的url列表
31 | def get_url_list(self, url, total_nums):
32 | url_temp = url + 'pn{}/'
33 | # 每一页显示显示15个美食 通过获取到每个分类下的总美食数来求出总页数
34 | pages = total_nums // 15 + 1 if total_nums % 15 != 0 else total_nums // 15
35 | url_list = [url_temp.format(i) for i in range(1, pages + 1)]
36 | return url_list
37 |
38 | # 对url进行请求并返回处理后的响应信息
39 | def parse_url(self, url):
40 | # self.headers['Cookie'] = random.choice(self.cookies)
41 | time.sleep(1)
42 | rest = requests.get(url, headers=self.headers, proxies=get_ip())
43 | html_str = re.findall(r'window._appState = (.*?);', rest.content.decode())[0]
44 | return html_str
45 |
46 | # 访问店家详细页面,获取地址和电话
47 | def get_son_msg(self, url):
48 | time.sleep(1)
49 | res = requests.get(url, headers=self.headers, proxies=get_ip())
50 | # 地址
51 | address = re.findall(r'"address":"(.*?)",', res.text)[0]
52 | # 电话
53 | phone_number = re.findall(r'"phone":"(.*?)",', res.text)[0]
54 | return address, phone_number
55 |
56 | # 创建item并进行存储
57 | def get_content_list(self, html_str):
58 | json_html = json.loads(html_str)
59 | foods = json_html['poiLists']['poiInfos']
60 | for i in foods:
61 | item = {}
62 | # 获取子链接
63 | food_id = i['poiId']
64 | item['链接'] = "https://www.meituan.com/meishi/{}/".format(food_id)
65 | item['店名'] = i['title']
66 | item['地址'], item["电话"] = self.get_son_msg(item['链接'])
67 | item['评论数'] = i['allCommentNum']
68 | item['评分'] = i['avgScore']
69 | # item['价格'] = i['avgPrice']
70 | self.save(item)
71 |
72 | # 保存数据到mongodb数据库中
73 | def save(self, item):
74 | # 转换为字典
75 | data = dict(item)
76 | # 展示数据
77 | print(data)
78 | # 写入数据
79 | self.col.insert_one(data)
80 |
81 | # 主方法
82 | def run(self):
83 | # 首先请求入口url来获取每一个美食分类的url地址
84 | html_str = requests.get(self.start_url, headers=self.headers, proxies=get_ip())
85 | str_html = re.findall(r'window._appState = (.*?);', html_str.content.decode())[0]
86 | json_html = json.loads(str_html)
87 | # 获取分类链接列表
88 | cate_list = json_html['filters']['cates'][1:]
89 | print(cate_list)
90 | item_list = []
91 |
92 | # 对每一个分类进行分组分别获取美食的分类名和美食的分类的url
93 | for i in cate_list:
94 | item = {}
95 | # 分类的url进行反爬处理,将http替换成https
96 | # cate_url= i.xpath('./a/@href')[0]
97 | cate_url = i['url']
98 | item['cate_url'] = cate_url.replace('http', 'https')
99 | # item['cate_name'] = i.xpath('./a/text()')[0]
100 | item['name'] = i['name']
101 | item_list.append(item)
102 |
103 | # 对每一个美食分类的分类名和分类url地址进行遍历并分别进行处理
104 | for i in item_list:
105 | time.sleep(3)
106 | # https请求
107 | rest = requests.get(i['cate_url'], headers=self.headers, proxies=get_ip())
108 | # http替换成https后的全部分类链接
109 | str_html = re.findall(r'window._appState = (.*?);', rest.content.decode())[0]
110 | json_html = json.loads(str_html)
111 | total_nums = json_html['poiLists']['totalCounts']
112 | # 获取每一页的链接
113 | url_list = self.get_url_list(i['cate_url'], total_nums)
114 | for url in url_list:
115 | time.sleep(2)
116 | list_html = self.parse_url(url)
117 | self.get_content_list(list_html)
118 |
119 |
120 | if __name__ == '__main__':
121 | meituan = MeituanSpider()
122 | meituan.run()
123 |
124 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/README.md:
--------------------------------------------------------------------------------
1 | ## ✨搭建一个简易的免费IP代理池✨
2 | - 爬取的免费IP代理的网站包含:
3 | - [http://www.66ip.cn/index.html](http://www.66ip.cn/index.html)
4 | - [https://www.89ip.cn/index_1.html](https://www.89ip.cn/index_1.html)
5 | - [https://ip.ihuan.me/address/5Lit5Zu9.html](https://ip.ihuan.me/address/5Lit5Zu9.html)
6 | - [https://proxy.ip3366.net/free/?action=china&page=1](https://proxy.ip3366.net/free/?action=china&page=1)
7 | - [https://ip.jiangxianli.com/blog.html?page=1](https://ip.jiangxianli.com/blog.html?page=1)
8 | - [https://www.kuaidaili.com/free/inha/1/](https://www.kuaidaili.com/free/inha/1/)
9 |
10 | ```python
11 | # 运行主方法:ip_pool_run.py 即可启动爬虫
12 |
13 | # 该爬虫使用到了 多线程和协程 (没有做到极致,可自行后续优化),同时对这些网站进行ip代理抓取
14 | # 将所有网站抓取到的ip添加到test_ip方法进行测试,如果代理可用则将其保存至ip_pool.json
15 |
16 | # 尚未实现ip代理池去重功能
17 | ```
18 |
19 | - 该程序使用到的模块包含:
20 | - lxml
21 | - request
22 | - json
23 | - random
24 | - threading
25 | - asyncio
26 | - aiohttp
27 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_66ip.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 10:11
3 | # @Author : Torres-圣君
4 | # @File : get_66ip.py
5 | # @Sofaware : PyCharm
6 | # http://www.66ip.cn/index.html
7 | import aiohttp
8 | import asyncio
9 | from user_agent import get_ua
10 | from test_save import test_ip
11 | from lxml import etree
12 |
13 |
14 | def get_data(num):
15 | loop_ = asyncio.new_event_loop()
16 | asyncio.set_event_loop(loop_)
17 | loop = asyncio.get_event_loop()
18 | urls = [f"http://www.66ip.cn/{str(i)}.html" for i in range(1, num+1)]
19 | tasks = [loop.create_task(parse(url)) for url in urls]
20 | loop.run_until_complete(asyncio.wait(tasks))
21 |
22 |
23 | async def parse(url):
24 | try:
25 | headers = {
26 | "User-Agent": get_ua()
27 | }
28 | timeout = aiohttp.ClientTimeout(total=1000)
29 | async with aiohttp.ClientSession(timeout=timeout) as session:
30 | async with session.get(url, headers=headers) as res:
31 | page = etree.HTML(await res.text())
32 | ip_list = page.xpath('//*[@id="main"]/div[1]/div[2]/div[1]/table//tr')
33 | del ip_list[0]
34 | # print(len(ip_list))
35 | for i in range(1, len(ip_list)):
36 | # 提取ip地址
37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0]
38 | # 提取ip端口
39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0]
40 | # 去除无用字符,并拼接为ip可用格式
41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
42 | poxyz = {
43 | "http": ip_msg
44 | }
45 | test_ip(poxyz)
46 | except IndexError:
47 | pass
48 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_89ip.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 10:14
3 | # @Author : Torres-圣君
4 | # @File : get_89ip.py
5 | # @Sofaware : PyCharm
6 | # https://www.89ip.cn/index_1.html
7 | import aiohttp
8 | import asyncio
9 | from user_agent import get_ua
10 | from test_save import test_ip
11 | from lxml import etree
12 |
13 |
14 | def get_data(num):
15 | loop_ = asyncio.new_event_loop()
16 | asyncio.set_event_loop(loop_)
17 | loop = asyncio.get_event_loop()
18 | urls = [f"https://www.89ip.cn/index_{str(i)}.html" for i in range(1, num+1)]
19 | tasks = [loop.create_task(parse(url)) for url in urls]
20 | loop.run_until_complete(asyncio.wait(tasks))
21 |
22 |
23 | async def parse(url):
24 | try:
25 | headers = {
26 | "User-Agent": get_ua()
27 | }
28 | timeout = aiohttp.ClientTimeout(total=1000)
29 | async with aiohttp.ClientSession(timeout=timeout) as session:
30 | async with session.get(url, headers=headers) as res:
31 | page = etree.HTML(await res.text())
32 | ip_list = page.xpath('//table//tr')
33 | del ip_list[0]
34 | # print(len(ip_list))
35 | for i in range(1, len(ip_list)):
36 | # 提取ip地址
37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0]
38 | # 提取ip端口
39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0]
40 | # 去除无用字符,并拼接为ip可用格式
41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
42 | poxyz = {
43 | "http": ip_msg
44 | }
45 | test_ip(poxyz)
46 | except IndexError:
47 | pass
48 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_ihuan.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 10:14
3 | # @Author : Torres-圣君
4 | # @File : get_ihuan.py
5 | # @Sofaware : PyCharm
6 | # https://ip.ihuan.me/address/5Lit5Zu9.html
7 | from user_agent import get_ua
8 | from test_save import test_ip
9 | import time
10 | import requests
11 | from lxml import etree
12 |
13 | main_url = "https://ip.ihuan.me/address/5Lit5Zu9.html/"
14 | next_url = ""
15 | headers = {
16 | "User-Agent": get_ua()
17 | }
18 |
19 |
20 | def get_data(num):
21 | global next_url
22 | next_url = parse(main_url)
23 | for i in range(1, num+1):
24 | time.sleep(1)
25 | parse(next_url)
26 |
27 |
28 | def parse(url):
29 | try:
30 | global next_url
31 | res = requests.get(url, headers=headers)
32 | page = etree.HTML(res.text)
33 | ip_list = page.xpath('//table//tr')
34 | # print(len(ip_list))
35 | for i in range(1, len(ip_list)):
36 | # 提取ip地址
37 | ip_address = ip_list[i].xpath(f'./td[1]/a/text()')[0]
38 | # 提取ip端口
39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0]
40 | # 提取ip类型
41 | ip_type = ip_list[i].xpath(f'./td[5]/text()')[0]
42 | if ip_type == "支持":
43 | # 去除无用字符,并拼接为ip可用格式
44 | ip_msg = "https://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
45 | poxyz = {
46 | "https": ip_msg
47 | }
48 | else:
49 | # 去除无用字符,并拼接为ip可用格式
50 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
51 | poxyz = {
52 | "http": ip_msg
53 | }
54 | test_ip(poxyz)
55 | next_url = main_url + page.xpath('//ul[@class="pagination"]/li[3]/a/@href')[0]
56 | return next_url
57 | except IndexError:
58 | pass
59 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_ip3366.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 10:12
3 | # @Author : Torres-圣君
4 | # @File : get_ip3366.py
5 | # @Sofaware : PyCharm
6 | # https://proxy.ip3366.net/free/?action=china&page=1
7 | import asyncio
8 | import aiohttp
9 | from user_agent import get_ua
10 | from test_save import test_ip
11 | from lxml import etree
12 |
13 |
14 | def get_data(num):
15 | loop_ = asyncio.new_event_loop()
16 | asyncio.set_event_loop(loop_)
17 | loop = asyncio.get_event_loop()
18 | urls = [f"https://proxy.ip3366.net/free/?action=china&page={str(i)}" for i in range(1, num+1)]
19 | tasks = [loop.create_task(parse(url)) for url in urls]
20 | loop.run_until_complete(asyncio.wait(tasks))
21 |
22 |
23 | async def parse(url):
24 | try:
25 | headers = {
26 | "User-Agent": get_ua()
27 | }
28 | timeout = aiohttp.ClientTimeout(total=1000)
29 | async with aiohttp.ClientSession(timeout=timeout) as session:
30 | async with session.get(url, headers=headers) as res:
31 | page = etree.HTML(await res.text())
32 | ip_list = page.xpath('//*[@id="content"]/section/div[2]/table//tr')
33 | del ip_list[0]
34 | # print(len(ip_list))
35 | for i in range(1, len(ip_list)):
36 | # 提取ip地址
37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0]
38 | # 提取ip端口
39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0]
40 | # 提取ip类型
41 | ip_type = ip_list[i].xpath(f'./td[4]/text()')[0]
42 | if ip_type == "HTTPS":
43 | # 去除无用字符,并拼接为ip可用格式
44 | ip_msg = "https://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
45 | poxyz = {
46 | "https": ip_msg
47 | }
48 | else:
49 | # 去除无用字符,并拼接为ip可用格式
50 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
51 | poxyz = {
52 | "http": ip_msg
53 | }
54 | test_ip(poxyz)
55 | except IndexError:
56 | pass
57 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_jiangxianli.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 16:33
3 | # @Author : Torres-圣君
4 | # @File : get_jiangxianli.py
5 | # @Sofaware : PyCharm
6 | # https://ip.jiangxianli.com/blog.html?page=1
7 | import asyncio
8 | import aiohttp
9 | from user_agent import get_ua
10 | from test_save import test_ip
11 | from lxml import etree
12 |
13 |
14 | headers = {
15 | "User-Agent": get_ua()
16 | }
17 |
18 |
19 | def get_data(num):
20 | loop_ = asyncio.new_event_loop()
21 | asyncio.set_event_loop(loop_)
22 | loop = asyncio.get_event_loop()
23 | urls = [f"https://ip.jiangxianli.com/blog.html?page={str(int(i/5)+1)}" for i in range(1, num+1)]
24 | tasks = [loop.create_task(parse(url)) for url in urls]
25 | loop.run_until_complete(asyncio.wait(tasks))
26 |
27 |
28 | async def get_page(url):
29 | try:
30 | timeout = aiohttp.ClientTimeout(total=1000)
31 | async with aiohttp.ClientSession(timeout=timeout) as session:
32 | async with session.get(url, headers=headers, timeout=2) as res:
33 | page = etree.HTML(await res.text())
34 | div_list = page.xpath('//div[@class="contar-wrap"]/div')
35 | for div in div_list:
36 | son_url = div.xpath('./div/h3/a/@href')[0]
37 | await parse(son_url)
38 | except IndexError:
39 | pass
40 |
41 |
42 | async def parse(son_url):
43 | try:
44 | timeout = aiohttp.ClientTimeout(total=1000)
45 | async with aiohttp.ClientSession(timeout=timeout) as session:
46 | async with session.get(son_url, headers=headers) as res:
47 | page = etree.HTML(await res.text())
48 | ip_list = page.xpath('//div[@class="item"]/div/p/text()')
49 | for i in range(0, len(ip_list)):
50 | # 去除无用字符,并拼接为ip可用格式
51 | ip_msg = "http://" + ip_list[i].split("@")[0].strip(" \t\n")
52 | poxyz = {
53 | "http": ip_msg
54 | }
55 | test_ip(poxyz)
56 | except IndexError:
57 | pass
58 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/get_kuaidaili.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 16:45
3 | # @Author : Torres-圣君
4 | # @File : get_kuaidaili.py
5 | # @Sofaware : PyCharm
6 | # https://www.kuaidaili.com/free/inha/1/
7 | import asyncio
8 | import aiohttp
9 | from user_agent import get_ua
10 | from test_save import test_ip
11 | from lxml import etree
12 |
13 |
14 | def get_data(num):
15 | loop_ = asyncio.new_event_loop()
16 | asyncio.set_event_loop(loop_)
17 | loop = asyncio.get_event_loop()
18 | urls = [f"https://www.kuaidaili.com/free/inha/{str(i)}/" for i in range(1, num+1)]
19 | tasks = [loop.create_task(parse(url)) for url in urls]
20 | loop.run_until_complete(asyncio.wait(tasks))
21 |
22 |
23 | async def parse(url):
24 | try:
25 | headers = {
26 | "User-Agent": get_ua()
27 | }
28 | timeout = aiohttp.ClientTimeout(total=1000)
29 | async with aiohttp.ClientSession(timeout=timeout) as session:
30 | async with session.get(url, headers=headers) as res:
31 | page = etree.HTML(await res.text())
32 | ip_list = page.xpath('//table//tr')
33 | del ip_list[0]
34 | # print(len(ip_list))
35 | for i in range(1, len(ip_list)):
36 | # 提取ip地址
37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0]
38 | # 提取ip端口
39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0]
40 | # 去除无用字符,并拼接为ip可用格式
41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n")
42 | poxyz = {
43 | "http": ip_msg
44 | }
45 | test_ip(poxyz)
46 | except IndexError:
47 | pass
48 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/test_save.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 12:09
3 | # @Author : Torres-圣君
4 | # @File : test_save.py
5 | # @Sofaware : PyCharm
6 | import requests
7 | import json
8 | from user_agent import get_ua
9 |
10 |
11 | # 测试ip代理是否可用
12 | def test_ip(poxyz):
13 | url = "http://www.baidu.com"
14 | headers = {
15 | "User-Agent": get_ua()
16 | }
17 | try:
18 | res = requests.get(url=url, headers=headers, proxies=poxyz, timeout=1)
19 | if res.status_code == 200:
20 | save_ip(poxyz)
21 | except Exception:
22 | pass
23 |
24 |
25 | # 将可用的代理进行保存
26 | def save_ip(poxyz):
27 | data = json.dumps(poxyz, indent=1)
28 | with open("./ip_pool.json", "a") as w:
29 | w.write(data+",")
30 | print(f"<{poxyz}>已写入IP代理池...")
31 |
32 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/all_ip_agent/user_agent.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 11:59
3 | # @Author : Torres-圣君
4 | # @File : user_agent.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_ua():
10 | user_agent_list = [
11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
43 | "UCWEB7.0.2.37/28/999",
44 | "NOKIA5700/ UCWEB7.0.2.37/28/999",
45 | "Openwave/ UCWEB7.0.2.37/28/999",
46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
48 | ]
49 | # 设置UA伪装
50 | return random.choice(user_agent_list)
51 |
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/ip_pool.json:
--------------------------------------------------------------------------------
1 | {
2 | "https": "https://58.220.95.42:10174"
3 | },{
4 | "https": "https://118.163.13.200:8080"
5 | },{
6 | "http": "http://223.96.90.216:8085"
7 | },{
8 | "http": "http://165.225.202.95:10605"
9 | },{
10 | "https": "https://139.198.157.59:7890"
11 | },{
12 | "http": "http://120.220.220.95:8085"
13 | },{
14 | "http": "http://182.61.201.201:80"
15 | },{
16 | "http": "http://165.225.206.106:10605"
17 | },{
18 | "https": "https://117.26.40.251:3712"
19 | },{
20 | "http": "http://39.130.150.43:80"
21 | },{
22 | "https": "https://103.38.80.138:3128"
23 | },{
24 | "http": "http://39.130.150.42:80"
25 | },{
26 | "http": "http://113.96.62.246:8081"
27 | },{
28 | "http": "http://39.130.150.44:80"
29 | },{
30 | "http": "http://112.6.117.135:8085"
31 | },{
32 | "http": "http://39.130.150.44:80"
33 | },{
34 | "http": "http://165.225.76.175:10605"
35 | },{
36 | "https": "https://223.112.99.150:80"
37 | },{
38 | "http": "http://39.130.150.44:80"
39 | },{
40 | "https": "https://40.83.102.86:80"
41 | },{
42 | "https": "https://113.21.237.83:443"
43 | },{
44 | "http": "http://112.6.117.178:8085"
45 | },{
46 | "http": "http://218.59.139.238:80"
47 | },{
48 | "https": "https://210.5.10.87:53281"
49 | },{
50 | "http": "http://183.247.199.153:30001"
51 | },{
52 | "http": "http://112.6.117.178:8085"
53 | },{
54 | "http": "http://47.113.90.161:83"
55 | },{
56 | "https": "https://222.69.240.130:8001"
57 | },{
58 | "https": "https://14.20.235.19:45770"
59 | },{
60 | "http": "http://165.225.204.12:10605"
61 | },{
62 | "http": "http://103.148.72.192:80"
63 | },{
64 | "http": "http://165.225.76.165:10605"
65 | },{
66 | "http": "http://120.220.220.95:8085"
67 | },{
68 | "http": "http://103.37.141.69:80"
69 | },{
70 | "https": "https://103.133.177.141:443"
71 | },{
72 | "http": "http://223.96.90.216:8085"
73 | },{
74 | "http": "http://120.220.220.95:8085"
75 | },{
76 | "http": "http://221.122.91.60:80"
77 | },{
78 | "https": "https://47.93.48.155:8888"
79 | },{
80 | "http": "http://103.148.72.192:80"
81 | },{
82 | "http": "http://120.220.220.95:8085"
83 | },{
84 | "https": "https://42.193.253.152:8089"
85 | },
--------------------------------------------------------------------------------
/simple_ip_proxy_pool/ip_pool_run.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/23 10:05
3 | # @Author : Torres-圣君
4 | # @File : ip_poop_run.py
5 | # @Sofaware : PyCharm
6 | import asyncio
7 | from all_ip_agent import get_66ip, get_89ip, get_ip3366, get_ihuan, get_kuaidaili, get_jiangxianli
8 | import threading
9 | import os
10 |
11 |
12 | def thread_run(num):
13 | threads = [
14 | threading.Thread(target=get_66ip.get_data, args=(num,)),
15 | threading.Thread(target=get_89ip.get_data, args=(num,)),
16 | threading.Thread(target=get_ip3366.get_data, args=(num,)),
17 | threading.Thread(target=get_ihuan.get_data, args=(num,)),
18 | threading.Thread(target=get_kuaidaili.get_data, args=(num,)),
19 | threading.Thread(target=get_jiangxianli.get_data, args=(num,)),
20 | ]
21 | for thread in threads:
22 | thread.start()
23 | for thread in threads:
24 | thread.join()
25 |
26 |
27 | if __name__ == '__main__':
28 | try:
29 | os.remove("ip_pool.json")
30 | except:
31 | pass
32 | finally:
33 | # 爬取所有网站前10页可用的IP代理
34 | thread_run(5)
35 | print("爬取完毕!")
36 |
--------------------------------------------------------------------------------
/taobao_commodity_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取淘宝商品相关信息✨
2 | - 淘宝官网:[https://www.taobao.com/](https://www.taobao.com/)
3 |
4 | - 输入需要搜索的商品名,自动获取搜索结果的商品信息,包含:
5 | - 商品名称
6 | - 商品链接
7 | - 商品价格
8 | - 商品销量
9 | - 店铺名称
10 |
11 | - 新增功能:
12 |
13 | ```text
14 | 1. 采集前先获取商品总页数后,再进入循环采集每一页的数据
15 | 2. 滑块验证,使用鼠标动作链实现自动拖拉滑块,当反复尝试无果后,会提示需人工手动滑动,待人工滑动完成后程序将继续采集数据
16 | 3. 使用openpyxl将采集的数据保存至Excel表格中,通过采用一页保存一次的方法,防止因某页数据获取失败影响前者采取到的数据
17 | 4. 设置Excel表格的样式,比如:居中、行高、列宽等,更人性化的展现数据信息
18 | ```
19 |
20 | - 该爬虫使用到的模块:
21 | - re
22 | - time
23 | - random
24 | - selenium
25 | - openpyxl
--------------------------------------------------------------------------------
/taobao_commodity_spider/data/光遇_商品信息.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/taobao_commodity_spider/data/光遇_商品信息.xlsx
--------------------------------------------------------------------------------
/taobao_commodity_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/24 14:57
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import random
7 | import time
8 | import re
9 | from selenium import webdriver
10 | from selenium.webdriver.common.keys import Keys
11 | from selenium.webdriver import ActionChains as ac
12 | from openpyxl import Workbook
13 | from openpyxl import load_workbook
14 | from openpyxl.styles import Alignment
15 |
16 |
17 | class SaveTaobaoData:
18 | def __init__(self, search_content):
19 | # 搜索内容
20 | self.search_content = search_content
21 | # 数据计数器
22 | self.count = 1
23 | # 表格内容居中
24 | self.align = Alignment(horizontal='center', vertical='center', wrap_text=True)
25 | self.options = webdriver.ChromeOptions()
26 | self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
27 | self.options.add_experimental_option('useAutomationExtension', False)
28 | self.driver = webdriver.Chrome(options=self.options)
29 | self.driver.execute_cdp_cmd(
30 | 'Page.addScriptToEvaluateOnNewDocument',
31 | {
32 | 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
33 | }
34 | )
35 |
36 | def get_page(self):
37 | # 访问淘宝网址
38 | self.driver.get('https://www.taobao.com/')
39 | time.sleep(3) # 停一会防止出意外
40 | # 向搜索框中添加内容,并按下回车进行搜索
41 | self.driver.find_element_by_xpath("//input[@aria-label='请输入搜索文字']").send_keys(self.search_content, Keys.ENTER)
42 | # 扫码登陆
43 | self.driver.find_element_by_xpath('//*[@id="login"]/div[1]/i').click()
44 | # 给20秒时间登陆自己的账号,根据自己的速度来
45 | time.sleep(20)
46 | # 进入循环获取每页数据信息
47 | self.get_next_page()
48 |
49 | def get_page_data(self):
50 | # 判断是否出现验证码
51 | self.driver = self.validation()
52 | # 模拟真人操作,拖动滚动条
53 | for x in range(1, 11, 2):
54 | time.sleep(0.5)
55 | j = x / 10
56 | js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
57 | self.driver.execute_script(js)
58 | # 页面存放的所有商品
59 | div_list = self.driver.find_elements_by_xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
60 | print("当前页面的总商品数:", len(div_list))
61 | # 首次数据添加表头
62 | if self.count == 1:
63 | data_list = [
64 | ['商品标题', '商品价格', '商品销量', '店铺名称', '商品链接']
65 | ]
66 | else:
67 | data_list = []
68 | for div in div_list:
69 | try:
70 | item = [
71 | # 商品标题
72 | div.find_element_by_xpath('./div[2]/div[2]/a').text.strip(" \t\n"),
73 | # 商品价格
74 | float(div.find_element_by_xpath('./div[2]/div[1]/div[1]/strong').text),
75 | # 商品销量
76 | div.find_element_by_xpath('./div[2]/div[1]/div[2]').text,
77 | # 店铺名称
78 | div.find_element_by_xpath('./div[2]/div[3]/div[1]/a/span[2]').text,
79 | # 商品链接
80 | div.find_element_by_xpath('./div/div/div[1]/a').get_attribute('href').strip(" \t\n")
81 | ]
82 | # 展示爬取到的数据
83 | print(item)
84 | # 追加进列表
85 | data_list.append(item)
86 | except:
87 | pass
88 | # 保存数据
89 | self.save_data(data_list)
90 |
91 | def get_next_page(self):
92 | # 判断是否出现验证码
93 | self.driver = self.validation()
94 | # 获取关键字商品的总页数
95 | get_page_number = self.driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text
96 | page_number = int(re.findall(r'(\d+)', get_page_number)[0])
97 | print(f"共获取到数据:{page_number}页")
98 | # 循环访问所有页面
99 | for i in range(0, page_number*44, 44):
100 | # 构造每页的链接
101 | self.driver.get(f"https://s.taobao.com/search?q={self.search_content}&s={i}")
102 | # 隐式等待
103 | self.driver.implicitly_wait(10)
104 | # 解析页面数据
105 | self.get_page_data()
106 | print(f"第{int(i/44+1)}页数据写入完成!")
107 |
108 | def validation(self):
109 | content = self.driver.page_source
110 | if "亲,请拖动下方滑块完成验证" in content:
111 | con = self.hua_kuai()
112 | count = 1
113 | while "亲,请拖动下方滑块完成验证" in con and count <= 3:
114 | con = self.hua_kuai()
115 | count += 1
116 | if count == 3:
117 | print("已尽力尝试自动滑动验证码,但抱歉没能通过,请手动滑一下吧~\n")
118 | input("手动滑动后,请等待页面“加载完成”,扣1并按回车键继续采集:")
119 | con = self.driver.page_source
120 | return self.driver
121 |
122 | def hua_kuai(self):
123 | ele = self.driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
124 | # 按住滑块元素不放
125 | ac(self.driver).click_and_hold(ele).perform()
126 | # 拖动滑块,xxx需要滑动的大小
127 | ac(self.driver).move_by_offset(300, random.randint(-5, 5)).perform()
128 | # 松开鼠标
129 | ac(self.driver).release().perform()
130 | # 加载页面
131 | time.sleep(2)
132 | try:
133 | # 点击重新滑动按钮
134 | self.driver.find_element_by_xpath('//*[@id="`nc_1_refresh1`"]').click()
135 | except:
136 | pass
137 | return self.driver.page_source
138 |
139 | def save_data(self, data_list):
140 | # 第一次写入需创建表格,后者追加内容
141 | if self.count == 1:
142 | # 创建新的excel表格
143 | wb = Workbook()
144 | sheet = wb.create_sheet("sheet1", -1)
145 | # 设置列宽
146 | sheet.column_dimensions['A'].width = 70
147 | sheet.column_dimensions['B'].width = 10
148 | sheet.column_dimensions['C'].width = 15
149 | sheet.column_dimensions['D'].width = 25
150 | sheet.column_dimensions['E'].width = 80
151 | else:
152 | wb = load_workbook(f"./data/{self.search_content}_商品信息.xlsx")
153 | sheet = wb["sheet1"]
154 | # 遍历表格索引,写入商品数据
155 | for x in range(len(data_list)):
156 | # 设置行高
157 | sheet.row_dimensions[x].height = 15
158 | for y in range(len(data_list[x])):
159 | sheet.cell(x + self.count, y + 1).value = data_list[x][y]
160 | # 居中显示
161 | sheet.cell(x + self.count, y + 1).alignment = self.align
162 | # 保存该Excel表格
163 | wb.save(f"./data/{self.search_content}_商品信息.xlsx")
164 | # 累加计数器,用于追加表格内容
165 | self.count += len(data_list)
166 |
167 |
168 | if __name__ == '__main__':
169 | text = input("请输入需要搜索的关键字:")
170 | run_spider = SaveTaobaoData(text)
171 | run_spider.get_page()
172 |
--------------------------------------------------------------------------------
/umeitu_dongman_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨异步爬取优美图库动漫图片✨
2 | - 优美图库官网:[https://www.umeitu.com/katongdongman/dongmantupian/](https://www.umeitu.com/katongdongman/dongmantupian/)
3 |
4 | - 输入指定页数后,异步下载页面上的所有图片
5 | - 下载的图片都保存在:`all_images/`
6 | - 该爬虫使用到的模块:
7 | - requests
8 | - aiohttp
9 | - asyncio
10 | - lxml
--------------------------------------------------------------------------------
/umeitu_dongman_spider/all_images/AIR神尾观铃双马尾高清卡通图片.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/AIR神尾观铃双马尾高清卡通图片.jpg
--------------------------------------------------------------------------------
/umeitu_dongman_spider/all_images/樱花庄的宠物女孩椎名真白高清卡通图片.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/樱花庄的宠物女孩椎名真白高清卡通图片.jpg
--------------------------------------------------------------------------------
/umeitu_dongman_spider/all_images/软萌系列动漫头像高清卡通图片.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/软萌系列动漫头像高清卡通图片.jpg
--------------------------------------------------------------------------------
/umeitu_dongman_spider/all_images/黄昏之大地的炼金术士高清卡通图片.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/黄昏之大地的炼金术士高清卡通图片.jpg
--------------------------------------------------------------------------------
/umeitu_dongman_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2021/12/5 15:37
3 | # @Author : Torres-圣君
4 | # @File : get_page_data.py
5 | # @Sofaware : PyCharm
6 | import requests
7 | import aiohttp
8 | import asyncio
9 | from lxml import etree
10 |
11 |
12 | class uMeitu:
13 | def __init__(self):
14 | self.url = "https://www.umeitu.com/e/action/get_img_a.php"
15 | self.headers = {
16 | 'referer': 'https://www.umeitu.com/katongdongman/dongmantupian/',
17 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44',
18 | }
19 |
20 | def get_img_data(self, i: int):
21 | data = {
22 | "next": i,
23 | "table": "news",
24 | "action": "getmorenews",
25 | "limit": 10,
26 | "small_length": 120,
27 | "classid": 48
28 | }
29 | res = requests.post(self.url, headers=self.headers, data=data)
30 | page_data = etree.HTML(res.text)
31 | imgs_list = page_data.xpath('//ul/li/a')
32 | # 存放图片名称的列表
33 | task_name = []
34 | # 存放图片链接的列表
35 | task_link = []
36 | for img in imgs_list:
37 | # 图片名称
38 | img_name = img.xpath('./span/text()')[0]
39 | # 图片链接
40 | img_link = img.xpath('./img/@src')[0].replace("small", "")
41 | task_name.append(img_name)
42 | task_link.append(img_link)
43 | self.async_spider(task_name, task_link)
44 |
45 | async def download_imgs(self, img_name, img_link):
46 | try:
47 | async with aiohttp.ClientSession() as session:
48 | async with session.get(img_link, headers=self.headers) as res:
49 | with open(f'all_images/{img_name}.jpg', "wb") as w:
50 | w.write(await res.content.read())
51 | print(f"<{img_name}>下载完成")
52 | except Exception:
53 | pass
54 |
55 | def async_spider(self, task_name, task_link):
56 | # 获取事件循环
57 | loop = asyncio.get_event_loop()
58 | # 创建task列表
59 | tasks = [
60 | loop.create_task(self.download_imgs(task_name[i], task_link[i])) for i in range(0, len(task_name))
61 | ]
62 | # 执行爬虫事件列表
63 | loop.run_until_complete(asyncio.wait(tasks))
64 |
65 | def run(self):
66 | num = int(input("请输入要下载的图片页数:"))
67 | for i in range(1, num+1):
68 | self.get_img_data(i)
69 |
70 |
71 | if __name__ == '__main__':
72 | u = uMeitu()
73 | u.run()
74 |
--------------------------------------------------------------------------------
/ximalaya_audio_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨保存喜马拉雅免费音频✨
2 | - 喜马拉雅官网:[https://www.ximalaya.com](https://www.ximalaya.com)
3 |
4 | - 输入作者ID后,下载该ID下所有免费的有声书
5 |
6 | - 该爬虫使用到的模块:
7 | - requests
8 |
--------------------------------------------------------------------------------
/ximalaya_audio_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/29 10:20
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import requests
7 |
8 | headers = {
9 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37"
10 | }
11 |
12 |
13 | def run(author_id):
14 | count = 0
15 | while True:
16 | count += 1
17 | url = f"https://www.ximalaya.com/revision/album/v1/getTracksList?albumId={author_id}&pageNum={count}&sort=0"
18 | res = requests.get(url, headers=headers)
19 | audio_link_lisk = res.json()['data']['tracks']
20 | if len(audio_link_lisk) == 0:
21 | print("所有音频爬取完毕!")
22 | break
23 | else:
24 | for audio_link in audio_link_lisk:
25 | audio_title = audio_link['title']
26 | audio_id = audio_link['trackId']
27 | audio_url = f"https://www.ximalaya.com/revision/play/v1/audio?id={audio_id}&ptype=1"
28 | print("正在保存:", audio_title)
29 | save_audio(audio_title, audio_url)
30 |
31 |
32 | def save_audio(audio_title, audio_url):
33 | audio_res = requests.get(audio_url, headers=headers).json()['data']['src']
34 | audio_data = requests.get(audio_res, headers=headers).content
35 | with open(f'{audio_title}.mp3', 'wb') as w:
36 | w.write(audio_data)
37 | print(audio_title, "保存完成!")
38 |
39 |
40 | if __name__ == '__main__':
41 | # 作者ID
42 | author_id = 10092072
43 | run(author_id)
44 |
--------------------------------------------------------------------------------
/yibu_book_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取异步社区所有图书信息✨
2 | - 异步社区图书官网:[https://www.epubit.com/books](https://www.epubit.com/books)
3 |
4 | - 爬取异步社区所有图书信息,包含:
5 | - 书名
6 | - 书的作者
7 | - 书的价格
8 | - 书的标签
9 | - 书的链接
10 | - 爬取的数据存储方式:
11 | - 通过连接MongoDB数据库,将其存入数据库
12 | - 该爬虫使用到的模块:
13 | - requests
14 | - pymongo
--------------------------------------------------------------------------------
/yibu_book_spider/get_proxyz.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/1 17:00
3 | # @Author : Torres-圣君
4 | # @File : get_proxyz.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_proxies():
10 | proxies_list = [
11 | {
12 | "ip_address": "http://39.175.67.28:30001"
13 | }, {
14 | "ip_address": "http://101.133.138.238:8118"
15 | }, {
16 | "ip_address": "http://58.246.58.150:9002"
17 | }, {
18 | "ip_address": "http://112.6.117.178:8085"
19 | }, {
20 | "ip_address": "http://221.122.91.74:9401"
21 | }, {
22 | "ip_address": "http://58.220.95.116:10122"
23 | }, {
24 | "ip_address": "http://58.220.95.32:10174"
25 | }, {
26 | "ip_address": "http://220.168.132.43:9015"
27 | }, {
28 | "ip_address": "http://112.6.117.135:8085"
29 | }, {
30 | "ip_address": "http://183.131.85.16:7302"
31 | }, {
32 | "ip_address": "http://223.96.90.216:8085"
33 | }, {
34 | "ip_address": "http://120.133.231.92:8000"
35 | }, {
36 | "ip_address": "http://58.220.95.35:10174"
37 | }, {
38 | "ip_address": "http://47.97.191.179:8018"
39 | }, {
40 | "ip_address": "http://58.220.95.116:10122"
41 | }, {
42 | "ip_address": "http://221.122.91.64:9401"
43 | }, {
44 | "ip_address": "http://123.57.246.163:8118"
45 | },
46 | ]
47 | return random.choice(proxies_list)
48 |
--------------------------------------------------------------------------------
/yibu_book_spider/get_ua.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/4/1 17:02
3 | # @Author : Torres-圣君
4 | # @File : get_ua.py
5 | # @Sofaware : PyCharm
6 | import random
7 |
8 |
9 | def get_ua():
10 | user_agent_list = [
11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
43 | "UCWEB7.0.2.37/28/999",
44 | "NOKIA5700/ UCWEB7.0.2.37/28/999",
45 | "Openwave/ UCWEB7.0.2.37/28/999",
46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
48 | ]
49 | # 设置UA伪装
50 | return random.choice(user_agent_list)
51 |
--------------------------------------------------------------------------------
/yibu_book_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/3/30 21:08
3 | # @Author : Torres-圣君
4 | # @File : mian.py
5 | # @Sofaware : PyCharm
6 | import requests
7 | from .get_proxyz import get_proxies
8 | from .get_ua import get_ua
9 | from pymongo import MongoClient
10 |
11 |
12 | class CatchYibuBook:
13 | def __init__(self):
14 | self.url = f'https://www.epubit.com/pubcloud/content/front/portal/getUbookList'
15 | # 初始化MongoDB数据库并创建数据库连接
16 | self.mongo_address = '127.0.0.1'
17 | self.client = MongoClient(self.mongo_address, 27017)
18 | self.db = self.client['book']
19 | self.col = self.db['yibutushu']
20 |
21 | def get_data(self, i):
22 | headers = {
23 | 'Origin-Domain': 'www.epubit.com',
24 | 'User-Agent': get_ua()
25 | }
26 | params = {
27 | 'page': i,
28 | 'row': 20,
29 | 'startPrice': None,
30 | 'endPrice': None,
31 | 'tagId': None,
32 | }
33 |
34 | res = requests.get(self.url, headers=headers, params=params, proxies=get_proxies())
35 | data = res.json()
36 | for i in range(0, 20):
37 | item = {}
38 | item['book_name'] = data['data']['records'][i]['name']
39 | item['book_author'] = data['data']['records'][i]['authors']
40 | item['book_price'] = data['data']['records'][i]['price']
41 | item['book_tagNames'] = data['data']['records'][i]['tagNames']
42 | item['book_link'] = "https://www.epubit.com/bookDetails?id=" + data['data']['records'][0]['code']
43 | self.col.insert_one(item)
44 | print(item)
45 |
46 | def run(self, page):
47 | for i in range(1, page+1):
48 | # 设置抓取数据的页数
49 | catch_msg.get_data(i)
50 | # 断开连接mongo
51 | self.client.close()
52 |
53 |
54 | if __name__ == '__main__':
55 | num = int(input("请输入需要爬取的页数:"))
56 | # 实例化对象
57 | catch_msg = CatchYibuBook()
58 | catch_msg.run(num)
59 |
--------------------------------------------------------------------------------
/yiqing_data_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取城市实时疫情数据信息✨
2 | - 腾讯疫情数据API:[https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf](https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf)
3 |
4 | - 输入城市名称获取疫情数据,包含:
5 | - 最近更新日期
6 | - 新增确诊人数
7 | - 目前确诊人数
8 | - 累计确诊人数
9 | - 累计治愈人数
10 | - 累计死亡人数
11 | - 该爬虫使用到的模块:
12 | - json
13 | - requests
14 |
--------------------------------------------------------------------------------
/yiqing_data_spider/city_list.json:
--------------------------------------------------------------------------------
1 | {
2 | "省": [
3 | "台湾",
4 | "香港",
5 | "澳门",
6 | "天津",
7 | "安徽",
8 | "吉林",
9 | "广东",
10 | "上海",
11 | "福建",
12 | "内蒙古",
13 | "山东",
14 | "江苏",
15 | "北京",
16 | "重庆",
17 | "四川",
18 | "陕西",
19 | "云南",
20 | "浙江",
21 | "江西",
22 | "湖北",
23 | "辽宁",
24 | "湖南",
25 | "河北",
26 | "河南",
27 | "甘肃",
28 | "黑龙江",
29 | "新疆",
30 | "宁夏",
31 | "西藏",
32 | "海南",
33 | "广西",
34 | "山西",
35 | "贵州",
36 | "青海"
37 | ],
38 | "市": [
39 | "地区待确认",
40 | "地区待确认",
41 | "地区待确认",
42 | "待确认",
43 | "境外输入",
44 | "河北区",
45 | "北辰区",
46 | "和平区",
47 | "河西区",
48 | "南开区",
49 | "东丽区",
50 | "西青区",
51 | "津南区",
52 | "滨海新区",
53 | "红桥区",
54 | "河东区",
55 | "蓟州区",
56 | "宁河区",
57 | "武清区",
58 | "宝坻区",
59 | "静海区",
60 | "外地来津",
61 | "宿州",
62 | "境外输入",
63 | "六安",
64 | "宣城",
65 | "滁州",
66 | "安庆",
67 | "淮北",
68 | "蚌埠",
69 | "黄山",
70 | "合肥",
71 | "淮南",
72 | "池州",
73 | "马鞍山",
74 | "阜阳",
75 | "亳州",
76 | "芜湖",
77 | "铜陵",
78 | "长春",
79 | "境外输入",
80 | "吉林市",
81 | "四平",
82 | "通化",
83 | "延边",
84 | "白城",
85 | "梅河口市",
86 | "长白山管委会",
87 | "松原",
88 | "辽源",
89 | "白山",
90 | "待确认",
91 | "深圳",
92 | "广州",
93 | "湛江",
94 | "境外输入",
95 | "珠海",
96 | "中山",
97 | "惠州",
98 | "肇庆",
99 | "茂名",
100 | "云浮",
101 | "江门",
102 | "佛山",
103 | "河源",
104 | "汕尾",
105 | "韶关",
106 | "阳江",
107 | "梅州",
108 | "汕头",
109 | "潮州",
110 | "揭阳",
111 | "清远",
112 | "东莞",
113 | "地区待确认",
114 | "黄浦",
115 | "浦东",
116 | "杨浦",
117 | "徐汇",
118 | "虹口",
119 | "静安",
120 | "闵行",
121 | "宝山",
122 | "长宁",
123 | "普陀",
124 | "嘉定",
125 | "崇明",
126 | "奉贤",
127 | "松江",
128 | "青浦",
129 | "境外输入",
130 | "金山",
131 | "外地来沪",
132 | "境外来沪",
133 | "地区待确认",
134 | "境外输入",
135 | "宁德",
136 | "莆田",
137 | "厦门",
138 | "漳州",
139 | "泉州",
140 | "南平",
141 | "福州",
142 | "三明",
143 | "龙岩",
144 | "地区待确认",
145 | "锡林郭勒",
146 | "境外输入",
147 | "赤峰",
148 | "呼和浩特",
149 | "鄂尔多斯",
150 | "巴彦淖尔",
151 | "乌海",
152 | "乌兰察布",
153 | "兴安盟",
154 | "通辽",
155 | "阿拉善盟",
156 | "包头",
157 | "呼伦贝尔",
158 | "境外输入",
159 | "青岛",
160 | "临沂",
161 | "淄博",
162 | "德州",
163 | "日照",
164 | "滨州",
165 | "枣庄",
166 | "威海",
167 | "泰安",
168 | "聊城",
169 | "济宁",
170 | "东营",
171 | "潍坊",
172 | "菏泽",
173 | "烟台",
174 | "济南",
175 | "地区待确认",
176 | "徐州",
177 | "南京",
178 | "盐城",
179 | "常州",
180 | "苏州",
181 | "无锡",
182 | "宿迁",
183 | "镇江",
184 | "泰州",
185 | "境外输入",
186 | "淮安",
187 | "连云港",
188 | "扬州",
189 | "南通",
190 | "地区待确认",
191 | "朝阳",
192 | "丰台",
193 | "海淀",
194 | "房山",
195 | "境外输入",
196 | "西城",
197 | "通州",
198 | "东城",
199 | "昌平",
200 | "大兴",
201 | "顺义",
202 | "石景山",
203 | "外地来京",
204 | "门头沟",
205 | "经济开发区",
206 | "涉奥闭环人员",
207 | "密云",
208 | "延庆",
209 | "怀柔",
210 | "平谷区",
211 | "地区待确认",
212 | "境外输入",
213 | "南岸区",
214 | "沙坪坝区",
215 | "綦江区",
216 | "荣昌区",
217 | "潼南区",
218 | "涪陵区",
219 | "长寿区",
220 | "奉节县",
221 | "大渡口区",
222 | "合川区",
223 | "万州区",
224 | "渝中区",
225 | "丰都县",
226 | "垫江县",
227 | "城口县",
228 | "石柱县",
229 | "铜梁区",
230 | "酉阳县",
231 | "秀山县",
232 | "璧山区",
233 | "巫溪县",
234 | "两江新区",
235 | "高新区",
236 | "大足区",
237 | "梁平区",
238 | "黔江区",
239 | "南川区",
240 | "开州区",
241 | "北碚区",
242 | "万盛经开区",
243 | "江北区",
244 | "江津区",
245 | "巫山县",
246 | "云阳县",
247 | "渝北区",
248 | "永川区",
249 | "武隆区",
250 | "巴南区",
251 | "忠县",
252 | "九龙坡区",
253 | "彭水县",
254 | "广安",
255 | "境外输入",
256 | "成都",
257 | "巴中",
258 | "乐山",
259 | "达州",
260 | "德阳",
261 | "广元",
262 | "遂宁",
263 | "资阳",
264 | "宜宾",
265 | "泸州",
266 | "雅安",
267 | "阿坝",
268 | "自贡",
269 | "南充",
270 | "凉山",
271 | "攀枝花",
272 | "绵阳",
273 | "眉山",
274 | "甘孜",
275 | "内江",
276 | "地区待确认",
277 | "境外输入",
278 | "西安",
279 | "咸阳",
280 | "延安",
281 | "汉中",
282 | "榆林",
283 | "铜川",
284 | "渭南",
285 | "杨凌",
286 | "宝鸡",
287 | "商洛",
288 | "安康",
289 | "地区待确认",
290 | "红河",
291 | "境外输入",
292 | "临沧",
293 | "普洱",
294 | "文山州",
295 | "昆明",
296 | "西双版纳州",
297 | "曲靖",
298 | "保山市",
299 | "昭通市",
300 | "怒江州",
301 | "德宏州",
302 | "大理",
303 | "楚雄州",
304 | "丽江市",
305 | "迪庆州",
306 | "玉溪",
307 | "地区待确认",
308 | "境外输入",
309 | "杭州",
310 | "嘉兴",
311 | "衢州",
312 | "金华",
313 | "宁波",
314 | "湖州",
315 | "绍兴",
316 | "舟山",
317 | "温州",
318 | "丽水",
319 | "台州",
320 | "省十里丰监狱",
321 | "地区待确认",
322 | "境外输入",
323 | "上饶",
324 | "抚州",
325 | "新余",
326 | "吉安",
327 | "宜春",
328 | "赣江新区",
329 | "景德镇",
330 | "鹰潭",
331 | "萍乡",
332 | "赣州",
333 | "南昌",
334 | "九江",
335 | "地区待确认",
336 | "境外输入",
337 | "鄂州",
338 | "恩施州",
339 | "神农架",
340 | "宜昌",
341 | "荆门",
342 | "天门",
343 | "黄石",
344 | "孝感",
345 | "十堰",
346 | "襄阳",
347 | "仙桃",
348 | "咸宁",
349 | "潜江",
350 | "黄冈",
351 | "随州",
352 | "武汉",
353 | "荆州",
354 | "沈阳",
355 | "营口",
356 | "丹东",
357 | "葫芦岛",
358 | "大连",
359 | "鞍山",
360 | "铁岭",
361 | "阜新",
362 | "境外输入",
363 | "本溪",
364 | "锦州",
365 | "抚顺",
366 | "朝阳市",
367 | "盘锦",
368 | "辽阳",
369 | "地区待确认",
370 | "邵阳",
371 | "境外输入",
372 | "长沙",
373 | "湘西自治州",
374 | "湘潭",
375 | "永州",
376 | "郴州",
377 | "岳阳",
378 | "怀化",
379 | "常德",
380 | "衡阳",
381 | "益阳",
382 | "张家界",
383 | "株洲",
384 | "娄底",
385 | "地区待确认",
386 | "廊坊",
387 | "沧州",
388 | "邯郸",
389 | "唐山",
390 | "保定",
391 | "秦皇岛",
392 | "定州",
393 | "雄安新区",
394 | "承德",
395 | "衡水",
396 | "石家庄",
397 | "张家口",
398 | "邢台",
399 | "境外输入",
400 | "辛集市",
401 | "地区待确认",
402 | "许昌",
403 | "郑州",
404 | "周口",
405 | "安阳",
406 | "平顶山",
407 | "信阳",
408 | "濮阳",
409 | "漯河",
410 | "开封",
411 | "洛阳",
412 | "商丘",
413 | "境外输入",
414 | "南阳",
415 | "三门峡",
416 | "济源示范区",
417 | "驻马店",
418 | "新乡",
419 | "鹤壁",
420 | "焦作",
421 | "地区待确认",
422 | "境外输入",
423 | "金昌",
424 | "地区待确认",
425 | "临夏",
426 | "平凉",
427 | "庆阳",
428 | "甘南州",
429 | "定西",
430 | "嘉峪关",
431 | "张掖",
432 | "天水",
433 | "酒泉",
434 | "兰州",
435 | "陇南",
436 | "白银",
437 | "武威",
438 | "境外输入",
439 | "哈尔滨",
440 | "牡丹江",
441 | "大庆",
442 | "鸡西",
443 | "地区待确认",
444 | "齐齐哈尔",
445 | "佳木斯",
446 | "双鸭山",
447 | "伊春",
448 | "绥化",
449 | "大兴安岭",
450 | "鹤岗",
451 | "黑河",
452 | "七台河",
453 | "兵团第十一师",
454 | "兵团第九师",
455 | "喀什",
456 | "地区待确认",
457 | "兵团第十二师",
458 | "第七师",
459 | "第八师石河子",
460 | "兵团第四师",
461 | "伊犁哈萨克自治州",
462 | "六师五家渠",
463 | "克孜州",
464 | "哈密",
465 | "阿克苏",
466 | "昌吉州",
467 | "博尔塔拉州",
468 | "吐鲁番",
469 | "阿勒泰",
470 | "和田",
471 | "巴音郭楞州",
472 | "塔城",
473 | "克拉玛依",
474 | "乌鲁木齐",
475 | "境外输入",
476 | "中卫",
477 | "地区待确认",
478 | "石嘴山",
479 | "固原",
480 | "银川",
481 | "吴忠",
482 | "宁东管委会",
483 | "那曲",
484 | "山南",
485 | "林芝",
486 | "阿里地区",
487 | "拉萨",
488 | "日喀则",
489 | "昌都",
490 | "三亚",
491 | "海口",
492 | "陵水县",
493 | "琼海",
494 | "儋州",
495 | "万宁",
496 | "昌江县",
497 | "定安县",
498 | "临高县",
499 | "保亭",
500 | "澄迈县",
501 | "琼中县",
502 | "三沙",
503 | "境外输入",
504 | "文昌",
505 | "东方",
506 | "乐东",
507 | "地区待确认",
508 | "防城港",
509 | "河池",
510 | "玉林",
511 | "钦州",
512 | "桂林",
513 | "贵港",
514 | "贺州",
515 | "梧州",
516 | "柳州",
517 | "来宾",
518 | "南宁",
519 | "百色",
520 | "北海",
521 | "崇左",
522 | "地区待确认",
523 | "境外输入",
524 | "境外输入",
525 | "阳泉",
526 | "长治",
527 | "晋中",
528 | "忻州",
529 | "吕梁",
530 | "大同",
531 | "临汾",
532 | "晋城",
533 | "太原",
534 | "运城",
535 | "朔州",
536 | "遵义",
537 | "铜仁",
538 | "毕节",
539 | "贵阳",
540 | "黔南州",
541 | "六盘水",
542 | "黔西南州",
543 | "境外输入",
544 | "安顺",
545 | "黔东南州",
546 | "地区待确认",
547 | "西宁",
548 | "海东",
549 | "玉树州",
550 | "海西州",
551 | "果洛州",
552 | "海南州",
553 | "海北州",
554 | "黄南州",
555 | "地区待确认"
556 | ]
557 | }
--------------------------------------------------------------------------------
/yiqing_data_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/5/3 23:23
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import json
7 | import requests
8 |
9 | url = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf"
10 | headers = {
11 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44"
12 | }
13 |
14 |
15 | def run(city):
16 | res = requests.get(url, headers=headers).json()
17 | all_data = res['data']['diseaseh5Shelf']
18 | last_update_time = all_data["lastUpdateTime"]
19 | # 保存城市列表
20 | save_city_list(all_data)
21 | # 读取城市名称列表
22 | city_list = json.loads(open("city_list.json", encoding='utf-8').read())
23 | city = city.strip("省市")
24 | if city == "中国":
25 | data_ = all_data["areaTree"][0]
26 | else:
27 | try:
28 | if city in city_list["省"]:
29 | # 提取当前省份的所有数据
30 | data_ = [x for x in all_data["areaTree"][0]["children"] if x["name"] == city][0]
31 | elif city in city_list["市"]:
32 | # 提取当前城市的所有数据
33 | data_ = [y for x in all_data["areaTree"][0]["children"] for y in x["children"] if y["name"] == city][0]
34 | else:
35 | return f"没有查询到{city}的疫情数据~"
36 | except IndexError:
37 | return "疫情接口出现异常,请稍后重试~"
38 | confirm = data_["total"]["confirm"] # 累计确诊
39 | heal = data_["total"]["heal"] # 累计治愈
40 | dead = data_["total"]["dead"] # 累计死亡
41 | now_confirm = data_["total"]["nowConfirm"] # 目前确诊
42 | add_confirm = data_["today"]["confirm"] # 新增确诊
43 | return f"{city}疫情更新日期:\n" \
44 | f"{last_update_time}\n" \
45 | f"————————————————————————\n" \
46 | f"该地区疫情数据如下:\n" \
47 | f"新增确诊:{add_confirm}\n" \
48 | f"目前确诊:{now_confirm}\n" \
49 | f"累计确诊:{confirm}\n" \
50 | f"累计治愈:{heal}\n" \
51 | f"累计死亡:{dead}"
52 |
53 |
54 | def save_city_list(all_data):
55 | with open("city_list.json", 'w', encoding='utf-8') as w:
56 | # 保存所有省份和城市名称
57 | sheng_list = []
58 | shi_list = []
59 | for i in all_data["areaTree"][0]["children"]:
60 | sheng_list.append(i["name"])
61 | sheng_list.append(i["name"]+"省")
62 | for j in i["children"]:
63 | shi_list.append(j["name"])
64 | shi_list.append(j["name"]+"市")
65 | dict_city = {
66 | "省": sheng_list,
67 | "市": shi_list
68 | }
69 | w.write(json.dumps(dict_city, indent=1, ensure_ascii=False))
70 | print("城市列表保存完成!")
71 |
72 |
73 | city_name = input("请输入要查的城市名:")
74 | print(run(city_name))
75 |
--------------------------------------------------------------------------------
/youdao_fanyi_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨有道在线翻译接口✨
2 | - 有道在线翻译:[https://fanyi.youdao.com/](https://fanyi.youdao.com/)
3 |
4 | ```python
5 | 通过抓包获取到接口后,查看其携带的参数信息
6 | 通过对参数的分析得出:
7 | 'i':需要翻译的文本
8 | 'salt':14位的时间戳
9 | 'sign':使用的是md5密码加盐方式,对需要翻译的文本加盐后进行加密
10 | 'lts':13位的时间戳
11 | 除此之外,其余的参数则都为固定值
12 | 参数都解决完成后,携带这些参数对接口发送请求即可
13 | ```
14 |
15 | - 该爬虫使用到的模块:
16 | - requests
17 | - hashlib
18 | - time
19 |
--------------------------------------------------------------------------------
/youdao_fanyi_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/14 21:00
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import requests
7 | import hashlib
8 | import time
9 |
10 |
11 | class YouDao(object):
12 | def __init__(self, word):
13 | self.word = word
14 | self.headers = {
15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
16 | '99.0.4844.51 Safari/537.36 Edg/99.0.1150.39',
17 | 'Cookie': 'OUTFOX_SEARCH_USER_ID=1277855906@10.108.160.101; OUTFOX_SEARCH_USER_ID_NCOO=1759159210.6581216; ___rl__test__cookies=1656644180767; fanyi-ad-id=307488; fanyi-ad-closed=0',
18 | 'Referer': 'https://fanyi.youdao.com/'
19 | }
20 |
21 | def run(self):
22 | url = 'https://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
23 | res = requests.post(url, headers=self.headers, data=self.get_fromdata())
24 | data = res.json()
25 | print(f"{'-'*100}\n", data['translateResult'][0][0]['tgt'])
26 |
27 | def get_fromdata(self):
28 | """
29 | ts: "" + (new Date).getTime(),
30 | salt: ts + parseInt(10 * Math.random(), 10);,
31 | sign: n.md5("fanyideskweb" + e + i + "Ygy_4c=r#e#4EX^NUGUc5")
32 | """
33 | salt = str(int(time.time()*10000)) # 14位
34 | lts = str(int(time.time() * 1000)) # 13位
35 |
36 | # MD5加密
37 | data = "fanyideskweb" + self.word + salt + "Ygy_4c=r#e#4EX^NUGUc5"
38 | md5 = hashlib.md5()
39 | md5.update(data.encode())
40 | sign = md5.hexdigest()
41 |
42 | fromdata = {
43 | "i": self.word,
44 | "from": "AUTO",
45 | "to": "AUTO",
46 | "smartresult": "dict",
47 | "client": "fanyideskweb",
48 | "salt": salt,
49 | "sign": sign,
50 | "lts": lts,
51 | "bv": "8c5b4ecb9f7fdfe6b2997ab984775a98",
52 | "doctype": "json",
53 | "version": "2.1",
54 | "keyfrom": "fanyi.web",
55 | "action": "FY_BY_REALTlME"
56 | }
57 | return fromdata
58 |
59 |
60 | if __name__ == '__main__':
61 | content = input("请输入需要翻译的内容:")
62 | youdao = YouDao(content)
63 | youdao.run()
64 |
--------------------------------------------------------------------------------
/ziroom_message_spider/README.md:
--------------------------------------------------------------------------------
1 | ## ✨获取自如网房源信息✨
2 | - 自如网官网:[https://www.ziroom.com/z/](https://www.ziroom.com/z/)
3 |
4 | ```python
5 | 字体反爬大体思路:
6 | 1. 通过自如网页面的源码中,提取房价数字的背景图片链接,并保存图片
7 | 2. 使用'PIL'的'Image'将数字图片和纯黑色图片合并(因为保存的图片背景为透明,pytesseract无法识别)
8 | 3. 合并后会生成'text.png'图片,再使用'pytesseract'进行识别提取数字
9 | 4. 将提取的数字和坐标值(固定的)建立映射,再将数字'position'对应的坐标替换对应的数字即可
10 | ```
11 |
12 | - 该爬虫使用到的模块:
13 | - requests
14 | - re
15 | - time
16 | - lxml
17 | - pytesseract
18 | - PIL
19 |
--------------------------------------------------------------------------------
/ziroom_message_spider/ocr_img/bg_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/bg_image.png
--------------------------------------------------------------------------------
/ziroom_message_spider/ocr_img/black_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/black_img.png
--------------------------------------------------------------------------------
/ziroom_message_spider/ocr_img/text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/text.png
--------------------------------------------------------------------------------
/ziroom_message_spider/run_spider.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | # @Time : 2022/6/30 14:55
3 | # @Author : Torres-圣君
4 | # @File : run_spider.py
5 | # @Software : PyCharm
6 | import requests
7 | import re
8 | import time
9 | from lxml import etree
10 | import pytesseract
11 | from PIL import Image
12 |
13 |
14 | class DetailedData:
15 | def __init__(self, page_num):
16 | self.urls = [f'https://www.ziroom.com/z/p{num + 1}/' for num in range(page_num)]
17 | self.headers = {
18 | "Cookie": "CURRENT_CITY_CODE=110000; CURRENT_CITY_NAME=%E5%8C%97%E4%BA%AC; _csrf=yjfN8G-kzNnGvj1iEvjH6O1x3TNy89d0; __jsluid_s=4174712fab682cd6df16575532ddfe6b; sajssdk_2015_cross_new_user=1; gr_user_id=383b4bb6-6a6b-4901-a057-9c620a3e2e26; __jsluid_h=1df213e6b4954e733185c9409be2a2e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22181b36097ea1b3-0f16d08755ce17-4f617f5b-1327104-181b36097eb316%22%2C%22%24device_id%22%3A%22181b36097ea1b3-0f16d08755ce17-4f617f5b-1327104-181b36097eb316%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D",
19 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37"
20 | }
21 | # 固定的数字位置
22 | self.position_list = ['-0px', '-21.4px', '-42.8px', '-64.2px', '-85.6px', '-107px', '-128.4px', '-149.8px', '-171.2px', '-192.6px']
23 |
24 | def run(self):
25 | for url in self.urls:
26 | self.page_data(url)
27 | time.sleep(5)
28 |
29 | def page_data(self, url):
30 | res = requests.get(url, headers=self.headers).text
31 | # 保存数字背景图片
32 | self.download_img(res)
33 | # 使用ocr识别图片
34 | fonts_dic = self.ocr_fonts()
35 | print(fonts_dic)
36 | html = etree.HTML(res)
37 | div_list = html.xpath('//div[@class="Z_list-box"]/div')
38 | for div in div_list:
39 | room_link = "https:" + div.xpath('./div[2]/h5/a/@href')[0]
40 | title = div.xpath('./div[2]/h5/a/text()')[0]
41 | area = div.xpath('./div[2]/div[1]/div[1]/text()')[0]
42 | address = div.xpath('./div[2]/div[1]/div[2]/text()')[0].strip()
43 | bg_link = div.xpath('.//span[@class="num"]/@style')
44 | price = self.decrypt_font(bg_link, fonts_dic)
45 | item = [room_link, title, area, address, price]
46 | print(item)
47 | self.save_data(item)
48 |
49 | def download_img(self, res):
50 | # 在页面源码中提取图片链接
51 | img = re.findall(r'//static8.ziroom.com/phoenix/pc/images/price/new-list/(.*?)\);', res)[0]
52 | img_url = "https://static8.ziroom.com/phoenix/pc/images/price/new-list/" + img
53 | # 以二进制写入文件保存图片
54 | img_data = requests.get(img_url, headers=self.headers).content
55 | with open('ocr_img/bg_image.png', 'wb') as w:
56 | w.write(img_data)
57 |
58 | def ocr_fonts(self):
59 | # 纯白背景图
60 | white_img = Image.open('ocr_img/black_img.png')
61 | # 数字背景图
62 | bg_img = Image.open('ocr_img/bg_image.png')
63 | # 改变图像尺寸
64 | img1 = white_img.resize((600, 100))
65 | img2 = bg_img.resize((560, 60))
66 | # 合并两个图像,bg_img 放到 white_img 并指定坐标(不能完全重叠)
67 | img1.paste(img2, (30, 20))
68 | # 保存图片
69 | img1.save("text.png")
70 | # 使用合并后的图
71 | image = Image.open('ocr_img/text.png')
72 | # 图片二值化,便于ocr识别
73 | Img = image.convert('L')
74 | # 识别提取图片中的内容
75 | text = pytesseract.image_to_string(Img)
76 | # 将内容写入列表
77 | nums = [num for num in text if num != " "]
78 | fonts_dic = {}
79 | # 把位置和数字存放为字典
80 | for k, v in zip(self.position_list, nums):
81 | fonts_dic[k] = v
82 | return fonts_dic
83 |
84 | def decrypt_font(self, bg_link, fonts_dic):
85 | price_list = []
86 | # 替换价格的每个数字
87 | for bg in bg_link:
88 | position = bg.split(" ")[-1]
89 | num = fonts_dic[position]
90 | price_list.append(num)
91 | # 拼接成完整的价格
92 | price = ''.join(price_list) + "元/月"
93 | return price
94 |
95 | def save_data(self, item):
96 | with open('自如网租房房源信息.csv', 'a+') as w:
97 | w.seek(0)
98 | flag = w.read() == ""
99 | if flag:
100 | w.write("链接,标题,面积,地址,房价\n")
101 | w.write(','.join(item) + "\n")
102 |
103 |
104 | if __name__ == '__main__':
105 | page_num = int(input("请输入需要获取的页码:"))
106 | dd = DetailedData(page_num)
107 | dd.run()
108 |
--------------------------------------------------------------------------------
/ziroom_message_spider/自如网租房房源信息.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/自如网租房房源信息.csv
--------------------------------------------------------------------------------