├── Python生成验证码文字变图片.py
├── Python的Web和数据分析学习图谱
├── Django.png
├── flask.png
└── 数据分析算法合集.png
├── README.md
├── lagouSpider.py
├── scrapy爬虫简单项目
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── python实现有道词典.iml
│ └── workspace.xml
├── 2.py
├── Qqnews
│ ├── Qqnews
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ ├── pipelines.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── Qqnews_spider.py
│ │ │ ├── __init__.py
│ │ │ └── __pycache__
│ │ │ ├── Qqnews_spider.cpython-36.pyc
│ │ │ └── __init__.cpython-36.pyc
│ └── scrapy.cfg
├── Yustneirong
│ ├── Yustneirong
│ │ ├── __init__.py
│ │ ├── __init__.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ ├── settings.pyc
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── __init__.pyc
│ └── scrapy.cfg
├── dbtop250
│ ├── dbtop250
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ ├── pipelines.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── dbtop250_spider.cpython-36.pyc
│ │ │ └── dbtop250_spider.py
│ └── scrapy.cfg
├── douban
│ ├── douban
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── douban_spider.cpython-36.pyc
│ │ │ └── douban_spider.py
│ └── scrapy.cfg
├── ip地址查询工具.py
├── lagou-scrapy
│ ├── .idea
│ │ ├── dictionaries
│ │ │ └── .xml
│ │ ├── lagou.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── workspace.xml
│ ├── dump.rdb
│ ├── geckodriver.log
│ ├── lagou
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ ├── pipelines.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── middlewares
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-36.pyc
│ │ │ │ └── useragent.cpython-36.pyc
│ │ │ └── useragent.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── lagouspider.cpython-36.pyc
│ │ │ └── lagouspider.py
│ └── scrapy.cfg
├── python爬虫实现有道词典.py
├── taobaoclass
│ ├── scrapy.cfg
│ └── taobaoclass
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ └── __init__.cpython-36.pyc
│ │ └── taobao_spider.py
├── tutorial
│ ├── scrapy.cfg
│ └── tutorial
│ │ ├── __init__.py
│ │ ├── __init__.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ ├── settings.pyc
│ │ └── spiders
│ │ ├── __init__.py
│ │ ├── __init__.pyc
│ │ ├── dmoz_spider.py
│ │ ├── dmoz_spider.pyc
│ │ ├── mydomain.py
│ │ └── mydomain.pyc
└── zaobao
│ ├── scrapy.cfg
│ └── zaobao
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── zaobao_spider.cpython-36.pyc
│ └── zaobao_spider.py
├── zhihu.com
├── scrapy.cfg
└── zhihuuser
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── zhihu.cpython-36.pyc
│ └── zhihu.py
├── zhihu
├── scrapy.cfg
└── zhihuuser
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── zhihu.cpython-36.pyc
│ └── zhihu.py
├── 可视化文件显示程序.zip
├── 基于python的turtle画出叮当猫.py
├── 基于python的turtle的桌面弹球.py
├── 基于python的turtle移动的小球.py
├── 抓取财富网股票信息.py
├── 爬取12306车票信息.py
└── 爬取qq音乐歌曲
├── audio2.txt
└── 爬取扣扣音乐文件.py
/Python生成验证码文字变图片.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pygame
3 | from pygame.locals import *
4 |
5 |
6 |
7 | pygame.init()
8 | text = u"1234"
9 |
10 | font = pygame.font.SysFont("Microsoft YaHei",64)
11 | ftext = font.render(text,True,(65,83,130),(255,255,255))
12 | pygame.image.save(ftext,"D:/pythontab.jpg")
--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/Django.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/Django.png
--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/flask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/flask.png
--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/数据分析算法合集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/数据分析算法合集.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # python
2 | python-爬虫-web-数据分析
3 |
--------------------------------------------------------------------------------
/lagouSpider.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import random
4 | import urllib
5 | import requests
6 | from bs4 import BeautifulSoup
7 | import time
8 | import csv
9 | import codecs
10 | from selenium import webdriver
11 |
12 |
13 | headers = {
14 | 'Accept':'application/json, text/javascript, */*; q=0.01',
15 | 'Accept-Encoding':'gzip, deflate, br',
16 | 'Accept-Language':'zh-CN,zh;q=0.8',
17 | 'Connection':'keep-alive',
18 | 'Content-Length':'25',
19 | 'Content-Type':'application/x-www-form-urlencoded;
20 | 'Cookie':'填上cookie信息',
21 | 'Host':'www.lagou.com',
22 | 'Origin':'https://www.lagou.com',
23 | 'Referer':"h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC",
24 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
25 | 'X-Anit-Forge-Code':'0',
26 | 'X-Anit-Forge-Token':'None',
27 | 'X-Requested-With':'XMLHttpRequest'
28 | }
29 |
30 | #访问网页 获取所有的json数据
31 | def post(url,para,headers=None,proxy=None,timeOut=5,timeOutRetry=5):
32 | if not url or not para:
33 | print("PostError url or para not exit")
34 | print("11111111111111")
35 | return None
36 | try:
37 | if not headers:
38 | headers=headers
39 | response = requests.post(url,data=para,headers=headers)
40 | print(response.status_code)
41 |
42 | print(response.text)
43 | if response.status_code == 200 or response.status_code == 302:
44 | htmlCode = response.text
45 | # print('1111111111')
46 | else:
47 | print("2222222222222")
48 | htmlCode = None
49 | except Exception as e:
50 | if timeOutRetry > 0:
51 | htmlCode = post(url=url,para=para,timeOutRetry=(timeOutRetry-1))
52 | print('3333333333333333333333333333')
53 | htmlCode = None
54 | return htmlCode
55 |
56 | # url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0'
57 | # url = 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
58 | #对获取的json数据进行处理,获取自己需要的信息,获取每个职位数据页数,
59 | def getinfo(url,para):
60 |
61 | htmlCode = post(url,para=para,headers=headers) #获取到网页源码,一大堆的json数据
62 | if htmlCode == None:
63 | return False
64 | companies = json.loads(htmlCode).get('content').get('positionResult').get('result')
65 | totalCount = json.loads(htmlCode).get('content').get('positionResult').get('totalCount')
66 | pagesize = json.loads(htmlCode).get('content').get('pageSize')
67 | pages = 0
68 | if int(totalCount)%int(pagesize) == 0:
69 | pages = int(int(totalCount)/int(pagesize))
70 | else:
71 | pages = int(int(totalCount) // int(pagesize)) + 1
72 |
73 | return pages,companies
74 |
75 | #写入文件中,不同的职位保存在不同的文件
76 | def writeCsv(filename,companies):
77 | info = {}
78 | csv_file = codecs.open(filename+'.csv', 'ab', 'utf-8', 'ignore')
79 | csv_writer = csv.writer(csv_file)
80 | for i in companies:
81 | info['公司名字'] = i['companyFullName'] #公司名字
82 | # print(info['公司名字'])
83 | info['公司城市'] = i['city'] #职位城市
84 | info['招聘职位'] = i['positionName'] #招聘职位
85 | info['发布时间'] = i['formatCreateTime'] #发布时间
86 | info['薪资待遇'] = i['salary'] #薪资待遇
87 | info['经验要求'] = i['workYear'] #经验要求
88 | info['公司大小'] = i['companySize'] #公司大小
89 | info['公司福利'] = i['positionAdvantage'] #公司福利
90 | info['公司地址'] = i['district'] #公司地址
91 | # print(info)
92 | csv_writer.writerow([i['companyFullName'],i['city'],i['positionName'],i['formatCreateTime'],i['salary'],
93 | i['workYear'],i['companySize'],i['positionAdvantage'],i['district']])
94 |
95 |
96 |
97 | #获取所有的职位信息
98 | def occupation():
99 | url = "https://www.lagou.com/"
100 | response = requests.get(url)
101 | soup = BeautifulSoup(response.text, 'html.parser')
102 | ds = soup.find_all("div", attrs=["_class", "menu_sub dn"])
103 |
104 | occupation_list = []
105 | for h in ds:
106 | for g in h.find_all('dd'):
107 | for l in g:
108 | if l.string != "\n":
109 | occupation_list.append(l.string)
110 |
111 | # print(occupation_list)
112 | # print(len(occupation_list))
113 | return occupation_list
114 |
115 | #获取热门城市这些职位的信息
116 | if __name__ == '__main__':
117 | occu_list = occupation()
118 | city_list = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']
119 | for l in occu_list[:]:
120 | print(l)
121 | for j in city_list:
122 | url = 'https://www.lagou.com/jobs/positionAjax.json?'
123 | para = {'px': 'default','city':j,'needAddtionalResult': 'false', 'isSchoolJob': 0, 'first': 'true', 'pn': '1',
124 | 'kd':l}
125 | pages,companies = getinfo(url,para)
126 | for i in range(pages):
127 | para['pn'] = str(i+1)
128 | time.sleep(random.random()*5)
129 | print('开始爬取第%s页'%str(i+1))
130 | try:
131 | pages,companies = getinfo(url,para)
132 | except:
133 | continue
134 | # fina = writeCsv(companies)
135 | if companies == None:
136 | break
137 | writeCsv(l,companies)
138 | # csv_writer.writerow(fina)
139 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/python实现有道词典.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | true
72 | DEFINITION_ORDER
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 | Python
89 |
90 |
91 |
92 |
93 | PyUnresolvedReferencesInspection
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 | project
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 | 1508592887941
463 |
464 |
465 | 1508592887941
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/2.py:
--------------------------------------------------------------------------------
1 | str1 = "/**/jQuery110205057557444126394_1484574357057("
2 | print(len(str1))
3 |
4 | "https://ssl.captcha.qq.com/cap_union_new_getcapbysig?aid=522005705&asig=&captype=&protocol=https&clientype=2&disturblevel=&apptype=2&curenv=inner&ua=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS82MS4wLjMxNjMuMTAwIFNhZmFyaS81MzcuMzY=&sess=1iW5KCYL2DiqaiJy8K76Am6iwXvqJkGKpwLsItwLEpjoWwy0G0R3y_t1YKNzrr-Ts5j2Knkgh2qfBoWdWYmHDY_tiQXBpB2vT7ttfysXWlz-JltnuOA33JN14umsk_q0oYq3ITlJNR02RDPd_JRNP0iQeNZe8JMMv3x8BD_Sqi-38jNGuIVSD-EZkLDrjztCENIt15GWQCs*&theme=&noBorder=noborder&fb=1&showtype=embed&uid=123456&cap_cd=Kz3KLjvqeqsYRc0aLobTgXc2UjrnVE-vhPOEpygni5x_9E6HTuxT9Q**&lang=2052&rnd=150167&rand=0.482505701756349&vsig=gvzE39T_XEWYaq6gx4EBY250WYUlIgIL_2ypn6K_iE-O-d3Iwb_2XEr5XegIpAikh4qANjt3pf3yvnESAd95nV1qemP9M1hu9800zE1wEvXls0T5ulqE4Die4uYOfQM_J&ischartype=1"
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class QqnewsItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | title = scrapy.Field()
15 | date = scrapy.Field()
16 | author = scrapy.Field()
17 | content = scrapy.Field()
18 |
19 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class QqnewsSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 | from scrapy.conf import settings
9 | class QqnewsPipeline(object):
10 |
11 | def __init__(self):
12 | MONGODN_HOST = settings['MONGODB_HOST']
13 | MONGODB_PORT = settings['MONGODB_PORT']
14 | dbName = settings['MONGODB_DBNAME']
15 | MONGODB_CNAME = settings['MONGODB_CNAME']
16 | client = pymongo.MongoClient(host=MONGODN_HOST,port=MONGODB_PORT)
17 | tdb = client[dbName]
18 | self.post = tdb[MONGODB_CNAME]
19 | def process_item(self, item, spider):
20 | news = dict(item)
21 | self.post.insert(news)
22 | return item
23 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for Qqnews project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'Qqnews'
13 |
14 | SPIDER_MODULES = ['Qqnews.spiders']
15 | NEWSPIDER_MODULE = 'Qqnews.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
20 |
21 |
22 | MONGODB_HOST = '127.0.0.1'
23 | MONGODB_PORT = 27017
24 | MONGODB_DBNAME = 'QQnews'
25 | MONGODB_CNAME = 'military'
26 |
27 | # Obey robots.txt rules
28 | ROBOTSTXT_OBEY = True
29 |
30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
31 | #CONCURRENT_REQUESTS = 32
32 |
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 |
41 | # Disable cookies (enabled by default)
42 | COOKIES_ENABLED = True
43 |
44 | # Disable Telnet Console (enabled by default)
45 | #TELNETCONSOLE_ENABLED = False
46 |
47 | # Override the default request headers:
48 | DEFAULT_REQUEST_HEADERS = {
49 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 | 'Accept-Language': 'en',
51 | 'RK=7SNngcUONh':'pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; uin=o0252943669; skey=@zCZ8lcmdT; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pgv_info=ssid=s4702440319; pgv_pvid=4169365884; o_cookie=252943669; pac_uid=1_252943669'
52 | }
53 |
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | # 'Qqnews.middlewares.QqnewsSpiderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | # 'Qqnews.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 |
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | # 'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 |
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | ITEM_PIPELINES = {
75 | 'Qqnews.pipelines.QqnewsPipeline': 300,
76 | }
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/Qqnews_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 | from Qqnews.items import QqnewsItem
5 |
6 |
7 | class QqnewsSpiderSpider(scrapy.Spider):
8 | name = "Qqnews_spider"
9 | allowed_domains = ["qq.com"]
10 | start_urls = ['http://mil.qq.com/mil_index.htm']
11 |
12 | def parse(self, response):
13 | for eveUrl in response.xpath('//a[@class="linkto"]/@href'):
14 | yield scrapy.Request(eveUrl.extract(),callback=self.parse_content)
15 |
16 |
17 | def parse_content(self,response):
18 | item = QqnewsItem()
19 | title = response.xpath('//div[@class="hd"]/h1/text()').extract()
20 | date1 = response.xpath('//span[@class="a_time"]/text()').extract()
21 | date2 = response.xpath('//div[@class="md"]/text()').extract()
22 | date3 = response.xpath('//div[@class="time"]/text()').extract()
23 | date = str(date1)+str(date2)+str(date3)
24 | author = response.xpath('//div[@class="content-article"]/p[1]/text()').extract()
25 | content = response.xpath('//div[@class="content-article"]/text()').extract()
26 | print(title,date,author,content)
27 | yield item
28 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Qqnews.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Qqnews
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class YustneirongItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class YustneirongSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class YustneirongPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for Yustneirong project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'Yustneirong'
13 |
14 | SPIDER_MODULES = ['Yustneirong.spiders']
15 | NEWSPIDER_MODULE = 'Yustneirong.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Yustneirong (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'Yustneirong.middlewares.YustneirongSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'Yustneirong.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'Yustneirong.pipelines.YustneirongPipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Yustneirong.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Yustneirong
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Dbtop250Item(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | name = scrapy.Field()
15 | zuto = scrapy.Field()
16 | desc = scrapy.Field()
17 | ping = scrapy.Field()
18 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Dbtop250SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 |
9 | from scrapy.conf import settings
10 |
11 |
12 | class Dbtop250Pipeline(object):
13 |
14 | def __init__(self):
15 | host = settings['MONGODB_HOST']
16 | port = settings['MONGODB_PORT']
17 | dbName = settings['MONGODB_DBNAME']
18 | client = pymongo.MongoClient(host=host,port=port)
19 | tdb = client[dbName]
20 | self.post = tdb[settings['MONGODB_DOCNAME']]
21 | def process_item(self, item, spider):
22 | movie = dict(item)
23 | self.post.insert(movie)
24 | return item
25 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for dbtop250 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'dbtop250'
13 |
14 | SPIDER_MODULES = ['dbtop250.spiders']
15 | NEWSPIDER_MODULE = 'dbtop250.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
20 |
21 | MONGODB_HOST = 'localhost'
22 | MONGODB_PORT = 27017
23 | MONGODB_DBNAME = 'dbtop250'
24 | MONGODB_DOCNAME = 'top250'
25 |
26 | # Obey robots.txt rules
27 | ROBOTSTXT_OBEY = True
28 |
29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
30 | #CONCURRENT_REQUESTS = 32
31 |
32 | # Configure a delay for requests for the same website (default: 0)
33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
34 | # See also autothrottle settings and docs
35 | #DOWNLOAD_DELAY = 3
36 | # The download delay setting will honor only one of:
37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | #CONCURRENT_REQUESTS_PER_IP = 16
39 |
40 | # Disable cookies (enabled by default)
41 | COOKIES_ENABLED = True
42 |
43 | # Disable Telnet Console (enabled by default)
44 | #TELNETCONSOLE_ENABLED = False
45 |
46 | # Override the default request headers:
47 | DEFAULT_REQUEST_HEADERS = {
48 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 | 'Accept-Language': 'en',
50 | 'Cookie':'RK=7SNngcUONh; pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pac_uid=1_252943669; dsp_cookiemapping0=1508662302062; dsp_cookiemapping2=1508662302064; ad_play_index=66; thyls_ad=440; dsp_cookiemapping1=1508662308938; pgv_info=ssid=s4702440319; ts_last=mil.qq.com/mil_index.htm; pgv_pvid=4169365884; o_cookie=252943669; ts_uid=2412234112'
51 | }
52 |
53 | # Enable or disable spider middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
55 | #SPIDER_MIDDLEWARES = {
56 | # 'dbtop250.middlewares.Dbtop250SpiderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable downloader middlewares
60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
61 | #DOWNLOADER_MIDDLEWARES = {
62 | # 'dbtop250.middlewares.MyCustomDownloaderMiddleware': 543,
63 | #}
64 |
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 | 'dbtop250.pipelines.Dbtop250Pipeline': 300,
75 | }
76 |
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 |
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/dbtop250_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | class Dbtop250SpiderSpider(scrapy.Spider):
6 | name = "dbtop250_spider"
7 | allowed_domains = ["douban.com"]
8 | start_urls = ['https://movie.douban.com/top250?start=0&filter=']
9 | conunt = 0
10 | def parse(self, response):
11 | self.conunt +=1
12 | for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'):
13 | full_url = eve.extract()
14 | yield scrapy.Request(full_url,callback=self.parse_movie)
15 |
16 | if self.conunt * 25 < 250:
17 | full_url = 'https://movie.douban.com/top250?start={}&filter='.format(str(self.conunt*25))
18 | yield scrapy.Request(full_url,callback=self.parse)
19 | def parse_movie(self,response):
20 | from dbtop250.items import Dbtop250Item
21 | item = Dbtop250Item()
22 | item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
23 | # item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract()
24 | item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract()
25 | yield item
26 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = dbtop250.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dbtop250
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DoubanItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | name = scrapy.Field()
15 | # auto = scrapy.Field()
16 | desc = scrapy.Field()
17 | ping = scrapy.Field()
18 |
19 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class DoubanSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class DoubanPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for douban project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'douban'
13 |
14 | SPIDER_MODULES = ['douban.spiders']
15 | NEWSPIDER_MODULE = 'douban.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'douban (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | 'Accept-Language': 'en',
45 | 'Cookie':'bid=iJIjKbBsQZ4; gr_user_id=b3a58668-aa55-4aa3-a212-1d9ed21843e8; viewed="27116300_25862578"; ps=y; ll="108288"; push_noty_num=0; push_doumail_num=0; ap=1; _ga=GA1.2.462587836.1508291602; __yadk_uid=byGuKstnDBAymxz38q9BxYZnm6ibZZbe; _vwo_uuid_v2=F727776224927130F161043B6E8DCD6F|0d3a3a996a0fdc93c651e031901946bb; __utma=30149280.462587836.1508291602.1508501064.1508655486.6; __utmb=30149280.0.10.1508655486; __utmc=30149280; __utmz=30149280.1508501064.5.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.16325; __utma=223695111.462587836.1508291602.1508655486.1508655486.1; __utmb=223695111.0.10.1508655486; __utmc=223695111; __utmz=223695111.1508655486.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_id.100001.4cf6=288cdd8ec5b2cdaf.1508655486.1.1508656155.1508655486.; _pk_ses.100001.4cf6=*',
46 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'douban.middlewares.DoubanSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'douban.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | # 'douban.pipelines.DoubanPipeline': 300,
71 | #}
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/douban_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from douban.items import DoubanItem
4 |
5 | class DoubanSpiderSpider(scrapy.Spider):
6 | name = "douban_spider"
7 | allowed_domains = ["douban.com"]
8 | print("1111111111111111111111")
9 | start_urls = ['https://movie.douban.com/top250?start=0&filter=']
10 |
11 | def parse(self, response):
12 |
13 | for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'):
14 | full_url = eve.extract()
15 |
16 | print(full_url)
17 | yield scrapy.Request(full_url,callback=self.parse_movie)
18 |
19 | def parse_movie(self,response):
20 |
21 | item = DoubanItem()
22 | item['name'] = response.xpath('//*[@id="content"]/h1/span[1]//text()').extract()
23 | item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract()
24 | item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract()
25 | print(item)
26 | yield item
27 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = douban.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/ip地址查询工具.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import json
3 | import requests
4 |
5 |
6 | url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?query=10.0.144.241&co=&resource_id=6006&t=1484574592369&ie=utf8&oe=gbk&cb=op_aladdin_callback&format=json&tn=baidu&cb=jQuery110205057557444126394_1484574357057&_=1484574357071'
7 |
8 | headers = {
9 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
10 | 'Accept-Encoding':'gzip, deflate, br',
11 | 'Accept-Language':'zh-CN,zh;q=0.8',
12 | 'Connection':'keep-alive',
13 | 'Cookie':'BAIDUID=4812092AE366ED4A55C6D8EA6713A635:FG=1; PSTM=1508161904; BIDUPSID=18C54752D18DC057B004465161A28981; BDUSS=9XM0M3bnJBYUpRZVBFRDRRWXdpVXdIa0d2WDRJUlVFaVlJcFVSMnVFOE5MUkJhSVFBQUFBJCQAAAAAAAAAAAEAAAC6uzCj0KHKqNfTczAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA2g6FkNoOhZQ2; MCITY=-%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598',
14 | 'Host':'sp0.baidu.com',
15 | 'Upgrade-Insecure-Requests':'1',
16 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
17 | }
18 | ip = input("请输入你的IP地址:")
19 | params = {
20 | 'query':ip,
21 | 'co':'',
22 | 'resource_id':'6006',
23 | 't':'1484574592369',
24 | 'ie':'utf8',
25 | 'oe':'gbk',
26 | 'cb':'op_aladdin_callback',
27 | 'format':'json',
28 | 'tn':'baidu',
29 | 'cb':'jQuery110205057557444126394_1484574357057',
30 | '_':'1484574357071'
31 | }
32 |
33 | response = requests.get(url,params=params).text
34 | # print(response)
35 | response = json.loads(response[46:][:-2])
36 | # print(response[46:][:-2])
37 | print("location:"+response.get('data')[0].get("location"))
38 | print("titlecont:"+response.get('data')[0].get("titlecont"))
39 | print("origip:"+response.get('data')[0].get("origip"))
40 | print("origipquery:"+response.get('data')[0].get("origipquery"))
41 | print("showlamp:"+response.get('data')[0].get("showlamp"))
42 | print("showLikeShare:"+str(response.get('data')[0].get("showLikeShare")))
43 | print("shareImage:"+str(response.get('data')[0].get("shareImage")))
44 | print("ExtendedLocation:"+response.get('data')[0].get("ExtendedLocation"))
45 | print("QriginQuery:"+str(response.get('data')[0].get("QriginQuery")))
46 | print("tplt:"+response.get('data')[0].get("tplt"))
47 | print("resourceid:"+str(response.get('data')[0].get("resourceid")))
48 | print("fetchkey:"+response.get('data')[0].get("fetchkey"))
49 | print("appinfo:"+response.get('data')[0].get("appinfo"))
50 | print("role_id:"+str(response.get('data')[0].get("role_id")))
51 | print("disp_type:"+str(response.get('data')[0].get("disp_type")))
52 |
53 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/dictionaries/.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/lagou.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | AngularJS
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/dump.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/dump.rdb
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/geckodriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/geckodriver.log
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy import Field
10 |
11 | class LagouItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | companyFullName = Field() # 公司名字
15 | # print(info['公司名字'])
16 | city = Field() # 职位城市
17 | positionName = Field() # 招聘职位
18 | formatCreateTime = Field() # 发布时间
19 | salary = Field() # 薪资待遇
20 | workYear = Field() # 经验要求
21 | Jobdescriptions = Field() #职位描述
22 | companySize = Field() # 公司大小
23 | positionAdvantage = Field() # 公司福利
24 | district = Field() #公司地址
25 | companyhref = Field() #公司链接
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class LagouSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from .useragent import UserAgentMiddleware
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/useragent.py:
--------------------------------------------------------------------------------
1 | import faker
2 |
3 |
4 | class UserAgentMiddleware(object):
5 | def __init__(self,settings):
6 | self.faker = faker.Faker()
7 |
8 | @classmethod
9 | def from_crawler(cls,crawler):
10 | return cls(crawler.settings)
11 |
12 | def process_request(self,request,spider):
13 | request.headers['User-Agent'] = self.faker.user_agent()
14 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 |
9 | #存储到mongodb数据库
10 | class LagouPipeline(object):
11 |
12 | def __init__(self, mongo_uri, mongo_db):
13 | self.mongo_uri = mongo_uri
14 | self.mongo_db = mongo_db
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | return cls(
19 | mongo_uri = crawler.settings.get('MONGO_URI'),
20 | mongo_db = crawler.settings.get('MONGO_DATABASE','items')
21 | )
22 |
23 | def open_spider(self,spider):
24 | self.client = pymongo.MongoClient()
25 | self.db = self.client['lagouzhiwei']
26 |
27 | def close_spider(self,spider):
28 | self.client.close()
29 |
30 | def process_item(self, item, spider):
31 | self.db['zhiweitest2'].insert(dict(item))
32 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for lagou project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'lagou'
13 |
14 | SPIDER_MODULES = ['lagou.spiders']
15 | NEWSPIDER_MODULE = 'lagou.spiders'
16 | LOG_LEVEL= 'INFO'
17 |
18 |
19 | #数据库,我已经写死了,所以这里就不写值了,如果需要在这里调用的可以填写这两个值
20 | # MONGO_URI = ''
21 | # MONGO_DATABASE = ''
22 |
23 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
24 | # USER_AGENT = 'lagou (+http://www.yourdomain.com)'
25 |
26 | # Obey robots.txt rules
27 | ROBOTSTXT_OBEY = False
28 |
29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
30 | CONCURRENT_REQUESTS = 32
31 |
32 | # Configure a delay for requests for the same website (default: 0)
33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
34 | # See also autothrottle settings and docs
35 | DOWNLOAD_DELAY = 0
36 | # The download delay setting will honor only one of:
37 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
38 | #CONCURRENT_REQUESTS_PER_IP = 16
39 |
40 | # Disable cookies (enabled by default)
41 | COOKIES_ENABLED = False
42 | # COOKIE = " user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _gat=1; index_location_city=%E5%8C%97%E4%BA%AC; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C; TG-TRACK-CODE=index_navigation; _gid=GA1.2.1376878689.1512383958; _ga=GA1.2.358203920.1509241265; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828514,1511828645,1512096311,1512383961; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512391953; LGSID=20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce; LGRID=20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce; SEARCH_ID=b1c5303a69754a66bc97d63dc0fec865"
43 | # Cookie = {'user_trace_token':'20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243',' LGUID':'20171016205145-c44d7b22-b270-11e7-991d-525400f775ce', 'showExpriedIndex':'1',
44 | # 'showExpriedCompanyHome':'1', 'showExpriedMyPublish':'1', 'hasDeliver':'2', '_gat':'1','index_location_city':'%E5%8C%97%E4%BA%AC','login':'false',
45 | # 'unick':"", '_putrc':"", 'JSESSIONID':'ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C' ,'TG-TRACK-CODE':'index_navigation','_gid':'GA1.2.1376878689.1512383958','_ga':'GA1.2.358203920.1509241265','Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1511828514,1511828645,1512096311,1512383961','Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1512391953','LGSID':'20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce',
46 | # 'LGRID':'20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce','SEARCH_ID':'b1c5303a69754a66bc97d63dc0fec865'}
47 | # Disable Telnet Console (enabled by default)
48 | TELNETCONSOLE_ENABLED = True
49 |
50 | # Override the default request headers:
51 | # DEFAULT_REQUEST_HEADERS = {
52 | # 'Accept':'application/json, text/javascript, */*; q=0.01',
53 | # 'Accept-Encoding':'gzip, deflate, br',
54 | # 'Accept-Language':'zh-CN,zh;q=0.8',
55 | # 'Connection':'keep-alive',
56 | # 'Content-Length':'25',
57 | # 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
58 | # 'Cookie':'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC',
59 | # 'Host':'www.lagou.com',
60 | # 'Origin':'https://www.lagou.com',
61 | # 'Cookie':'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512613229,1512613260,1512625404,1512968182; index_location_city=%E5%8C%97%E4%BA%AC; _ga=GA1.2.2037062440.1512613233; user_trace_token=201712011102032-33c95bdc-daf5-11e7-8800-525400f775ce; LGUID=20171207102032-33c95ef6-daf5-11e7-8800-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=4; JSESSIONID=ABAAABAAADEAAFI7D85FFAA76F7A088717F2BAF4B49DB5A; SEARCH_ID=e00f27cb11504a72a10b8ec58bd5f04f; _gat=1; LGSID=20171211125618-9f9c7c23-de2f-11e7-8e96-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_COCOS2D-X%3Fpx%3Ddefault%26city%3D%25E5%2585%25A8%25E5%259B%25BDstart.firefoxchina.cn; LGRID=20171211125650-b2cc7009-de2f-11e7-8e96-525400f775ce; _putrc=54D6D44AC87A2A52; _gid=GA1.2.834272328.1512968180; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512968210',
62 | # 'Referer':"https://www.lagou.com",
63 | # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
64 | # 'X-Anit-Forge-Code':'0',
65 | # 'X-Anit-Forge-Token':'None',
66 | # 'X-Requested-With':'XMLHttpRequest'
67 | # }
68 |
69 |
70 | # Enable or disable spider middlewares
71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
72 | #SPIDER_MIDDLEWARES = {
73 | # 'lagou.middlewares.LagouSpiderMiddleware': 543,
74 | #}
75 |
76 | # Enable or disable downloader middlewares
77 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
78 | DOWNLOADER_MIDDLEWARES = {
79 | # 'lagou.middlewares.MyCustomDownloaderMiddleware': 543,
80 | 'lagou.middlewares.UserAgentMiddleware':500,
81 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
82 |
83 | }
84 |
85 | # Enable or disable extensions
86 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
87 | #EXTENSIONS = {
88 | # 'scrapy.extensions.telnet.TelnetConsole': None,
89 | #}
90 |
91 | # Configure item pipelines
92 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
93 | ITEM_PIPELINES = {
94 | 'lagou.pipelines.LagouPipeline': 300,
95 | }
96 |
97 | # Enable and configure the AutoThrottle extension (disabled by default)
98 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
99 | #AUTOTHROTTLE_ENABLED = True
100 | # The initial download delay
101 | #AUTOTHROTTLE_START_DELAY = 5
102 | # The maximum download delay to be set in case of high latencies
103 | #AUTOTHROTTLE_MAX_DELAY = 60
104 | # The average number of requests Scrapy should be sending in parallel to
105 | # each remote server
106 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
107 | # Enable showing throttling stats for every response received:
108 | #AUTOTHROTTLE_DEBUG = False
109 |
110 | # Enable and configure HTTP caching (disabled by default)
111 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
112 | #HTTPCACHE_ENABLED = True
113 | #HTTPCACHE_EXPIRATION_SECS = 0
114 | #HTTPCACHE_DIR = 'httpcache'
115 | #HTTPCACHE_IGNORE_HTTP_CODES = []
116 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
117 |
118 |
119 | #开启scrapy-redis分布式
120 | #修改调度器
121 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
122 | #开启去重
123 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/lagouspider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from bs4 import BeautifulSoup
4 | import json
5 | from scrapy.conf import settings
6 | from ..items import LagouItem
7 | import requests
8 |
9 | occupation_list = []
10 |
11 | class LagouspiderSpider(scrapy.Spider):
12 | name = "lagouspider"
13 | allowed_domains = ["lagou.com"]
14 | start_urls = ['https://www.lagou.com']
15 | cookie = settings['COOKIE']
16 | headers = {
17 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
18 | 'Accept-Encoding': 'gzip, deflate, br',
19 | 'Accept-Language': 'zh-CN,zh;q=0.8',
20 | 'Connection': 'keep-alive',
21 | 'Content-Length': '25',
22 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
23 | 'Cookie': 'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC',
24 | 'Host': 'www.lagou.com',
25 | 'Origin': 'https://www.lagou.com',
26 | 'Referer': "h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC",
27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
28 | 'X-Anit-Forge-Code': '0',
29 | 'X-Anit-Forge-Token': 'None',
30 | 'X-Requested-With': 'XMLHttpRequest'
31 | }
32 |
33 |
34 | def parse(self, response,pn=1):
35 | #获取所有职位
36 | for i in range(1,8):
37 | occos = response.xpath('//*[@id="sidebar"]/div/div[{}]/div/dl/dd/a/text()'.format(i)).extract()
38 | for occo in occos:
39 | # url = "https://www.lagou.com/jobs/list_{}?px=default&city=%E5%85%A8%E5%9B%BD#filterBox".format('java')
40 | # yield scrapy.Request(url,callback=self.parse_page)
41 | occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0'
42 | data = {
43 | 'first':'true',
44 | 'pn':pn,
45 | 'kd':'java'
46 | }
47 | #获取返回的json数据
48 | response = requests.post(occu_url, data=data, headers=self.headers)
49 | # positionIds = json.loads(response.text).get('content').get('positionResult').get('result')
50 | try:
51 | pageSize = json.loads(response.text).get('content').get('pageSize')
52 | totalCount = json.loads(response.text).get('content').get('positionResult').get('totalCount')
53 | except json.decoder.JSONDecodeError:
54 | continue
55 | #获取总页数
56 | if int(totalCount) % int(pageSize) == 0:
57 | pages = int(int(totalCount)/int(pageSize))
58 | else:
59 | pages = int(int(totalCount)/int(pageSize)) + 1
60 |
61 | for page in range(int(pages)):
62 | pn = page + 1
63 | occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0'
64 | data = {
65 | 'first': 'true',
66 | 'pn': pn,
67 | 'kd': occo
68 | }
69 |
70 | response = requests.post(occu_url, data=data, headers=self.headers)
71 |
72 | try:
73 | if 'content' in json.loads(response.text).keys():
74 | positionIds = json.loads(response.text).get('content').get('positionResult').get('result')
75 |
76 |
77 | for positionId in positionIds:
78 | # try:
79 | position = positionId.get('positionId')
80 | # except:
81 | # continue
82 | # print(positionId)
83 | item = LagouItem()
84 | # self.item = info
85 | item['companyFullName'] = positionId['companyFullName'] # 公司名字
86 | # print(info['公司名字'])
87 | item['city'] = positionId['city'] # 职位城市
88 | item['positionName'] = positionId['positionName'] # 招聘职位
89 | item['formatCreateTime'] = positionId['formatCreateTime'] # 发布时间
90 | item['salary'] = positionId['salary'] # 薪资待遇
91 | item['workYear'] = positionId['workYear'] # 经验要求
92 | item['companySize'] = positionId['companySize'] # 公司大小
93 | item['positionAdvantage'] = positionId['positionAdvantage'] # 公司福利
94 | item['district'] = positionId['district'] # 公司地址
95 | info_url = "https://www.lagou.com/jobs/{}.html".format(position)
96 | # item = LagouItem()
97 | # item['companyhref'] = info_url
98 | print(item)
99 | yield item
100 | # yield scrapy.Request(url=info_url, callback=self.parse_fina)
101 | except json.decoder.JSONDecodeError:
102 | continue
103 | except TimeoutError:
104 | continue
105 | # print(info_url)
106 | # yield item
107 |
108 | #获取详细页面的信息,这里试验了,很慢,只获取了页面链接,我就只获取了json数据里面的信息,大部门差不多了
109 | # def parse_fina(self,response):
110 | # item = LagouItem()
111 | # response = response.text
112 | # print(response.status)
113 | # if response.status == 200:
114 | # try:
115 | # item['companyFullName'] = response.xpath('//*[@id="job_company"]/dt/a/img/@alt').extract() # 公司名字
116 |
117 | # print(info['公司名字'])
118 | # item['city'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[2]/text()').extract() # 职位城市
119 | # item['positionName'] = response.xpath('/html/body/div[2]/div/div[1]/div/span/text()').extract() # 招聘职位
120 | # item['formatCreateTime'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[2]/text()').extract() # 发布时间
121 | # item['salary'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[1]/text()').extract() # 薪资待遇
122 | # item['workYear'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[3]/text()').extract()[0] # 经验要求
123 | # item['Jobdescriptions'] = response.xpath('//*[@id="job_detail"]/dd[2]/div/p/text()').extract() # 职位描述
124 | # item['companySize'] = response.xpath('//*[@id="job_company"]/dd/ul/li[3]/text()').extract() # 公司大小
125 | # item['positionAdvantage'] = response.xpath('//*[@id="job_detail"]/dd[1]/p/text()').extract() # 公司福利
126 | # item['district'] = response.xpath('//*[@id="job_detail"]/dd[3]/div[1]/a/text()').extract() # 公司地址
127 | # item['companyhref'] = response.xpath('//*[@id="job_company"]/dd/ul/li[4]/a/@href').extract() # 公司链接
128 | # except IndexError:
129 | # pass
130 | # print(item)
131 | # yield item
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = lagou.settings
8 |
9 | [deploy:demo]
10 | url = http://localhost:6800/
11 | project = lagou
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/python爬虫实现有道词典.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import urllib.request
3 | import json
4 | import urllib.parse
5 |
6 | url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
7 | headers = {
8 | 'Cookie':'OUTFOX_SEARCH_USER_ID=-763428860@10.168.8.61; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc31lbWsGNO67M3Fi-8v; OUTFOX_SEARCH_USER_ID_NCOO=1648534080.0892432; _ntes_nnid=bf4e54b134dc8a8b2f65cd59c8ba272e,1508592727589; ___rl__test__cookies=1508593353423',
9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
10 | }
11 |
12 | dict1 = {
13 | '0':['zh-CHS','en'],
14 | '1':['en','zh-CHS'],
15 | '2':['zh-CHS','ja'],
16 | '3':['ja','zh-CHS'],
17 | '4':['zh-CHS','ko'],
18 | '5':['ko','zh-CHS'],
19 | '6':['zh-CHS','fr'],
20 | '7':['fr','zh-CHS'],
21 | '8':['zh-CHS','ru'],
22 | '9':['ru','zh-CHS'],
23 | '10':['zh-CHS','es'],
24 | '11':['es','zh-CHS'],
25 | '12':['zh-CHS','pt'],
26 | '13':['pt','zh-CHS'],
27 | }
28 | switch = input("请选择语言翻译:0:中文-》英语,1:英语-》中文,2:中文-》日语,3:日语-》中文,\n,4:中文-》韩语,5:韩语-》中文,"
29 | "6:中文-》法语,7:法语-》中文,8:中文-》俄语,\n,9:俄语-》中文,10:中文-》西班牙语,\n,11:西班牙语-》中文,12:中文-》葡萄牙语,"
30 | "13:葡萄牙语-》中文:")
31 |
32 | star = dict1[switch][0]
33 | end = dict1[switch][1]
34 | # print(star)
35 | # print(end)
36 | word = input("请输入你要翻译的语句:")
37 | data = {
38 | 'i':word,
39 | 'from':star,
40 | 'to':end,
41 | 'smartresult':'dict',
42 | 'client':'fanyideskweb',
43 | 'salt':'1508593351114',
44 | 'sign':'32cded672e5ba31d4f4929650a5ad22e',
45 | 'doctype':'json',
46 | 'version':'2.1',
47 | 'keyfrom':'fanyi.web',
48 | 'action':'FY_BY_CLICKBUTTION',
49 | 'typoResult':'true'
50 | }
51 |
52 | data = urllib.parse.urlencode(data).encode("utf-8")
53 | response = urllib.request.urlopen(url=url,data=data)
54 | datas = json.loads(response.read().decode("utf-8"))
55 | answer = datas.get('translateResult')[0][0]['tgt']
56 | print(answer)
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = taobaoclass.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = taobaoclass
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class TaobaoclassItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | image_urls = scrapy.Field()
15 | images = scrapy.Field()
16 | title = scrapy.Field()
17 | price = scrapy.Field()
18 | fukuan = scrapy.Field()
19 | dizhi = scrapy.Field()
20 | url = scrapy.Field()
21 | dianqu = scrapy.Field()
22 |
23 | class Iphone(scrapy.Item):
24 | image_urls = scrapy.Field()
25 | images = scrapy.Field()
26 | title = scrapy.Field()
27 | price = scrapy.Field()
28 | fukuan = scrapy.Field()
29 | dizhi = scrapy.Field()
30 | url = scrapy.Field()
31 | dianqu = scrapy.Field()
32 |
33 | class Samsung(scrapy.Item):
34 | image_urls = scrapy.Field()
35 | images = scrapy.Field()
36 | title = scrapy.Field()
37 | price = scrapy.Field()
38 | fukuan = scrapy.Field()
39 | dizhi = scrapy.Field()
40 | url = scrapy.Field()
41 | dianqu = scrapy.Field()
42 |
43 |
44 |
45 | class HuaWei(scrapy.Item):
46 | image_urls = scrapy.Field()
47 | images = scrapy.Field()
48 | title = scrapy.Field()
49 | price = scrapy.Field()
50 | fukuan = scrapy.Field()
51 | dizhi = scrapy.Field()
52 | url = scrapy.Field()
53 | dianqu = scrapy.Field()
54 |
55 |
56 | class Magic(scrapy.Item):
57 | image_urls = scrapy.Field()
58 | images = scrapy.Field()
59 | title = scrapy.Field()
60 | price = scrapy.Field()
61 | fukuan = scrapy.Field()
62 | dizhi = scrapy.Field()
63 | url = scrapy.Field()
64 | dianqu = scrapy.Field()
65 |
66 |
67 |
68 | class ShouJike(scrapy.Item):
69 | image_urls = scrapy.Field()
70 | images = scrapy.Field()
71 | title = scrapy.Field()
72 | price = scrapy.Field()
73 | fukuan = scrapy.Field()
74 | dizhi = scrapy.Field()
75 | url = scrapy.Field()
76 | dianqu = scrapy.Field()
77 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class TaobaoclassSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 | from scrapy.conf import settings
9 | from taobao.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe
10 |
11 |
12 | class TaobaoclassPipeline(object):
13 | def process_item(self, item, spider):
14 | host = settings('MONGODB_HOST')
15 | port = settings('MONGODB_PORT')
16 | dbName = settings('MONGODB_DBNAME')
17 | client = pymongo.MongoClient(host=host,port=port)
18 | tdb = client[dbName]
19 |
20 | if isinstance(item,Iphone):
21 | self.post = tdb[settings['MONGODB_DOCNAME_IP']]
22 | elif isinstance(item, Samsung):
23 | self.post = tdb[settings['MONGODB_DOCNAME_SAM']]
24 | elif isinstance(item, HuaWei):
25 | self.post = tdb[settings['MONGODB_DOCNAME_HW']]
26 | elif isinstance(item, ShouJiKe):
27 | self.post = tdb[settings['MONGODB_DOCNAME_SJK']]
28 | elif isinstance(item, Magic):
29 | self.post = tdb[settings['MONGODB_DOCNAME_MAG']]
30 | taobao = dict(item)
31 | self.post.insert(taobao)
32 |
33 | return item
34 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for taobaoclass project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'taobaoclass'
13 |
14 | SPIDER_MODULES = ['taobaoclass.spiders']
15 | NEWSPIDER_MODULE = 'taobaoclass.spiders'
16 |
17 | MONGODB_HOST = '127.0.0.1'
18 | MONGODB_PORT = 27017
19 | MONGODB_DBNAME = 'taobao'
20 | MONGODB_DOCNAME_IP = 'ipad'
21 | MONGODB_DOCNAME_SAM = 'samsung'
22 | MONGODB_DOCNAME_HW = 'huawei'
23 | MONGODB_DOCNAME_MAG = 'magic'
24 | MONGODB_DOCNAME_SJK = 'shoujike'
25 |
26 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
27 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
28 |
29 | # Obey robots.txt rules
30 | ROBOTSTXT_OBEY = False
31 |
32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
33 | #CONCURRENT_REQUESTS = 32
34 |
35 | # Configure a delay for requests for the same website (default: 0)
36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
37 | # See also autothrottle settings and docs
38 | #DOWNLOAD_DELAY = 3
39 | # The download delay setting will honor only one of:
40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
41 | #CONCURRENT_REQUESTS_PER_IP = 16
42 |
43 | # Disable cookies (enabled by default)
44 | COOKIES_ENABLED = True
45 |
46 | # Disable Telnet Console (enabled by default)
47 | #TELNETCONSOLE_ENABLED = False
48 |
49 | # Override the default request headers:
50 | DEFAULT_REQUEST_HEADERS = {
51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 | 'Accept-Language': 'en',
53 | }
54 |
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | #SPIDER_MIDDLEWARES = {
58 | # 'taobaoclass.middlewares.TaobaoclassSpiderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | #DOWNLOADER_MIDDLEWARES = {
64 | # 'taobaoclass.middlewares.MyCustomDownloaderMiddleware': 543,
65 | #}
66 |
67 | # Enable or disable extensions
68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
69 | #EXTENSIONS = {
70 | # 'scrapy.extensions.telnet.TelnetConsole': None,
71 | #}
72 |
73 | # Configure item pipelines
74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
75 | ITEM_PIPELINES = {
76 | 'taobaoclass.pipelines.TaobaoclassPipeline': 300,
77 | 'scrapy.contrib.pipeline.images.ImagePipeline':1
78 | }
79 | IMAGES_STORE = 'pic/'
80 | IMAGES_URL_FIELD = 'image_urls'
81 |
82 | # Enable and configure the AutoThrottle extension (disabled by default)
83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
84 | #AUTOTHROTTLE_ENABLED = True
85 | # The initial download delay
86 | #AUTOTHROTTLE_START_DELAY = 5
87 | # The maximum download delay to be set in case of high latencies
88 | #AUTOTHROTTLE_MAX_DELAY = 60
89 | # The average number of requests Scrapy should be sending in parallel to
90 | # each remote server
91 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
92 | # Enable showing throttling stats for every response received:
93 | #AUTOTHROTTLE_DEBUG = False
94 |
95 | # Enable and configure HTTP caching (disabled by default)
96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
97 | #HTTPCACHE_ENABLED = True
98 | #HTTPCACHE_EXPIRATION_SECS = 0
99 | #HTTPCACHE_DIR = 'httpcache'
100 | #HTTPCACHE_IGNORE_HTTP_CODES = []
101 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
102 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/taobao_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import json
4 | from taobaoclass import items
5 | from taobaoclass.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe
6 | import urllib.parse
7 |
8 |
9 | class TaobaoSpiderSpider(scrapy.Spider):
10 | name = "taobao_spider"
11 | totalItem = ['magic','华为mate9']
12 | allowed_domains = ["taobao.com"]
13 | start_urls = []
14 | count = 0
15 | total = 0
16 | while(count < 500):
17 | for eveItem in totalItem:
18 | count = count + 13
19 | new_url = 'https://s.taobao.com/api?_ksTS=1488147288907_219&ajax=true&m=customized&q=' + urllib.parse.quote(eveItem) + '&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&s=' + str(count) + '&bcoffset=-3'
20 | start_urls.append(new_url)
21 | print(start_urls)
22 |
23 |
24 | def parse(self, response):
25 | try:
26 | html = json.loads(response.body.decode().replace('}}})','}}}').replace("jsonp220(",''))
27 | for eve in html['API.CustomizedApi']['itemlist']['auctions']:
28 | print("++++++++++++++++++++++++++++++++++++++++")
29 | if 'ipad' in str(response.url):
30 | items = Iphone()
31 | print("ipad")
32 | elif 'samsung' in str(response.url):
33 | items = Samsung()
34 | print("Samsung")
35 | elif 'mate9' in str(response.url):
36 | item = HuaWei()
37 | print('huawei')
38 | else:
39 | item = ShouJiKe()
40 | print('shoujike')
41 | img = []
42 | self.total = self.total + 1
43 | item['title'] = eve['raw_title']
44 | item['price'] = eve['view_price']
45 | item['fukuan'] = eve['view_sales']
46 | item['dizhi'] = eve['item_loc']
47 | item['url'] = 'https:' + eve['comment_url']
48 | item['dianpu'] = eve['nick']
49 | img.append(str('http://' + eve['pic_url']))
50 | item['image_urls'] = img
51 | yield item
52 | except Exception as e:
53 | print(e)
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tutorial.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class TutorialItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
16 | class DmozItem(scrapy.Item):
17 | title = scrapy.Field()
18 | link = scrapy.Field()
19 | desc = scrapy.Field()
20 | name = scrapy.Field()
21 | price = scrapy.Field()
22 | last_updated = scrapy.Field(serializer = str)
23 | class TestItem(scrapy.Item):
24 | id = scrapy.Field()
25 | name = scrapy.Field()
26 | description = scrapy.Field()
27 |
28 |
29 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class TutorialSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class TutorialPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for tutorial project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'tutorial'
13 |
14 | SPIDER_MODULES = ['tutorial.spiders']
15 | NEWSPIDER_MODULE = 'tutorial.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'tutorial.pipelines.TutorialPipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/settings.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 | # from tutorial.tutorial.items import DmozItem
4 |
5 |
6 | class DmozSpider(scrapy.Spider):
7 |
8 | name = "dmoz"
9 | allowed_domains = ['dmoz.org']
10 | start_urls = [
11 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
12 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
13 | ]
14 | # def parse(self, response):
15 | # # filename = response.url.split("/")[-2]
16 | # # with open(filename,"wb") as f:
17 | # # f.write(response.body)
18 | # for sel in response.xpath('//ul/li'):
19 | # # title = sel.xpath('a/text()').extract()
20 | # # link = sel.xpath('a/@href').extract()
21 | # # desc = self.xpath('text()').extract()
22 | # # print(title,link,desc)
23 | # item = DmozItem()
24 | # item['title'] = sel.xpath('a/text()').extract()
25 | # item['scrlink'] = sel.xpath('a/@href').extract()
26 | # item['desc'] = sel.xpath('text()').extract()
27 | # yield item
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | class MydomainSpider(scrapy.Spider):
6 | name = 'mydomain'
7 | allowed_domains = ['mydomain.com']
8 | start_urls = ['http://mydomain.com/']
9 |
10 | def parse(self, response):
11 | pass
12 | class MySpider(scrapy.Spider):
13 | name = "example.com"
14 | allowed_domains = ['example.com']
15 | start_urls = [
16 | 'http://www.example.com/1.html',
17 | 'http://www.example.com/2.html',
18 | 'http://www.example.com/3.html',
19 | ]
20 |
21 | def parse(self, response):
22 | self.log('A response from %s just arrived!' % response.url)
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = zaobao.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zaobao
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__init__.py
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ZaobaoItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | name = scrapy.Field()
15 | url = scrapy.Field()
16 | data = scrapy.Field()
17 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ZaobaoSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class ZaobaoPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for zaobao project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'zaobao'
13 |
14 | SPIDER_MODULES = ['zaobao.spiders']
15 | NEWSPIDER_MODULE = 'zaobao.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zaobao (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'zaobao.middlewares.ZaobaoSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'zaobao.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'zaobao.pipelines.ZaobaoPipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc
--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/zaobao_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 | from zaobao.items import ZaobaoItem
5 |
6 |
7 | class ZaobaoSpiderSpider(scrapy.Spider):
8 | name = "zaobao_spider"
9 | allowed_domains = ["zaobao.com"]
10 | start_urls = ['http://zaobao.com/']
11 |
12 | def parse(self, response):
13 | for eve in response.xpath('//*[@id="DressUp]/div/div/div/div/a/@href'):
14 | full_url = response.urljoin(eve.extract())
15 | yield scrapy.Request(full_url,callback=self.parse_news)
16 |
17 | def parse_news(self,response):
18 | item = ZaobaoItem()
19 | item['name'] = response.xpath('//*[@id="MainCourse"]/div/h1/text()').extract()
20 | item['url'] = response.xpath('//*[@id="MainCourse]/div/div[2]').extract()
21 | print(item)
22 | yield item
--------------------------------------------------------------------------------
/zhihu.com/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = zhihuuser.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihuuser
12 |
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__init__.py
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy import Item,Field
10 |
11 | class ZhihuuserItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | id = Field()
15 | name = Field()
16 | avatar_url = Field()
17 | user_type = Field()
18 | answer_count = Field()
19 | url = Field()
20 | url_token = Field()
21 | headline = Field()
22 |
23 |
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ZhihuuserSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymongo
9 | class ZhihuuserPipeline(object):
10 | def __init__(self, mondo_uri, mongo_db):
11 | self.mongo_uri = mondo_uri
12 | self.mongo_db = mongo_db
13 |
14 | @classmethod
15 | def from_crawler(cls,crawler):
16 | return cls(
17 | mongo_uri = crawler.settings.get('MONGO_URI'),
18 | mongo_db = crawler.settings.get('MONGO_DATABASE','items')
19 | )
20 |
21 | def open_spider(self,spider):
22 | self.client = pymongo.MongoClient()
23 | self.db = self.client['zhihuuser']
24 |
25 | def close_spider(self,spider):
26 | self.client.close()
27 |
28 | def process_item(self, item, spider):
29 | # collection_name = item.__class__.__name__
30 | # self.db[collection_name].insert(dict(item))
31 | #去重,如果有更新,没有就插入
32 | self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
33 | return item
34 |
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for zhihuuser project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'zhihuuser'
13 |
14 | SPIDER_MODULES = ['zhihuuser.spiders']
15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | 'Accept-Language': 'en',
45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
46 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300,
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
94 |
95 | #修改调度器
96 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
97 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 |
4 | import scrapy
5 | from scrapy import spider,Request
6 | from ..items import ZhihuuserItem
7 |
8 | class ZhihuSpider(scrapy.Spider):
9 | name = "zhihu"
10 | allowed_domains = ["zhihu.com"]
11 | start_urls = ['http://www.zhihu.com/']
12 |
13 | start_user = 'excited-vczh'
14 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
15 | user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
16 |
17 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
18 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 |
20 | #粉丝列表
21 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
22 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
23 |
24 | def start_requests(self):
25 | # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
26 | yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user)
27 | yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows)
28 |
29 | #用户个人信息
30 | def parse_user(self, response):
31 | result = json.loads(response.text)
32 | item = ZhihuuserItem()
33 | for field in item.fields:
34 | #如果定义的item是获取的键名之一,就赋值
35 | if field in result.keys():
36 | item[field] = result.get(field)
37 | yield item
38 |
39 | yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows)
40 | yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
41 | yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
42 |
43 | #关注着信息
44 | def parse_follows(self,response):
45 |
46 | results = json.loads(response.text)
47 |
48 | #先判断data键名是否存在
49 | if 'data' in results.keys():
50 | for result in results.get('data'):
51 | yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user)
52 |
53 | #获取下一页链接,然后继续对下一页数据进行处理
54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 | next_page = results.get('paging').get('next')
56 | yield Request(next_page,self.parse_follows)
57 |
58 | #粉丝信息
59 | def parse_followers(self, response):
60 |
61 | results = json.loads(response.text)
62 |
63 | # 先判断data键名是否存在
64 | if 'data' in results.keys():
65 | for result in results.get('data'):
66 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
67 | callback=self.parse_user)
68 |
69 | # 获取下一页链接,然后继续对下一页数据进行处理
70 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
71 | next_page = results.get('paging').get('next')
72 | yield Request(next_page, self.parse_followers)
--------------------------------------------------------------------------------
/zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = zhihuuser.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | #project = zhihuuser
12 | #部署项目到主机上,部署分布式,利用scrapyd
13 | url = http://localhost:6800/addversion.json
14 | projuct = zhihuuser
--------------------------------------------------------------------------------
/zhihu/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__init__.py
--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu/zhihuuser/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy import Item,Field
10 |
11 | class ZhihuuserItem(Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | id = Field()
15 | name = Field()
16 | avatar_url = Field()
17 | user_type = Field()
18 | answer_count = Field()
19 | url = Field()
20 | url_token = Field()
21 | headline = Field()
22 |
23 |
--------------------------------------------------------------------------------
/zhihu/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ZhihuuserSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/zhihu/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymongo
9 | class ZhihuuserPipeline(object):
10 | def __init__(self, mondo_uri, mongo_db):
11 | self.mongo_uri = mondo_uri
12 | self.mongo_db = mongo_db
13 |
14 | @classmethod
15 | def from_crawler(cls,crawler):
16 | return cls(
17 | mongo_uri = crawler.settings.get('MONGO_URI'),
18 | mongo_db = crawler.settings.get('MONGO_DATABASE','items')
19 | )
20 |
21 | def open_spider(self,spider):
22 | self.client = pymongo.MongoClient()
23 | self.db = self.client['zhihuuser']
24 |
25 | def close_spider(self,spider):
26 | self.client.close()
27 |
28 | def process_item(self, item, spider):
29 | # collection_name = item.__class__.__name__
30 | # self.db[collection_name].insert(dict(item))
31 | #去重,如果有更新,没有就插入
32 | self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
33 | return item
34 |
--------------------------------------------------------------------------------
/zhihu/zhihuuser/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for zhihuuser project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'zhihuuser'
13 |
14 | SPIDER_MODULES = ['zhihuuser.spiders']
15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | 'Accept-Language': 'en',
45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
46 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300,
71 | #加入scrapy_redis中间件,分布式,不注释此行,每台机器爬取的item存储到各自的数据库,网络传输压力大
72 | #为了使每台机器爬取的item不存储到数据库,注释此行
73 | # 'scrapy_redis.pipelines.RedisPipeline':301
74 | }
75 |
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 |
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 |
97 |
98 | #分布式
99 |
100 | #修改调度器
101 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
102 | #开启去重
103 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
104 |
105 | #redis数据库连接信息
106 | REDIS_URL = 'redis://127.0.0.1:6379'
107 |
108 | #爬取完不清空请求队列和指纹 ,没什么用,一般默认False
109 | # SCHEDULER_PERSIST = True
110 | #在每次爬取的时候,都会把指纹和队列清空,相当于重新进行了爬取
111 | #SCHEDULER_FLUSH_ON_START = True
--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc
--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 |
4 | import scrapy
5 | from scrapy import spider,Request
6 | from ..items import ZhihuuserItem
7 |
8 | class ZhihuSpider(scrapy.Spider):
9 | name = "zhihu"
10 | allowed_domains = ["zhihu.com"]
11 | start_urls = ['http://www.zhihu.com/']
12 |
13 | start_user = 'excited-vczh'
14 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
15 | user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
16 |
17 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
18 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 |
20 | #粉丝列表
21 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
22 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
23 |
24 | def start_requests(self):
25 | # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
26 | yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user)
27 | yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows)
28 |
29 | #用户个人信息
30 | def parse_user(self, response):
31 | result = json.loads(response.text)
32 | item = ZhihuuserItem()
33 | for field in item.fields:
34 | #如果定义的item是获取的键名之一,就赋值
35 | if field in result.keys():
36 | item[field] = result.get(field)
37 | yield item
38 |
39 | yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows)
40 | yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
41 | yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
42 |
43 | #关注着信息
44 | def parse_follows(self,response):
45 |
46 | results = json.loads(response.text)
47 |
48 | #先判断data键名是否存在
49 | if 'data' in results.keys():
50 | for result in results.get('data'):
51 | yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user)
52 |
53 | #获取下一页链接,然后继续对下一页数据进行处理
54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 | next_page = results.get('paging').get('next')
56 | yield Request(next_page,self.parse_follows)
57 |
58 | #粉丝信息
59 | def parse_followers(self, response):
60 |
61 | results = json.loads(response.text)
62 |
63 | # 先判断data键名是否存在
64 | if 'data' in results.keys():
65 | for result in results.get('data'):
66 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
67 | callback=self.parse_user)
68 |
69 | # 获取下一页链接,然后继续对下一页数据进行处理
70 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
71 | next_page = results.get('paging').get('next')
72 | yield Request(next_page, self.parse_followers)
--------------------------------------------------------------------------------
/可视化文件显示程序.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/可视化文件显示程序.zip
--------------------------------------------------------------------------------
/基于python的turtle画出叮当猫.py:
--------------------------------------------------------------------------------
1 | import turtle
2 |
3 | turtle.speed(5)
4 | turtle.circle(50)
5 | turtle.begin_fill() #画头
6 | turtle.circle(85)
7 | turtle.fillcolor("blue")
8 | turtle.end_fill()
9 |
10 | # turtle.penup()
11 | # turtle.goto(0,20)
12 | # turtle.pendown()
13 |
14 | # turtle.begin_fill()
15 | # turtle.circle(35)
16 | # turtle.fillcolor("white")
17 | # turtle.end_fill()
18 |
19 | turtle.begin_fill() #画脸
20 | turtle.circle(60)
21 | turtle.fillcolor("white")
22 | turtle.end_fill()
23 |
24 |
25 |
26 | turtle.penup()
27 | turtle.goto(-20,95) #化左眼眶
28 | turtle.pendown()
29 | turtle.begin_fill()
30 | turtle.circle(19)
31 | turtle.fillcolor("white")
32 | turtle.end_fill()
33 |
34 |
35 |
36 | turtle.penup() #画右眼眶
37 | turtle.goto(20,95)
38 | turtle.pendown()
39 | turtle.begin_fill()
40 | turtle.circle(19)
41 | turtle.fillcolor("white")
42 | turtle.end_fill()
43 |
44 | turtle.penup() #化左眼珠
45 | turtle.goto(-8,111)
46 | turtle.pendown()
47 | turtle.begin_fill()
48 | turtle.fillcolor("black")
49 | turtle.circle(3)
50 | turtle.end_fill()
51 |
52 |
53 | turtle.penup() #画右眼珠
54 | turtle.goto(8,111)
55 | turtle.pendown()
56 | turtle.begin_fill()
57 | turtle.fillcolor("black")
58 | turtle.circle(3)
59 | turtle.end_fill()
60 |
61 | turtle.penup() #画鼻子
62 | turtle.goto(0,85)
63 | turtle.pendown()
64 | turtle.begin_fill()
65 | turtle.circle(10)
66 | turtle.fillcolor("red")
67 | turtle.end_fill()
68 |
69 |
70 | turtle.goto(0,30) #画竖线
71 |
72 | turtle.penup() #左边第一根胡子
73 | turtle.goto(-20,70)
74 | turtle.pendown()
75 | turtle.goto(-45,80)
76 |
77 | turtle.penup() #左边第二根胡子
78 | turtle.goto(-20,60)
79 | turtle.pendown()
80 | turtle.goto(-47,60)
81 |
82 | turtle.penup() #左边第三根胡子
83 | turtle.goto(-20,50)
84 | turtle.pendown()
85 | turtle.goto(-47,40)
86 |
87 | turtle.penup() #右边第三根胡子
88 | turtle.goto(20,50)
89 | turtle.pendown()
90 | turtle.goto(47,40)
91 |
92 |
93 | turtle.penup() #右边第二根胡子
94 | turtle.goto(20,60)
95 | turtle.pendown()
96 | turtle.goto(47,60)
97 |
98 |
99 | turtle.penup() #左边第一根胡子
100 | turtle.goto(20,70)
101 | turtle.pendown()
102 | turtle.goto(45,80)
103 |
104 | turtle.penup() #右边胳膊1
105 | turtle.goto(50,20)
106 | turtle.pendown()
107 | turtle.goto(100,-10)
108 |
109 |
110 | turtle.penup() #右边胳膊2
111 | turtle.goto(50,-20)
112 | turtle.pendown()
113 | turtle.goto(80,-40)
114 |
115 | turtle.begin_fill()
116 | turtle.goto(100,-10)
117 | turtle.goto(50,20)
118 | turtle.goto(50,-20)
119 | turtle.goto(80,-40)
120 | turtle.fillcolor("yellow")
121 | turtle.end_fill()
122 |
123 |
124 |
125 | turtle.penup() #右手
126 | turtle.goto(100,-50)
127 | turtle.pendown()
128 | turtle.begin_fill()
129 | turtle.circle(20)
130 | turtle.fillcolor("blue")
131 | turtle.end_fill()
132 |
133 |
134 |
135 | turtle.penup() #左边胳膊1
136 | turtle.goto(-50,20)
137 | turtle.pendown()
138 | turtle.goto(-100,-10)
139 |
140 |
141 | turtle.penup() #左边胳膊2
142 | turtle.goto(-50,-20)
143 | turtle.pendown()
144 | turtle.goto(-80,-40)
145 |
146 | turtle.begin_fill()
147 | turtle.goto(-100,-10)
148 | turtle.goto(-50,20)
149 | turtle.goto(-50,-20)
150 | turtle.goto(-80,-40)
151 | turtle.fillcolor("yellow")
152 | turtle.end_fill()
153 |
154 | turtle.penup() #左手
155 | turtle.goto(-100,-53)
156 | turtle.pendown()
157 | turtle.begin_fill()
158 | turtle.circle(20)
159 | turtle.fillcolor("blue")
160 | turtle.end_fill()
161 |
162 |
163 | turtle.penup() #左手
164 | turtle.goto(-50,-20)
165 | turtle.pendown()
166 | turtle.goto(-50,-100)
167 |
168 | turtle.penup() #左手
169 | turtle.goto(50,-20)
170 | turtle.pendown()
171 | turtle.goto(50,-100)
172 |
173 |
174 | turtle.begin_fill()
175 | turtle.penup()
176 | turtle.goto(50,-120)
177 | turtle.pendown()
178 | turtle.circle(10)
179 | turtle.fillcolor("blue")
180 | turtle.end_fill()
181 |
182 | turtle.begin_fill()
183 | turtle.goto(20,-120)
184 | turtle.circle(10)
185 | turtle.fillcolor("blue")
186 | turtle.end_fill()
187 |
188 |
189 | turtle.penup()
190 | turtle.goto(50,-100)
191 | turtle.pendown()
192 | turtle.goto(20,-100)
193 |
194 |
195 |
196 | turtle.penup()
197 | turtle.goto(-50,-120)
198 | turtle.pendown()
199 | turtle.begin_fill()
200 | turtle.circle(10)
201 | turtle.goto(-20,-120)
202 | turtle.circle(10)
203 | turtle.fillcolor("blue")
204 | turtle.end_fill()
205 |
206 | turtle.penup()
207 | turtle.goto(-20,-100)
208 | turtle.pendown()
209 | turtle.goto(-50,-100)
210 |
211 |
212 | turtle.penup()
213 | turtle.goto(-20,-100)
214 | turtle.pendown()
215 | turtle.goto(-20,-85)
216 |
217 | turtle.goto(20,-85)
218 | turtle.goto(20,-100)
219 |
220 | turtle.penup()
221 | turtle.goto(-50,-20)
222 | turtle.pendown()
223 |
224 | turtle.begin_fill()
225 | turtle.goto(50,-20)
226 | turtle.goto(50,-85)
227 | turtle.goto(-50,-85)
228 | turtle.goto(-50,-20)
229 | turtle.fillcolor("blue")
230 | turtle.end_fill()
231 |
232 |
233 | turtle.penup()
234 | turtle.goto(0,-20) #铃铛
235 | turtle.pendown()
236 | turtle.begin_fill()
237 | turtle.circle(10)
238 | turtle.fillcolor("yellow")
239 | turtle.end_fill()
240 |
241 |
242 |
243 | turtle.penup()
244 | turtle.goto(-10,-10)
245 | turtle.pendown()
246 | turtle.goto(10,-10)
247 |
248 |
249 | turtle.penup()
250 | turtle.goto(-50,20)
251 | turtle.pendown()
252 | turtle.begin_fill()
253 | turtle.goto(50,20)
254 | turtle.goto(50,0)
255 | turtle.goto(-50,0)
256 | turtle.goto(-50,20)
257 | turtle.fillcolor("red")
258 | turtle.end_fill()
259 |
260 |
261 | turtle.penup()
262 | turtle.goto(50,0)
263 | turtle.pendown()
264 | turtle.begin_fill()
265 | turtle.circle(10)
266 | turtle.fillcolor("red")
267 | turtle.end_fill()
268 |
269 |
270 | turtle.penup()
271 | turtle.goto(-50,0)
272 | turtle.pendown()
273 | turtle.begin_fill()
274 | turtle.circle(10)
275 | turtle.fillcolor("red")
276 | turtle.end_fill()
277 |
278 |
279 | turtle.penup() #内裤
280 | turtle.goto(-50,-70)
281 | turtle.pendown()
282 | turtle.begin_fill()
283 | turtle.goto(50,-70)
284 | turtle.goto(50,-50)
285 | turtle.goto(-50,-50)
286 | turtle.goto(-50,-70)
287 | turtle.fillcolor("red")
288 | turtle.end_fill()
289 |
290 | turtle.penup()
291 | turtle.goto(-10,-70)
292 | turtle.pendown()
293 | turtle.begin_fill()
294 | turtle.goto(-10,-85)
295 | turtle.goto(10,-85)
296 | turtle.goto(10,-70)
297 | turtle.goto(-10,-70)
298 | turtle.fillcolor("red")
299 | turtle.end_fill()
300 |
301 | turtle.penup()
302 | turtle.goto(-100,200)
303 | turtle.pendown()
304 | s = "机器猫中的战斗猫"
305 | turtle.write(s,font = ("Arial",20,"normal"))
306 |
307 |
308 | turtle.done()
--------------------------------------------------------------------------------
/基于python的turtle的桌面弹球.py:
--------------------------------------------------------------------------------
1 | from tkinter import *
2 | from random import randint
3 |
4 | def getRandomColor():
5 | color = "#"
6 | for j in range(6):
7 | color += toHexChar(randint(0,15))
8 | return color
9 | def toHexChar(hexValue):
10 | if 0 <= hexValue <= 9:
11 | return chr(hexValue + ord('0'))
12 | else:
13 | return chr(hexValue - 10 + ord("A"))
14 | class Ball:
15 | def __init__(self):
16 | self.x = 0
17 | self.y = 0
18 | self.dx = 2
19 | self.dy = 2
20 | self.radius = 3
21 | self.color = getRandomColor()
22 |
23 | class BounceBalls:
24 | def __init__(self):
25 | self.ballList = []
26 | win = Tk()
27 | win.title("Bouncing Balls")
28 |
29 | self.width = 350
30 | self.height = 150
31 | self.canvas = Canvas(win,bg = "white",width = self.width,height = self.height)
32 | self.canvas.pack()
33 |
34 |
35 | frame = Frame(win)
36 | frame.pack()
37 | btStop = Button(frame,text = "Stop",command = self.stop)
38 | btStop.pack(side = LEFT)
39 | btResume = Button(frame,text = "Resume",command = self.resume)
40 | btResume.pack(side = LEFT)
41 | btAdd = Button(frame,text = "+",command = self.add)
42 | btAdd.pack(side = LEFT)
43 | btRemove = Button(frame,text = "-",command = self.remove)
44 | btRemove.pack(side = LEFT)
45 |
46 |
47 | self.sleepTime = 100
48 | self.isStopped = False
49 | self.animate()
50 | win.mainloop()
51 | def stop(self):
52 | self.isStopped = True
53 | def resume(self):
54 | self.isStopped = False
55 | self.animate()
56 | def add(self):
57 | self.ballList.append(Ball())
58 | def remove(self):
59 | self.ballList.pop()
60 | def animate(self):
61 | while not self.isStopped:
62 | self.canvas.after(self.sleepTime)
63 | self.canvas.update()
64 | self.canvas.delete("ball")
65 |
66 | for ball in self.ballList:
67 | self.redisplayBall(ball)
68 | def redisplayBall(self,ball):
69 | if ball.x > self.width or ball.x < 0:
70 | ball.dx = -ball.dx
71 | if ball.y > self.height or ball.y < 0:
72 | ball.y = -ball.y
73 | ball.x += ball.dx
74 | ball.y += ball.dy
75 | self.canvas.create_oval(ball.x - ball.radius,ball.y - ball.radius,ball.x + ball.radius,ball.y + ball.radius,fill = ball.color,tags = "ball")
76 | BounceBalls()
--------------------------------------------------------------------------------
/基于python的turtle移动的小球.py:
--------------------------------------------------------------------------------
1 | from tkinter import *
2 |
3 | class MovingBall:
4 | def __init__(self):
5 | win = Tk()
6 | win.title("Moving Ball")
7 |
8 | self.width = 250
9 | self.canvas = Canvas(win,width = self.width,height = 200,bg = 'white')
10 | self.canvas.pack()
11 |
12 | frame = Frame(win)
13 | frame.pack()
14 | btLeft = Button(frame,text = "Left",command = self.LeftMoving )
15 | btLeft.pack()
16 | btRight = Button(frame,text = "Right",command = self.RightMoving)
17 | btRight.pack()
18 | btUp = Button(frame,text = "Up",command = self.UpMoving)
19 | btUp.pack()
20 | btDown = Button(frame,text = "Down",command = self.DownMoving)
21 | btDown.pack()
22 | self.x = 0
23 | self.y = 0
24 | self.canvas.create_oval(self.x,self.y,self.x + 10,self.y + 10,fill = "black",tags = "oval")
25 | win.mainloop()
26 |
27 | def LeftMoving(self):
28 | self.canvas.delete("oval")
29 | if self.x > 10:
30 | self.x -= 10
31 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
32 | else:
33 | self.x = 250
34 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
35 |
36 | def RightMoving(self):
37 | self.canvas.delete("oval")
38 | if self.x < 250:
39 | self.x += 10
40 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
41 | else:
42 | self.x = 0
43 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
44 | def UpMoving(self):
45 | self.canvas.delete("oval")
46 | if self.y > 10:
47 | self.y -= 10
48 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
49 | else:
50 | self.y = 200
51 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
52 |
53 | def DownMoving(self):
54 | self.canvas.delete("oval")
55 | if self.y < 200:
56 | self.y += 10
57 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
58 | else:
59 | self.y = 0
60 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
61 | MovingBall()
--------------------------------------------------------------------------------
/抓取财富网股票信息.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import urllib.request
3 | import re
4 | import random
5 | import time
6 | #抓取所需内容
7 | user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
8 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
9 | 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
10 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
11 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
12 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
13 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
14 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
15 | 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
16 | 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
17 | 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
18 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
19 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
20 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
21 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
22 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
23 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
25 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
26 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
27 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
28 | stock_total=[] #stock_total:所有页面的股票数据 stock_page:某页的股票数据
29 | for page in range(1,8):
30 | url='http://quote.stockstar.com/stock/ranklist_a_3_1_'+str(page)+'.html'
31 | request=urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素
32 | try:
33 | response=urllib.request.urlopen(request)
34 | except urllib.error.HTTPError as e: #异常检测
35 | print('page=',page,'',e.code)
36 | except urllib.error.URLError as e:
37 | print('page=',page,'',e.reason)
38 | content=response.read().decode('gbk') #读取网页内容
39 | print('get page',page) #打印成功获取的页码
40 | pattern=re.compile('
')
41 | body=re.findall(pattern,str(content))
42 | # for i in body:
43 | # print(i)
44 | pattern=re.compile('>(.*?)<')
45 | stock_page=re.findall(pattern,body[0]) #正则匹配
46 | # print(stock_page)
47 | stock_total.extend(stock_page)
48 | # print(stock_total)
49 | time.sleep(random.randrange(1,4)) #每抓一页随机休眠几秒,数值可根据实际情况改动
50 | #删除空白字符
51 | stock_last=stock_total[:] #stock_last为最终所要得到的股票数据
52 | for data in stock_total:
53 | if data=='':
54 | stock_last.remove('')
55 | # print(stock_last)
56 | #打印部分结果
57 | print('代码','\t','简称',' ','\t','最新价','\t','涨跌幅','\t','涨跌额','\t','5分钟涨幅')
58 | for i in range(len(stock_last) - 6):
59 | print(format(stock_last[i],"6s"),'\t',format(stock_last[i+1],"6s"),' ','\t',format(stock_last[i+2],"6s"),' ','\t',format(stock_last[i+3],"6s"),' ','\t',format(stock_last[i+4],"6s"),' ','\t',format(stock_last[i+5],"6s"))
60 |
--------------------------------------------------------------------------------
/爬取12306车票信息.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import ssl
3 | from urllib import parse
4 | import re
5 | import requests
6 | import json
7 | import urllib
8 |
9 | #
10 | ssl._create_default_https_context = ssl._create_unverified_context
11 | # headers = {
12 | # 'Cookie':'JSESSIONID=95820ECC00B038495AC43E949F6D4A69; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=351273482.64545.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-20; _jc_save_wfdc_flag=dc',
13 | # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
14 | # }
15 |
16 | # 获取所有的站点信息
17 | def get_station():
18 | url = 'http://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9028'
19 | response = urllib.request.urlopen(url).read()
20 | # print(response)
21 | return response.decode("utf-8")
22 |
23 | #获取出发点和终点站的信息
24 | def station(stationinfo,star,end):
25 | str2 = stationinfo[20:][:-2]
26 | str3 = str2.split('|')
27 | order1 = str3.index(star)
28 | order2 = str3.index(end)
29 | starstation = str3[int(order1) + 1]
30 | endstation= str3[int(order2) + 1]
31 | return starstation,endstation
32 |
33 | # 获取列车信息
34 | def getTrainInfo(start,end,date):
35 |
36 | # params = {
37 | train_date = date
38 | from_station = start
39 | to_station = end
40 | purpose_codes = 'ADULT'
41 | # }
42 | url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes={}'.format(train_date,from_station,to_station,purpose_codes)
43 | # print(url)
44 | headers = {
45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
46 | 'Cookie':'JSESSIONID=B201655CD8BCF12D53ADF6CA6D2AA050; route=495c805987d0f5c8c84b14f60212447d; BIGipServerotn=770703882.38945.0000; BIGipServerpool_passport=367854090.50215.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-21; _jc_save_wfdc_flag=dc'
47 |
48 | }
49 | response = urllib.request.Request(url,headers=headers)
50 | response = urllib.request.urlopen(response).read()
51 | # response = response.urlopen()
52 | return response.decode("utf-8")
53 |
54 | # 获取价钱信息,打印列车的所有信息
55 | def getTicketInfo(getTrainInfos,train_date,stationinfo):
56 | # print(getTrainInfos)
57 | getTrainInfos = json.loads(getTrainInfos).get('data').get('result')
58 |
59 | for getTrainInfo in getTrainInfos:
60 | order3 = getTrainInfo.split('|')
61 | train_no = order3[2]
62 | seat_types = str(order3[-1:])[2:5]
63 | if len(seat_types) != 3 :
64 | continue
65 | from_station_no = str(order3[1:][15])
66 | to_station_no = str(order3[1:][16])
67 | url = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no={}&from_station_no={}&to_station_no={}&seat_types={}&train_date={}'.format(train_no,from_station_no,to_station_no,seat_types,train_date)
68 | # url = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no=26000K772632&from_station_no=10&to_station_no=11&seat_types=113&train_date=2017-10-25'
69 | headers = {
70 | 'Accept':'* / *',
71 | 'Accept - Encoding':'gzip, deflate, br',
72 | 'Accept - Language':'zh - CN, zh;q = 0.8',
73 | 'Cache - Control':'no - cache',
74 | 'Connection':'keep - alive',
75 | 'Host':'kyfw.12306.cn',
76 | 'If - Modified - Since':'0',
77 | 'Referer:https':'// kyfw.12306.cn / otn / leftTicket / init',
78 | 'X - Requested - With':'XMLHttpRequest',
79 | 'Cookie': 'JSESSIONID = B201655CD8BCF12D53ADF6CA6D2AA050;route = 495c805987d0f5c8c84b14f60212447d;BIGipServerotn = 770703882.38945.0000;BIGipServerpool_passport = 367854090.50215.0000;_jc_save_fromStation = % u5317 % u4EAC % 2BJP;_jc_save_toStation = % u5929 % u6D25 % 2TJP;_jc_save_fromDate = 2017 - 10 - 25;_jc_save_toDate = 2017 - 10 - 21;_jc_save_wfdc_flag = dc',
80 | 'User - Agent':' Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 61.0.3163.100Safari / 537.36'
81 | }
82 | response = urllib.request.urlopen(url).read()
83 | datas = response.decode("utf-8")
84 | com = re.compile('({.*?}})')
85 | datas = com.findall(datas)
86 | for data in datas:
87 | if len(data) > 30:
88 | data = json.loads(data)
89 | # print(data)
90 | datas = data.get('data')
91 | print("------------------本次列车-----------------------------------")
92 | print('本次列车', order3[3])
93 |
94 | str2 = stationinfo[20:][:-2]
95 | str3 = str2.split('|')
96 | order1 = str3.index(order3[4])
97 | order2 = str3.index(order3[7])
98 | starstation = str3[int(order1) - 1]
99 | endstation = str3[int(order2) - 1]
100 |
101 | print('出发站点', starstation)
102 | print('到达站点', endstation)
103 | print('出发时间', order3[8])
104 | print('到达时间', order3[9])
105 | print('历时时间', order3[10])
106 | # print(type(datas))
107 | for k in datas:
108 |
109 | if k == 'A9':
110 | print('商务座特等座',":",datas[k])
111 | elif k == 'M':
112 | print("一等座",":",datas[k])
113 | elif k == 'O':
114 | print("二等座",":",datas[k])
115 | elif k == 'WZ':
116 | print("无座",":",datas[k])
117 | elif k == 'A4':
118 | print("软卧", ":", datas[k])
119 | elif k == 'WZ':
120 | print("无座", ":", datas[k])
121 | elif k == 'F':
122 | print("动卧", ":", datas[k])
123 | elif k == 'A3':
124 | print("硬卧", ":", datas[k])
125 | elif k == 'A1':
126 | print("硬座", ":", datas[k])
127 | elif k == 'A6':
128 | print("高级软卧", ":", datas[k])
129 | elif k == 'OT':
130 | print("其他", ":", datas[k])
131 |
132 | if __name__ == "__main__":
133 | start = input('出发车站:')
134 | end = input('到达车站:')
135 | date = input("出发时间(如2017.10.25):")
136 | # 处理时间格式
137 | date = date.replace('.','-')
138 | stationinfo = get_station()
139 | starstation,endstation = station(stationinfo,start,end)
140 | getTrainInfo = getTrainInfo(starstation,endstation,date)
141 | getTicketInfo(getTrainInfo,date,stationinfo)
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/爬取qq音乐歌曲/爬取扣扣音乐文件.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import time
4 | import random
5 | import requests
6 | import urllib
7 | import time
8 | import codecs
9 | import urllib3
10 | def songmid():
11 | mid = []
12 | name = []
13 | url = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=sizer.yqq.song_next&searchid=148958880434449513&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w=%E4%BA%94%E6%9C%88%E5%A4%A9&g_tk=1989554541&jsonpCallback=searchCallbacksong5150&loginUin=1093211972&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0"
14 | response = requests.get(url)
15 | if json.loads(response.text[23:-1])['data']['song']['curnum'] and json.loads(response.text[23:-1])['data']['song']['curpage']:
16 | # if True:
17 | for i in range(20):
18 | # print(i)
19 | jsonpcallback = "searchCallbacksong"+str((random.randint(1000,10000)))
20 | if i == 0:
21 | remoteplace = "txt.yqq.song"
22 | else:
23 | remoteplace = "sizer.yqq.song_next"
24 | # print(i)
25 | params= {
26 | 'ct': "24",
27 | 'qqmusic_ver': "1298:",
28 | 'new_json': "1",
29 | 'remoteplace': "sizer.yqq.song_next",
30 | 'searchid': "148958880434449513",
31 | 't': "0",
32 | 'aggr': "1",
33 | 'cr': "1",
34 | 'catzhida': "1",
35 | 'lossless': "0",
36 | 'flag_qc': "0",
37 | 'p': i+1,
38 | 'n': str(json.loads(response.text[23:-1])['data']['song']['curnum']),
39 | # 'n': 20,
40 | 'w': "%E4%BA%94%E6%9C%88%E5%A4%A9",
41 | 'g_tk': "1989554541",
42 | 'jsonpcallback': jsonpcallback,
43 | 'loginuin': "1093211972",
44 | 'hostuin': "0",
45 | 'format': "jsonp",
46 | 'incharset': "utf8",
47 | 'outcharset': "utf-8",
48 | 'notice': "0",
49 | 'platform': "yqq",
50 | 'neednewcode': "0",
51 | 'cache-control': "no-cache",
52 | }
53 | # url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?"
54 | url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp"
55 | response2 = requests.get(url2,params=params)
56 | # print(json.loads(response2.text[9:-1]))
57 | for i in json.loads(response2.content[9:-1])['data']['song']['list']:
58 | if i['file']['media_mid']:
59 | mid.append(i['file']['media_mid'])
60 | name.append(i['name'])
61 | print(set(mid))
62 | print(len(set(mid)))
63 | return mid,name
64 | url = []
65 | file = codecs.open('audio2.txt','w')
66 | def resolve(songmids,name):
67 |
68 | for i in range(len(songmids)):
69 | filename = 'C400' + songmids[i] + '.m4a'
70 | # print(songmids[i])
71 | guid = int(random.random() * 2147483647) * int(time.time() * 1000) % 10000000000
72 |
73 | d = {
74 | 'format': 'json',
75 | 'cid': 205361747,
76 | 'uin': 0,
77 | 'songmid': songmids[i],
78 | 'filename': filename,
79 | 'guid': guid,
80 | 'g_tk':5381,
81 | 'loginUin':0,
82 | 'hostUin':0,
83 | 'notice': '0',
84 | 'platform':'yqq',
85 | 'needNewCode':'0',
86 | }
87 | headers = {
88 | 'User - Agent':"Mozilla / 5.0(WindowsNT10.0; …) Gecko / 20100101Firefox / 57.0"
89 | }
90 | r = requests.get('https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg', params=d)
91 | try:
92 | vkey = json.loads(r.text)['data']['items'][0]['vkey']
93 | except:
94 | continue
95 | if vkey:
96 | audio_url = 'http://dl.stream.qqmusic.qq.com/%s?vkey=%s&guid=%s&uin=0&fromtag=66' % (filename, vkey, guid)
97 | time.sleep(random.random()*1)
98 | url.append(audio_url)
99 | file.write(audio_url+'\n')
100 | if __name__ == "__main__":
101 | songmids,name =songmid()
102 | resolve(songmids,name)
103 | file.close()
--------------------------------------------------------------------------------