├── Python生成验证码文字变图片.py ├── Python的Web和数据分析学习图谱 ├── Django.png ├── flask.png └── 数据分析算法合集.png ├── README.md ├── lagouSpider.py ├── scrapy爬虫简单项目 ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── python实现有道词典.iml │ └── workspace.xml ├── 2.py ├── Qqnews │ ├── Qqnews │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── Qqnews_spider.py │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ ├── Qqnews_spider.cpython-36.pyc │ │ │ └── __init__.cpython-36.pyc │ └── scrapy.cfg ├── Yustneirong │ ├── Yustneirong │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── settings.pyc │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── __init__.pyc │ └── scrapy.cfg ├── dbtop250 │ ├── dbtop250 │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── dbtop250_spider.cpython-36.pyc │ │ │ └── dbtop250_spider.py │ └── scrapy.cfg ├── douban │ ├── douban │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── douban_spider.cpython-36.pyc │ │ │ └── douban_spider.py │ └── scrapy.cfg ├── ip地址查询工具.py ├── lagou-scrapy │ ├── .idea │ │ ├── dictionaries │ │ │ └── .xml │ │ ├── lagou.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ ├── dump.rdb │ ├── geckodriver.log │ ├── lagou │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── middlewares │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── useragent.cpython-36.pyc │ │ │ └── useragent.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── lagouspider.cpython-36.pyc │ │ │ └── lagouspider.py │ └── scrapy.cfg ├── python爬虫实现有道词典.py ├── taobaoclass │ ├── scrapy.cfg │ └── taobaoclass │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ └── __init__.cpython-36.pyc │ │ └── taobao_spider.py ├── tutorial │ ├── scrapy.cfg │ └── tutorial │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── settings.pyc │ │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── dmoz_spider.py │ │ ├── dmoz_spider.pyc │ │ ├── mydomain.py │ │ └── mydomain.pyc └── zaobao │ ├── scrapy.cfg │ └── zaobao │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── zaobao_spider.cpython-36.pyc │ └── zaobao_spider.py ├── zhihu.com ├── scrapy.cfg └── zhihuuser │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── zhihu.cpython-36.pyc │ └── zhihu.py ├── zhihu ├── scrapy.cfg └── zhihuuser │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── zhihu.cpython-36.pyc │ └── zhihu.py ├── 可视化文件显示程序.zip ├── 基于python的turtle画出叮当猫.py ├── 基于python的turtle的桌面弹球.py ├── 基于python的turtle移动的小球.py ├── 抓取财富网股票信息.py ├── 爬取12306车票信息.py └── 爬取qq音乐歌曲 ├── audio2.txt └── 爬取扣扣音乐文件.py /Python生成验证码文字变图片.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pygame 3 | from pygame.locals import * 4 | 5 | 6 | 7 | pygame.init() 8 | text = u"1234" 9 | 10 | font = pygame.font.SysFont("Microsoft YaHei",64) 11 | ftext = font.render(text,True,(65,83,130),(255,255,255)) 12 | pygame.image.save(ftext,"D:/pythontab.jpg") -------------------------------------------------------------------------------- /Python的Web和数据分析学习图谱/Django.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/Django.png -------------------------------------------------------------------------------- /Python的Web和数据分析学习图谱/flask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/flask.png -------------------------------------------------------------------------------- /Python的Web和数据分析学习图谱/数据分析算法合集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/数据分析算法合集.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python 2 | python-爬虫-web-数据分析 3 | -------------------------------------------------------------------------------- /lagouSpider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import urllib 5 | import requests 6 | from bs4 import BeautifulSoup 7 | import time 8 | import csv 9 | import codecs 10 | from selenium import webdriver 11 | 12 | 13 | headers = { 14 | 'Accept':'application/json, text/javascript, */*; q=0.01', 15 | 'Accept-Encoding':'gzip, deflate, br', 16 | 'Accept-Language':'zh-CN,zh;q=0.8', 17 | 'Connection':'keep-alive', 18 | 'Content-Length':'25', 19 | 'Content-Type':'application/x-www-form-urlencoded; 20 | 'Cookie':'填上cookie信息', 21 | 'Host':'www.lagou.com', 22 | 'Origin':'https://www.lagou.com', 23 | 'Referer':"h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC", 24 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 25 | 'X-Anit-Forge-Code':'0', 26 | 'X-Anit-Forge-Token':'None', 27 | 'X-Requested-With':'XMLHttpRequest' 28 | } 29 | 30 | #访问网页 获取所有的json数据 31 | def post(url,para,headers=None,proxy=None,timeOut=5,timeOutRetry=5): 32 | if not url or not para: 33 | print("PostError url or para not exit") 34 | print("11111111111111") 35 | return None 36 | try: 37 | if not headers: 38 | headers=headers 39 | response = requests.post(url,data=para,headers=headers) 40 | print(response.status_code) 41 | 42 | print(response.text) 43 | if response.status_code == 200 or response.status_code == 302: 44 | htmlCode = response.text 45 | # print('1111111111') 46 | else: 47 | print("2222222222222") 48 | htmlCode = None 49 | except Exception as e: 50 | if timeOutRetry > 0: 51 | htmlCode = post(url=url,para=para,timeOutRetry=(timeOutRetry-1)) 52 | print('3333333333333333333333333333') 53 | htmlCode = None 54 | return htmlCode 55 | 56 | # url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0' 57 | # url = 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC' 58 | #对获取的json数据进行处理,获取自己需要的信息,获取每个职位数据页数, 59 | def getinfo(url,para): 60 | 61 | htmlCode = post(url,para=para,headers=headers) #获取到网页源码,一大堆的json数据 62 | if htmlCode == None: 63 | return False 64 | companies = json.loads(htmlCode).get('content').get('positionResult').get('result') 65 | totalCount = json.loads(htmlCode).get('content').get('positionResult').get('totalCount') 66 | pagesize = json.loads(htmlCode).get('content').get('pageSize') 67 | pages = 0 68 | if int(totalCount)%int(pagesize) == 0: 69 | pages = int(int(totalCount)/int(pagesize)) 70 | else: 71 | pages = int(int(totalCount) // int(pagesize)) + 1 72 | 73 | return pages,companies 74 | 75 | #写入文件中,不同的职位保存在不同的文件 76 | def writeCsv(filename,companies): 77 | info = {} 78 | csv_file = codecs.open(filename+'.csv', 'ab', 'utf-8', 'ignore') 79 | csv_writer = csv.writer(csv_file) 80 | for i in companies: 81 | info['公司名字'] = i['companyFullName'] #公司名字 82 | # print(info['公司名字']) 83 | info['公司城市'] = i['city'] #职位城市 84 | info['招聘职位'] = i['positionName'] #招聘职位 85 | info['发布时间'] = i['formatCreateTime'] #发布时间 86 | info['薪资待遇'] = i['salary'] #薪资待遇 87 | info['经验要求'] = i['workYear'] #经验要求 88 | info['公司大小'] = i['companySize'] #公司大小 89 | info['公司福利'] = i['positionAdvantage'] #公司福利 90 | info['公司地址'] = i['district'] #公司地址 91 | # print(info) 92 | csv_writer.writerow([i['companyFullName'],i['city'],i['positionName'],i['formatCreateTime'],i['salary'], 93 | i['workYear'],i['companySize'],i['positionAdvantage'],i['district']]) 94 | 95 | 96 | 97 | #获取所有的职位信息 98 | def occupation(): 99 | url = "https://www.lagou.com/" 100 | response = requests.get(url) 101 | soup = BeautifulSoup(response.text, 'html.parser') 102 | ds = soup.find_all("div", attrs=["_class", "menu_sub dn"]) 103 | 104 | occupation_list = [] 105 | for h in ds: 106 | for g in h.find_all('dd'): 107 | for l in g: 108 | if l.string != "\n": 109 | occupation_list.append(l.string) 110 | 111 | # print(occupation_list) 112 | # print(len(occupation_list)) 113 | return occupation_list 114 | 115 | #获取热门城市这些职位的信息 116 | if __name__ == '__main__': 117 | occu_list = occupation() 118 | city_list = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津'] 119 | for l in occu_list[:]: 120 | print(l) 121 | for j in city_list: 122 | url = 'https://www.lagou.com/jobs/positionAjax.json?' 123 | para = {'px': 'default','city':j,'needAddtionalResult': 'false', 'isSchoolJob': 0, 'first': 'true', 'pn': '1', 124 | 'kd':l} 125 | pages,companies = getinfo(url,para) 126 | for i in range(pages): 127 | para['pn'] = str(i+1) 128 | time.sleep(random.random()*5) 129 | print('开始爬取第%s页'%str(i+1)) 130 | try: 131 | pages,companies = getinfo(url,para) 132 | except: 133 | continue 134 | # fina = writeCsv(companies) 135 | if companies == None: 136 | break 137 | writeCsv(l,companies) 138 | # csv_writer.writerow(fina) 139 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/.idea/python实现有道词典.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 38 | 39 | 40 | 67 | 68 | 69 | 70 | 71 | true 72 | DEFINITION_ORDER 73 | 74 | 75 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | Python 89 | 90 | 91 | 92 | 93 | PyUnresolvedReferencesInspection 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 156 | 157 | 174 | 175 | 192 | 193 | 210 | 211 | 228 | 229 | 248 | 249 | 250 | 251 | 252 | 265 | 266 | 279 | 280 | 297 | 298 | 310 | 311 | project 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 346 | 347 | 366 | 367 | 388 | 389 | 411 | 412 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 1508592887941 463 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 503 | 506 | 507 | 508 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/2.py: -------------------------------------------------------------------------------- 1 | str1 = "/**/jQuery110205057557444126394_1484574357057(" 2 | print(len(str1)) 3 | 4 | "https://ssl.captcha.qq.com/cap_union_new_getcapbysig?aid=522005705&asig=&captype=&protocol=https&clientype=2&disturblevel=&apptype=2&curenv=inner&ua=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS82MS4wLjMxNjMuMTAwIFNhZmFyaS81MzcuMzY=&sess=1iW5KCYL2DiqaiJy8K76Am6iwXvqJkGKpwLsItwLEpjoWwy0G0R3y_t1YKNzrr-Ts5j2Knkgh2qfBoWdWYmHDY_tiQXBpB2vT7ttfysXWlz-JltnuOA33JN14umsk_q0oYq3ITlJNR02RDPd_JRNP0iQeNZe8JMMv3x8BD_Sqi-38jNGuIVSD-EZkLDrjztCENIt15GWQCs*&theme=&noBorder=noborder&fb=1&showtype=embed&uid=123456&cap_cd=Kz3KLjvqeqsYRc0aLobTgXc2UjrnVE-vhPOEpygni5x_9E6HTuxT9Q**&lang=2052&rnd=150167&rand=0.482505701756349&vsig=gvzE39T_XEWYaq6gx4EBY250WYUlIgIL_2ypn6K_iE-O-d3Iwb_2XEr5XegIpAikh4qANjt3pf3yvnESAd95nV1qemP9M1hu9800zE1wEvXls0T5ulqE4Die4uYOfQM_J&ischartype=1" -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QqnewsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | date = scrapy.Field() 16 | author = scrapy.Field() 17 | content = scrapy.Field() 18 | 19 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class QqnewsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.conf import settings 9 | class QqnewsPipeline(object): 10 | 11 | def __init__(self): 12 | MONGODN_HOST = settings['MONGODB_HOST'] 13 | MONGODB_PORT = settings['MONGODB_PORT'] 14 | dbName = settings['MONGODB_DBNAME'] 15 | MONGODB_CNAME = settings['MONGODB_CNAME'] 16 | client = pymongo.MongoClient(host=MONGODN_HOST,port=MONGODB_PORT) 17 | tdb = client[dbName] 18 | self.post = tdb[MONGODB_CNAME] 19 | def process_item(self, item, spider): 20 | news = dict(item) 21 | self.post.insert(news) 22 | return item 23 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Qqnews project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Qqnews' 13 | 14 | SPIDER_MODULES = ['Qqnews.spiders'] 15 | NEWSPIDER_MODULE = 'Qqnews.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 20 | 21 | 22 | MONGODB_HOST = '127.0.0.1' 23 | MONGODB_PORT = 27017 24 | MONGODB_DBNAME = 'QQnews' 25 | MONGODB_CNAME = 'military' 26 | 27 | # Obey robots.txt rules 28 | ROBOTSTXT_OBEY = True 29 | 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | #CONCURRENT_REQUESTS = 32 32 | 33 | # Configure a delay for requests for the same website (default: 0) 34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 35 | # See also autothrottle settings and docs 36 | #DOWNLOAD_DELAY = 3 37 | # The download delay setting will honor only one of: 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 39 | #CONCURRENT_REQUESTS_PER_IP = 16 40 | 41 | # Disable cookies (enabled by default) 42 | COOKIES_ENABLED = True 43 | 44 | # Disable Telnet Console (enabled by default) 45 | #TELNETCONSOLE_ENABLED = False 46 | 47 | # Override the default request headers: 48 | DEFAULT_REQUEST_HEADERS = { 49 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 50 | 'Accept-Language': 'en', 51 | 'RK=7SNngcUONh':'pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; uin=o0252943669; skey=@zCZ8lcmdT; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pgv_info=ssid=s4702440319; pgv_pvid=4169365884; o_cookie=252943669; pac_uid=1_252943669' 52 | } 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'Qqnews.middlewares.QqnewsSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'Qqnews.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | 'Qqnews.pipelines.QqnewsPipeline': 300, 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/spiders/Qqnews_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from Qqnews.items import QqnewsItem 5 | 6 | 7 | class QqnewsSpiderSpider(scrapy.Spider): 8 | name = "Qqnews_spider" 9 | allowed_domains = ["qq.com"] 10 | start_urls = ['http://mil.qq.com/mil_index.htm'] 11 | 12 | def parse(self, response): 13 | for eveUrl in response.xpath('//a[@class="linkto"]/@href'): 14 | yield scrapy.Request(eveUrl.extract(),callback=self.parse_content) 15 | 16 | 17 | def parse_content(self,response): 18 | item = QqnewsItem() 19 | title = response.xpath('//div[@class="hd"]/h1/text()').extract() 20 | date1 = response.xpath('//span[@class="a_time"]/text()').extract() 21 | date2 = response.xpath('//div[@class="md"]/text()').extract() 22 | date3 = response.xpath('//div[@class="time"]/text()').extract() 23 | date = str(date1)+str(date2)+str(date3) 24 | author = response.xpath('//div[@class="content-article"]/p[1]/text()').extract() 25 | content = response.xpath('//div[@class="content-article"]/text()').extract() 26 | print(title,date,author,content) 27 | yield item 28 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Qqnews/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Qqnews.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Qqnews 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YustneirongItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class YustneirongSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class YustneirongPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Yustneirong project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Yustneirong' 13 | 14 | SPIDER_MODULES = ['Yustneirong.spiders'] 15 | NEWSPIDER_MODULE = 'Yustneirong.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Yustneirong (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Yustneirong.middlewares.YustneirongSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Yustneirong.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'Yustneirong.pipelines.YustneirongPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/Yustneirong/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Yustneirong.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Yustneirong 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Dbtop250Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | name = scrapy.Field() 15 | zuto = scrapy.Field() 16 | desc = scrapy.Field() 17 | ping = scrapy.Field() 18 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Dbtop250SpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | from scrapy.conf import settings 10 | 11 | 12 | class Dbtop250Pipeline(object): 13 | 14 | def __init__(self): 15 | host = settings['MONGODB_HOST'] 16 | port = settings['MONGODB_PORT'] 17 | dbName = settings['MONGODB_DBNAME'] 18 | client = pymongo.MongoClient(host=host,port=port) 19 | tdb = client[dbName] 20 | self.post = tdb[settings['MONGODB_DOCNAME']] 21 | def process_item(self, item, spider): 22 | movie = dict(item) 23 | self.post.insert(movie) 24 | return item 25 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dbtop250 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dbtop250' 13 | 14 | SPIDER_MODULES = ['dbtop250.spiders'] 15 | NEWSPIDER_MODULE = 'dbtop250.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 20 | 21 | MONGODB_HOST = 'localhost' 22 | MONGODB_PORT = 27017 23 | MONGODB_DBNAME = 'dbtop250' 24 | MONGODB_DOCNAME = 'top250' 25 | 26 | # Obey robots.txt rules 27 | ROBOTSTXT_OBEY = True 28 | 29 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 30 | #CONCURRENT_REQUESTS = 32 31 | 32 | # Configure a delay for requests for the same website (default: 0) 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 34 | # See also autothrottle settings and docs 35 | #DOWNLOAD_DELAY = 3 36 | # The download delay setting will honor only one of: 37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 38 | #CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | COOKIES_ENABLED = True 42 | 43 | # Disable Telnet Console (enabled by default) 44 | #TELNETCONSOLE_ENABLED = False 45 | 46 | # Override the default request headers: 47 | DEFAULT_REQUEST_HEADERS = { 48 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | 'Accept-Language': 'en', 50 | 'Cookie':'RK=7SNngcUONh; pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pac_uid=1_252943669; dsp_cookiemapping0=1508662302062; dsp_cookiemapping2=1508662302064; ad_play_index=66; thyls_ad=440; dsp_cookiemapping1=1508662308938; pgv_info=ssid=s4702440319; ts_last=mil.qq.com/mil_index.htm; pgv_pvid=4169365884; o_cookie=252943669; ts_uid=2412234112' 51 | } 52 | 53 | # Enable or disable spider middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 55 | #SPIDER_MIDDLEWARES = { 56 | # 'dbtop250.middlewares.Dbtop250SpiderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable downloader middlewares 60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 61 | #DOWNLOADER_MIDDLEWARES = { 62 | # 'dbtop250.middlewares.MyCustomDownloaderMiddleware': 543, 63 | #} 64 | 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'dbtop250.pipelines.Dbtop250Pipeline': 300, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/dbtop250/spiders/dbtop250_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class Dbtop250SpiderSpider(scrapy.Spider): 6 | name = "dbtop250_spider" 7 | allowed_domains = ["douban.com"] 8 | start_urls = ['https://movie.douban.com/top250?start=0&filter='] 9 | conunt = 0 10 | def parse(self, response): 11 | self.conunt +=1 12 | for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'): 13 | full_url = eve.extract() 14 | yield scrapy.Request(full_url,callback=self.parse_movie) 15 | 16 | if self.conunt * 25 < 250: 17 | full_url = 'https://movie.douban.com/top250?start={}&filter='.format(str(self.conunt*25)) 18 | yield scrapy.Request(full_url,callback=self.parse) 19 | def parse_movie(self,response): 20 | from dbtop250.items import Dbtop250Item 21 | item = Dbtop250Item() 22 | item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract() 23 | # item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract() 24 | item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract() 25 | yield item 26 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/dbtop250/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dbtop250.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dbtop250 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | name = scrapy.Field() 15 | # auto = scrapy.Field() 16 | desc = scrapy.Field() 17 | ping = scrapy.Field() 18 | 19 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for douban project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'douban' 13 | 14 | SPIDER_MODULES = ['douban.spiders'] 15 | NEWSPIDER_MODULE = 'douban.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'douban (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'Cookie':'bid=iJIjKbBsQZ4; gr_user_id=b3a58668-aa55-4aa3-a212-1d9ed21843e8; viewed="27116300_25862578"; ps=y; ll="108288"; push_noty_num=0; push_doumail_num=0; ap=1; _ga=GA1.2.462587836.1508291602; __yadk_uid=byGuKstnDBAymxz38q9BxYZnm6ibZZbe; _vwo_uuid_v2=F727776224927130F161043B6E8DCD6F|0d3a3a996a0fdc93c651e031901946bb; __utma=30149280.462587836.1508291602.1508501064.1508655486.6; __utmb=30149280.0.10.1508655486; __utmc=30149280; __utmz=30149280.1508501064.5.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.16325; __utma=223695111.462587836.1508291602.1508655486.1508655486.1; __utmb=223695111.0.10.1508655486; __utmc=223695111; __utmz=223695111.1508655486.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_id.100001.4cf6=288cdd8ec5b2cdaf.1508655486.1.1508656155.1508655486.; _pk_ses.100001.4cf6=*', 46 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'douban.middlewares.DoubanSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'douban.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | #ITEM_PIPELINES = { 70 | # 'douban.pipelines.DoubanPipeline': 300, 71 | #} 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/douban/spiders/douban_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from douban.items import DoubanItem 4 | 5 | class DoubanSpiderSpider(scrapy.Spider): 6 | name = "douban_spider" 7 | allowed_domains = ["douban.com"] 8 | print("1111111111111111111111") 9 | start_urls = ['https://movie.douban.com/top250?start=0&filter='] 10 | 11 | def parse(self, response): 12 | 13 | for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'): 14 | full_url = eve.extract() 15 | 16 | print(full_url) 17 | yield scrapy.Request(full_url,callback=self.parse_movie) 18 | 19 | def parse_movie(self,response): 20 | 21 | item = DoubanItem() 22 | item['name'] = response.xpath('//*[@id="content"]/h1/span[1]//text()').extract() 23 | item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract() 24 | item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract() 25 | print(item) 26 | yield item 27 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/douban/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = douban.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douban 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/ip地址查询工具.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import requests 4 | 5 | 6 | url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?query=10.0.144.241&co=&resource_id=6006&t=1484574592369&ie=utf8&oe=gbk&cb=op_aladdin_callback&format=json&tn=baidu&cb=jQuery110205057557444126394_1484574357057&_=1484574357071' 7 | 8 | headers = { 9 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 10 | 'Accept-Encoding':'gzip, deflate, br', 11 | 'Accept-Language':'zh-CN,zh;q=0.8', 12 | 'Connection':'keep-alive', 13 | 'Cookie':'BAIDUID=4812092AE366ED4A55C6D8EA6713A635:FG=1; PSTM=1508161904; BIDUPSID=18C54752D18DC057B004465161A28981; BDUSS=9XM0M3bnJBYUpRZVBFRDRRWXdpVXdIa0d2WDRJUlVFaVlJcFVSMnVFOE5MUkJhSVFBQUFBJCQAAAAAAAAAAAEAAAC6uzCj0KHKqNfTczAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA2g6FkNoOhZQ2; MCITY=-%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598', 14 | 'Host':'sp0.baidu.com', 15 | 'Upgrade-Insecure-Requests':'1', 16 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 17 | } 18 | ip = input("请输入你的IP地址:") 19 | params = { 20 | 'query':ip, 21 | 'co':'', 22 | 'resource_id':'6006', 23 | 't':'1484574592369', 24 | 'ie':'utf8', 25 | 'oe':'gbk', 26 | 'cb':'op_aladdin_callback', 27 | 'format':'json', 28 | 'tn':'baidu', 29 | 'cb':'jQuery110205057557444126394_1484574357057', 30 | '_':'1484574357071' 31 | } 32 | 33 | response = requests.get(url,params=params).text 34 | # print(response) 35 | response = json.loads(response[46:][:-2]) 36 | # print(response[46:][:-2]) 37 | print("location:"+response.get('data')[0].get("location")) 38 | print("titlecont:"+response.get('data')[0].get("titlecont")) 39 | print("origip:"+response.get('data')[0].get("origip")) 40 | print("origipquery:"+response.get('data')[0].get("origipquery")) 41 | print("showlamp:"+response.get('data')[0].get("showlamp")) 42 | print("showLikeShare:"+str(response.get('data')[0].get("showLikeShare"))) 43 | print("shareImage:"+str(response.get('data')[0].get("shareImage"))) 44 | print("ExtendedLocation:"+response.get('data')[0].get("ExtendedLocation")) 45 | print("QriginQuery:"+str(response.get('data')[0].get("QriginQuery"))) 46 | print("tplt:"+response.get('data')[0].get("tplt")) 47 | print("resourceid:"+str(response.get('data')[0].get("resourceid"))) 48 | print("fetchkey:"+response.get('data')[0].get("fetchkey")) 49 | print("appinfo:"+response.get('data')[0].get("appinfo")) 50 | print("role_id:"+str(response.get('data')[0].get("role_id"))) 51 | print("disp_type:"+str(response.get('data')[0].get("disp_type"))) 52 | 53 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/.idea/dictionaries/.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/.idea/lagou.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | AngularJS 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/dump.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/dump.rdb -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/geckodriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/geckodriver.log -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy import Field 10 | 11 | class LagouItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | companyFullName = Field() # 公司名字 15 | # print(info['公司名字']) 16 | city = Field() # 职位城市 17 | positionName = Field() # 招聘职位 18 | formatCreateTime = Field() # 发布时间 19 | salary = Field() # 薪资待遇 20 | workYear = Field() # 经验要求 21 | Jobdescriptions = Field() #职位描述 22 | companySize = Field() # 公司大小 23 | positionAdvantage = Field() # 公司福利 24 | district = Field() #公司地址 25 | companyhref = Field() #公司链接 -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LagouSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from .useragent import UserAgentMiddleware -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/useragent.py: -------------------------------------------------------------------------------- 1 | import faker 2 | 3 | 4 | class UserAgentMiddleware(object): 5 | def __init__(self,settings): 6 | self.faker = faker.Faker() 7 | 8 | @classmethod 9 | def from_crawler(cls,crawler): 10 | return cls(crawler.settings) 11 | 12 | def process_request(self,request,spider): 13 | request.headers['User-Agent'] = self.faker.user_agent() 14 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | #存储到mongodb数据库 10 | class LagouPipeline(object): 11 | 12 | def __init__(self, mongo_uri, mongo_db): 13 | self.mongo_uri = mongo_uri 14 | self.mongo_db = mongo_db 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | return cls( 19 | mongo_uri = crawler.settings.get('MONGO_URI'), 20 | mongo_db = crawler.settings.get('MONGO_DATABASE','items') 21 | ) 22 | 23 | def open_spider(self,spider): 24 | self.client = pymongo.MongoClient() 25 | self.db = self.client['lagouzhiwei'] 26 | 27 | def close_spider(self,spider): 28 | self.client.close() 29 | 30 | def process_item(self, item, spider): 31 | self.db['zhiweitest2'].insert(dict(item)) 32 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lagou project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'lagou' 13 | 14 | SPIDER_MODULES = ['lagou.spiders'] 15 | NEWSPIDER_MODULE = 'lagou.spiders' 16 | LOG_LEVEL= 'INFO' 17 | 18 | 19 | #数据库,我已经写死了,所以这里就不写值了,如果需要在这里调用的可以填写这两个值 20 | # MONGO_URI = '' 21 | # MONGO_DATABASE = '' 22 | 23 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 24 | # USER_AGENT = 'lagou (+http://www.yourdomain.com)' 25 | 26 | # Obey robots.txt rules 27 | ROBOTSTXT_OBEY = False 28 | 29 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 30 | CONCURRENT_REQUESTS = 32 31 | 32 | # Configure a delay for requests for the same website (default: 0) 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 34 | # See also autothrottle settings and docs 35 | DOWNLOAD_DELAY = 0 36 | # The download delay setting will honor only one of: 37 | CONCURRENT_REQUESTS_PER_DOMAIN = 1 38 | #CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | COOKIES_ENABLED = False 42 | # COOKIE = " user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _gat=1; index_location_city=%E5%8C%97%E4%BA%AC; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C; TG-TRACK-CODE=index_navigation; _gid=GA1.2.1376878689.1512383958; _ga=GA1.2.358203920.1509241265; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828514,1511828645,1512096311,1512383961; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512391953; LGSID=20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce; LGRID=20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce; SEARCH_ID=b1c5303a69754a66bc97d63dc0fec865" 43 | # Cookie = {'user_trace_token':'20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243',' LGUID':'20171016205145-c44d7b22-b270-11e7-991d-525400f775ce', 'showExpriedIndex':'1', 44 | # 'showExpriedCompanyHome':'1', 'showExpriedMyPublish':'1', 'hasDeliver':'2', '_gat':'1','index_location_city':'%E5%8C%97%E4%BA%AC','login':'false', 45 | # 'unick':"", '_putrc':"", 'JSESSIONID':'ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C' ,'TG-TRACK-CODE':'index_navigation','_gid':'GA1.2.1376878689.1512383958','_ga':'GA1.2.358203920.1509241265','Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1511828514,1511828645,1512096311,1512383961','Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1512391953','LGSID':'20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce', 46 | # 'LGRID':'20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce','SEARCH_ID':'b1c5303a69754a66bc97d63dc0fec865'} 47 | # Disable Telnet Console (enabled by default) 48 | TELNETCONSOLE_ENABLED = True 49 | 50 | # Override the default request headers: 51 | # DEFAULT_REQUEST_HEADERS = { 52 | # 'Accept':'application/json, text/javascript, */*; q=0.01', 53 | # 'Accept-Encoding':'gzip, deflate, br', 54 | # 'Accept-Language':'zh-CN,zh;q=0.8', 55 | # 'Connection':'keep-alive', 56 | # 'Content-Length':'25', 57 | # 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 58 | # 'Cookie':'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC', 59 | # 'Host':'www.lagou.com', 60 | # 'Origin':'https://www.lagou.com', 61 | # 'Cookie':'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512613229,1512613260,1512625404,1512968182; index_location_city=%E5%8C%97%E4%BA%AC; _ga=GA1.2.2037062440.1512613233; user_trace_token=201712011102032-33c95bdc-daf5-11e7-8800-525400f775ce; LGUID=20171207102032-33c95ef6-daf5-11e7-8800-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=4; JSESSIONID=ABAAABAAADEAAFI7D85FFAA76F7A088717F2BAF4B49DB5A; SEARCH_ID=e00f27cb11504a72a10b8ec58bd5f04f; _gat=1; LGSID=20171211125618-9f9c7c23-de2f-11e7-8e96-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_COCOS2D-X%3Fpx%3Ddefault%26city%3D%25E5%2585%25A8%25E5%259B%25BDstart.firefoxchina.cn; LGRID=20171211125650-b2cc7009-de2f-11e7-8e96-525400f775ce; _putrc=54D6D44AC87A2A52; _gid=GA1.2.834272328.1512968180; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512968210', 62 | # 'Referer':"https://www.lagou.com", 63 | # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 64 | # 'X-Anit-Forge-Code':'0', 65 | # 'X-Anit-Forge-Token':'None', 66 | # 'X-Requested-With':'XMLHttpRequest' 67 | # } 68 | 69 | 70 | # Enable or disable spider middlewares 71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 72 | #SPIDER_MIDDLEWARES = { 73 | # 'lagou.middlewares.LagouSpiderMiddleware': 543, 74 | #} 75 | 76 | # Enable or disable downloader middlewares 77 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 78 | DOWNLOADER_MIDDLEWARES = { 79 | # 'lagou.middlewares.MyCustomDownloaderMiddleware': 543, 80 | 'lagou.middlewares.UserAgentMiddleware':500, 81 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None, 82 | 83 | } 84 | 85 | # Enable or disable extensions 86 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 87 | #EXTENSIONS = { 88 | # 'scrapy.extensions.telnet.TelnetConsole': None, 89 | #} 90 | 91 | # Configure item pipelines 92 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 93 | ITEM_PIPELINES = { 94 | 'lagou.pipelines.LagouPipeline': 300, 95 | } 96 | 97 | # Enable and configure the AutoThrottle extension (disabled by default) 98 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 99 | #AUTOTHROTTLE_ENABLED = True 100 | # The initial download delay 101 | #AUTOTHROTTLE_START_DELAY = 5 102 | # The maximum download delay to be set in case of high latencies 103 | #AUTOTHROTTLE_MAX_DELAY = 60 104 | # The average number of requests Scrapy should be sending in parallel to 105 | # each remote server 106 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 107 | # Enable showing throttling stats for every response received: 108 | #AUTOTHROTTLE_DEBUG = False 109 | 110 | # Enable and configure HTTP caching (disabled by default) 111 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 112 | #HTTPCACHE_ENABLED = True 113 | #HTTPCACHE_EXPIRATION_SECS = 0 114 | #HTTPCACHE_DIR = 'httpcache' 115 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 116 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 117 | 118 | 119 | #开启scrapy-redis分布式 120 | #修改调度器 121 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 122 | #开启去重 123 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/lagouspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | import json 5 | from scrapy.conf import settings 6 | from ..items import LagouItem 7 | import requests 8 | 9 | occupation_list = [] 10 | 11 | class LagouspiderSpider(scrapy.Spider): 12 | name = "lagouspider" 13 | allowed_domains = ["lagou.com"] 14 | start_urls = ['https://www.lagou.com'] 15 | cookie = settings['COOKIE'] 16 | headers = { 17 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 18 | 'Accept-Encoding': 'gzip, deflate, br', 19 | 'Accept-Language': 'zh-CN,zh;q=0.8', 20 | 'Connection': 'keep-alive', 21 | 'Content-Length': '25', 22 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 23 | 'Cookie': 'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC', 24 | 'Host': 'www.lagou.com', 25 | 'Origin': 'https://www.lagou.com', 26 | 'Referer': "h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC", 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 28 | 'X-Anit-Forge-Code': '0', 29 | 'X-Anit-Forge-Token': 'None', 30 | 'X-Requested-With': 'XMLHttpRequest' 31 | } 32 | 33 | 34 | def parse(self, response,pn=1): 35 | #获取所有职位 36 | for i in range(1,8): 37 | occos = response.xpath('//*[@id="sidebar"]/div/div[{}]/div/dl/dd/a/text()'.format(i)).extract() 38 | for occo in occos: 39 | # url = "https://www.lagou.com/jobs/list_{}?px=default&city=%E5%85%A8%E5%9B%BD#filterBox".format('java') 40 | # yield scrapy.Request(url,callback=self.parse_page) 41 | occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0' 42 | data = { 43 | 'first':'true', 44 | 'pn':pn, 45 | 'kd':'java' 46 | } 47 | #获取返回的json数据 48 | response = requests.post(occu_url, data=data, headers=self.headers) 49 | # positionIds = json.loads(response.text).get('content').get('positionResult').get('result') 50 | try: 51 | pageSize = json.loads(response.text).get('content').get('pageSize') 52 | totalCount = json.loads(response.text).get('content').get('positionResult').get('totalCount') 53 | except json.decoder.JSONDecodeError: 54 | continue 55 | #获取总页数 56 | if int(totalCount) % int(pageSize) == 0: 57 | pages = int(int(totalCount)/int(pageSize)) 58 | else: 59 | pages = int(int(totalCount)/int(pageSize)) + 1 60 | 61 | for page in range(int(pages)): 62 | pn = page + 1 63 | occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0' 64 | data = { 65 | 'first': 'true', 66 | 'pn': pn, 67 | 'kd': occo 68 | } 69 | 70 | response = requests.post(occu_url, data=data, headers=self.headers) 71 | 72 | try: 73 | if 'content' in json.loads(response.text).keys(): 74 | positionIds = json.loads(response.text).get('content').get('positionResult').get('result') 75 | 76 | 77 | for positionId in positionIds: 78 | # try: 79 | position = positionId.get('positionId') 80 | # except: 81 | # continue 82 | # print(positionId) 83 | item = LagouItem() 84 | # self.item = info 85 | item['companyFullName'] = positionId['companyFullName'] # 公司名字 86 | # print(info['公司名字']) 87 | item['city'] = positionId['city'] # 职位城市 88 | item['positionName'] = positionId['positionName'] # 招聘职位 89 | item['formatCreateTime'] = positionId['formatCreateTime'] # 发布时间 90 | item['salary'] = positionId['salary'] # 薪资待遇 91 | item['workYear'] = positionId['workYear'] # 经验要求 92 | item['companySize'] = positionId['companySize'] # 公司大小 93 | item['positionAdvantage'] = positionId['positionAdvantage'] # 公司福利 94 | item['district'] = positionId['district'] # 公司地址 95 | info_url = "https://www.lagou.com/jobs/{}.html".format(position) 96 | # item = LagouItem() 97 | # item['companyhref'] = info_url 98 | print(item) 99 | yield item 100 | # yield scrapy.Request(url=info_url, callback=self.parse_fina) 101 | except json.decoder.JSONDecodeError: 102 | continue 103 | except TimeoutError: 104 | continue 105 | # print(info_url) 106 | # yield item 107 | 108 | #获取详细页面的信息,这里试验了,很慢,只获取了页面链接,我就只获取了json数据里面的信息,大部门差不多了 109 | # def parse_fina(self,response): 110 | # item = LagouItem() 111 | # response = response.text 112 | # print(response.status) 113 | # if response.status == 200: 114 | # try: 115 | # item['companyFullName'] = response.xpath('//*[@id="job_company"]/dt/a/img/@alt').extract() # 公司名字 116 | 117 | # print(info['公司名字']) 118 | # item['city'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[2]/text()').extract() # 职位城市 119 | # item['positionName'] = response.xpath('/html/body/div[2]/div/div[1]/div/span/text()').extract() # 招聘职位 120 | # item['formatCreateTime'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[2]/text()').extract() # 发布时间 121 | # item['salary'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[1]/text()').extract() # 薪资待遇 122 | # item['workYear'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[3]/text()').extract()[0] # 经验要求 123 | # item['Jobdescriptions'] = response.xpath('//*[@id="job_detail"]/dd[2]/div/p/text()').extract() # 职位描述 124 | # item['companySize'] = response.xpath('//*[@id="job_company"]/dd/ul/li[3]/text()').extract() # 公司大小 125 | # item['positionAdvantage'] = response.xpath('//*[@id="job_detail"]/dd[1]/p/text()').extract() # 公司福利 126 | # item['district'] = response.xpath('//*[@id="job_detail"]/dd[3]/div[1]/a/text()').extract() # 公司地址 127 | # item['companyhref'] = response.xpath('//*[@id="job_company"]/dd/ul/li[4]/a/@href').extract() # 公司链接 128 | # except IndexError: 129 | # pass 130 | # print(item) 131 | # yield item -------------------------------------------------------------------------------- /scrapy爬虫简单项目/lagou-scrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lagou.settings 8 | 9 | [deploy:demo] 10 | url = http://localhost:6800/ 11 | project = lagou 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/python爬虫实现有道词典.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import urllib.request 3 | import json 4 | import urllib.parse 5 | 6 | url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null' 7 | headers = { 8 | 'Cookie':'OUTFOX_SEARCH_USER_ID=-763428860@10.168.8.61; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc31lbWsGNO67M3Fi-8v; OUTFOX_SEARCH_USER_ID_NCOO=1648534080.0892432; _ntes_nnid=bf4e54b134dc8a8b2f65cd59c8ba272e,1508592727589; ___rl__test__cookies=1508593353423', 9 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 10 | } 11 | 12 | dict1 = { 13 | '0':['zh-CHS','en'], 14 | '1':['en','zh-CHS'], 15 | '2':['zh-CHS','ja'], 16 | '3':['ja','zh-CHS'], 17 | '4':['zh-CHS','ko'], 18 | '5':['ko','zh-CHS'], 19 | '6':['zh-CHS','fr'], 20 | '7':['fr','zh-CHS'], 21 | '8':['zh-CHS','ru'], 22 | '9':['ru','zh-CHS'], 23 | '10':['zh-CHS','es'], 24 | '11':['es','zh-CHS'], 25 | '12':['zh-CHS','pt'], 26 | '13':['pt','zh-CHS'], 27 | } 28 | switch = input("请选择语言翻译:0:中文-》英语,1:英语-》中文,2:中文-》日语,3:日语-》中文,\n,4:中文-》韩语,5:韩语-》中文," 29 | "6:中文-》法语,7:法语-》中文,8:中文-》俄语,\n,9:俄语-》中文,10:中文-》西班牙语,\n,11:西班牙语-》中文,12:中文-》葡萄牙语," 30 | "13:葡萄牙语-》中文:") 31 | 32 | star = dict1[switch][0] 33 | end = dict1[switch][1] 34 | # print(star) 35 | # print(end) 36 | word = input("请输入你要翻译的语句:") 37 | data = { 38 | 'i':word, 39 | 'from':star, 40 | 'to':end, 41 | 'smartresult':'dict', 42 | 'client':'fanyideskweb', 43 | 'salt':'1508593351114', 44 | 'sign':'32cded672e5ba31d4f4929650a5ad22e', 45 | 'doctype':'json', 46 | 'version':'2.1', 47 | 'keyfrom':'fanyi.web', 48 | 'action':'FY_BY_CLICKBUTTION', 49 | 'typoResult':'true' 50 | } 51 | 52 | data = urllib.parse.urlencode(data).encode("utf-8") 53 | response = urllib.request.urlopen(url=url,data=data) 54 | datas = json.loads(response.read().decode("utf-8")) 55 | answer = datas.get('translateResult')[0][0]['tgt'] 56 | print(answer) -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = taobaoclass.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = taobaoclass 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TaobaoclassItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | image_urls = scrapy.Field() 15 | images = scrapy.Field() 16 | title = scrapy.Field() 17 | price = scrapy.Field() 18 | fukuan = scrapy.Field() 19 | dizhi = scrapy.Field() 20 | url = scrapy.Field() 21 | dianqu = scrapy.Field() 22 | 23 | class Iphone(scrapy.Item): 24 | image_urls = scrapy.Field() 25 | images = scrapy.Field() 26 | title = scrapy.Field() 27 | price = scrapy.Field() 28 | fukuan = scrapy.Field() 29 | dizhi = scrapy.Field() 30 | url = scrapy.Field() 31 | dianqu = scrapy.Field() 32 | 33 | class Samsung(scrapy.Item): 34 | image_urls = scrapy.Field() 35 | images = scrapy.Field() 36 | title = scrapy.Field() 37 | price = scrapy.Field() 38 | fukuan = scrapy.Field() 39 | dizhi = scrapy.Field() 40 | url = scrapy.Field() 41 | dianqu = scrapy.Field() 42 | 43 | 44 | 45 | class HuaWei(scrapy.Item): 46 | image_urls = scrapy.Field() 47 | images = scrapy.Field() 48 | title = scrapy.Field() 49 | price = scrapy.Field() 50 | fukuan = scrapy.Field() 51 | dizhi = scrapy.Field() 52 | url = scrapy.Field() 53 | dianqu = scrapy.Field() 54 | 55 | 56 | class Magic(scrapy.Item): 57 | image_urls = scrapy.Field() 58 | images = scrapy.Field() 59 | title = scrapy.Field() 60 | price = scrapy.Field() 61 | fukuan = scrapy.Field() 62 | dizhi = scrapy.Field() 63 | url = scrapy.Field() 64 | dianqu = scrapy.Field() 65 | 66 | 67 | 68 | class ShouJike(scrapy.Item): 69 | image_urls = scrapy.Field() 70 | images = scrapy.Field() 71 | title = scrapy.Field() 72 | price = scrapy.Field() 73 | fukuan = scrapy.Field() 74 | dizhi = scrapy.Field() 75 | url = scrapy.Field() 76 | dianqu = scrapy.Field() 77 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TaobaoclassSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.conf import settings 9 | from taobao.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe 10 | 11 | 12 | class TaobaoclassPipeline(object): 13 | def process_item(self, item, spider): 14 | host = settings('MONGODB_HOST') 15 | port = settings('MONGODB_PORT') 16 | dbName = settings('MONGODB_DBNAME') 17 | client = pymongo.MongoClient(host=host,port=port) 18 | tdb = client[dbName] 19 | 20 | if isinstance(item,Iphone): 21 | self.post = tdb[settings['MONGODB_DOCNAME_IP']] 22 | elif isinstance(item, Samsung): 23 | self.post = tdb[settings['MONGODB_DOCNAME_SAM']] 24 | elif isinstance(item, HuaWei): 25 | self.post = tdb[settings['MONGODB_DOCNAME_HW']] 26 | elif isinstance(item, ShouJiKe): 27 | self.post = tdb[settings['MONGODB_DOCNAME_SJK']] 28 | elif isinstance(item, Magic): 29 | self.post = tdb[settings['MONGODB_DOCNAME_MAG']] 30 | taobao = dict(item) 31 | self.post.insert(taobao) 32 | 33 | return item 34 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for taobaoclass project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'taobaoclass' 13 | 14 | SPIDER_MODULES = ['taobaoclass.spiders'] 15 | NEWSPIDER_MODULE = 'taobaoclass.spiders' 16 | 17 | MONGODB_HOST = '127.0.0.1' 18 | MONGODB_PORT = 27017 19 | MONGODB_DBNAME = 'taobao' 20 | MONGODB_DOCNAME_IP = 'ipad' 21 | MONGODB_DOCNAME_SAM = 'samsung' 22 | MONGODB_DOCNAME_HW = 'huawei' 23 | MONGODB_DOCNAME_MAG = 'magic' 24 | MONGODB_DOCNAME_SJK = 'shoujike' 25 | 26 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 27 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 28 | 29 | # Obey robots.txt rules 30 | ROBOTSTXT_OBEY = False 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = True 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | DEFAULT_REQUEST_HEADERS = { 51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | 'Accept-Language': 'en', 53 | } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'taobaoclass.middlewares.TaobaoclassSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'taobaoclass.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | 'taobaoclass.pipelines.TaobaoclassPipeline': 300, 77 | 'scrapy.contrib.pipeline.images.ImagePipeline':1 78 | } 79 | IMAGES_STORE = 'pic/' 80 | IMAGES_URL_FIELD = 'image_urls' 81 | 82 | # Enable and configure the AutoThrottle extension (disabled by default) 83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 84 | #AUTOTHROTTLE_ENABLED = True 85 | # The initial download delay 86 | #AUTOTHROTTLE_START_DELAY = 5 87 | # The maximum download delay to be set in case of high latencies 88 | #AUTOTHROTTLE_MAX_DELAY = 60 89 | # The average number of requests Scrapy should be sending in parallel to 90 | # each remote server 91 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 92 | # Enable showing throttling stats for every response received: 93 | #AUTOTHROTTLE_DEBUG = False 94 | 95 | # Enable and configure HTTP caching (disabled by default) 96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 97 | #HTTPCACHE_ENABLED = True 98 | #HTTPCACHE_EXPIRATION_SECS = 0 99 | #HTTPCACHE_DIR = 'httpcache' 100 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 101 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 102 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/taobao_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | from taobaoclass import items 5 | from taobaoclass.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe 6 | import urllib.parse 7 | 8 | 9 | class TaobaoSpiderSpider(scrapy.Spider): 10 | name = "taobao_spider" 11 | totalItem = ['magic','华为mate9'] 12 | allowed_domains = ["taobao.com"] 13 | start_urls = [] 14 | count = 0 15 | total = 0 16 | while(count < 500): 17 | for eveItem in totalItem: 18 | count = count + 13 19 | new_url = 'https://s.taobao.com/api?_ksTS=1488147288907_219&ajax=true&m=customized&q=' + urllib.parse.quote(eveItem) + '&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&s=' + str(count) + '&bcoffset=-3' 20 | start_urls.append(new_url) 21 | print(start_urls) 22 | 23 | 24 | def parse(self, response): 25 | try: 26 | html = json.loads(response.body.decode().replace('}}})','}}}').replace("jsonp220(",'')) 27 | for eve in html['API.CustomizedApi']['itemlist']['auctions']: 28 | print("++++++++++++++++++++++++++++++++++++++++") 29 | if 'ipad' in str(response.url): 30 | items = Iphone() 31 | print("ipad") 32 | elif 'samsung' in str(response.url): 33 | items = Samsung() 34 | print("Samsung") 35 | elif 'mate9' in str(response.url): 36 | item = HuaWei() 37 | print('huawei') 38 | else: 39 | item = ShouJiKe() 40 | print('shoujike') 41 | img = [] 42 | self.total = self.total + 1 43 | item['title'] = eve['raw_title'] 44 | item['price'] = eve['view_price'] 45 | item['fukuan'] = eve['view_sales'] 46 | item['dizhi'] = eve['item_loc'] 47 | item['url'] = 'https:' + eve['comment_url'] 48 | item['dianpu'] = eve['nick'] 49 | img.append(str('http://' + eve['pic_url'])) 50 | item['image_urls'] = img 51 | yield item 52 | except Exception as e: 53 | print(e) -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TutorialItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | class DmozItem(scrapy.Item): 17 | title = scrapy.Field() 18 | link = scrapy.Field() 19 | desc = scrapy.Field() 20 | name = scrapy.Field() 21 | price = scrapy.Field() 22 | last_updated = scrapy.Field(serializer = str) 23 | class TestItem(scrapy.Item): 24 | id = scrapy.Field() 25 | name = scrapy.Field() 26 | description = scrapy.Field() 27 | 28 | 29 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TutorialSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TutorialPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutorial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tutorial' 13 | 14 | SPIDER_MODULES = ['tutorial.spiders'] 15 | NEWSPIDER_MODULE = 'tutorial.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'tutorial.pipelines.TutorialPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/settings.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | # from tutorial.tutorial.items import DmozItem 4 | 5 | 6 | class DmozSpider(scrapy.Spider): 7 | 8 | name = "dmoz" 9 | allowed_domains = ['dmoz.org'] 10 | start_urls = [ 11 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 12 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" 13 | ] 14 | # def parse(self, response): 15 | # # filename = response.url.split("/")[-2] 16 | # # with open(filename,"wb") as f: 17 | # # f.write(response.body) 18 | # for sel in response.xpath('//ul/li'): 19 | # # title = sel.xpath('a/text()').extract() 20 | # # link = sel.xpath('a/@href').extract() 21 | # # desc = self.xpath('text()').extract() 22 | # # print(title,link,desc) 23 | # item = DmozItem() 24 | # item['title'] = sel.xpath('a/text()').extract() 25 | # item['scrlink'] = sel.xpath('a/@href').extract() 26 | # item['desc'] = sel.xpath('text()').extract() 27 | # yield item -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class MydomainSpider(scrapy.Spider): 6 | name = 'mydomain' 7 | allowed_domains = ['mydomain.com'] 8 | start_urls = ['http://mydomain.com/'] 9 | 10 | def parse(self, response): 11 | pass 12 | class MySpider(scrapy.Spider): 13 | name = "example.com" 14 | allowed_domains = ['example.com'] 15 | start_urls = [ 16 | 'http://www.example.com/1.html', 17 | 'http://www.example.com/2.html', 18 | 'http://www.example.com/3.html', 19 | ] 20 | 21 | def parse(self, response): 22 | self.log('A response from %s just arrived!' % response.url) -------------------------------------------------------------------------------- /scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zaobao.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zaobao 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__init__.py -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZaobaoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | name = scrapy.Field() 15 | url = scrapy.Field() 16 | data = scrapy.Field() 17 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZaobaoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ZaobaoPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zaobao project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zaobao' 13 | 14 | SPIDER_MODULES = ['zaobao.spiders'] 15 | NEWSPIDER_MODULE = 'zaobao.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'zaobao (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'zaobao.middlewares.ZaobaoSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'zaobao.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'zaobao.pipelines.ZaobaoPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc -------------------------------------------------------------------------------- /scrapy爬虫简单项目/zaobao/zaobao/spiders/zaobao_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from zaobao.items import ZaobaoItem 5 | 6 | 7 | class ZaobaoSpiderSpider(scrapy.Spider): 8 | name = "zaobao_spider" 9 | allowed_domains = ["zaobao.com"] 10 | start_urls = ['http://zaobao.com/'] 11 | 12 | def parse(self, response): 13 | for eve in response.xpath('//*[@id="DressUp]/div/div/div/div/a/@href'): 14 | full_url = response.urljoin(eve.extract()) 15 | yield scrapy.Request(full_url,callback=self.parse_news) 16 | 17 | def parse_news(self,response): 18 | item = ZaobaoItem() 19 | item['name'] = response.xpath('//*[@id="MainCourse"]/div/h1/text()').extract() 20 | item['url'] = response.xpath('//*[@id="MainCourse]/div/div[2]').extract() 21 | print(item) 22 | yield item -------------------------------------------------------------------------------- /zhihu.com/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihuuser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihuuser 12 | -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__init__.py -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy import Item,Field 10 | 11 | class ZhihuuserItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | id = Field() 15 | name = Field() 16 | avatar_url = Field() 17 | user_type = Field() 18 | answer_count = Field() 19 | url = Field() 20 | url_token = Field() 21 | headline = Field() 22 | 23 | -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhihuuserSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | class ZhihuuserPipeline(object): 10 | def __init__(self, mondo_uri, mongo_db): 11 | self.mongo_uri = mondo_uri 12 | self.mongo_db = mongo_db 13 | 14 | @classmethod 15 | def from_crawler(cls,crawler): 16 | return cls( 17 | mongo_uri = crawler.settings.get('MONGO_URI'), 18 | mongo_db = crawler.settings.get('MONGO_DATABASE','items') 19 | ) 20 | 21 | def open_spider(self,spider): 22 | self.client = pymongo.MongoClient() 23 | self.db = self.client['zhihuuser'] 24 | 25 | def close_spider(self,spider): 26 | self.client.close() 27 | 28 | def process_item(self, item, spider): 29 | # collection_name = item.__class__.__name__ 30 | # self.db[collection_name].insert(dict(item)) 31 | #去重,如果有更新,没有就插入 32 | self.db['user'].update({'url_token':item['url_token']},{'$set':item},True) 33 | return item 34 | -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zhihuuser project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhihuuser' 13 | 14 | SPIDER_MODULES = ['zhihuuser.spiders'] 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 46 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20' 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | 95 | #修改调度器 96 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 97 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu.com/zhihuuser/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | 4 | import scrapy 5 | from scrapy import spider,Request 6 | from ..items import ZhihuuserItem 7 | 8 | class ZhihuSpider(scrapy.Spider): 9 | name = "zhihu" 10 | allowed_domains = ["zhihu.com"] 11 | start_urls = ['http://www.zhihu.com/'] 12 | 13 | start_user = 'excited-vczh' 14 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' 15 | user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' 16 | 17 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 18 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 19 | 20 | #粉丝列表 21 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 22 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 23 | 24 | def start_requests(self): 25 | # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' 26 | yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user) 27 | yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows) 28 | 29 | #用户个人信息 30 | def parse_user(self, response): 31 | result = json.loads(response.text) 32 | item = ZhihuuserItem() 33 | for field in item.fields: 34 | #如果定义的item是获取的键名之一,就赋值 35 | if field in result.keys(): 36 | item[field] = result.get(field) 37 | yield item 38 | 39 | yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows) 40 | yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers) 41 | yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers) 42 | 43 | #关注着信息 44 | def parse_follows(self,response): 45 | 46 | results = json.loads(response.text) 47 | 48 | #先判断data键名是否存在 49 | if 'data' in results.keys(): 50 | for result in results.get('data'): 51 | yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user) 52 | 53 | #获取下一页链接,然后继续对下一页数据进行处理 54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 55 | next_page = results.get('paging').get('next') 56 | yield Request(next_page,self.parse_follows) 57 | 58 | #粉丝信息 59 | def parse_followers(self, response): 60 | 61 | results = json.loads(response.text) 62 | 63 | # 先判断data键名是否存在 64 | if 'data' in results.keys(): 65 | for result in results.get('data'): 66 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 67 | callback=self.parse_user) 68 | 69 | # 获取下一页链接,然后继续对下一页数据进行处理 70 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 71 | next_page = results.get('paging').get('next') 72 | yield Request(next_page, self.parse_followers) -------------------------------------------------------------------------------- /zhihu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhihuuser.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | #project = zhihuuser 12 | #部署项目到主机上,部署分布式,利用scrapyd 13 | url = http://localhost:6800/addversion.json 14 | projuct = zhihuuser -------------------------------------------------------------------------------- /zhihu/zhihuuser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__init__.py -------------------------------------------------------------------------------- /zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu/zhihuuser/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu/zhihuuser/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy import Item,Field 10 | 11 | class ZhihuuserItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | id = Field() 15 | name = Field() 16 | avatar_url = Field() 17 | user_type = Field() 18 | answer_count = Field() 19 | url = Field() 20 | url_token = Field() 21 | headline = Field() 22 | 23 | -------------------------------------------------------------------------------- /zhihu/zhihuuser/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZhihuuserSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /zhihu/zhihuuser/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | class ZhihuuserPipeline(object): 10 | def __init__(self, mondo_uri, mongo_db): 11 | self.mongo_uri = mondo_uri 12 | self.mongo_db = mongo_db 13 | 14 | @classmethod 15 | def from_crawler(cls,crawler): 16 | return cls( 17 | mongo_uri = crawler.settings.get('MONGO_URI'), 18 | mongo_db = crawler.settings.get('MONGO_DATABASE','items') 19 | ) 20 | 21 | def open_spider(self,spider): 22 | self.client = pymongo.MongoClient() 23 | self.db = self.client['zhihuuser'] 24 | 25 | def close_spider(self,spider): 26 | self.client.close() 27 | 28 | def process_item(self, item, spider): 29 | # collection_name = item.__class__.__name__ 30 | # self.db[collection_name].insert(dict(item)) 31 | #去重,如果有更新,没有就插入 32 | self.db['user'].update({'url_token':item['url_token']},{'$set':item},True) 33 | return item 34 | -------------------------------------------------------------------------------- /zhihu/zhihuuser/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zhihuuser project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhihuuser' 13 | 14 | SPIDER_MODULES = ['zhihuuser.spiders'] 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 46 | 'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20' 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'zhihuuser.pipelines.ZhihuuserPipeline': 300, 71 | #加入scrapy_redis中间件,分布式,不注释此行,每台机器爬取的item存储到各自的数据库,网络传输压力大 72 | #为了使每台机器爬取的item不存储到数据库,注释此行 73 | # 'scrapy_redis.pipelines.RedisPipeline':301 74 | } 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | 97 | 98 | #分布式 99 | 100 | #修改调度器 101 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 102 | #开启去重 103 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 104 | 105 | #redis数据库连接信息 106 | REDIS_URL = 'redis://127.0.0.1:6379' 107 | 108 | #爬取完不清空请求队列和指纹 ,没什么用,一般默认False 109 | # SCHEDULER_PERSIST = True 110 | #在每次爬取的时候,都会把指纹和队列清空,相当于重新进行了爬取 111 | #SCHEDULER_FLUSH_ON_START = True -------------------------------------------------------------------------------- /zhihu/zhihuuser/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc -------------------------------------------------------------------------------- /zhihu/zhihuuser/spiders/zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | 4 | import scrapy 5 | from scrapy import spider,Request 6 | from ..items import ZhihuuserItem 7 | 8 | class ZhihuSpider(scrapy.Spider): 9 | name = "zhihu" 10 | allowed_domains = ["zhihu.com"] 11 | start_urls = ['http://www.zhihu.com/'] 12 | 13 | start_user = 'excited-vczh' 14 | user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' 15 | user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' 16 | 17 | follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 18 | follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 19 | 20 | #粉丝列表 21 | followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' 22 | followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' 23 | 24 | def start_requests(self): 25 | # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' 26 | yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user) 27 | yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows) 28 | 29 | #用户个人信息 30 | def parse_user(self, response): 31 | result = json.loads(response.text) 32 | item = ZhihuuserItem() 33 | for field in item.fields: 34 | #如果定义的item是获取的键名之一,就赋值 35 | if field in result.keys(): 36 | item[field] = result.get(field) 37 | yield item 38 | 39 | yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows) 40 | yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers) 41 | yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers) 42 | 43 | #关注着信息 44 | def parse_follows(self,response): 45 | 46 | results = json.loads(response.text) 47 | 48 | #先判断data键名是否存在 49 | if 'data' in results.keys(): 50 | for result in results.get('data'): 51 | yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user) 52 | 53 | #获取下一页链接,然后继续对下一页数据进行处理 54 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 55 | next_page = results.get('paging').get('next') 56 | yield Request(next_page,self.parse_follows) 57 | 58 | #粉丝信息 59 | def parse_followers(self, response): 60 | 61 | results = json.loads(response.text) 62 | 63 | # 先判断data键名是否存在 64 | if 'data' in results.keys(): 65 | for result in results.get('data'): 66 | yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), 67 | callback=self.parse_user) 68 | 69 | # 获取下一页链接,然后继续对下一页数据进行处理 70 | if 'paging' in results.keys() and results.get('paging').get('is_end') == False: 71 | next_page = results.get('paging').get('next') 72 | yield Request(next_page, self.parse_followers) -------------------------------------------------------------------------------- /可视化文件显示程序.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/可视化文件显示程序.zip -------------------------------------------------------------------------------- /基于python的turtle画出叮当猫.py: -------------------------------------------------------------------------------- 1 | import turtle 2 | 3 | turtle.speed(5) 4 | turtle.circle(50) 5 | turtle.begin_fill() #画头 6 | turtle.circle(85) 7 | turtle.fillcolor("blue") 8 | turtle.end_fill() 9 | 10 | # turtle.penup() 11 | # turtle.goto(0,20) 12 | # turtle.pendown() 13 | 14 | # turtle.begin_fill() 15 | # turtle.circle(35) 16 | # turtle.fillcolor("white") 17 | # turtle.end_fill() 18 | 19 | turtle.begin_fill() #画脸 20 | turtle.circle(60) 21 | turtle.fillcolor("white") 22 | turtle.end_fill() 23 | 24 | 25 | 26 | turtle.penup() 27 | turtle.goto(-20,95) #化左眼眶 28 | turtle.pendown() 29 | turtle.begin_fill() 30 | turtle.circle(19) 31 | turtle.fillcolor("white") 32 | turtle.end_fill() 33 | 34 | 35 | 36 | turtle.penup() #画右眼眶 37 | turtle.goto(20,95) 38 | turtle.pendown() 39 | turtle.begin_fill() 40 | turtle.circle(19) 41 | turtle.fillcolor("white") 42 | turtle.end_fill() 43 | 44 | turtle.penup() #化左眼珠 45 | turtle.goto(-8,111) 46 | turtle.pendown() 47 | turtle.begin_fill() 48 | turtle.fillcolor("black") 49 | turtle.circle(3) 50 | turtle.end_fill() 51 | 52 | 53 | turtle.penup() #画右眼珠 54 | turtle.goto(8,111) 55 | turtle.pendown() 56 | turtle.begin_fill() 57 | turtle.fillcolor("black") 58 | turtle.circle(3) 59 | turtle.end_fill() 60 | 61 | turtle.penup() #画鼻子 62 | turtle.goto(0,85) 63 | turtle.pendown() 64 | turtle.begin_fill() 65 | turtle.circle(10) 66 | turtle.fillcolor("red") 67 | turtle.end_fill() 68 | 69 | 70 | turtle.goto(0,30) #画竖线 71 | 72 | turtle.penup() #左边第一根胡子 73 | turtle.goto(-20,70) 74 | turtle.pendown() 75 | turtle.goto(-45,80) 76 | 77 | turtle.penup() #左边第二根胡子 78 | turtle.goto(-20,60) 79 | turtle.pendown() 80 | turtle.goto(-47,60) 81 | 82 | turtle.penup() #左边第三根胡子 83 | turtle.goto(-20,50) 84 | turtle.pendown() 85 | turtle.goto(-47,40) 86 | 87 | turtle.penup() #右边第三根胡子 88 | turtle.goto(20,50) 89 | turtle.pendown() 90 | turtle.goto(47,40) 91 | 92 | 93 | turtle.penup() #右边第二根胡子 94 | turtle.goto(20,60) 95 | turtle.pendown() 96 | turtle.goto(47,60) 97 | 98 | 99 | turtle.penup() #左边第一根胡子 100 | turtle.goto(20,70) 101 | turtle.pendown() 102 | turtle.goto(45,80) 103 | 104 | turtle.penup() #右边胳膊1 105 | turtle.goto(50,20) 106 | turtle.pendown() 107 | turtle.goto(100,-10) 108 | 109 | 110 | turtle.penup() #右边胳膊2 111 | turtle.goto(50,-20) 112 | turtle.pendown() 113 | turtle.goto(80,-40) 114 | 115 | turtle.begin_fill() 116 | turtle.goto(100,-10) 117 | turtle.goto(50,20) 118 | turtle.goto(50,-20) 119 | turtle.goto(80,-40) 120 | turtle.fillcolor("yellow") 121 | turtle.end_fill() 122 | 123 | 124 | 125 | turtle.penup() #右手 126 | turtle.goto(100,-50) 127 | turtle.pendown() 128 | turtle.begin_fill() 129 | turtle.circle(20) 130 | turtle.fillcolor("blue") 131 | turtle.end_fill() 132 | 133 | 134 | 135 | turtle.penup() #左边胳膊1 136 | turtle.goto(-50,20) 137 | turtle.pendown() 138 | turtle.goto(-100,-10) 139 | 140 | 141 | turtle.penup() #左边胳膊2 142 | turtle.goto(-50,-20) 143 | turtle.pendown() 144 | turtle.goto(-80,-40) 145 | 146 | turtle.begin_fill() 147 | turtle.goto(-100,-10) 148 | turtle.goto(-50,20) 149 | turtle.goto(-50,-20) 150 | turtle.goto(-80,-40) 151 | turtle.fillcolor("yellow") 152 | turtle.end_fill() 153 | 154 | turtle.penup() #左手 155 | turtle.goto(-100,-53) 156 | turtle.pendown() 157 | turtle.begin_fill() 158 | turtle.circle(20) 159 | turtle.fillcolor("blue") 160 | turtle.end_fill() 161 | 162 | 163 | turtle.penup() #左手 164 | turtle.goto(-50,-20) 165 | turtle.pendown() 166 | turtle.goto(-50,-100) 167 | 168 | turtle.penup() #左手 169 | turtle.goto(50,-20) 170 | turtle.pendown() 171 | turtle.goto(50,-100) 172 | 173 | 174 | turtle.begin_fill() 175 | turtle.penup() 176 | turtle.goto(50,-120) 177 | turtle.pendown() 178 | turtle.circle(10) 179 | turtle.fillcolor("blue") 180 | turtle.end_fill() 181 | 182 | turtle.begin_fill() 183 | turtle.goto(20,-120) 184 | turtle.circle(10) 185 | turtle.fillcolor("blue") 186 | turtle.end_fill() 187 | 188 | 189 | turtle.penup() 190 | turtle.goto(50,-100) 191 | turtle.pendown() 192 | turtle.goto(20,-100) 193 | 194 | 195 | 196 | turtle.penup() 197 | turtle.goto(-50,-120) 198 | turtle.pendown() 199 | turtle.begin_fill() 200 | turtle.circle(10) 201 | turtle.goto(-20,-120) 202 | turtle.circle(10) 203 | turtle.fillcolor("blue") 204 | turtle.end_fill() 205 | 206 | turtle.penup() 207 | turtle.goto(-20,-100) 208 | turtle.pendown() 209 | turtle.goto(-50,-100) 210 | 211 | 212 | turtle.penup() 213 | turtle.goto(-20,-100) 214 | turtle.pendown() 215 | turtle.goto(-20,-85) 216 | 217 | turtle.goto(20,-85) 218 | turtle.goto(20,-100) 219 | 220 | turtle.penup() 221 | turtle.goto(-50,-20) 222 | turtle.pendown() 223 | 224 | turtle.begin_fill() 225 | turtle.goto(50,-20) 226 | turtle.goto(50,-85) 227 | turtle.goto(-50,-85) 228 | turtle.goto(-50,-20) 229 | turtle.fillcolor("blue") 230 | turtle.end_fill() 231 | 232 | 233 | turtle.penup() 234 | turtle.goto(0,-20) #铃铛 235 | turtle.pendown() 236 | turtle.begin_fill() 237 | turtle.circle(10) 238 | turtle.fillcolor("yellow") 239 | turtle.end_fill() 240 | 241 | 242 | 243 | turtle.penup() 244 | turtle.goto(-10,-10) 245 | turtle.pendown() 246 | turtle.goto(10,-10) 247 | 248 | 249 | turtle.penup() 250 | turtle.goto(-50,20) 251 | turtle.pendown() 252 | turtle.begin_fill() 253 | turtle.goto(50,20) 254 | turtle.goto(50,0) 255 | turtle.goto(-50,0) 256 | turtle.goto(-50,20) 257 | turtle.fillcolor("red") 258 | turtle.end_fill() 259 | 260 | 261 | turtle.penup() 262 | turtle.goto(50,0) 263 | turtle.pendown() 264 | turtle.begin_fill() 265 | turtle.circle(10) 266 | turtle.fillcolor("red") 267 | turtle.end_fill() 268 | 269 | 270 | turtle.penup() 271 | turtle.goto(-50,0) 272 | turtle.pendown() 273 | turtle.begin_fill() 274 | turtle.circle(10) 275 | turtle.fillcolor("red") 276 | turtle.end_fill() 277 | 278 | 279 | turtle.penup() #内裤 280 | turtle.goto(-50,-70) 281 | turtle.pendown() 282 | turtle.begin_fill() 283 | turtle.goto(50,-70) 284 | turtle.goto(50,-50) 285 | turtle.goto(-50,-50) 286 | turtle.goto(-50,-70) 287 | turtle.fillcolor("red") 288 | turtle.end_fill() 289 | 290 | turtle.penup() 291 | turtle.goto(-10,-70) 292 | turtle.pendown() 293 | turtle.begin_fill() 294 | turtle.goto(-10,-85) 295 | turtle.goto(10,-85) 296 | turtle.goto(10,-70) 297 | turtle.goto(-10,-70) 298 | turtle.fillcolor("red") 299 | turtle.end_fill() 300 | 301 | turtle.penup() 302 | turtle.goto(-100,200) 303 | turtle.pendown() 304 | s = "机器猫中的战斗猫" 305 | turtle.write(s,font = ("Arial",20,"normal")) 306 | 307 | 308 | turtle.done() -------------------------------------------------------------------------------- /基于python的turtle的桌面弹球.py: -------------------------------------------------------------------------------- 1 | from tkinter import * 2 | from random import randint 3 | 4 | def getRandomColor(): 5 | color = "#" 6 | for j in range(6): 7 | color += toHexChar(randint(0,15)) 8 | return color 9 | def toHexChar(hexValue): 10 | if 0 <= hexValue <= 9: 11 | return chr(hexValue + ord('0')) 12 | else: 13 | return chr(hexValue - 10 + ord("A")) 14 | class Ball: 15 | def __init__(self): 16 | self.x = 0 17 | self.y = 0 18 | self.dx = 2 19 | self.dy = 2 20 | self.radius = 3 21 | self.color = getRandomColor() 22 | 23 | class BounceBalls: 24 | def __init__(self): 25 | self.ballList = [] 26 | win = Tk() 27 | win.title("Bouncing Balls") 28 | 29 | self.width = 350 30 | self.height = 150 31 | self.canvas = Canvas(win,bg = "white",width = self.width,height = self.height) 32 | self.canvas.pack() 33 | 34 | 35 | frame = Frame(win) 36 | frame.pack() 37 | btStop = Button(frame,text = "Stop",command = self.stop) 38 | btStop.pack(side = LEFT) 39 | btResume = Button(frame,text = "Resume",command = self.resume) 40 | btResume.pack(side = LEFT) 41 | btAdd = Button(frame,text = "+",command = self.add) 42 | btAdd.pack(side = LEFT) 43 | btRemove = Button(frame,text = "-",command = self.remove) 44 | btRemove.pack(side = LEFT) 45 | 46 | 47 | self.sleepTime = 100 48 | self.isStopped = False 49 | self.animate() 50 | win.mainloop() 51 | def stop(self): 52 | self.isStopped = True 53 | def resume(self): 54 | self.isStopped = False 55 | self.animate() 56 | def add(self): 57 | self.ballList.append(Ball()) 58 | def remove(self): 59 | self.ballList.pop() 60 | def animate(self): 61 | while not self.isStopped: 62 | self.canvas.after(self.sleepTime) 63 | self.canvas.update() 64 | self.canvas.delete("ball") 65 | 66 | for ball in self.ballList: 67 | self.redisplayBall(ball) 68 | def redisplayBall(self,ball): 69 | if ball.x > self.width or ball.x < 0: 70 | ball.dx = -ball.dx 71 | if ball.y > self.height or ball.y < 0: 72 | ball.y = -ball.y 73 | ball.x += ball.dx 74 | ball.y += ball.dy 75 | self.canvas.create_oval(ball.x - ball.radius,ball.y - ball.radius,ball.x + ball.radius,ball.y + ball.radius,fill = ball.color,tags = "ball") 76 | BounceBalls() -------------------------------------------------------------------------------- /基于python的turtle移动的小球.py: -------------------------------------------------------------------------------- 1 | from tkinter import * 2 | 3 | class MovingBall: 4 | def __init__(self): 5 | win = Tk() 6 | win.title("Moving Ball") 7 | 8 | self.width = 250 9 | self.canvas = Canvas(win,width = self.width,height = 200,bg = 'white') 10 | self.canvas.pack() 11 | 12 | frame = Frame(win) 13 | frame.pack() 14 | btLeft = Button(frame,text = "Left",command = self.LeftMoving ) 15 | btLeft.pack() 16 | btRight = Button(frame,text = "Right",command = self.RightMoving) 17 | btRight.pack() 18 | btUp = Button(frame,text = "Up",command = self.UpMoving) 19 | btUp.pack() 20 | btDown = Button(frame,text = "Down",command = self.DownMoving) 21 | btDown.pack() 22 | self.x = 0 23 | self.y = 0 24 | self.canvas.create_oval(self.x,self.y,self.x + 10,self.y + 10,fill = "black",tags = "oval") 25 | win.mainloop() 26 | 27 | def LeftMoving(self): 28 | self.canvas.delete("oval") 29 | if self.x > 10: 30 | self.x -= 10 31 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 32 | else: 33 | self.x = 250 34 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 35 | 36 | def RightMoving(self): 37 | self.canvas.delete("oval") 38 | if self.x < 250: 39 | self.x += 10 40 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 41 | else: 42 | self.x = 0 43 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 44 | def UpMoving(self): 45 | self.canvas.delete("oval") 46 | if self.y > 10: 47 | self.y -= 10 48 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 49 | else: 50 | self.y = 200 51 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 52 | 53 | def DownMoving(self): 54 | self.canvas.delete("oval") 55 | if self.y < 200: 56 | self.y += 10 57 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 58 | else: 59 | self.y = 0 60 | self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval") 61 | MovingBall() -------------------------------------------------------------------------------- /抓取财富网股票信息.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import urllib.request 3 | import re 4 | import random 5 | import time 6 | #抓取所需内容 7 | user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)', 8 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 9 | 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 10 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 11 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)', 12 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1', 13 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 14 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12', 15 | 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 16 | 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 17 | 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 18 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 19 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)', 20 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 21 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)', 22 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ', 23 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)', 24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ', 25 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER', 26 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)', 27 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] 28 | stock_total=[] #stock_total:所有页面的股票数据 stock_page:某页的股票数据 29 | for page in range(1,8): 30 | url='http://quote.stockstar.com/stock/ranklist_a_3_1_'+str(page)+'.html' 31 | request=urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素 32 | try: 33 | response=urllib.request.urlopen(request) 34 | except urllib.error.HTTPError as e: #异常检测 35 | print('page=',page,'',e.code) 36 | except urllib.error.URLError as e: 37 | print('page=',page,'',e.reason) 38 | content=response.read().decode('gbk') #读取网页内容 39 | print('get page',page) #打印成功获取的页码 40 | pattern=re.compile('') 41 | body=re.findall(pattern,str(content)) 42 | # for i in body: 43 | # print(i) 44 | pattern=re.compile('>(.*?)<') 45 | stock_page=re.findall(pattern,body[0]) #正则匹配 46 | # print(stock_page) 47 | stock_total.extend(stock_page) 48 | # print(stock_total) 49 | time.sleep(random.randrange(1,4)) #每抓一页随机休眠几秒,数值可根据实际情况改动 50 | #删除空白字符 51 | stock_last=stock_total[:] #stock_last为最终所要得到的股票数据 52 | for data in stock_total: 53 | if data=='': 54 | stock_last.remove('') 55 | # print(stock_last) 56 | #打印部分结果 57 | print('代码','\t','简称',' ','\t','最新价','\t','涨跌幅','\t','涨跌额','\t','5分钟涨幅') 58 | for i in range(len(stock_last) - 6): 59 | print(format(stock_last[i],"6s"),'\t',format(stock_last[i+1],"6s"),' ','\t',format(stock_last[i+2],"6s"),' ','\t',format(stock_last[i+3],"6s"),' ','\t',format(stock_last[i+4],"6s"),' ','\t',format(stock_last[i+5],"6s")) 60 | -------------------------------------------------------------------------------- /爬取12306车票信息.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import ssl 3 | from urllib import parse 4 | import re 5 | import requests 6 | import json 7 | import urllib 8 | 9 | # 10 | ssl._create_default_https_context = ssl._create_unverified_context 11 | # headers = { 12 | # 'Cookie':'JSESSIONID=95820ECC00B038495AC43E949F6D4A69; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=351273482.64545.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-20; _jc_save_wfdc_flag=dc', 13 | # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 14 | # } 15 | 16 | # 获取所有的站点信息 17 | def get_station(): 18 | url = 'http://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9028' 19 | response = urllib.request.urlopen(url).read() 20 | # print(response) 21 | return response.decode("utf-8") 22 | 23 | #获取出发点和终点站的信息 24 | def station(stationinfo,star,end): 25 | str2 = stationinfo[20:][:-2] 26 | str3 = str2.split('|') 27 | order1 = str3.index(star) 28 | order2 = str3.index(end) 29 | starstation = str3[int(order1) + 1] 30 | endstation= str3[int(order2) + 1] 31 | return starstation,endstation 32 | 33 | # 获取列车信息 34 | def getTrainInfo(start,end,date): 35 | 36 | # params = { 37 | train_date = date 38 | from_station = start 39 | to_station = end 40 | purpose_codes = 'ADULT' 41 | # } 42 | url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes={}'.format(train_date,from_station,to_station,purpose_codes) 43 | # print(url) 44 | headers = { 45 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 46 | 'Cookie':'JSESSIONID=B201655CD8BCF12D53ADF6CA6D2AA050; route=495c805987d0f5c8c84b14f60212447d; BIGipServerotn=770703882.38945.0000; BIGipServerpool_passport=367854090.50215.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-21; _jc_save_wfdc_flag=dc' 47 | 48 | } 49 | response = urllib.request.Request(url,headers=headers) 50 | response = urllib.request.urlopen(response).read() 51 | # response = response.urlopen() 52 | return response.decode("utf-8") 53 | 54 | # 获取价钱信息,打印列车的所有信息 55 | def getTicketInfo(getTrainInfos,train_date,stationinfo): 56 | # print(getTrainInfos) 57 | getTrainInfos = json.loads(getTrainInfos).get('data').get('result') 58 | 59 | for getTrainInfo in getTrainInfos: 60 | order3 = getTrainInfo.split('|') 61 | train_no = order3[2] 62 | seat_types = str(order3[-1:])[2:5] 63 | if len(seat_types) != 3 : 64 | continue 65 | from_station_no = str(order3[1:][15]) 66 | to_station_no = str(order3[1:][16]) 67 | url = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no={}&from_station_no={}&to_station_no={}&seat_types={}&train_date={}'.format(train_no,from_station_no,to_station_no,seat_types,train_date) 68 | # url = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no=26000K772632&from_station_no=10&to_station_no=11&seat_types=113&train_date=2017-10-25' 69 | headers = { 70 | 'Accept':'* / *', 71 | 'Accept - Encoding':'gzip, deflate, br', 72 | 'Accept - Language':'zh - CN, zh;q = 0.8', 73 | 'Cache - Control':'no - cache', 74 | 'Connection':'keep - alive', 75 | 'Host':'kyfw.12306.cn', 76 | 'If - Modified - Since':'0', 77 | 'Referer:https':'// kyfw.12306.cn / otn / leftTicket / init', 78 | 'X - Requested - With':'XMLHttpRequest', 79 | 'Cookie': 'JSESSIONID = B201655CD8BCF12D53ADF6CA6D2AA050;route = 495c805987d0f5c8c84b14f60212447d;BIGipServerotn = 770703882.38945.0000;BIGipServerpool_passport = 367854090.50215.0000;_jc_save_fromStation = % u5317 % u4EAC % 2BJP;_jc_save_toStation = % u5929 % u6D25 % 2TJP;_jc_save_fromDate = 2017 - 10 - 25;_jc_save_toDate = 2017 - 10 - 21;_jc_save_wfdc_flag = dc', 80 | 'User - Agent':' Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 61.0.3163.100Safari / 537.36' 81 | } 82 | response = urllib.request.urlopen(url).read() 83 | datas = response.decode("utf-8") 84 | com = re.compile('({.*?}})') 85 | datas = com.findall(datas) 86 | for data in datas: 87 | if len(data) > 30: 88 | data = json.loads(data) 89 | # print(data) 90 | datas = data.get('data') 91 | print("------------------本次列车-----------------------------------") 92 | print('本次列车', order3[3]) 93 | 94 | str2 = stationinfo[20:][:-2] 95 | str3 = str2.split('|') 96 | order1 = str3.index(order3[4]) 97 | order2 = str3.index(order3[7]) 98 | starstation = str3[int(order1) - 1] 99 | endstation = str3[int(order2) - 1] 100 | 101 | print('出发站点', starstation) 102 | print('到达站点', endstation) 103 | print('出发时间', order3[8]) 104 | print('到达时间', order3[9]) 105 | print('历时时间', order3[10]) 106 | # print(type(datas)) 107 | for k in datas: 108 | 109 | if k == 'A9': 110 | print('商务座特等座',":",datas[k]) 111 | elif k == 'M': 112 | print("一等座",":",datas[k]) 113 | elif k == 'O': 114 | print("二等座",":",datas[k]) 115 | elif k == 'WZ': 116 | print("无座",":",datas[k]) 117 | elif k == 'A4': 118 | print("软卧", ":", datas[k]) 119 | elif k == 'WZ': 120 | print("无座", ":", datas[k]) 121 | elif k == 'F': 122 | print("动卧", ":", datas[k]) 123 | elif k == 'A3': 124 | print("硬卧", ":", datas[k]) 125 | elif k == 'A1': 126 | print("硬座", ":", datas[k]) 127 | elif k == 'A6': 128 | print("高级软卧", ":", datas[k]) 129 | elif k == 'OT': 130 | print("其他", ":", datas[k]) 131 | 132 | if __name__ == "__main__": 133 | start = input('出发车站:') 134 | end = input('到达车站:') 135 | date = input("出发时间(如2017.10.25):") 136 | # 处理时间格式 137 | date = date.replace('.','-') 138 | stationinfo = get_station() 139 | starstation,endstation = station(stationinfo,start,end) 140 | getTrainInfo = getTrainInfo(starstation,endstation,date) 141 | getTicketInfo(getTrainInfo,date,stationinfo) 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /爬取qq音乐歌曲/爬取扣扣音乐文件.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import time 4 | import random 5 | import requests 6 | import urllib 7 | import time 8 | import codecs 9 | import urllib3 10 | def songmid(): 11 | mid = [] 12 | name = [] 13 | url = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=sizer.yqq.song_next&searchid=148958880434449513&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w=%E4%BA%94%E6%9C%88%E5%A4%A9&g_tk=1989554541&jsonpCallback=searchCallbacksong5150&loginUin=1093211972&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0" 14 | response = requests.get(url) 15 | if json.loads(response.text[23:-1])['data']['song']['curnum'] and json.loads(response.text[23:-1])['data']['song']['curpage']: 16 | # if True: 17 | for i in range(20): 18 | # print(i) 19 | jsonpcallback = "searchCallbacksong"+str((random.randint(1000,10000))) 20 | if i == 0: 21 | remoteplace = "txt.yqq.song" 22 | else: 23 | remoteplace = "sizer.yqq.song_next" 24 | # print(i) 25 | params= { 26 | 'ct': "24", 27 | 'qqmusic_ver': "1298:", 28 | 'new_json': "1", 29 | 'remoteplace': "sizer.yqq.song_next", 30 | 'searchid': "148958880434449513", 31 | 't': "0", 32 | 'aggr': "1", 33 | 'cr': "1", 34 | 'catzhida': "1", 35 | 'lossless': "0", 36 | 'flag_qc': "0", 37 | 'p': i+1, 38 | 'n': str(json.loads(response.text[23:-1])['data']['song']['curnum']), 39 | # 'n': 20, 40 | 'w': "%E4%BA%94%E6%9C%88%E5%A4%A9", 41 | 'g_tk': "1989554541", 42 | 'jsonpcallback': jsonpcallback, 43 | 'loginuin': "1093211972", 44 | 'hostuin': "0", 45 | 'format': "jsonp", 46 | 'incharset': "utf8", 47 | 'outcharset': "utf-8", 48 | 'notice': "0", 49 | 'platform': "yqq", 50 | 'neednewcode': "0", 51 | 'cache-control': "no-cache", 52 | } 53 | # url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?" 54 | url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp" 55 | response2 = requests.get(url2,params=params) 56 | # print(json.loads(response2.text[9:-1])) 57 | for i in json.loads(response2.content[9:-1])['data']['song']['list']: 58 | if i['file']['media_mid']: 59 | mid.append(i['file']['media_mid']) 60 | name.append(i['name']) 61 | print(set(mid)) 62 | print(len(set(mid))) 63 | return mid,name 64 | url = [] 65 | file = codecs.open('audio2.txt','w') 66 | def resolve(songmids,name): 67 | 68 | for i in range(len(songmids)): 69 | filename = 'C400' + songmids[i] + '.m4a' 70 | # print(songmids[i]) 71 | guid = int(random.random() * 2147483647) * int(time.time() * 1000) % 10000000000 72 | 73 | d = { 74 | 'format': 'json', 75 | 'cid': 205361747, 76 | 'uin': 0, 77 | 'songmid': songmids[i], 78 | 'filename': filename, 79 | 'guid': guid, 80 | 'g_tk':5381, 81 | 'loginUin':0, 82 | 'hostUin':0, 83 | 'notice': '0', 84 | 'platform':'yqq', 85 | 'needNewCode':'0', 86 | } 87 | headers = { 88 | 'User - Agent':"Mozilla / 5.0(WindowsNT10.0; …) Gecko / 20100101Firefox / 57.0" 89 | } 90 | r = requests.get('https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg', params=d) 91 | try: 92 | vkey = json.loads(r.text)['data']['items'][0]['vkey'] 93 | except: 94 | continue 95 | if vkey: 96 | audio_url = 'http://dl.stream.qqmusic.qq.com/%s?vkey=%s&guid=%s&uin=0&fromtag=66' % (filename, vkey, guid) 97 | time.sleep(random.random()*1) 98 | url.append(audio_url) 99 | file.write(audio_url+'\n') 100 | if __name__ == "__main__": 101 | songmids,name =songmid() 102 | resolve(songmids,name) 103 | file.close() --------------------------------------------------------------------------------