├── scrapy_auto ├── __init__.py ├── service │ ├── templates │ │ ├── test_movie.html │ │ ├── index.html │ │ └── show_data.html │ ├── static │ │ ├── img │ │ │ ├── word_cloud.png │ │ │ ├── work_count.png │ │ │ └── education_percentage.png │ │ └── js │ │ │ └── index_js2.js │ ├── __init__.py │ └── service.py ├── .DS_Store ├── items.pyc ├── __init__.pyc ├── config.pyc ├── settings.pyc ├── pipelines.pyc ├── middlewares.pyc ├── tools │ ├── .DS_Store │ ├── convers.pyc │ ├── __init__.pyc │ ├── common_parser.pyc │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── convers.cpython-37.pyc │ │ └── common_parser.cpython-37.pyc │ ├── __init__.py │ ├── convers.py │ ├── common_parser.py │ └── data_show.py ├── unit_test │ ├── demo.jpg │ ├── __init__.py │ ├── mydemo.py │ ├── crawl_crack.py │ ├── demo.py │ └── httpsProxys.py ├── spiders │ ├── __init__.pyc │ ├── article_spiders.pyc │ ├── employ_spiders.pyc │ ├── media_spiders.pyc │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── demo_spider.cpython-37.pyc │ │ ├── SearchSpiders.cpython-37.pyc │ │ ├── employ_spiders.cpython-37.pyc │ │ ├── media_spiders.cpython-37.pyc │ │ ├── FreeProxySpider.cpython-37.pyc │ │ ├── article_spiders.cpython-37.pyc │ │ └── company_kandian_spiders.cpython-37.pyc │ ├── __init__.py │ ├── demo_spider.py │ ├── bili_spider.py │ ├── FreeProxySpider.py │ ├── SearchSpiders.py │ ├── media_spiders.py │ ├── employ_spiders.py │ ├── company_kandian_spiders.py │ └── article_spiders.py ├── __pycache__ │ ├── items.cpython-37.pyc │ ├── config.cpython-37.pyc │ ├── __init__.cpython-37.pyc │ ├── pipelines.cpython-37.pyc │ └── settings.cpython-37.pyc ├── config.py ├── items.py ├── settings.py ├── pipelines.py └── middlewares.py ├── .DS_Store ├── .gitattributes ├── .idea ├── vcs.xml ├── encodings.xml ├── modules.xml ├── misc.xml ├── deployment.xml ├── webServers.xml ├── scrapy_pro.iml ├── codeStyles │ └── Project.xml └── dbnavigator.xml ├── proxy_list.json ├── scrapy.cfg ├── begin.py ├── requirements.txt └── README.md /scrapy_auto/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_auto/service/templates/test_movie.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/.DS_Store -------------------------------------------------------------------------------- /scrapy_auto/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/.DS_Store -------------------------------------------------------------------------------- /scrapy_auto/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/items.pyc -------------------------------------------------------------------------------- /scrapy_auto/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__init__.pyc -------------------------------------------------------------------------------- /scrapy_auto/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/config.pyc -------------------------------------------------------------------------------- /scrapy_auto/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/settings.pyc -------------------------------------------------------------------------------- /scrapy_auto/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/pipelines.pyc -------------------------------------------------------------------------------- /scrapy_auto/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/middlewares.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/.DS_Store -------------------------------------------------------------------------------- /scrapy_auto/tools/convers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/convers.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/__init__.pyc -------------------------------------------------------------------------------- /scrapy_auto/unit_test/demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/unit_test/demo.jpg -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.html linguist-language=python 4 | -------------------------------------------------------------------------------- /scrapy_auto/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__init__.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/common_parser.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/common_parser.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/article_spiders.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/article_spiders.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/employ_spiders.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/employ_spiders.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/media_spiders.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/media_spiders.pyc -------------------------------------------------------------------------------- /scrapy_auto/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/service/static/img/word_cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/service/static/img/word_cloud.png -------------------------------------------------------------------------------- /scrapy_auto/service/static/img/work_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/service/static/img/work_count.png -------------------------------------------------------------------------------- /scrapy_auto/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/__pycache__/convers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/__pycache__/convers.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/service/static/img/education_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/service/static/img/education_percentage.png -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/demo_spider.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/demo_spider.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/tools/__pycache__/common_parser.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/tools/__pycache__/common_parser.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/SearchSpiders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/SearchSpiders.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/employ_spiders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/employ_spiders.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/media_spiders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/media_spiders.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/FreeProxySpider.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/FreeProxySpider.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/article_spiders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/article_spiders.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__pycache__/company_kandian_spiders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangge11/scrapy_pro/HEAD/scrapy_auto/spiders/__pycache__/company_kandian_spiders.cpython-37.pyc -------------------------------------------------------------------------------- /scrapy_auto/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy_auto/service/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/3/20 14:49 6 | # @Author : 504747754@qq.com(ZengYang) 7 | # @File : __init__.py.py 8 | # @Software: PyCharm 9 | # @ToUse : -------------------------------------------------------------------------------- /scrapy_auto/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/1/24 16:37 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : __init__.py.py 8 | # @Software: PyCharm 9 | # @ToUse : -------------------------------------------------------------------------------- /scrapy_auto/unit_test/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/2/20 17:23 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : __init__.py.py 8 | # @Software: PyCharm 9 | # @ToUse : -------------------------------------------------------------------------------- /proxy_list.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"proxy_scheme": "http", "proxy": "http://218.91.112.250:9999"}, 3 | {"proxy_scheme": "https", "proxy": "https://116.209.56.111:9999"}, 4 | {"proxy_scheme": "http", "proxy": "http://42.51.42.201:808"}, 5 | {"proxy_scheme": "https", "proxy": "https://222.135.92.68:38094"} 6 | ] -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapy_auto.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapy_auto 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 10 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/webServers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/scrapy_pro.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | -------------------------------------------------------------------------------- /scrapy_auto/service/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Echarts 6 | 9 | 12 | 13 | 14 |

其他数据图

15 |

背景:统计不同岗位在全国城市的分布情况
目前进度:暂时只做了所有岗位的统计

16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /scrapy_auto/service/templates/show_data.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 |

数据来源:boss直聘官网


9 |

免责声明:本项目纯粹技术探索,如有数据侵权,请联系<504747754@qq.com>下架
时间有限,目前项目暂停于此,后续有空会更新

10 |

学历分布图
背景:统计不同岗位的学历需求分布
目前进度:暂时只做了所有岗位的统计

11 | 12 |
13 |

技能词云图(案例图)
背景:统计不同岗位所需要技能的关键词排名
目前进度:暂时只做了python岗位

14 | 15 |
16 |

岗位-数量图
背景:统计不同岗位的需求全国数量
目前进度:数据未全部录入,暂时只做了部分

17 | 18 |
19 | 23 | 24 | -------------------------------------------------------------------------------- /scrapy_auto/service/service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/3/20 14:49 6 | # @Author : 504747754@qq.com(ZengYang) 7 | # @File : service.py 8 | # @Software: PyCharm 9 | # @ToUse : 数据可视化service 10 | from flask import Flask, render_template 11 | 12 | app = Flask(__name__) 13 | 14 | 15 | @app.route('/', methods=["GET"]) 16 | def index(): 17 | return render_template('index.html') 18 | 19 | 20 | @app.route("/show_data") 21 | def education(): 22 | return render_template('show_data.html') 23 | 24 | 25 | @app.route("/test_movie") 26 | def test_movie(): 27 | return render_template('test_movie.html') 28 | 29 | 30 | if __name__ == '__main__': 31 | app.run(host='0.0.0.0', port=1080, debug=True) 32 | -------------------------------------------------------------------------------- /begin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/3/19 23:39 6 | # @Author : 504747754@qq.com(ZengYang) 7 | # @File : begin.py 8 | # @Software: PyCharm 9 | # @ToUse : 10 | 11 | 12 | from scrapy import cmdline 13 | 14 | # cmdline.execute("scrapy crawl boss_spider".split()) 15 | # cmdline.execute("scrapy crawl xici_spider -o proxy_list.json".split()) 16 | # cmdline.execute("scrapy crawl demo_spider".split()) 17 | # cmdline.execute("scrapy crawl toutiao_add_spider -o items.json".split()) 18 | # cmdline.execute("scrapy crawl toutiao_all_spider -o items.json".split()) 19 | # cmdline.execute("scrapy crawl lanzhou_spider -o items.json".split()) 20 | cmdline.execute("scrapy crawl bili_spider".split()) 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /scrapy_auto/unit_test/mydemo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | plt.rcParams['font.sans-serif'] = ['FangSong'] 8 | plt.rcParams['axes.unicode_minus'] = False 9 | # 设置图片大小 10 | label = '超载', '船员责任心不强', '船员驾驶技术太差', '通航环境差', '海事、港航监管不到位', '船舶过于老旧', '冒险航行' # 各类别标签 11 | color = 'red', 'orange', 'yellow', 'green', 'blue', 'gray', 'goldenrod' # 各类别颜色 12 | size = [34, 5, 6, 14, 1, 10, 23] # 各类别占比 13 | explode = (0.2, 0, 0, 0, 0, 0, 0, 0) # 各类别的偏移半径 14 | 15 | pie = plt.pie(size, colors=color, explode=explode, labels=label, shadow=True, autopct='%1.1f%%') 16 | # for digit in pie[2]: 17 | # digit.set_size(8) 18 | 19 | plt.axis('equal') 20 | plt.title('你认为砂石船发生事故的主要原因在于', fontsize=12) 21 | 22 | plt.legend(loc=0, bbox_to_anchor=(0.82, 1)) # 图例 23 | # 设置legend的字体大小 24 | leg = plt.gca().get_legend() 25 | ltext = leg.get_texts() 26 | plt.setp(ltext, fontsize=6) 27 | plt.show() 28 | pass 29 | -------------------------------------------------------------------------------- /scrapy_auto/spiders/demo_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/1/23 18:44 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : demo_spider.py 8 | # @Software: PyCharm 9 | # @ToUse : 10 | import scrapy 11 | from scrapy import Request 12 | 13 | from scrapy_auto.items import JobItem 14 | 15 | 16 | class Demo1(scrapy.Spider): 17 | """ 18 | 测试spider的各种操作 19 | """ 20 | name = 'demo_spider' 21 | start_urls = [ 22 | 'https://www.baidu.com/' 23 | ] 24 | custom_settings = { 25 | 'CONCURRENT_REQUESTS': 50, 26 | 'DOWNLOAD_DELAY': 0.1, 27 | } 28 | 29 | def parse(self, response): 30 | yield Request(url='https://www.baidu.com/', callback=self.demo_item) 31 | 32 | def demo_item(self, response): 33 | while True: 34 | item = JobItem() 35 | for filed in list(item.fields.keys()): 36 | item[filed] = 'demo' 37 | yield item 38 | pass 39 | -------------------------------------------------------------------------------- /scrapy_auto/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/1/28 15:04 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : config.py 8 | # @Software: PyCharm 9 | # @ToUse : 10 | 11 | 12 | parser_config = { 13 | 'all_spider': { 14 | 'title': '//title/text()', 15 | 'descr': '//meta[@name="description"]/text()|//meta[@name="Description"]/text()', 16 | 'keywords': '//meta[@name="keywords"]/text()|//meta[@name="Keywords"]/text()', 17 | }, 18 | 'cnys_spider': { 19 | 'content_original': '//div[@class="reads"]', 20 | }, 21 | 'w39_spider': { 22 | 'content_original': '//div[@class="art_con"]', 23 | }, 24 | 'verywellhealth_spider': { 25 | 'content_original': '//div[@class="loc chop-content "]|//div[@class="comp right-rail__offset taxonomy article-content expert-content"]', 26 | # 'loc content l-main',//article 27 | }, 28 | 'health_spider': { 29 | 'content_original': '//div[@class="article-content-container two-col-content-container"]', 30 | }, 31 | 'wsj_spider': { 32 | 'content_original': '//div[@class="wsj-snippet-body"]', 33 | }, 34 | } 35 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /scrapy_auto/tools/convers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/3/25 18:37 6 | # @Author : 504747754@qq.com(ZengYang) 7 | # @File : convers.py 8 | # @Software: PyCharm 9 | # @ToUse : 10 | import json 11 | import traceback 12 | 13 | 14 | def from_string_to_json(content): 15 | json_dict = {} 16 | try: 17 | json_dict = json.loads(content) 18 | json_dict = normalize_dict(json_dict) 19 | except Exception as e: 20 | traceback.print_exc() 21 | return json_dict 22 | 23 | 24 | def from_json_to_string(data): 25 | data = normalize_dict(data) 26 | return json.dumps(data, ensure_ascii=False) 27 | 28 | 29 | def normalize_dict(data): 30 | if type(data) == dict: 31 | new_data = {} 32 | for k in data: 33 | data[k] = normalize_dict(data[k]) 34 | if type(k) == str: 35 | new_data[k.encode('utf-8')] = data[k] 36 | else: 37 | new_data[k] = data[k] 38 | data = new_data 39 | elif type(data) == list: 40 | for i in range(0, len(data)): 41 | data[i] = normalize_dict(data[i]) 42 | elif type(data) == str: 43 | data = data.encode('utf-8') 44 | else: 45 | data = str(data) 46 | return data 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asn1crypto==0.24.0 2 | attrs==19.1.0 3 | Automat==0.7.0 4 | beautifulsoup4==4.7.1 5 | bs4==0.0.1 6 | certifi==2019.3.9 7 | cffi==1.12.2 8 | chardet==3.0.4 9 | Click==7.0 10 | constantly==15.1.0 11 | cryptography==2.6.1 12 | cssselect==1.0.3 13 | cycler==0.10.0 14 | Django==2.1.4 15 | dukpy==0.2.2 16 | echarts-china-cities-pypkg==0.0.9 17 | echarts-china-provinces-pypkg==0.0.3 18 | echarts-countries-pypkg==0.1.6 19 | et-xmlfile==1.0.1 20 | Flask==1.0.2 21 | future==0.17.1 22 | hyperlink==18.0.0 23 | idna==2.8 24 | incremental==17.5.0 25 | itsdangerous==1.1.0 26 | javascripthon==0.10 27 | jdcal==1.4.1 28 | jieba==0.39 29 | Jinja2==2.10 30 | jupyter-echarts-pypkg==0.1.2 31 | kiwisolver==1.0.1 32 | lml==0.0.2 33 | lxml==4.3.3 34 | macropy3==1.1.0b2 35 | MarkupSafe==1.1.1 36 | matplotlib==3.0.3 37 | numpy==1.16.2 38 | openpyxl==3.0.2 39 | pandas==0.24.2 40 | parsel==1.5.1 41 | Pillow==6.0.0 42 | pyasn1==0.4.5 43 | pyasn1-modules==0.2.4 44 | pycparser==2.19 45 | PyDispatcher==2.0.5 46 | pyecharts==0.5.11 47 | pyecharts-javascripthon==0.0.6 48 | pyecharts-jupyter-installer==0.0.3 49 | PyHamcrest==1.9.0 50 | pymongo==3.7.2 51 | PyMySQL==0.9.3 52 | pyOpenSSL==19.0.0 53 | pyparsing==2.3.1 54 | pypiwin32==223 55 | python-dateutil==2.8.0 56 | pytz==2019.1 57 | pywin32==224 58 | queuelib==1.5.0 59 | readability==0.3.1 60 | requests==2.22.0 61 | Scrapy==1.6.0 62 | selenium==3.141.0 63 | service-identity==18.1.0 64 | sh==1.12.14 65 | simplejson==3.16.0 66 | six==1.12.0 67 | soupsieve==1.9 68 | sqlparse==0.3.0 69 | tushare==1.2.35 70 | Twisted==18.9.0 71 | urllib3==1.24.1 72 | w3lib==1.20.0 73 | Werkzeug==0.15.2 74 | zope.interface==4.6.0 75 | -------------------------------------------------------------------------------- /scrapy_auto/spiders/bili_spider.py: -------------------------------------------------------------------------------- 1 | # @Time : 2019/12/19 8:51 PM 2 | # @Author : 504747754@qq.com(ZengYang) 3 | # @File : bili_spider.py 4 | # @Software : PyCharm 5 | # @ToUse : 6 | import json 7 | import re 8 | 9 | from scrapy import Request 10 | from scrapy.spiders import CrawlSpider 11 | 12 | # 1.了解爬虫执行原理 2.了解爬虫脚本 3.了解爬虫的框架 4.了解各种反爬 13 | from scrapy_auto.items import BiliItem 14 | 15 | 16 | class BiliSpider(CrawlSpider): 17 | """ 18 | 需求:采集b站番剧索引的151页的数据 19 | """ 20 | name = 'bili_spider' 21 | url = 'https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={page}&season_type=1&pagesize=20&type=1' 22 | custom_settings = { # 每一个爬虫的自定义配置,settins.py是全局配置 23 | 'ITEM_PIPELINES': { # 管道,控制你输出数据的方式,数值越小,优先级越高 24 | 'scrapy_auto.pipelines.ExcelBiliPipeline': 10, 25 | }, 26 | } 27 | 28 | def start_requests(self): 29 | for pg in range(1, 2): 30 | yield Request(url=self.url.format(page=pg)) 31 | 32 | def parse(self, response): 33 | item_list = json.loads(response.text)['data']['list'] 34 | for item1 in item_list: 35 | item = BiliItem() 36 | item['is_vip'] = 1 if item1['badge'] else 0 37 | item['thumb'] = item1['cover'] 38 | item['episode'] = re.findall('\d+', item1['index_show'])[0] 39 | item['is_finish'] = item1['is_finish'] 40 | item['link_detail'] = item1['link'] 41 | item['fans_info'] = item1['order'] 42 | item['title'] = item1['title'] 43 | yield item 44 | pass 45 | -------------------------------------------------------------------------------- /scrapy_auto/spiders/FreeProxySpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/4/1 20:40 6 | # @Author : 504747754@qq.com(ZengYang) 7 | # @File : FreeProxySpider.py 8 | # @Software: PyCharm 9 | # @ToUse : 抓取免费代理站点的代理 10 | 11 | """ 12 | 1.抓取代理 13 | 2.验证代理 14 | 3.存储备用 15 | """ 16 | 17 | import scrapy 18 | from scrapy import Request 19 | from scrapy.exporters import JsonItemExporter 20 | 21 | 22 | class XiCiSpider(scrapy.Spider): 23 | name = 'xici_spider' 24 | allowed_domains = ['www.xicidaili.com'] 25 | start_urls = [ 26 | 'https://www.xicidaili.com/nn' 27 | ] 28 | custom_settings = { 29 | 'ITEM_PIPELINES': { 30 | }, 31 | 32 | } 33 | 34 | def parse(self, response): 35 | for sel in response.xpath('//table[@id="ip_list"]/tr[position()>1]'): 36 | ip = sel.css('td:nth-child(2)::text').extract_first().encode('utf-8') 37 | port = sel.css('td:nth-child(3)::text').extract_first().encode('utf-8') 38 | scheme = sel.css('td:nth-child(6)::text').extract_first().lower().encode('utf-8') 39 | proxy = '%s://%s:%s' % (scheme, ip, port) 40 | meta = { 41 | 'proxy': proxy, 'dont_retry': True, 'download_timeout': 10, '_proxy_scheme': scheme, '_proxy_ip': ip, 42 | } 43 | yield Request(url='%s://httpbin.org/ip' % scheme, callback=self.check_available, dont_filter=True, 44 | meta=meta) 45 | 46 | def check_available(self, response): 47 | proxy_ip = response.meta['_proxy_ip'] 48 | if proxy_ip in response.text: 49 | yield {'proxy_scheme': response.meta['_proxy_scheme'], 'proxy': response.meta['proxy'], } 50 | -------------------------------------------------------------------------------- /scrapy_auto/unit_test/crawl_crack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/2/20 17:24 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : crawl_crack.py 8 | # @Software: PyCharm 9 | # @ToUse : 各种爬虫的反爬处理 10 | 11 | 12 | # 滑块验证码处理 13 | import time 14 | import traceback 15 | 16 | from selenium.webdriver import ActionChains 17 | 18 | 19 | def move_slider(driver): 20 | while True: 21 | try: 22 | # 定位滑块元素 23 | slider = driver.find_element_by_xpath("//span[@id='nc_1_n1z']") 24 | track = get_track() 25 | move_to_gap(driver, slider, track) 26 | # 查看是否认证成功,获取text值 27 | while True: 28 | try: 29 | text = driver.find_element_by_xpath("//span[@class='nc-lang-cnt']") 30 | break 31 | except: 32 | traceback.print_exc() 33 | continue 34 | # 目前只碰到3种情况:成功(请在在下方输入验证码,请点击图);无响应(请按住滑块拖动);失败(哎呀,失败了,请刷新) 35 | if text.text.startswith('验证通过'): 36 | break 37 | elif text.text.startswith('哎呀,出错了,点击刷新再来一次'): 38 | driver.find_element_by_xpath("//span[@class='nc-lang-cnt']/a").click() 39 | pass 40 | except Exception as e: 41 | traceback.print_exc() 42 | time.sleep(5) 43 | 44 | 45 | def get_track(distance=200): 46 | track = [] 47 | current = 0 48 | mid = distance * 3 / 4 49 | t = 0.2 50 | t = 0.9 51 | v = 0 52 | while current < distance: 53 | if current < mid: 54 | a = 2 55 | else: 56 | a = -3 57 | v0 = v 58 | v = v0 + a * t 59 | move = v0 * t + 1 / 2 * a * t * t 60 | current += move 61 | track.append(round(move)) 62 | return track 63 | 64 | 65 | def move_to_gap(driver, slider, track): 66 | try: 67 | ActionChains(driver).click_and_hold(slider).perform() 68 | for x in track: 69 | ActionChains(driver).move_by_offset(xoffset=x, yoffset=0).perform() 70 | time.sleep(0.1) 71 | ActionChains(driver).release().perform() 72 | except: 73 | traceback.print_exc() 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy_pro 2 | 3 | #### 项目背景 4 | 做了挺久爬虫开发,接过各种各样的站点爬虫处理,过程中也遇到过各种各样的问题; 5 | 偶尔会逛逛社区和群,依旧会看到各种各样的小白在上面问些简单问题,却陷入无人回答的尴尬境地,原因有两种: 6 | 1. 问问题不懂怎么问,导致有些大咖看到了不知道咋回你 7 | 2. 问题问的还可以,但是毕竟大家都忙,有时候想回你,但是手上有事,等到没事了,也忘了 8 | 所以,我希望这个项目,能涉及到爬虫里面的各个技术点(详见技术点),让大家进行一个毕竟好的归纳总结 9 | 关于项目名称scrapy_auto的由来: 10 | scrapy自动化,不得不说scrapy是个非常高效实用的框架,因此,本项目初衷是站在scrapy的角度,追求更高效,更快捷,更实用,针对使用scrapy的各种问题,针对性解决 11 | 12 | #### Version 13 | 1. 基本架构搭建,jieba分词处理,数据标签属性清洗 14 | 2. Version1.0. 15 | 新增boss直聘爬虫,抓取boss各类别岗位,并进行词云统计; 16 | 新增岗位搜索词轮询接口,接口暂时只抓取固定岗位词; 17 | 3. Version1.0.1. 18 | boss直聘爬虫完成,采用定页面层级的方式减少访问次数,提高性能 19 | 4. Version1.0.2. 20 | 数据入库存储; 21 | 数据更新功能由于时间有限,暂时做全量更新(最好的方式是根据url对应的待抓取内容,进行和原来的抓取内容的md5比较) 22 | 5. Version1.0.3. 23 | 增加数据的可视化分析 24 | 6. Version1.0.4. 25 | 增加可視化html頁面 26 | 7. Version1.0.5. 27 | 重大改动:鉴于py2编码的麻烦以及py2官方维护截止到2020年元旦,本项目由py2=》py3进行转换 28 | 8. Version1.0.6. 29 | 新增头条号文章抓取的增量和全量抓取,对接scrapy-splash 30 | 9. Version1.0.7. 31 | 新增蓝奏云盘数据采集 32 | Version1.0.8. 安装启动教程 33 | Version1.0.9. b站番剧采集demo 34 | 35 | #TODO: 36 | 1. 根据不同的岗位,生成对应的分布图及对应的词库(时间有限目前暂时是几个demo数据)—— 37 | 2. scrapy=>scrapy-redis的转换、代理设置—— 38 | 39 | #### 相关爬虫技术点 40 | 本站点作为开源项目,希望针对各类不同的网站抓取的实例分析,能让大家在爬虫技术上能有更大的发展 41 | 1. 爬虫系统架构搭建(目前采用scrapy系统) 42 | 2. 分布式爬虫系统搭建(scrapy-redis) 43 | 3. 各种站点的反爬处理(包括登录、cookie验证、UA、请求数据加密计算、页面数据加密破解、js破解、js动态加载、ajax加载数据、ip封禁、多层数据加密计算、验证码等) 44 | 4. 日志监控系统 45 | 5. 爬虫性能优化 46 | 6. 存储数据的方式及性能优化 47 | 7. 广度优先和深度优先的抓取 48 | 8. 爬虫运行状态及相应抓取数据监控 49 | 50 | #### 可能遇到的问题及解决方式 51 | 52 | q:ImportError: No module named win32api 53 | a:pip install pypiwin32 54 | 55 | q: error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools 56 | a:https://segmentfault.com/a/1190000014782698 57 | 58 | q: AttributeError: module 'pip' has no attribute 'main' 59 | a: python -m pip install --upgrade pip==9.0.3 60 | 61 | q: python进行批量的py2=>py3转换 62 | a: https://blog.csdn.net/u012211419/article/details/51136232 63 | 64 | q: jinja2.exceptions.TemplateSyntaxError: unexpected char '\x9d' at 734926 65 | a: 时间有限,暂时无太好的方式,参照:https://blog.csdn.net/qq_39241986/article/details/80680392 66 | 67 | q: distutils.errors.DistutilsError: Could not find suitable distribution for Requirement.parse('pytest-runner') 68 | a: pip install pytest-runner 69 | 70 | ### 需求背景: 71 | 1. 抓取招聘网站数据,用于统计各个岗位薪资、地域分布、技能关键词排名、==》demo_spider.py 72 | 2. 抓取头条号文章数据: 73 | 1)时效性:尝试5分钟进行一次目录轮巡的数据抓取,以文章url作为唯一区分标准 74 | 2)抓取字段内容(待定): 75 | 76 | ### 技术点 77 | 解决js加载问题: 78 | 无头浏览器(性能差);scrapy-splash 79 | 80 | 81 | ### 安装启动 82 | 以蓝奏云盘为例: 83 | cd 你的code目录/scrapy_pro/ 84 | pip install -r requirements.txt 85 | scrapy crawl lanzhou_spider -o items.json 86 | 程序跑完后可以在项目目录下查看items.json数据 87 | 88 | -------------------------------------------------------------------------------- /scrapy_auto/tools/common_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf8 3 | # Copyright 2017 SARRS Inc. All Rights Reserved. 4 | 5 | # @Time : 2019/1/24 16:37 6 | # @Author : zengyang@tv365.net(ZengYang) 7 | # @File : common_parser.py 8 | # @Software: PyCharm 9 | # @ToUse : 10 | import re 11 | 12 | 13 | def del_html_attr(page_source): 14 | # todo: