├── lagoucrawl ├── __init__.py ├── book.txt ├── spiders │ ├── __init__.py │ └── lgcrawl.py ├── pipelines.py ├── items.py ├── settings.py ├── ip.csv └── middlewares.py ├── .idea ├── vcs.xml ├── misc.xml ├── modules.xml └── lgcrawl.iml ├── scrapy.cfg ├── README.MD ├── analysis.py ├── .gitignore ├── LagouJobCrawl.py └── LICENSE /lagoucrawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lagoucrawl/book.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lagoucrawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lagoucrawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = lagoucrawl 12 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # scrapy+splash 爬取拉勾全站职位信息 2 | 3 | ## 使用方法: 4 |
    5 |
  1. 首先根据对应系统平台安装docker
  2. 6 |
  3. sudo docker pull scrapinghub/splash 安装splash
  4. 7 |
  5. sudo docker run -p 8050:8050 -p 8051:8051 scrapinghug/splash 启动splash
  6. 8 |
  7. cd 到爬虫目录下,执行scrapy crawl lgcrawl -o jobs_all.csv 命令启动爬虫
  8. 9 |
10 | 11 | # 本项目代码尽供学习使用 12 | 13 | -------------------------------------------------------------------------------- /lagoucrawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class LagoucrawlPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | from os import path 4 | 5 | 6 | def parse_csv(): 7 | dir = path.dirname('.') 8 | print("dir:" + path.abspath('')) 9 | 10 | df = pd.read_csv(path.join(dir, 'jobs_all.csv')) 11 | r = df.groupby('classify_name') 12 | v = r.money.agg(['count', 'max', 'min']) 13 | print(v) 14 | 15 | 16 | if __name__ == "__main__": 17 | parse_csv() 18 | -------------------------------------------------------------------------------- /.idea/lgcrawl.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /lagoucrawl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LagoucrawlItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | job_name = scrapy.Field() 15 | money = scrapy.Field() 16 | company = scrapy.Field() 17 | classify_name = scrapy.Field() 18 | advantage = scrapy.Field() 19 | requirements = scrapy.Field() 20 | info = scrapy.Field() 21 | pass 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,pycharm 3 | 4 | ### PyCharm ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/modules.xml 36 | # .idea/*.iml 37 | # .idea/modules 38 | 39 | # CMake 40 | cmake-build-*/ 41 | 42 | # Mongo Explorer plugin 43 | .idea/**/mongoSettings.xml 44 | 45 | # File-based project format 46 | *.iws 47 | 48 | # IntelliJ 49 | out/ 50 | 51 | # mpeltonen/sbt-idea plugin 52 | .idea_modules/ 53 | 54 | # JIRA plugin 55 | atlassian-ide-plugin.xml 56 | 57 | # Cursive Clojure plugin 58 | .idea/replstate.xml 59 | 60 | # Crashlytics plugin (for Android Studio and IntelliJ) 61 | com_crashlytics_export_strings.xml 62 | crashlytics.properties 63 | crashlytics-build.properties 64 | fabric.properties 65 | 66 | # Editor-based Rest Client 67 | .idea/httpRequests 68 | 69 | ### PyCharm Patch ### 70 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 71 | 72 | # *.iml 73 | # modules.xml 74 | # .idea/misc.xml 75 | # *.ipr 76 | 77 | # Sonarlint plugin 78 | .idea/sonarlint 79 | 80 | ### Python ### 81 | # Byte-compiled / optimized / DLL files 82 | __pycache__/ 83 | *.py[cod] 84 | *$py.class 85 | 86 | # C extensions 87 | *.so 88 | 89 | # Distribution / packaging 90 | .Python 91 | build/ 92 | develop-eggs/ 93 | dist/ 94 | downloads/ 95 | eggs/ 96 | .eggs/ 97 | lib/ 98 | lib64/ 99 | parts/ 100 | sdist/ 101 | var/ 102 | wheels/ 103 | *.egg-info/ 104 | .installed.cfg 105 | *.egg 106 | MANIFEST 107 | 108 | # PyInstaller 109 | # Usually these files are written by a python script from a template 110 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 111 | *.manifest 112 | *.spec 113 | 114 | # Installer logs 115 | pip-log.txt 116 | pip-delete-this-directory.txt 117 | 118 | # Unit test / coverage reports 119 | htmlcov/ 120 | .tox/ 121 | .coverage 122 | .coverage.* 123 | .cache 124 | nosetests.xml 125 | coverage.xml 126 | *.cover 127 | .hypothesis/ 128 | .pytest_cache/ 129 | 130 | # Translations 131 | *.mo 132 | *.pot 133 | 134 | # Django stuff: 135 | *.log 136 | local_settings.py 137 | db.sqlite3 138 | 139 | # Flask stuff: 140 | instance/ 141 | .webassets-cache 142 | 143 | # Scrapy stuff: 144 | .scrapy 145 | 146 | # Sphinx documentation 147 | docs/_build/ 148 | 149 | # PyBuilder 150 | target/ 151 | 152 | # Jupyter Notebook 153 | .ipynb_checkpoints 154 | 155 | # IPython 156 | profile_default/ 157 | ipython_config.py 158 | 159 | # pyenv 160 | .python-version 161 | 162 | # celery beat schedule file 163 | celerybeat-schedule 164 | 165 | # SageMath parsed files 166 | *.sage.py 167 | 168 | # Environments 169 | .env 170 | .venv 171 | env/ 172 | venv/ 173 | ENV/ 174 | env.bak/ 175 | venv.bak/ 176 | 177 | # Spyder project settings 178 | .spyderproject 179 | .spyproject 180 | 181 | # Rope project settings 182 | .ropeproject 183 | 184 | # mkdocs documentation 185 | /site 186 | 187 | # mypy 188 | .mypy_cache/ 189 | .dmypy.json 190 | dmypy.json 191 | 192 | ### Python Patch ### 193 | .venv/ 194 | 195 | ### Python.VirtualEnv Stack ### 196 | # Virtualenv 197 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 198 | [Bb]in 199 | [Ii]nclude 200 | [Ll]ib 201 | [Ll]ib64 202 | [Ll]ocal 203 | [Ss]cripts 204 | pyvenv.cfg 205 | pip-selfcheck.json 206 | 207 | 208 | # End of https://www.gitignore.io/api/python,pycharm -------------------------------------------------------------------------------- /lagoucrawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lagoucrawl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'lagoucrawl' 13 | 14 | SPIDER_MODULES = ['lagoucrawl.spiders'] 15 | NEWSPIDER_MODULE = 'lagoucrawl.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'lagoucrawl (+http://www.yourdomain.com)' 19 | 20 | 21 | 22 | SPLASH_COOKIES_DEBUG = True 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = False 26 | 27 | # splash 服务器地址 28 | SPLASH_URL = "http://localhost:8050" 29 | 30 | 31 | DOWNLOAD_DELAY = 3 32 | 33 | HEADERS = { 34 | 'Connection': 'keep-alive', 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36' 36 | } 37 | META = { 38 | 'dont_redirect': True, 39 | 'handle_httpstatus_list': [301, 302, 503, 400, 404] 40 | 41 | } 42 | 43 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 44 | # CONCURRENT_REQUESTS = 32 45 | 46 | # Configure a delay for requests for the same website (default: 0) 47 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 48 | # See also autothrottle settings and docs 49 | # DOWNLOAD_DELAY = 3 50 | # The download delay setting will honor only one of: 51 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 52 | # CONCURRENT_REQUESTS_PER_IP = 16 53 | 54 | # Disable cookies (enabled by default) 55 | COOKIES_ENABLED = True 56 | 57 | # Disable Telnet Console (enabled by default) 58 | # TELNETCONSOLE_ENABLED = False 59 | 60 | # Override the default request headers: 61 | # DEFAULT_REQUEST_HEADERS = { 62 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 63 | # 'Accept-Language': 'en', 64 | # } 65 | 66 | 67 | # user agent 68 | USER_AGENT_POOL = [ 69 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", 70 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", 71 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" 72 | , 73 | 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36' 74 | ] 75 | # Enable or disable spider middlewares 76 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 77 | SPIDER_MIDDLEWARES = { 78 | # 'lagou.middlewares.LagouSpiderMiddleware': 543,HTTPCACHE_STORAGE ='scrapy_splash.SplashAwareFSCacheStorage' 79 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 99, 80 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100 81 | } 82 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 83 | 84 | # Enable or disable downloader middlewares 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 86 | DOWNLOADER_MIDDLEWARES = { 87 | # 'lagou.middlewares.LagouDownloaderMiddleware': 543, 88 | 'scrapy_splash.SplashCookiesMiddleware': 723, 89 | 'scrapy_splash.SplashMiddleware': 725, 90 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 91 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None, 92 | # 'lagoucrawl.middlewares.MyproxiesSpiderMiddleware': 125, 93 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 94 | 'lagoucrawl.middlewares.MyUserAgentMiddleware': 100, 95 | 'lagoucrawl.middlewares.MyRetryMiddleware': 1 96 | 97 | } 98 | DUPEFILTER_CLASS = "scrapy_splash.SplashAwareDupeFilter" 99 | RETRY_ENABLED = True 100 | RETRY_TIMES = 10 101 | RETRY_HTTP_CODES = { 102 | 400, 404, 500, 503 103 | } 104 | # Enable and configure the AutoThrottle extension (disabled by default) 105 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 106 | # AUTOTHROTTLE_ENABLED = True 107 | # The initial download delay 108 | # AUTOTHROTTLE_START_DELAY = 5 109 | # The maximum download delay to be set in case of high latencies 110 | # AUTOTHROTTLE_MAX_DELAY = 60 111 | # The average number of requests Scrapy should be sending in parallel to 112 | # each remote server 113 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 114 | # Enable showing throttling stats for every response received: 115 | # AUTOTHROTTLE_DEBUG = False 116 | 117 | # Enable and configure HTTP caching (disabled by default) 118 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 119 | # HTTPCACHE_ENABLED = True 120 | # HTTPCACHE_EXPIRATION_SECS = 0 121 | # HTTPCACHE_DIR = 'httpcache' 122 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 123 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 124 | -------------------------------------------------------------------------------- /lagoucrawl/spiders/lgcrawl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | import scrapy 5 | import time 6 | from scrapy_splash import SplashRequest 7 | 8 | from lagoucrawl.items import LagoucrawlItem 9 | from scrapy.conf import settings 10 | import browsercookie 11 | 12 | lua_script = ''' 13 | function main(splash) 14 | print("cookies.type:",type(splash.args.cookies)) 15 | for k, v in pairs(splash.args.cookies) do 16 | 17 | local cookie = splash:add_cookie{k, v,"/",domain=".lagou.com"} 18 | print("add after cookie:",cookie) 19 | end 20 | splash:init_cookies(splash:get_cookies()) 21 | splash:go(splash.args.url) 22 | 23 | splash:wait(2) 24 | 25 | return splash:html() 26 | end 27 | ''' 28 | 29 | 30 | class LgcrawlSpider(scrapy.Spider): 31 | name = 'lgcrawl' 32 | allowed_domains = ['www.lagou.com'] 33 | start_urls = ['http://www.lagou.com/'] 34 | baseurl = "https://www.lagou.com/zhaopin/" 35 | meta = settings['META'] 36 | splash_args = {} 37 | cookies = {} 38 | 39 | def start_requests(self): 40 | yield scrapy.Request(url=self.start_urls[0], callback=self.start_parse_job) 41 | 42 | def start_parse_job(self, response): 43 | url_jobs = response.css('.sidebar .mainNavs .menu_box .menu_sub dd a.curr') 44 | 45 | cookie_k = [] 46 | cookie_v = [] 47 | for cookie in browsercookie.chrome(): 48 | if ('www.lagou.com'.rfind(str(cookie.domain)) != -1): 49 | # print("cookie:" + str(cookie.domain)) 50 | # print("cookie:" + str(cookie.name)) 51 | cookie_k.append(cookie.name) 52 | cookie_v.append(cookie.value) 53 | self.cookies = dict(zip(cookie_k, cookie_v)) 54 | 55 | headers = { 56 | "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 57 | 58 | } 59 | splash_args = { 60 | 'wait': 5, 61 | "http_method": "GET", 62 | # "images":0, 63 | 64 | "render_all": 1, 65 | "headers": headers, 66 | 'lua_source': lua_script, 67 | "cookies": self.cookies, 68 | 69 | } 70 | self.splash_args = splash_args 71 | 72 | for url_job in url_jobs: 73 | classify_href = url_job.xpath('@href').extract_first() 74 | classify_name = url_job.xpath('text()').extract_first() 75 | url = classify_href + "1/?filterOption=2" 76 | 77 | yield SplashRequest(url=url, endpoint='execute', 78 | meta={'classify_name': classify_name, 'classify_href': classify_href}, 79 | callback=self.parse_total_page, 80 | dont_filter=True, 81 | args=splash_args, cache_args=['lua_source']) 82 | 83 | def parse_total_page(self, response): 84 | total_page = '0' 85 | try: 86 | total_page = response.xpath('//*[@id="order"]/li/div[4]/div/span[2]/text()').extract_first() 87 | print("total_page:" + total_page) 88 | except Exception as e: 89 | total_page = '0' 90 | classify_href = response.meta['classify_href'] 91 | for i in range(1, int(total_page) + 1): 92 | url = classify_href + "/%s/?filterOption=2" % i 93 | if i % random.randint(1, 9) == 0: # 随机延时 94 | time.sleep(random.randint(1, 2)) 95 | yield SplashRequest(url=url, endpoint='execute', meta={'classify_name': response.meta['classify_name']}, 96 | callback=self.parse_item, 97 | dont_filter=True, 98 | args=self.splash_args, cache_args=['lua_source']) 99 | 100 | def parse_item(self, response): 101 | 102 | list = response.xpath('//*[@class="con_list_item default_list"]') 103 | print("parse,response.length:" + str(len(response.text)) + ",list.length:" + str( 104 | len(list.extract())) + ",url:" + response.url) 105 | title = response.xpath('/html/head/title/text()').extract_first() 106 | if len(list.extract()) == 0: 107 | print("list 0,title:%s" % title) 108 | else: 109 | print("list %d title %s" % (len(list), title)) 110 | for li in list: 111 | position = li.xpath('./div[@class="list_item_top"]/div[@class="position"]') 112 | job_name = position.xpath('./div[@class="p_top"]/a/h3/text()').extract_first() 113 | job_info_url = position.xpath('./div[@class="p_top"]/a/@href').extract_first() 114 | money = position.xpath('./div[@class="p_bot"]/div[@class="li_b_l"]/span/text()').extract_first() 115 | company = li.xpath( 116 | './div[@class="list_item_top"]/div[@class="company"]/div[@class="company_name"]/a/text()').extract_first() 117 | 118 | yield SplashRequest(url=job_info_url, endpoint='execute', 119 | meta={'job_name': job_name, 'money': money, 'company': company, 120 | 'classify_name': response.meta['classify_name']}, 121 | callback=self.parse_info, dont_filter=True, 122 | args=self.splash_args, cache_args=['lua_source']) 123 | 124 | def parse_info(self, response): 125 | item = LagoucrawlItem() 126 | item['job_name'] = response.meta['job_name'] 127 | item['money'] = response.meta['money'] 128 | item['company'] = response.meta['company'] 129 | item['classify_name'] = response.meta['classify_name'] 130 | item['advantage'] = str(response.css('.job-advantage p::text').extract()) 131 | item['requirements'] = str(response.css('.job_bt p::text').extract()) 132 | item['info'] = str(response.css('.position-head .position-content .position-content-l .job_request p').xpath( 133 | './span/text()').extract()) 134 | 135 | print('item:' + str(item)) 136 | yield item 137 | -------------------------------------------------------------------------------- /LagouJobCrawl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | import browsercookie 4 | import json 5 | import time 6 | import xlwt 7 | import sys 8 | from urllib.parse import urlencode 9 | 10 | class LgCrawl(object): 11 | def __init__(self, city,job, pageNum): 12 | """ 13 | 14 | :param job: 工作名字 15 | :param pageNum: 爬取页数 16 | """ 17 | self.job = job 18 | self.city = urlencode({"city": city}) 19 | self.excelName = u'%s-%s.xls' % (self.job, int(time.time())) 20 | self.pageNum = pageNum 21 | self.currentRow = 0 22 | self.book = xlwt.Workbook(encoding='utf-8', style_compression=0) 23 | self.sheet = self.book.add_sheet(self.job, cell_overwrite_ok=True) 24 | 25 | def go(self): 26 | for page in range(self.pageNum): 27 | self.crawl(str(page + 1)) 28 | 29 | def crawl(self, page): 30 | cookie_k = [] 31 | cookie_v = [] 32 | for cookie in browsercookie.chrome(): 33 | if 'www.lagou.com'.rfind(str(cookie.domain)) != -1: 34 | # print("cookie:" + str(cookie.domain)) 35 | # print("cookie:" + str(cookie.name)) 36 | cookie_k.append(cookie.name) 37 | cookie_v.append(cookie.value) 38 | cookies = dict(zip(cookie_k, cookie_v)) 39 | 40 | head = dict() 41 | head['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' 42 | head[ 43 | 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' 44 | head['Accept'] = 'application/json, text/javascript, */*; q=0.01' 45 | head['Accept-Encoding'] = 'gzip, deflate, br' 46 | head['Accept-Language'] = 'zh-CN,zh;q=0.9' 47 | head['X-Requested-With'] = 'XMLHttpRequest' 48 | head['X-Anit-Forge-Token'] = 'None' 49 | head['X-Anit-Forge-Code'] = '0' 50 | head['X-Requested-With'] = 'XMLHttpRequest' 51 | head['Referer'] = 'https://www.lagou.com/jobs/list_%s?labelWords=&fromSearch=true&suginput='%self.job 52 | head['Origin'] = 'https://www.lagou.com' 53 | data = dict() 54 | if page == '1': 55 | data['first'] = 'true' 56 | else: 57 | data['first'] = 'false' 58 | data['pn'] = page 59 | data['kd'] = self.job 60 | print("cookies:" + str(cookies)) 61 | print("header:" + str(head)) 62 | print('data:' + str(data)) 63 | resp = requests.post( 64 | url="https://www.lagou.com/jobs/positionAjax.json?px=default&%s&needAddtionalResult=false"%self.city, 65 | cookies=cookies, headers=head, data=data) 66 | print("resp:" + str(resp.content)) 67 | # result = json.loads(resp.content)['content']['positionResult']['result'] 68 | if 'success' in json.loads(resp.content): 69 | result = json.loads(resp.content)['content']['positionResult']['result'] 70 | for r in result: 71 | self.writeExcel(r) 72 | # print("excelName:"+self.excelName) 73 | self.book.save(self.excelName) 74 | else: 75 | print("error:" + json.loads(resp.content)['msg']) 76 | 77 | def writeExcel(self, r): 78 | companyShortName = r['companyShortName'] 79 | industryField = r['industryField'] 80 | education = r['education'] 81 | workYear = r['workYear'] 82 | positionAdvantage = r['positionAdvantage'] 83 | createTime = r['createTime'] 84 | salary = r['salary'] 85 | positionName = r['positionName'] 86 | companySize = r['companySize'] 87 | financeStage = r['financeStage'] 88 | companyLabelList = r['companyLabelList'] 89 | district = r['district'] 90 | positionLables = r['positionLables'] 91 | industryLables = r['industryLables'] 92 | businessZones = r['businessZones'] 93 | companyFullName = r['companyFullName'] 94 | hitags = r['hitags'] 95 | subwayline = r['subwayline'] 96 | stationname = r['stationname'] 97 | skillLables = r['skillLables'] 98 | linestaion = r['linestaion'] 99 | firstType = r['firstType'] 100 | secondType = r['secondType'] 101 | thirdType = r['thirdType'] 102 | print(str(r) + ",currentRow:" + str(self.currentRow)) 103 | 104 | self.sheet.write(self.currentRow, 0, companyShortName) 105 | self.sheet.write(self.currentRow, 1, industryField) 106 | self.sheet.write(self.currentRow, 2, education) 107 | self.sheet.write(self.currentRow, 3, workYear) 108 | self.sheet.write(self.currentRow, 4, positionAdvantage) 109 | self.sheet.write(self.currentRow, 5, createTime) 110 | self.sheet.write(self.currentRow, 6, salary) 111 | self.sheet.write(self.currentRow, 7, positionName) 112 | self.sheet.write(self.currentRow, 8, companySize) 113 | self.sheet.write(self.currentRow, 9, financeStage) 114 | self.sheet.write(self.currentRow, 10, companyLabelList) 115 | self.sheet.write(self.currentRow, 11, district) 116 | self.sheet.write(self.currentRow, 12, positionLables) 117 | self.sheet.write(self.currentRow, 13, industryLables) 118 | self.sheet.write(self.currentRow, 14, skillLables) 119 | self.sheet.write(self.currentRow, 15, companyFullName) 120 | self.sheet.write(self.currentRow, 16, businessZones) 121 | self.sheet.write(self.currentRow, 17, hitags) 122 | self.sheet.write(self.currentRow, 18, subwayline) 123 | self.sheet.write(self.currentRow, 19, stationname) 124 | 125 | self.sheet.write(self.currentRow, 20, linestaion) 126 | self.sheet.write(self.currentRow, 21, firstType) 127 | self.sheet.write(self.currentRow, 22, secondType) 128 | self.sheet.write(self.currentRow, 23, thirdType) 129 | self.sheet.write(self.currentRow, 24, companySize) 130 | self.currentRow += 1 131 | 132 | 133 | ###python LagouJobCrawl.py 城市 职业 页数开始爬取 134 | if __name__ == "__main__": 135 | city=sys.argv[1] 136 | job=sys.argv[2] 137 | page=sys.argv[3] 138 | LgCrawl(city=city,job=job, pageNum=int(page)).go() 139 | 140 | -------------------------------------------------------------------------------- /lagoucrawl/ip.csv: -------------------------------------------------------------------------------- 1 | ip_port 2 | 115.223.232.230:9000 3 | 120.77.247.147:80 4 | 118.24.185.22:80 5 | 124.42.68.152:90 6 | 182.111.64.8:32926 7 | 27.198.95.199:8118 8 | 210.72.14.142:80 9 | 117.90.1.227:9000 10 | 218.75.70.3:8118 11 | 222.214.248.13:9000 12 | 115.196.50.218:9000 13 | 101.80.105.128:58072 14 | 180.213.175.228:8118 15 | 117.87.176.131:9000 16 | 114.235.23.235:9000 17 | 118.178.227.171:80 18 | 112.240.183.30:9000 19 | 218.65.64.136:9000 20 | 116.55.177.174:27552 21 | 117.90.2.237:9000 22 | 180.118.247.168:9000 23 | 117.90.0.181:9000 24 | 42.87.18.128:80 25 | 183.129.207.80:36127 26 | 183.158.207.170:9000 27 | 59.62.41.163:9000 28 | 115.193.99.13:9000 29 | 117.90.252.123:9000 30 | 42.242.51.193:37735 31 | 223.150.38.232:9000 32 | 115.193.101.133:9000 33 | 115.154.43.242:8123 34 | 202.104.113.35:53281 35 | 117.90.137.154:9000 36 | 180.118.86.126:9000 37 | 180.118.134.91:9000 38 | 223.150.38.30:9000 39 | 115.218.222.183:9000 40 | 118.24.16.115:8118 41 | 112.243.171.37:53281 42 | 223.150.38.159:9000 43 | 101.200.50.18:8118 44 | 218.65.68.229:9000 45 | 182.88.188.26:8123 46 | 112.240.177.202:9000 47 | 115.196.54.93:9000 48 | 115.218.216.157:9000 49 | 222.94.7.123:8123 50 | 114.235.22.218:9000 51 | 202.107.195.217:80 52 | 47.106.92.90:8081 53 | 49.70.209.135:9000 54 | 180.118.86.211:9000 55 | 180.118.134.56:9000 56 | 180.118.135.127:9000 57 | 117.64.236.108:808 58 | 218.95.82.89:9000 59 | 125.110.74.130:9000 60 | 121.232.194.171:9000 61 | 115.210.29.167:9000 62 | 113.109.25.245:8118 63 | 211.136.127.125:80 64 | 115.218.223.176:9000 65 | 180.104.63.35:9000 66 | 180.118.86.224:9000 67 | 110.73.8.87:8123 68 | 101.76.218.58:1080 69 | 112.240.183.251:9000 70 | 115.218.210.3:9000 71 | 60.208.32.201:80 72 | 59.62.7.79:9000 73 | 115.151.229.7:9000 74 | 183.129.244.17:10800 75 | 180.118.134.21:9000 76 | 218.95.82.51:9000 77 | 59.62.25.86:9000 78 | 180.118.247.129:9000 79 | 180.104.62.123:9000 80 | 125.119.219.176:9000 81 | 183.147.223.98:9000 82 | 218.64.147.93:9000 83 | 180.118.86.207:9000 84 | 115.223.211.222:9000 85 | 110.72.193.11:80 86 | 110.73.2.225:8123 87 | 115.218.215.82:9000 88 | 223.150.38.189:9000 89 | 116.55.174.91:38724 90 | 182.129.243.93:9000 91 | 60.191.57.79:10080 92 | 180.118.128.244:9000 93 | 121.232.194.196:9000 94 | 120.34.102.75:53281 95 | 110.188.0.56:47830 96 | 183.158.203.74:9000 97 | 115.193.100.227:9000 98 | 122.234.204.44:9000 99 | 122.77.155.214:8123 100 | 180.118.86.10:9000 101 | 106.14.93.74:8118 102 | 115.218.222.75:9000 103 | 117.90.0.215:9000 104 | 114.235.22.52:9000 105 | 115.218.219.1:9000 106 | 47.96.12.3:8118 107 | 182.87.143.81:9000 108 | 182.105.10.97:9000 109 | 115.211.41.154:9000 110 | 112.74.108.145:80 111 | 112.193.128.202:8118 112 | 220.176.90.215:9000 113 | 223.150.38.210:9000 114 | 58.22.210.182:9000 115 | 220.176.90.198:9000 116 | 121.232.148.206:9000 117 | 117.90.137.131:9000 118 | 59.62.35.92:9000 119 | 180.118.134.38:9000 120 | 49.69.125.62:53281 121 | 58.18.225.234:80 122 | 223.150.38.10:9000 123 | 211.136.127.125:80 124 | 180.118.134.103:9000 125 | 60.188.38.186:9000 126 | 223.150.38.60:9000 127 | 114.235.22.128:9000 128 | 114.234.82.149:9000 129 | 182.96.245.111:9000 130 | 211.136.127.125:80 131 | 125.110.111.198:9000 132 | 106.12.22.41:8118 133 | 60.191.57.78:10800 134 | 123.249.88.153:9000 135 | 182.92.102.29:80 136 | 202.104.113.35:53281 137 | 106.14.206.26:8118 138 | 115.223.215.144:9000 139 | 123.133.206.4:9000 140 | 27.214.49.138:9000 141 | 114.234.83.179:9000 142 | 210.72.14.142:80 143 | 113.251.218.218:8123 144 | 218.64.148.182:9000 145 | 115.223.212.32:9000 146 | 183.158.203.57:9000 147 | 119.187.74.78:9000 148 | 182.105.1.151:9000 149 | 115.210.31.204:9000 150 | 111.160.123.110:80 151 | 118.117.137.76:9000 152 | 115.218.223.202:9000 153 | 114.234.82.183:9000 154 | 183.129.244.17:31773 155 | 221.202.72.250:53281 156 | 223.150.39.39:9000 157 | 124.42.68.152:90 158 | 115.223.193.13:9000 159 | 117.90.1.220:9000 160 | 183.129.207.80:21776 161 | 121.69.70.182:8118 162 | 180.118.92.88:9000 163 | 47.94.230.42:9999 164 | 139.196.51.201:8118 165 | 115.223.231.108:9000 166 | 183.129.244.17:35618 167 | 117.90.1.53:9000 168 | 218.6.16.233:8118 169 | 218.64.147.130:9000 170 | 115.212.39.248:9000 171 | 223.150.39.202:9000 172 | 114.235.23.13:9000 173 | 49.81.125.60:9000 174 | 117.90.5.58:9000 175 | 117.90.2.138:9000 176 | 115.153.173.148:9000 177 | 115.218.215.250:9000 178 | 180.76.136.77:9999 179 | 183.3.221.186:8118 180 | 114.234.81.111:9000 181 | 59.62.25.98:9000 182 | 182.106.193.46:9000 183 | 49.81.125.90:9000 184 | 218.95.82.27:9000 185 | 115.211.35.197:9000 186 | 180.118.247.188:9000 187 | 182.139.110.130:9000 188 | 117.90.2.109:9000 189 | 27.192.174.49:9000 190 | 115.153.167.165:9000 191 | 117.87.176.117:9000 192 | 123.133.204.49:9000 193 | 117.90.137.209:9000 194 | 27.206.178.39:9000 195 | 218.95.51.131:9000 196 | 175.44.155.54:9000 197 | 218.64.147.207:9000 198 | 211.136.127.125:80 199 | 182.87.142.138:9000 200 | 115.211.33.69:9000 201 | 121.232.199.76:9000 202 | 122.234.205.137:9000 203 | 139.196.51.201:8118 204 | 219.136.252.124:53281 205 | 115.213.230.107:9000 206 | 115.223.244.222:9000 207 | 59.62.6.103:9000 208 | 223.150.38.211:9000 209 | 121.69.70.182:8118 210 | 58.49.72.141:8888 211 | 121.69.70.182:8118 212 | 114.234.83.213:9000 213 | 183.129.244.17:45745 214 | 119.167.153.50:8118 215 | 117.90.1.88:9000 216 | 180.118.73.114:9000 217 | 180.104.63.204:9000 218 | 60.191.57.78:10800 219 | 183.158.206.33:9000 220 | 101.251.240.194:8080 221 | 117.90.3.6:9000 222 | 58.18.225.234:80 223 | 117.90.5.37:9000 224 | 18.213.137.107:8080 225 | 114.119.41.60:8118 226 | 115.223.232.230:9000 227 | 123.249.88.153:9000 228 | 125.117.133.169:9000 229 | 115.223.214.26:9000 230 | 180.118.93.22:9000 231 | 115.223.228.86:9000 232 | 123.249.88.153:9000 233 | 117.90.7.13:9000 234 | 113.200.26.173:37605 235 | 27.206.76.61:9000 236 | 114.235.23.64:9000 237 | 175.10.25.132:8118 238 | 123.133.205.133:9000 239 | 182.107.13.85:9000 240 | 117.90.137.190:9000 241 | 223.150.38.68:9000 242 | 163.125.30.169:8118 243 | 121.31.155.67:8123 244 | 125.117.132.175:9000 245 | 120.236.178.117:8118 246 | 117.65.46.11:41766 247 | 180.104.62.223:9000 248 | 223.150.38.234:9000 249 | 27.220.49.106:9000 250 | 115.211.41.213:9000 251 | 36.106.188.200:8118 252 | 180.118.93.27:9000 253 | 117.65.45.163:41766 254 | 115.151.242.168:9000 255 | 60.191.57.78:10080 256 | 117.87.177.110:9000 257 | 180.118.134.145:9000 258 | 180.118.86.210:9000 259 | 180.76.187.102:1080 260 | 115.193.96.26:9000 261 | 183.154.213.117:9000 262 | 110.72.242.203:53281 263 | 222.214.249.174:9000 264 | 182.141.44.197:9000 265 | 47.105.50.84:80 266 | 182.87.139.142:9000 267 | 121.31.177.4:8123 268 | 182.141.46.2:9000 269 | 115.193.96.140:9000 270 | 117.90.6.153:9000 271 | 58.49.72.141:8888 272 | 121.232.194.73:9000 273 | 171.39.2.56:8123 274 | 115.211.38.180:9000 275 | 115.211.43.228:9000 276 | 121.232.199.108:9000 277 | 117.90.3.65:9000 278 | 117.90.252.13:9000 279 | 183.158.207.151:9000 280 | 180.118.134.209:9000 281 | 115.216.37.35:9000 282 | 119.190.15.34:53281 283 | 223.150.38.62:9000 284 | 183.158.203.94:9000 285 | 175.44.152.226:9000 286 | 218.95.51.226:9000 287 | -------------------------------------------------------------------------------- /lagoucrawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import time 8 | import random 9 | from scrapy import signals 10 | 11 | from lagoucrawl.settings import USER_AGENT_POOL 12 | 13 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 14 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 15 | 16 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 17 | import logging 18 | from scrapy.exceptions import NotConfigured 19 | from scrapy.utils.response import response_status_message 20 | from os import path 21 | import csv 22 | from itertools import islice 23 | 24 | ips = [] 25 | 26 | 27 | def input_ips(): 28 | dir = path.dirname('.') 29 | print("dir:" + path.abspath(dir)) 30 | print("start ips.length") 31 | with open(path.join(dir, 'lagoucrawl/ip.csv'), 'r') as f: 32 | lines = csv.reader(f) 33 | for line in islice(lines, 1, None): 34 | # print("line.length:" + str(len(line))) 35 | if len(line) == 1: 36 | # print("line:" + str(line[0])) 37 | ips.append(line[0]) 38 | 39 | print("ips.length:" + str(len(ips))) 40 | 41 | 42 | class MyRetryMiddleware(RetryMiddleware): 43 | logger = logging.getLogger(__name__) 44 | 45 | def __init__(self, settings): 46 | 47 | if not settings.getbool('RETRY_ENABLED'): 48 | raise NotConfigured 49 | input_ips() 50 | self.max_retry_times = settings.getint('RETRY_TIMES') 51 | self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) 52 | self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') 53 | 54 | def delete_proxy(self, proxy): 55 | if proxy: 56 | print("proxy,before ip:" + proxy) 57 | proxy = proxy[7:] 58 | print("proxy,after ip:" + proxy) 59 | ips.remove(proxy) 60 | print("proxy delete ip length:" + str(len(ips))) 61 | 62 | def process_response(self, request, response, spider): 63 | if request.meta.get('dont_retry', False): 64 | return response 65 | logging.info("response.status:" + str(response.status)) 66 | if response.status in self.retry_http_codes: 67 | reason = response_status_message(response.status) 68 | self.delete_proxy(request.meta.get('proxy', False)) 69 | time.sleep(random.randint(10, 12)) 70 | self.logger.warning('返回值异常, 进行重试...') 71 | return self._retry(request, reason, spider) or response 72 | return response 73 | 74 | def process_exception(self, request, exception, spider): 75 | if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \ 76 | and not request.meta.get('dont_retry', False): 77 | self.delete_proxy(request.meta.get('proxy', False)) 78 | time.sleep(random.randint(3, 5)) 79 | self.logger.warning('连接异常, 进行重试...') 80 | 81 | return self._retry(request, exception, spider) 82 | 83 | def spider_opened(self, spider): 84 | spider.logger.info('start Spider opened: %s' % spider.name) 85 | 86 | 87 | 88 | class MyUserAgentMiddleware(UserAgentMiddleware): 89 | def __init__(self, user_agent=''): 90 | self.user_agent = user_agent 91 | 92 | def process_request(self, request, spider): 93 | thisua = random.choice(USER_AGENT_POOL) 94 | request.headers.setdefault('User-Agent', thisua) 95 | 96 | 97 | class MyproxiesSpiderMiddleware(HttpProxyMiddleware): 98 | 99 | def __init__(self, ip=''): 100 | self.ip = ip 101 | 102 | def process_request(self, request, spider): 103 | thisip = random.choice(ips) 104 | print("this is ip:" + thisip) 105 | request.meta["proxy"] = "http://" + thisip 106 | 107 | 108 | class LagoucrawlSpiderMiddleware(object): 109 | # Not all methods need to be defined. If a method is not defined, 110 | # scrapy acts as if the spider middleware does not modify the 111 | # passed objects. 112 | 113 | @classmethod 114 | def from_crawler(cls, crawler): 115 | # This method is used by Scrapy to create your spiders. 116 | s = cls() 117 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 118 | return s 119 | 120 | def process_spider_input(self, response, spider): 121 | # Called for each response that goes through the spider 122 | # middleware and into the spider. 123 | 124 | # Should return None or raise an exception. 125 | return None 126 | 127 | def process_spider_output(self, response, result, spider): 128 | # Called with the results returned from the Spider, after 129 | # it has processed the response. 130 | 131 | # Must return an iterable of Request, dict or Item objects. 132 | for i in result: 133 | yield i 134 | 135 | def process_spider_exception(self, response, exception, spider): 136 | # Called when a spider or process_spider_input() method 137 | # (from other spider middleware) raises an exception. 138 | 139 | # Should return either None or an iterable of Response, dict 140 | # or Item objects. 141 | pass 142 | 143 | def process_start_requests(self, start_requests, spider): 144 | # Called with the start requests of the spider, and works 145 | # similarly to the process_spider_output() method, except 146 | # that it doesn’t have a response associated. 147 | 148 | # Must return only requests (not items). 149 | for r in start_requests: 150 | yield r 151 | 152 | def spider_opened(self, spider): 153 | spider.logger.info('Spider opened: %s' % spider.name) 154 | 155 | 156 | class LagoucrawlDownloaderMiddleware(object): 157 | # Not all methods need to be defined. If a method is not defined, 158 | # scrapy acts as if the downloader middleware does not modify the 159 | # passed objects. 160 | 161 | @classmethod 162 | def from_crawler(cls, crawler): 163 | # This method is used by Scrapy to create your spiders. 164 | s = cls() 165 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 166 | return s 167 | 168 | def process_request(self, request, spider): 169 | # Called for each request that goes through the downloader 170 | # middleware. 171 | 172 | # Must either: 173 | # - return None: continue processing this request 174 | # - or return a Response object 175 | # - or return a Request object 176 | # - or raise IgnoreRequest: process_exception() methods of 177 | # installed downloader middleware will be called 178 | return None 179 | 180 | def process_response(self, request, response, spider): 181 | # Called with the response returned from the downloader. 182 | 183 | # Must either; 184 | # - return a Response object 185 | # - return a Request object 186 | # - or raise IgnoreRequest 187 | return response 188 | 189 | def process_exception(self, request, exception, spider): 190 | # Called when a download handler or a process_request() 191 | # (from other downloader middleware) raises an exception. 192 | 193 | # Must either: 194 | # - return None: continue processing this exception 195 | # - return a Response object: stops process_exception() chain 196 | # - return a Request object: stops process_exception() chain 197 | pass 198 | 199 | def spider_opened(self, spider): 200 | spider.logger.info('Spider opened: %s' % spider.name) 201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------