├── .gitignore ├── LICENSE ├── README.md ├── job.sql ├── run.bat ├── scrapy.cfg └── www_job_com ├── __init__.py ├── commands ├── __init__.py └── crawlall.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── chinahr_spider.py ├── dajie_spider.py ├── ganji_spider.py ├── job51_spider.py ├── job58_spider.py ├── lagou_spider.py ├── neitui_spider.py ├── zhaopin_spider.py └── zhipin_spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # www_job_com -------------------------------------------------------------------------------- /job.sql: -------------------------------------------------------------------------------- 1 | SET FOREIGN_KEY_CHECKS=0; 2 | 3 | -- ---------------------------- 4 | -- Table structure for jobs 5 | -- ---------------------------- 6 | DROP TABLE IF EXISTS `jobs`; 7 | CREATE TABLE `jobs` ( 8 | `id` int(11) NOT NULL AUTO_INCREMENT, 9 | `position_id` varchar(255) DEFAULT '', 10 | `position_name` varchar(255) DEFAULT '', 11 | `position_lables` varchar(255) DEFAULT '', 12 | `work_year` varchar(255) DEFAULT '', 13 | `salary` varchar(255) DEFAULT NULL, 14 | `city` varchar(255) DEFAULT '', 15 | `education` varchar(255) DEFAULT '', 16 | `company_name` varchar(255) DEFAULT '', 17 | `industry_field` varchar(255) DEFAULT '', 18 | `finance_stage` varchar(255) DEFAULT '', 19 | `company_size` varchar(255) DEFAULT '', 20 | `updated_at` varchar(255) DEFAULT '', 21 | `time` varchar(255) DEFAULT '', 22 | `platform` varchar(255) DEFAULT '', 23 | `avg_salary` float(6,3) DEFAULT '0.000', 24 | PRIMARY KEY (`id`) 25 | ) ENGINE=MyISAM AUTO_INCREMENT=9623 DEFAULT CHARSET=utf8; 26 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | scrapy crawlall --nolog -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = www_job_com.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = www_job_com 12 | -------------------------------------------------------------------------------- /www_job_com/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chjw8016/www_job_com/20d20139e6f65d0b2b06cf56e49a5243a489d161/www_job_com/__init__.py -------------------------------------------------------------------------------- /www_job_com/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chjw8016/www_job_com/20d20139e6f65d0b2b06cf56e49a5243a489d161/www_job_com/commands/__init__.py -------------------------------------------------------------------------------- /www_job_com/commands/crawlall.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | 4 | class Command(ScrapyCommand): 5 | requires_project = True 6 | 7 | def syntax(self): 8 | return '[options]' 9 | 10 | def short_desc(self): 11 | return 'Runs all of the spiders' 12 | 13 | def run(self, args, opts): 14 | spider_list = self.crawler_process.spiders.list() 15 | for name in spider_list: 16 | print("*********"+name+"************") 17 | self.crawler_process.crawl(name, **opts.__dict__) 18 | self.crawler_process.start() 19 | -------------------------------------------------------------------------------- /www_job_com/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WwwJobComItem(scrapy.Item): 12 | position_id = scrapy.Field() 13 | position_name = scrapy.Field() 14 | position_lables = scrapy.Field() 15 | work_year = scrapy.Field() 16 | salary = scrapy.Field() 17 | avg_salary = scrapy.Field() 18 | city = scrapy.Field() 19 | education = scrapy.Field() 20 | company_name = scrapy.Field() 21 | industry_field = scrapy.Field() 22 | finance_stage = scrapy.Field() 23 | company_size = scrapy.Field() 24 | time = scrapy.Field() 25 | updated_at = scrapy.Field() 26 | platform = scrapy.Field() -------------------------------------------------------------------------------- /www_job_com/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class WwwJobComSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class WwwJobComDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /www_job_com/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | from twisted.enterprise import adbapi 9 | 10 | 11 | class WwwJobComPipeline(object): 12 | @classmethod 13 | def from_settings(cls, settings): 14 | dbparams = dict( 15 | host=settings['MYSQL_HOST'], 16 | db=settings['MYSQL_DBNAME'], 17 | user=settings['MYSQL_USER'], 18 | passwd=settings['MYSQL_PASSWD'], 19 | charset='utf8', 20 | cursorclass=pymysql.cursors.DictCursor, 21 | use_unicode=False, 22 | ) 23 | dbpool = adbapi.ConnectionPool('pymysql', **dbparams) 24 | return cls(dbpool) 25 | 26 | def __init__(self, dbpool): 27 | self.dbpool = dbpool 28 | 29 | def process_item(self, item, spider): 30 | query = self.dbpool.runInteraction(self._conditional_insert, item) 31 | query.addErrback(self._handle_error, item, spider) 32 | return item 33 | 34 | def _conditional_insert(self, tx, item): 35 | # print item['name'] 36 | sql = "select * from jobs where position_id=%s and platform=%s" 37 | position_id = (item["position_id"], item["platform"]) 38 | result = tx.execute(sql, position_id) 39 | if (result == 0): 40 | sql = "insert into jobs(position_id,position_name,position_lables,work_year,salary,city,education,company_name,industry_field,finance_stage,company_size,updated_at,`time`,platform,avg_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 41 | params = ( 42 | item["position_id"], item["position_name"], item["position_lables"], item["work_year"], item["salary"], 43 | item["city"], item["education"], item["company_name"], item["industry_field"], 44 | item["finance_stage"], item["company_size"], item["updated_at"], item["time"], 45 | item["platform"], item["avg_salary"]) 46 | tx.execute(sql, params) 47 | 48 | def _handle_error(self, failue, item, spider): 49 | print(item) 50 | print(failue) 51 | -------------------------------------------------------------------------------- /www_job_com/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for www_job_com project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'www_job_com' 13 | 14 | SPIDER_MODULES = ['www_job_com.spiders'] 15 | NEWSPIDER_MODULE = 'www_job_com.spiders' 16 | COMMANDS_MODULE = 'www_job_com.commands' 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | # USER_AGENT = 'www_job_com (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | # CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | # COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | # TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | # DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | # } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'www_job_com.middlewares.WwwJobComSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'www_job_com.middlewares.WwwJobComDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'www_job_com.pipelines.WwwJobComPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | # Mysql数据库的配置信息 93 | MYSQL_HOST = 'dev.gammainfo.com' 94 | MYSQL_DBNAME = 'job' # 数据库名字 95 | MYSQL_USER = 'root' # 数据库账号 96 | MYSQL_PASSWD = 'Gamma0903' # 数据库密码 97 | 98 | MYSQL_PORT = 3306 # 数据库端口 99 | -------------------------------------------------------------------------------- /www_job_com/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /www_job_com/spiders/chinahr_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | 6 | 7 | class ZhipinSpider(scrapy.Spider): 8 | name = 'chinahr' 9 | allowed_domains = ['www.chinahr.com'] 10 | start_urls = ['http://www.chinahr.com/'] 11 | positionUrl = '' 12 | curPage = 0 13 | headers = {} 14 | 15 | def start_requests(self): 16 | return [self.next_request()] 17 | 18 | def parse(self, response): 19 | print("request -> " + response.url) 20 | job_list = response.css('div.jobList > ul') 21 | if (len(job_list) > 0): 22 | print("chinahr Nums:" + str(len(job_list))) 23 | for job in job_list: 24 | item = WwwJobComItem() 25 | item['position_id'] = job.css('li.l1 > span.e1 > a::attr(href)').extract_first().strip().replace( 26 | ".html?searchplace=22,247", "").replace("http://www.chinahr.com/job/", "") 27 | item["position_name"] = job.css('li.l1 > span.e1 > a::text').extract_first().strip() 28 | salary = job.css('li.l2 > span.e2::text').extract_first().strip().split("-") 29 | item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 1000)) + "K" 30 | item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 31 | info_primary = job.css('li.l2 > span.e1::text').extract_first().strip().split("/") 32 | item['city'] = "河南/郑州" 33 | item['work_year'] = info_primary[2].replace("]\r\n\t\t\t\t\t\t\t", "") 34 | item['education'] = info_primary[3] 35 | item['company_name'] = job.css('li.l1 > span.e3 > a::text').extract_first().strip() 36 | 37 | item['industry_field'] = "" 38 | item['finance_stage'] = "" 39 | item['company_size'] = "" 40 | 41 | item['position_lables'] = "" 42 | item['time'] = job.css('li.l1 > span.e2::text').extract_first().strip() 43 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 44 | item['platform'] = "chinahr" 45 | yield item 46 | yield self.next_request() 47 | 48 | # 发送请求 49 | def next_request(self): 50 | self.curPage += 1 51 | self.positionUrl = "http://www.chinahr.com/sou/?orderField=relate&keyword=php&city=22,247&page=" + str( 52 | self.curPage) 53 | print("chinahr page:" + str(self.curPage)) 54 | time.sleep(10) 55 | return scrapy.http.FormRequest( 56 | self.positionUrl, 57 | headers=self.headers, 58 | callback=self.parse) 59 | -------------------------------------------------------------------------------- /www_job_com/spiders/dajie_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | import json 5 | from www_job_com.items import WwwJobComItem 6 | import urllib.parse 7 | import http.cookiejar 8 | import urllib.request 9 | import re 10 | 11 | 12 | class DajieSpider(scrapy.Spider): 13 | name = 'dajie' 14 | allowed_domains = ['so.dajie.com'] 15 | start_urls = ['https://so.dajie.com/'] 16 | 17 | curPage = 1 18 | city_id = "410100" 19 | city_name = "%E9%83%91%E5%B7%9E" 20 | job_name = "php" 21 | cookie = ""; 22 | url = 'https://so.dajie.com/job/ajax/search/filter?keyword=php&order=0&city=410100&recruitType=&salary=&experience=&page=1&positionFunction=&_CSRFToken=&ajax=1' 23 | headers = { 24 | "accept": "application/json, text/javascript, */*; q=0.01", 25 | "accept-encoding": "gzip, deflate, br", 26 | "content-type": "text/html;charset=UTF-8", 27 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36", 28 | "referer": "https://so.dajie.com/job/search?cityId=410100&cname=%E9%83%91%E5%B7%9E&from=job" 29 | } 30 | formData = {} 31 | 32 | def start_requests(self): 33 | cookie = http.cookiejar.CookieJar() 34 | opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) 35 | session = opener.open( 36 | "https://so.dajie.com/job/search?cityId=" + self.city_id + "&cname=" + self.city_name + "&from=job") 37 | session_cookie = re.findall(r"SO_COOKIE_V2=.+?;", str(session.info()))[0].split("=")[1] 38 | self.cookie = session_cookie.strip(";") 39 | return [self.next_request()] 40 | 41 | def parse(self, response): 42 | print("request -> " + response.url) 43 | try: 44 | html = json.loads(response.body.decode("utf-8")) 45 | except ValueError: 46 | print(response.body) 47 | yield self.next_request() 48 | 49 | if (html.get("result") == 0): 50 | print("dajie Num:" + str(html.get('data').get('total'))) 51 | results = html.get('data').get('list') 52 | if len(results) > 0: 53 | for result in results: 54 | item = WwwJobComItem() 55 | item['salary'] = result.get('salary').replace(" ", "").replace("/月", "") 56 | if (item["salary"].find("-") > -1): 57 | salary = item["salary"].split("-") 58 | item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 59 | else: 60 | item["avg_salary"] = item["salary"].replace("K", "") 61 | item['city'] = result.get('pubCity') 62 | item['finance_stage'] = "" 63 | item['industry_field'] = result.get('industryName') 64 | item['position_lables'] = "" 65 | item['position_id'] = result.get('jobseq') 66 | item['company_size'] = result.get('scaleName') 67 | item['position_name'] = result.get('jobName') 68 | item['work_year'] = result.get('pubEx') 69 | item['education'] = result.get('pubEdu') 70 | item['company_name'] = result.get('compName') 71 | item['time'] = result.get("time") 72 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 73 | item['platform'] = "dajie" 74 | yield item 75 | totalPage = html.get('data').get("totalPage") 76 | self.curPage = self.curPage + 1 77 | if (self.curPage <= totalPage): 78 | self.url = 'https://so.dajie.com/job/ajax/search/filter?keyword=' + self.job_name + '&order=0&city=' + self.city_id + '&recruitType=&salary=&experience=&page=' + str( 79 | self.curPage) + '&positionFunction=&_CSRFToken=&ajax=1' 80 | yield self.next_request() 81 | else: 82 | time.sleep(10) 83 | yield self.next_request() 84 | 85 | def next_request(self): 86 | print("dajie page:" + str(self.curPage)) 87 | return scrapy.http.FormRequest(url=self.url, cookies={"SO_COOKIE_V2": self.cookie}, 88 | formdata=self.formData, headers=self.headers, method="GET") 89 | -------------------------------------------------------------------------------- /www_job_com/spiders/ganji_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | import math 6 | 7 | 8 | class GanjiSpider(scrapy.Spider): 9 | name = 'ganji' 10 | allowed_domains = ['zz.ganji.com'] 11 | start_urls = ['http://zz.ganji.com/'] 12 | positionUrl = '' 13 | curPage = 0 14 | headers = {} 15 | 16 | def start_requests(self): 17 | return [self.next_request()] 18 | 19 | def parse(self, response): 20 | print("request -> " + response.url) 21 | job_list = response.css('div.job-parttime > dl') 22 | if (len(job_list) > 0): 23 | print("ganji Nums:" + str(len(job_list))) 24 | for job in job_list: 25 | item = WwwJobComItem() 26 | item['position_id'] = job.css('dt > div > input::attr(value)').extract_first().strip().split(",")[0] 27 | item["position_name"] = "php开发工程师" 28 | salary = job.css('em.unit::text').extract_first().strip() 29 | if (salary == "面议"): 30 | item["salary"] = "面议" 31 | item["avg_salary"] = 0 32 | else: 33 | salary = job.css('dt > div > p > em.lipay > i > strong::text').extract_first().strip().split("-") 34 | item["salary"] = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str( 35 | math.ceil(int(salary[1]) / 1000)) + "K" 36 | item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 37 | item['city'] = job.css('dt > div > p.site > a::text').extract_first().strip().replace("地址:", "") 38 | item['work_year'] = job.css('dt > div > p > em.liexp::text').extract_first().strip().replace("经验:", 39 | "") 40 | item['education'] = "" 41 | item['company_name'] = job.css('div.j-comp > a::text').extract_first().strip() 42 | item['industry_field'] = "" 43 | item['finance_stage'] = "" 44 | item['company_size'] = "" 45 | item['position_lables'] = "" 46 | item['time'] = job.css('p.time::text').extract_first().strip() 47 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 48 | item['platform'] = "ganji" 49 | yield item 50 | yield self.next_request() 51 | 52 | # 发送请求 53 | def next_request(self): 54 | self.curPage += 1 55 | num = (self.curPage - 1) * 32 56 | self.positionUrl = 'http://zz.ganji.com/zhaopin/s/f' + str(num) + '/_php/' 57 | print("ganji page:" + str(self.curPage)) 58 | time.sleep(10) 59 | return scrapy.http.FormRequest( 60 | self.positionUrl, 61 | headers=self.headers, 62 | callback=self.parse) 63 | -------------------------------------------------------------------------------- /www_job_com/spiders/job51_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | 6 | 7 | class Job51Spider(scrapy.Spider): 8 | name = 'job51' 9 | allowed_domains = ['search.51job.com'] 10 | start_urls = ['http://search.51job.com/'] 11 | positionUrl = '' 12 | curPage = 0 13 | headers = {} 14 | 15 | def start_requests(self): 16 | return [self.next_request()] 17 | 18 | def parse(self, response): 19 | print("request -> " + response.url) 20 | job_list = response.css('div.dw_table > div.el') 21 | if (len(job_list) > 1): 22 | print("51job Nums:" + str(len(job_list))) 23 | for job in job_list: 24 | item = WwwJobComItem() 25 | str_time = job.css('span.t5::text').extract_first().strip() 26 | if (str_time == "发布时间"): 27 | continue 28 | else: 29 | item['position_id'] = job.css('p.t1 > input::attr(value)').extract_first().strip() 30 | item["position_name"] = job.css('p.t1 > span > a::text').extract_first().strip() 31 | salary = job.css('span.t4::text').extract_first().strip() 32 | if (salary.find("万/月") > -1): 33 | salary = salary.replace("万/月", "").split("-") 34 | item["salary"] = str(float(salary[0]) * 10) + "K-" + str(float(salary[1]) * 10) + "K" 35 | item["avg_salary"] = (float(salary[0]) * 10 + float(salary[1]) * 10) / 2 36 | elif (salary.find("万/年") > -1): 37 | salary = salary.replace("万/年", "").split("-") 38 | item["salary"] = str(float(salary[0]) / 12) + "K-" + str(float(salary[1]) / 12) + "K" 39 | item["avg_salary"] = (float(salary[0]) / 12 + float(salary[1]) / 12) / 2 40 | elif (salary.find("元/天") > -1): 41 | continue 42 | else: 43 | salary = salary.replace("千/月", "").split("-") 44 | item["salary"] = salary[0] + "K-" + salary[1] + "K" 45 | item["avg_salary"] = (float(salary[0]) + float(salary[1])) / 2 46 | item['city'] = job.css('span.t3::text').extract_first().strip() 47 | item['work_year'] = "" 48 | item['education'] = "" 49 | item['company_name'] = job.css('span.t2 > a::text').extract_first().strip() 50 | 51 | item['industry_field'] = "" 52 | item['finance_stage'] = "" 53 | item['company_size'] = "" 54 | item['position_lables'] = "" 55 | item['time'] = str_time 56 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 57 | item['platform'] = "51job" 58 | yield item 59 | yield self.next_request() 60 | 61 | # 发送请求 62 | def next_request(self): 63 | self.curPage += 1 64 | self.positionUrl = "http://search.51job.com/list/170200,000000,0000,00,9,99,php,2," + str( 65 | self.curPage) + ".html" 66 | print("51job page:" + str(self.curPage)) 67 | time.sleep(10) 68 | return scrapy.http.FormRequest(self.positionUrl, 69 | headers=self.headers, 70 | callback=self.parse) 71 | -------------------------------------------------------------------------------- /www_job_com/spiders/job58_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | import math 6 | 7 | 8 | class Job58Spider(scrapy.Spider): 9 | name = 'job58' 10 | allowed_domains = ['zz.58.com'] 11 | start_urls = ['http://zz.58.com/'] 12 | positionUrl = 'http://zz.58.com/job/?key=php&final=1&jump=1' 13 | curPage = 0 14 | headers = {} 15 | 16 | def start_requests(self): 17 | return [self.next_request()] 18 | 19 | def parse(self, response): 20 | print("request -> " + response.url) 21 | job_list = response.css('li.job_item') 22 | if (len(job_list) > 0): 23 | print("job58 Nums:" + str(len(job_list))) 24 | for job in job_list: 25 | item = WwwJobComItem() 26 | item['time'] = job.css('span.sign::text').extract_first().strip() 27 | if (item['time'] == "优选" or item['time'] == "精准"): 28 | continue 29 | else: 30 | item['position_id'] = job.css('div.job_name > a::attr(urlparams)').extract_first().strip().replace( 31 | "psid=", "").replace("&entinfo=", "").replace("_p", "").replace("_j", "") 32 | item[ 33 | "position_name"] = job.css('div.job_comp > p.job_require >span::text').extract()[ 34 | 0].strip() 35 | salary = job.css('p.job_salary::text').extract_first().strip() 36 | if (salary == "面议"): 37 | new_salary = salary 38 | item["avg_salary"] = 0 39 | elif (salary == "1000"): 40 | new_salary = "1K" 41 | item["avg_salary"] = 1.0 42 | else: 43 | salary = salary.split("-") 44 | new_salary = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str( 45 | math.ceil(int(salary[1]) / 1000)) + "K" 46 | item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 47 | item["salary"] = new_salary 48 | item['city'] = "郑州" 49 | item['work_year'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip() 50 | item['education'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip() 51 | item['company_name'] = job.css('div.comp_name > a::text').extract_first().strip() 52 | 53 | item['industry_field'] = "" 54 | item['finance_stage'] = "" 55 | item['company_size'] = "" 56 | label = job.css("div.job_wel > span::text").extract() 57 | item['position_lables'] = ",".join(label) 58 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 59 | item['platform'] = "job58" 60 | yield item 61 | yield self.next_request() 62 | 63 | # 发送请求 64 | def next_request(self): 65 | self.curPage += 1 66 | if (self.curPage > 1): 67 | self.positionUrl = "http://zz.58.com/job/pn" + str(self.curPage) + "/?key=php&final=1&jump=1" 68 | print("job58 page:" + str(self.curPage)) 69 | time.sleep(10) 70 | return scrapy.http.FormRequest( 71 | self.positionUrl, 72 | headers=self.headers, 73 | callback=self.parse) 74 | -------------------------------------------------------------------------------- /www_job_com/spiders/lagou_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | import json 5 | from www_job_com.items import WwwJobComItem 6 | import math 7 | 8 | 9 | class LagouSpider(scrapy.Spider): 10 | name = 'lagou' 11 | allowed_domains = ['www.lagou.com'] 12 | start_urls = ['https://www.lagou.com/'] 13 | 14 | curPage = 1 15 | city_name = "郑州" 16 | job_name = "PHP" 17 | url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=郑州&needAddtionalResult=false' 18 | headers = { 19 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 22 | "Referer": "https://www.lagou.com/jobs/list_php?cl=false&fromSearch=true&labelWords=&suginput=&city=郑州"} 23 | 24 | def start_requests(self): 25 | return [self.next_request()] 26 | 27 | def parse(self, response): 28 | print("request -> " + response.url) 29 | try: 30 | html = json.loads(response.body) 31 | except ValueError: 32 | print(response.body) 33 | yield self.next_request() 34 | 35 | if (html.get("success")): 36 | if html.get('content').get('positionResult').get('resultSize') != 0: 37 | results = html.get('content').get('positionResult').get('result') 38 | print('lagou Nums:' + str(len(results))) 39 | for result in results: 40 | item = WwwJobComItem() 41 | item['salary'] = result.get('salary').replace("k", "K") 42 | salary = item["salary"].split("-") 43 | item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 44 | item['city'] = result.get('city') 45 | item['finance_stage'] = result.get('financeStage') 46 | item['industry_field'] = result.get('industryField') 47 | item['position_lables'] = result.get('positionAdvantage') 48 | item['position_id'] = result.get('positionId') 49 | item['company_size'] = result.get('companySize') 50 | item['position_name'] = result.get('positionName') 51 | item['work_year'] = result.get('workYear') 52 | item['education'] = result.get('education') 53 | item['company_name'] = result.get('companyShortName') 54 | item['time'] = result.get("formatCreateTime") 55 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 56 | item['platform'] = "lagou" 57 | yield item 58 | totalPage = math.floor(int(html.get('content').get('positionResult').get("totalCount")) / int( 59 | html.get('content').get("pageSize"))) 60 | self.curPage = self.curPage + 1 61 | if (self.curPage <= totalPage): 62 | yield self.next_request() 63 | else: 64 | time.sleep(60) 65 | yield self.next_request() 66 | 67 | def next_request(self): 68 | print("lagou page:" + str(self.curPage)) 69 | return scrapy.FormRequest(url=self.url, formdata={'pn': str(self.curPage), 'kd': self.job_name}, 70 | method='POST', 71 | headers=self.headers, meta={'page': self.curPage, 'kd': self.job_name}, 72 | dont_filter=True) 73 | -------------------------------------------------------------------------------- /www_job_com/spiders/neitui_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | 6 | 7 | class NeituiSpider(scrapy.Spider): 8 | name = 'neitui' 9 | allowed_domains = ['www.neitui.me'] 10 | start_urls = ['http://www.neitui.me'] 11 | positionUrl = 'http://www.neitui.me/?name=job&handle=lists' 12 | curPage = 0 13 | headers = { 14 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36" 15 | } 16 | 17 | def start_requests(self): 18 | return [self.next_request()] 19 | 20 | def parse(self, response): 21 | print("request -> " + response.url) 22 | job_list = response.css('ul.list-items > li') 23 | if (len(job_list) > 0): 24 | print("neitui Nums:" + str(len(job_list))) 25 | for job in job_list: 26 | item = WwwJobComItem() 27 | job_primary = job.css('div.positionleft > div') 28 | item['position_id'] = job_primary[0].css('a::attr(href)').extract_first().strip().replace("/j/", "") 29 | item["position_name"] = job_primary[0].css('a::text').extract_first().strip() 30 | item['time'] = job_primary[0].css('span::text').extract_first().strip() 31 | item["salary"] = job_primary[1].css('span.mr10::text').extract_first().strip().replace("k", "K") 32 | salary = item["salary"].split("-") 33 | item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 34 | info_primary = job_primary[1].css('span::text').extract() 35 | item['city'] = info_primary[5].strip() 36 | item['work_year'] = info_primary[1].strip() 37 | item['education'] = info_primary[3].strip() 38 | item['company_name'] = job_primary[2].css('span >a::text').extract_first().strip() 39 | item['finance_stage'] = job_primary[2].css('span::text').extract()[1].strip() 40 | item['industry_field'] = "" 41 | item['company_size'] = "" 42 | item['position_lables'] = "" 43 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 44 | item['platform'] = "neitui" 45 | yield item 46 | yield self.next_request() 47 | 48 | # 发送请求 49 | def next_request(self): 50 | self.curPage += 1 51 | self.positionUrl += "&keyword=PHP&city=%E9%83%91%E5%B7%9E&page=" + str(self.curPage) 52 | print("neitui page:" + str(self.curPage)) 53 | time.sleep(0) 54 | return scrapy.http.FormRequest( 55 | self.positionUrl, 56 | headers=self.headers, 57 | callback=self.parse) 58 | -------------------------------------------------------------------------------- /www_job_com/spiders/zhaopin_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | 6 | 7 | class ZhaopinSpider(scrapy.Spider): 8 | name = 'zhaopin' 9 | allowed_domains = ['sou.zhaopin.com'] 10 | start_urls = ['http://sou.zhaopin.com/'] 11 | positionUrl = '' 12 | curPage = 0 13 | headers = {} 14 | 15 | def start_requests(self): 16 | return [self.next_request()] 17 | 18 | def parse(self, response): 19 | print("request -> " + response.url) 20 | job_list = response.css('table.newlist > tr') 21 | if (len(job_list) > 1): 22 | print("zhaopin Nums:" + str(len(job_list))) 23 | i = 0; 24 | for job in job_list: 25 | i += 1 26 | if (i > 1 and (i % 2) == 0): 27 | item = WwwJobComItem() 28 | item['position_id'] = job.css('td.zwmc > input::attr(data-monitor)').extract_first().strip().replace("|", "") 29 | name = job.css('td.zwmc > div > a').extract_first().strip() 30 | if (name.find("php") > -1 or name.find("Php") > -1 or name.find("PHP") > -1): 31 | item["position_name"] = "php研发工程师" 32 | salary = job.css('td.zwyx::text').extract_first().strip().split("-") 33 | item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 100)) + "K" 34 | item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 35 | item['city'] = "郑州" 36 | item['work_year'] = "" 37 | item['education'] = "" 38 | item['company_name'] = job.css('td.gsmc > a::text').extract_first().strip() 39 | item['industry_field'] = "" 40 | item['finance_stage'] = "" 41 | item['company_size'] = "" 42 | item['position_lables'] = "" 43 | item['time'] = job.css('td.gxsj > span::text').extract_first().strip() 44 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 45 | item['platform'] = "zhaopin" 46 | yield item 47 | yield self.next_request() 48 | 49 | # 发送请求 50 | def next_request(self): 51 | self.curPage += 1 52 | if (self.curPage <= 10): 53 | self.positionUrl = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=php&sm=0&fl=719&isadv=0&sb=1&isfilter=1&et=2&p=" + str( 54 | self.curPage) 55 | print("zhaopin page:" + str(self.curPage)) 56 | time.sleep(10) 57 | return scrapy.http.FormRequest(self.positionUrl, 58 | headers=self.headers, 59 | callback=self.parse) -------------------------------------------------------------------------------- /www_job_com/spiders/zhipin_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import time 4 | from www_job_com.items import WwwJobComItem 5 | 6 | 7 | class ZhipinSpider(scrapy.Spider): 8 | name = 'zhipin' 9 | allowed_domains = ['www.zhipin.com'] 10 | start_urls = ['https://www.zhipin.com/'] 11 | positionUrl = 'https://www.zhipin.com/c101180100-p100103/h_101180100/?query=' 12 | curPage = 0 13 | headers = {} 14 | 15 | def start_requests(self): 16 | return [self.next_request()] 17 | 18 | def parse(self, response): 19 | print("request -> " + response.url) 20 | job_list = response.css('div.job-list > ul > li') 21 | if (len(job_list) > 0): 22 | print("zhipin Nums:" + str(len(job_list))) 23 | for job in job_list: 24 | item = WwwJobComItem() 25 | job_primary = job.css('div.job-primary') 26 | item['position_id'] = job.css('div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip() 27 | item["position_name"] = job_primary.css('div.info-primary > h3 > a > div::text').extract_first().strip() 28 | item["salary"] = job_primary.css('div.info-primary > h3 > a > span::text').extract_first().strip() 29 | salary = item["salary"].split("-") 30 | item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 31 | info_primary = job_primary.css('div.info-primary > p::text').extract() 32 | item['city'] = info_primary[0].strip() 33 | item['work_year'] = info_primary[1].strip() 34 | item['education'] = info_primary[2].strip() 35 | item['company_name'] = job_primary.css( 36 | 'div.info-company > div.company-text > h3 > a::text').extract_first().strip() 37 | company_infos = job_primary.css('div.info-company > div.company-text > p::text').extract() 38 | if len(company_infos) == 3: 39 | item['industry_field'] = company_infos[0].strip() 40 | item['finance_stage'] = company_infos[1].strip() 41 | item['company_size'] = company_infos[2].strip() 42 | else: 43 | item['industry_field'] = company_infos[0].strip() 44 | item['finance_stage'] = "" 45 | item['company_size'] = company_infos[1].strip() 46 | 47 | item['position_lables'] = "" # job_primary.css('div.info-detail > div.tags > span::text').extract() 48 | item['time'] = job.css('div.info-publis > p::text').extract_first().strip() 49 | item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 50 | item['platform'] = "zhipin" 51 | yield item 52 | yield self.next_request() 53 | 54 | # 发送请求 55 | def next_request(self): 56 | self.curPage += 1 57 | print("zhipin page:" + str(self.curPage)) 58 | time.sleep(10) 59 | return scrapy.http.FormRequest( 60 | self.positionUrl + ("&page=%d&ka=page-%d" % 61 | (self.curPage, self.curPage)), 62 | headers=self.headers, 63 | callback=self.parse) 64 | --------------------------------------------------------------------------------