├── .gitignore
├── LICENSE
├── README.md
├── job.sql
├── run.bat
├── scrapy.cfg
└── www_job_com
    ├── __init__.py
    ├── commands
        ├── __init__.py
        └── crawlall.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── chinahr_spider.py
        ├── dajie_spider.py
        ├── ganji_spider.py
        ├── job51_spider.py
        ├── job58_spider.py
        ├── lagou_spider.py
        ├── neitui_spider.py
        ├── zhaopin_spider.py
        └── zhipin_spider.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # www_job_com


--------------------------------------------------------------------------------
/job.sql:
--------------------------------------------------------------------------------
 1 | SET FOREIGN_KEY_CHECKS=0;
 2 | 
 3 | -- ----------------------------
 4 | -- Table structure for jobs
 5 | -- ----------------------------
 6 | DROP TABLE IF EXISTS `jobs`;
 7 | CREATE TABLE `jobs` (
 8 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 9 |   `position_id` varchar(255) DEFAULT '',
10 |   `position_name` varchar(255) DEFAULT '',
11 |   `position_lables` varchar(255) DEFAULT '',
12 |   `work_year` varchar(255) DEFAULT '',
13 |   `salary` varchar(255) DEFAULT NULL,
14 |   `city` varchar(255) DEFAULT '',
15 |   `education` varchar(255) DEFAULT '',
16 |   `company_name` varchar(255) DEFAULT '',
17 |   `industry_field` varchar(255) DEFAULT '',
18 |   `finance_stage` varchar(255) DEFAULT '',
19 |   `company_size` varchar(255) DEFAULT '',
20 |   `updated_at` varchar(255) DEFAULT '',
21 |   `time` varchar(255) DEFAULT '',
22 |   `platform` varchar(255) DEFAULT '',
23 |   `avg_salary` float(6,3) DEFAULT '0.000',
24 |   PRIMARY KEY (`id`)
25 | ) ENGINE=MyISAM AUTO_INCREMENT=9623 DEFAULT CHARSET=utf8;
26 | 


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | scrapy crawlall --nolog


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = www_job_com.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = www_job_com
12 | 


--------------------------------------------------------------------------------
/www_job_com/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chjw8016/www_job_com/20d20139e6f65d0b2b06cf56e49a5243a489d161/www_job_com/__init__.py


--------------------------------------------------------------------------------
/www_job_com/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chjw8016/www_job_com/20d20139e6f65d0b2b06cf56e49a5243a489d161/www_job_com/commands/__init__.py


--------------------------------------------------------------------------------
/www_job_com/commands/crawlall.py:
--------------------------------------------------------------------------------
 1 | from scrapy.commands import ScrapyCommand
 2 | 
 3 | 
 4 | class Command(ScrapyCommand):
 5 |     requires_project = True
 6 | 
 7 |     def syntax(self):
 8 |         return '[options]'
 9 | 
10 |     def short_desc(self):
11 |         return 'Runs all of the spiders'
12 | 
13 |     def run(self, args, opts):
14 |         spider_list = self.crawler_process.spiders.list()
15 |         for name in spider_list:
16 |             print("*********"+name+"************")
17 |             self.crawler_process.crawl(name, **opts.__dict__)
18 |         self.crawler_process.start()
19 | 


--------------------------------------------------------------------------------
/www_job_com/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WwwJobComItem(scrapy.Item):
12 |     position_id = scrapy.Field()
13 |     position_name = scrapy.Field()
14 |     position_lables = scrapy.Field()
15 |     work_year = scrapy.Field()
16 |     salary = scrapy.Field()
17 |     avg_salary = scrapy.Field()
18 |     city = scrapy.Field()
19 |     education = scrapy.Field()
20 |     company_name = scrapy.Field()
21 |     industry_field = scrapy.Field()
22 |     finance_stage = scrapy.Field()
23 |     company_size = scrapy.Field()
24 |     time = scrapy.Field()
25 |     updated_at = scrapy.Field()
26 |     platform = scrapy.Field()


--------------------------------------------------------------------------------
/www_job_com/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class WwwJobComSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class WwwJobComDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/www_job_com/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | from twisted.enterprise import adbapi
 9 | 
10 | 
11 | class WwwJobComPipeline(object):
12 |     @classmethod
13 |     def from_settings(cls, settings):
14 |         dbparams = dict(
15 |             host=settings['MYSQL_HOST'],
16 |             db=settings['MYSQL_DBNAME'],
17 |             user=settings['MYSQL_USER'],
18 |             passwd=settings['MYSQL_PASSWD'],
19 |             charset='utf8',
20 |             cursorclass=pymysql.cursors.DictCursor,
21 |             use_unicode=False,
22 |         )
23 |         dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
24 |         return cls(dbpool)
25 | 
26 |     def __init__(self, dbpool):
27 |         self.dbpool = dbpool
28 | 
29 |     def process_item(self, item, spider):
30 |         query = self.dbpool.runInteraction(self._conditional_insert, item)
31 |         query.addErrback(self._handle_error, item, spider)
32 |         return item
33 | 
34 |     def _conditional_insert(self, tx, item):
35 |         # print item['name']
36 |         sql = "select * from jobs where position_id=%s and platform=%s"
37 |         position_id = (item["position_id"], item["platform"])
38 |         result = tx.execute(sql, position_id)
39 |         if (result == 0):
40 |             sql = "insert into jobs(position_id,position_name,position_lables,work_year,salary,city,education,company_name,industry_field,finance_stage,company_size,updated_at,`time`,platform,avg_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
41 |             params = (
42 |                 item["position_id"], item["position_name"], item["position_lables"], item["work_year"], item["salary"],
43 |                 item["city"], item["education"], item["company_name"], item["industry_field"],
44 |                 item["finance_stage"], item["company_size"], item["updated_at"], item["time"],
45 |                 item["platform"], item["avg_salary"])
46 |             tx.execute(sql, params)
47 | 
48 |     def _handle_error(self, failue, item, spider):
49 |         print(item)
50 |         print(failue)
51 | 


--------------------------------------------------------------------------------
/www_job_com/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for www_job_com project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'www_job_com'
13 | 
14 | SPIDER_MODULES = ['www_job_com.spiders']
15 | NEWSPIDER_MODULE = 'www_job_com.spiders'
16 | COMMANDS_MODULE = 'www_job_com.commands'
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | # USER_AGENT = 'www_job_com (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | # CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | # CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | # COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | # TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | # DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | # }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'www_job_com.middlewares.WwwJobComSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'www_job_com.middlewares.WwwJobComDownloaderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'www_job_com.pipelines.WwwJobComPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | # Mysql数据库的配置信息
93 | MYSQL_HOST = 'dev.gammainfo.com'
94 | MYSQL_DBNAME = 'job'  # 数据库名字
95 | MYSQL_USER = 'root'  # 数据库账号
96 | MYSQL_PASSWD = 'Gamma0903'  # 数据库密码
97 | 
98 | MYSQL_PORT = 3306  # 数据库端口
99 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/chinahr_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | 
 6 | 
 7 | class ZhipinSpider(scrapy.Spider):
 8 |     name = 'chinahr'
 9 |     allowed_domains = ['www.chinahr.com']
10 |     start_urls = ['http://www.chinahr.com/']
11 |     positionUrl = ''
12 |     curPage = 0
13 |     headers = {}
14 | 
15 |     def start_requests(self):
16 |         return [self.next_request()]
17 | 
18 |     def parse(self, response):
19 |         print("request -> " + response.url)
20 |         job_list = response.css('div.jobList > ul')
21 |         if (len(job_list) > 0):
22 |             print("chinahr Nums:" + str(len(job_list)))
23 |             for job in job_list:
24 |                 item = WwwJobComItem()
25 |                 item['position_id'] = job.css('li.l1 > span.e1 > a::attr(href)').extract_first().strip().replace(
26 |                     ".html?searchplace=22,247", "").replace("http://www.chinahr.com/job/", "")
27 |                 item["position_name"] = job.css('li.l1 > span.e1 > a::text').extract_first().strip()
28 |                 salary = job.css('li.l2 > span.e2::text').extract_first().strip().split("-")
29 |                 item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 1000)) + "K"
30 |                 item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
31 |                 info_primary = job.css('li.l2 > span.e1::text').extract_first().strip().split("/")
32 |                 item['city'] = "河南/郑州"
33 |                 item['work_year'] = info_primary[2].replace("]\r\n\t\t\t\t\t\t\t", "")
34 |                 item['education'] = info_primary[3]
35 |                 item['company_name'] = job.css('li.l1 > span.e3 > a::text').extract_first().strip()
36 | 
37 |                 item['industry_field'] = ""
38 |                 item['finance_stage'] = ""
39 |                 item['company_size'] = ""
40 | 
41 |                 item['position_lables'] = ""
42 |                 item['time'] = job.css('li.l1 > span.e2::text').extract_first().strip()
43 |                 item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
44 |                 item['platform'] = "chinahr"
45 |                 yield item
46 |             yield self.next_request()
47 | 
48 |     # 发送请求
49 |     def next_request(self):
50 |         self.curPage += 1
51 |         self.positionUrl = "http://www.chinahr.com/sou/?orderField=relate&keyword=php&city=22,247&page=" + str(
52 |             self.curPage)
53 |         print("chinahr page:" + str(self.curPage))
54 |         time.sleep(10)
55 |         return scrapy.http.FormRequest(
56 |             self.positionUrl,
57 |             headers=self.headers,
58 |             callback=self.parse)
59 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/dajie_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | import json
 5 | from www_job_com.items import WwwJobComItem
 6 | import urllib.parse
 7 | import http.cookiejar
 8 | import urllib.request
 9 | import re
10 | 
11 | 
12 | class DajieSpider(scrapy.Spider):
13 |     name = 'dajie'
14 |     allowed_domains = ['so.dajie.com']
15 |     start_urls = ['https://so.dajie.com/']
16 | 
17 |     curPage = 1
18 |     city_id = "410100"
19 |     city_name = "%E9%83%91%E5%B7%9E"
20 |     job_name = "php"
21 |     cookie = "";
22 |     url = 'https://so.dajie.com/job/ajax/search/filter?keyword=php&order=0&city=410100&recruitType=&salary=&experience=&page=1&positionFunction=&_CSRFToken=&ajax=1'
23 |     headers = {
24 |         "accept": "application/json, text/javascript, */*; q=0.01",
25 |         "accept-encoding": "gzip, deflate, br",
26 |         "content-type": "text/html;charset=UTF-8",
27 |         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
28 |         "referer": "https://so.dajie.com/job/search?cityId=410100&cname=%E9%83%91%E5%B7%9E&from=job"
29 |     }
30 |     formData = {}
31 | 
32 |     def start_requests(self):
33 |         cookie = http.cookiejar.CookieJar()
34 |         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
35 |         session = opener.open(
36 |             "https://so.dajie.com/job/search?cityId=" + self.city_id + "&cname=" + self.city_name + "&from=job")
37 |         session_cookie = re.findall(r"SO_COOKIE_V2=.+?;", str(session.info()))[0].split("=")[1]
38 |         self.cookie = session_cookie.strip(";")
39 |         return [self.next_request()]
40 | 
41 |     def parse(self, response):
42 |         print("request -> " + response.url)
43 |         try:
44 |             html = json.loads(response.body.decode("utf-8"))
45 |         except ValueError:
46 |             print(response.body)
47 |             yield self.next_request()
48 | 
49 |         if (html.get("result") == 0):
50 |             print("dajie Num:" + str(html.get('data').get('total')))
51 |             results = html.get('data').get('list')
52 |             if len(results) > 0:
53 |                 for result in results:
54 |                     item = WwwJobComItem()
55 |                     item['salary'] = result.get('salary').replace(" ", "").replace("/月", "")
56 |                     if (item["salary"].find("-") > -1):
57 |                         salary = item["salary"].split("-")
58 |                         item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2
59 |                     else:
60 |                         item["avg_salary"] = item["salary"].replace("K", "")
61 |                     item['city'] = result.get('pubCity')
62 |                     item['finance_stage'] = ""
63 |                     item['industry_field'] = result.get('industryName')
64 |                     item['position_lables'] = ""
65 |                     item['position_id'] = result.get('jobseq')
66 |                     item['company_size'] = result.get('scaleName')
67 |                     item['position_name'] = result.get('jobName')
68 |                     item['work_year'] = result.get('pubEx')
69 |                     item['education'] = result.get('pubEdu')
70 |                     item['company_name'] = result.get('compName')
71 |                     item['time'] = result.get("time")
72 |                     item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
73 |                     item['platform'] = "dajie"
74 |                     yield item
75 |             totalPage = html.get('data').get("totalPage")
76 |             self.curPage = self.curPage + 1
77 |             if (self.curPage <= totalPage):
78 |                 self.url = 'https://so.dajie.com/job/ajax/search/filter?keyword=' + self.job_name + '&order=0&city=' + self.city_id + '&recruitType=&salary=&experience=&page=' + str(
79 |                     self.curPage) + '&positionFunction=&_CSRFToken=&ajax=1'
80 |                 yield self.next_request()
81 |         else:
82 |             time.sleep(10)
83 |             yield self.next_request()
84 | 
85 |     def next_request(self):
86 |         print("dajie page:" + str(self.curPage))
87 |         return scrapy.http.FormRequest(url=self.url, cookies={"SO_COOKIE_V2": self.cookie},
88 |                                        formdata=self.formData, headers=self.headers, method="GET")
89 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/ganji_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | import math
 6 | 
 7 | 
 8 | class GanjiSpider(scrapy.Spider):
 9 |     name = 'ganji'
10 |     allowed_domains = ['zz.ganji.com']
11 |     start_urls = ['http://zz.ganji.com/']
12 |     positionUrl = ''
13 |     curPage = 0
14 |     headers = {}
15 | 
16 |     def start_requests(self):
17 |         return [self.next_request()]
18 | 
19 |     def parse(self, response):
20 |         print("request -> " + response.url)
21 |         job_list = response.css('div.job-parttime > dl')
22 |         if (len(job_list) > 0):
23 |             print("ganji Nums:" + str(len(job_list)))
24 |             for job in job_list:
25 |                 item = WwwJobComItem()
26 |                 item['position_id'] = job.css('dt > div > input::attr(value)').extract_first().strip().split(",")[0]
27 |                 item["position_name"] = "php开发工程师"
28 |                 salary = job.css('em.unit::text').extract_first().strip()
29 |                 if (salary == "面议"):
30 |                     item["salary"] = "面议"
31 |                     item["avg_salary"] = 0
32 |                 else:
33 |                     salary = job.css('dt > div > p > em.lipay > i > strong::text').extract_first().strip().split("-")
34 |                     item["salary"] = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str(
35 |                         math.ceil(int(salary[1]) / 1000)) + "K"
36 |                     item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
37 |                 item['city'] = job.css('dt > div > p.site > a::text').extract_first().strip().replace("地址：", "")
38 |                 item['work_year'] = job.css('dt > div > p > em.liexp::text').extract_first().strip().replace("经验：",
39 |                                                                                                              "")
40 |                 item['education'] = ""
41 |                 item['company_name'] = job.css('div.j-comp > a::text').extract_first().strip()
42 |                 item['industry_field'] = ""
43 |                 item['finance_stage'] = ""
44 |                 item['company_size'] = ""
45 |                 item['position_lables'] = ""
46 |                 item['time'] = job.css('p.time::text').extract_first().strip()
47 |                 item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
48 |                 item['platform'] = "ganji"
49 |                 yield item
50 |             yield self.next_request()
51 | 
52 |     # 发送请求
53 |     def next_request(self):
54 |         self.curPage += 1
55 |         num = (self.curPage - 1) * 32
56 |         self.positionUrl = 'http://zz.ganji.com/zhaopin/s/f' + str(num) + '/_php/'
57 |         print("ganji page:" + str(self.curPage))
58 |         time.sleep(10)
59 |         return scrapy.http.FormRequest(
60 |             self.positionUrl,
61 |             headers=self.headers,
62 |             callback=self.parse)
63 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/job51_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | 
 6 | 
 7 | class Job51Spider(scrapy.Spider):
 8 |     name = 'job51'
 9 |     allowed_domains = ['search.51job.com']
10 |     start_urls = ['http://search.51job.com/']
11 |     positionUrl = ''
12 |     curPage = 0
13 |     headers = {}
14 | 
15 |     def start_requests(self):
16 |         return [self.next_request()]
17 | 
18 |     def parse(self, response):
19 |         print("request -> " + response.url)
20 |         job_list = response.css('div.dw_table > div.el')
21 |         if (len(job_list) > 1):
22 |             print("51job Nums:" + str(len(job_list)))
23 |             for job in job_list:
24 |                 item = WwwJobComItem()
25 |                 str_time = job.css('span.t5::text').extract_first().strip()
26 |                 if (str_time == "发布时间"):
27 |                     continue
28 |                 else:
29 |                     item['position_id'] = job.css('p.t1 > input::attr(value)').extract_first().strip()
30 |                     item["position_name"] = job.css('p.t1 > span > a::text').extract_first().strip()
31 |                     salary = job.css('span.t4::text').extract_first().strip()
32 |                     if (salary.find("万/月") > -1):
33 |                         salary = salary.replace("万/月", "").split("-")
34 |                         item["salary"] = str(float(salary[0]) * 10) + "K-" + str(float(salary[1]) * 10) + "K"
35 |                         item["avg_salary"] = (float(salary[0]) * 10 + float(salary[1]) * 10) / 2
36 |                     elif (salary.find("万/年") > -1):
37 |                         salary = salary.replace("万/年", "").split("-")
38 |                         item["salary"] = str(float(salary[0]) / 12) + "K-" + str(float(salary[1]) / 12) + "K"
39 |                         item["avg_salary"] = (float(salary[0]) / 12 + float(salary[1]) / 12) / 2
40 |                     elif (salary.find("元/天") > -1):
41 |                         continue
42 |                     else:
43 |                         salary = salary.replace("千/月", "").split("-")
44 |                         item["salary"] = salary[0] + "K-" + salary[1] + "K"
45 |                         item["avg_salary"] = (float(salary[0]) + float(salary[1])) / 2
46 |                     item['city'] = job.css('span.t3::text').extract_first().strip()
47 |                     item['work_year'] = ""
48 |                     item['education'] = ""
49 |                     item['company_name'] = job.css('span.t2 > a::text').extract_first().strip()
50 | 
51 |                     item['industry_field'] = ""
52 |                     item['finance_stage'] = ""
53 |                     item['company_size'] = ""
54 |                     item['position_lables'] = ""
55 |                     item['time'] = str_time
56 |                     item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
57 |                     item['platform'] = "51job"
58 |                     yield item
59 |             yield self.next_request()
60 | 
61 |     # 发送请求
62 |     def next_request(self):
63 |         self.curPage += 1
64 |         self.positionUrl = "http://search.51job.com/list/170200,000000,0000,00,9,99,php,2," + str(
65 |             self.curPage) + ".html"
66 |         print("51job page:" + str(self.curPage))
67 |         time.sleep(10)
68 |         return scrapy.http.FormRequest(self.positionUrl,
69 |                                        headers=self.headers,
70 |                                        callback=self.parse)
71 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/job58_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | import math
 6 | 
 7 | 
 8 | class Job58Spider(scrapy.Spider):
 9 |     name = 'job58'
10 |     allowed_domains = ['zz.58.com']
11 |     start_urls = ['http://zz.58.com/']
12 |     positionUrl = 'http://zz.58.com/job/?key=php&final=1&jump=1'
13 |     curPage = 0
14 |     headers = {}
15 | 
16 |     def start_requests(self):
17 |         return [self.next_request()]
18 | 
19 |     def parse(self, response):
20 |         print("request -> " + response.url)
21 |         job_list = response.css('li.job_item')
22 |         if (len(job_list) > 0):
23 |             print("job58 Nums:" + str(len(job_list)))
24 |             for job in job_list:
25 |                 item = WwwJobComItem()
26 |                 item['time'] = job.css('span.sign::text').extract_first().strip()
27 |                 if (item['time'] == "优选" or item['time'] == "精准"):
28 |                     continue
29 |                 else:
30 |                     item['position_id'] = job.css('div.job_name > a::attr(urlparams)').extract_first().strip().replace(
31 |                         "psid=", "").replace("&entinfo=", "").replace("_p", "").replace("_j", "")
32 |                     item[
33 |                         "position_name"] = job.css('div.job_comp > p.job_require >span::text').extract()[
34 |                         0].strip()
35 |                     salary = job.css('p.job_salary::text').extract_first().strip()
36 |                     if (salary == "面议"):
37 |                         new_salary = salary
38 |                         item["avg_salary"] = 0
39 |                     elif (salary == "1000"):
40 |                         new_salary = "1K"
41 |                         item["avg_salary"] = 1.0
42 |                     else:
43 |                         salary = salary.split("-")
44 |                         new_salary = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str(
45 |                             math.ceil(int(salary[1]) / 1000)) + "K"
46 |                         item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
47 |                     item["salary"] = new_salary
48 |                     item['city'] = "郑州"
49 |                     item['work_year'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip()
50 |                     item['education'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip()
51 |                     item['company_name'] = job.css('div.comp_name > a::text').extract_first().strip()
52 | 
53 |                     item['industry_field'] = ""
54 |                     item['finance_stage'] = ""
55 |                     item['company_size'] = ""
56 |                     label = job.css("div.job_wel > span::text").extract()
57 |                     item['position_lables'] = ",".join(label)
58 |                     item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
59 |                     item['platform'] = "job58"
60 |                     yield item
61 |             yield self.next_request()
62 | 
63 |     # 发送请求
64 |     def next_request(self):
65 |         self.curPage += 1
66 |         if (self.curPage > 1):
67 |             self.positionUrl = "http://zz.58.com/job/pn" + str(self.curPage) + "/?key=php&final=1&jump=1"
68 |         print("job58 page:" + str(self.curPage))
69 |         time.sleep(10)
70 |         return scrapy.http.FormRequest(
71 |             self.positionUrl,
72 |             headers=self.headers,
73 |             callback=self.parse)
74 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/lagou_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | import json
 5 | from www_job_com.items import WwwJobComItem
 6 | import math
 7 | 
 8 | 
 9 | class LagouSpider(scrapy.Spider):
10 |     name = 'lagou'
11 |     allowed_domains = ['www.lagou.com']
12 |     start_urls = ['https://www.lagou.com/']
13 | 
14 |     curPage = 1
15 |     city_name = "郑州"
16 |     job_name = "PHP"
17 |     url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=郑州&needAddtionalResult=false'
18 |     headers = {
19 |         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
20 |         'Accept-Encoding': 'gzip, deflate',
21 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
22 |         "Referer": "https://www.lagou.com/jobs/list_php?cl=false&fromSearch=true&labelWords=&suginput=&city=郑州"}
23 | 
24 |     def start_requests(self):
25 |         return [self.next_request()]
26 | 
27 |     def parse(self, response):
28 |         print("request -> " + response.url)
29 |         try:
30 |             html = json.loads(response.body)
31 |         except ValueError:
32 |             print(response.body)
33 |             yield self.next_request()
34 | 
35 |         if (html.get("success")):
36 |             if html.get('content').get('positionResult').get('resultSize') != 0:
37 |                 results = html.get('content').get('positionResult').get('result')
38 |                 print('lagou Nums:' + str(len(results)))
39 |                 for result in results:
40 |                     item = WwwJobComItem()
41 |                     item['salary'] = result.get('salary').replace("k", "K")
42 |                     salary = item["salary"].split("-")
43 |                     item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2
44 |                     item['city'] = result.get('city')
45 |                     item['finance_stage'] = result.get('financeStage')
46 |                     item['industry_field'] = result.get('industryField')
47 |                     item['position_lables'] = result.get('positionAdvantage')
48 |                     item['position_id'] = result.get('positionId')
49 |                     item['company_size'] = result.get('companySize')
50 |                     item['position_name'] = result.get('positionName')
51 |                     item['work_year'] = result.get('workYear')
52 |                     item['education'] = result.get('education')
53 |                     item['company_name'] = result.get('companyShortName')
54 |                     item['time'] = result.get("formatCreateTime")
55 |                     item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
56 |                     item['platform'] = "lagou"
57 |                     yield item
58 |                 totalPage = math.floor(int(html.get('content').get('positionResult').get("totalCount")) / int(
59 |                     html.get('content').get("pageSize")))
60 |                 self.curPage = self.curPage + 1
61 |                 if (self.curPage <= totalPage):
62 |                     yield self.next_request()
63 |         else:
64 |             time.sleep(60)
65 |             yield self.next_request()
66 | 
67 |     def next_request(self):
68 |         print("lagou page:" + str(self.curPage))
69 |         return scrapy.FormRequest(url=self.url, formdata={'pn': str(self.curPage), 'kd': self.job_name},
70 |                                   method='POST',
71 |                                   headers=self.headers, meta={'page': self.curPage, 'kd': self.job_name},
72 |                                   dont_filter=True)
73 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/neitui_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | 
 6 | 
 7 | class NeituiSpider(scrapy.Spider):
 8 |     name = 'neitui'
 9 |     allowed_domains = ['www.neitui.me']
10 |     start_urls = ['http://www.neitui.me']
11 |     positionUrl = 'http://www.neitui.me/?name=job&handle=lists'
12 |     curPage = 0
13 |     headers = {
14 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"
15 |     }
16 | 
17 |     def start_requests(self):
18 |         return [self.next_request()]
19 | 
20 |     def parse(self, response):
21 |         print("request -> " + response.url)
22 |         job_list = response.css('ul.list-items > li')
23 |         if (len(job_list) > 0):
24 |             print("neitui Nums:" + str(len(job_list)))
25 |             for job in job_list:
26 |                 item = WwwJobComItem()
27 |                 job_primary = job.css('div.positionleft > div')
28 |                 item['position_id'] = job_primary[0].css('a::attr(href)').extract_first().strip().replace("/j/", "")
29 |                 item["position_name"] = job_primary[0].css('a::text').extract_first().strip()
30 |                 item['time'] = job_primary[0].css('span::text').extract_first().strip()
31 |                 item["salary"] = job_primary[1].css('span.mr10::text').extract_first().strip().replace("k", "K")
32 |                 salary = item["salary"].split("-")
33 |                 item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2
34 |                 info_primary = job_primary[1].css('span::text').extract()
35 |                 item['city'] = info_primary[5].strip()
36 |                 item['work_year'] = info_primary[1].strip()
37 |                 item['education'] = info_primary[3].strip()
38 |                 item['company_name'] = job_primary[2].css('span >a::text').extract_first().strip()
39 |                 item['finance_stage'] = job_primary[2].css('span::text').extract()[1].strip()
40 |                 item['industry_field'] = ""
41 |                 item['company_size'] = ""
42 |                 item['position_lables'] = ""
43 |                 item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
44 |                 item['platform'] = "neitui"
45 |                 yield item
46 |             yield self.next_request()
47 | 
48 |     # 发送请求
49 |     def next_request(self):
50 |         self.curPage += 1
51 |         self.positionUrl += "&keyword=PHP&city=%E9%83%91%E5%B7%9E&page=" + str(self.curPage)
52 |         print("neitui page:" + str(self.curPage))
53 |         time.sleep(0)
54 |         return scrapy.http.FormRequest(
55 |             self.positionUrl,
56 |             headers=self.headers,
57 |             callback=self.parse)
58 | 


--------------------------------------------------------------------------------
/www_job_com/spiders/zhaopin_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | 
 6 | 
 7 | class ZhaopinSpider(scrapy.Spider):
 8 |     name = 'zhaopin'
 9 |     allowed_domains = ['sou.zhaopin.com']
10 |     start_urls = ['http://sou.zhaopin.com/']
11 |     positionUrl = ''
12 |     curPage = 0
13 |     headers = {}
14 | 
15 |     def start_requests(self):
16 |         return [self.next_request()]
17 | 
18 |     def parse(self, response):
19 |         print("request -> " + response.url)
20 |         job_list = response.css('table.newlist > tr')
21 |         if (len(job_list) > 1):
22 |             print("zhaopin Nums:" + str(len(job_list)))
23 |             i = 0;
24 |             for job in job_list:
25 |                 i += 1
26 |                 if (i > 1 and (i % 2) == 0):
27 |                     item = WwwJobComItem()
28 |                     item['position_id'] = job.css('td.zwmc > input::attr(data-monitor)').extract_first().strip().replace("|", "")
29 |                     name = job.css('td.zwmc > div > a').extract_first().strip()
30 |                     if (name.find("php") > -1 or name.find("Php") > -1 or name.find("PHP") > -1):
31 |                         item["position_name"] = "php研发工程师"
32 |                         salary = job.css('td.zwyx::text').extract_first().strip().split("-")
33 |                         item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 100)) + "K"
34 |                         item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000
35 |                         item['city'] = "郑州"
36 |                         item['work_year'] = ""
37 |                         item['education'] = ""
38 |                         item['company_name'] = job.css('td.gsmc > a::text').extract_first().strip()
39 |                         item['industry_field'] = ""
40 |                         item['finance_stage'] = ""
41 |                         item['company_size'] = ""
42 |                         item['position_lables'] = ""
43 |                         item['time'] = job.css('td.gxsj > span::text').extract_first().strip()
44 |                         item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
45 |                         item['platform'] = "zhaopin"
46 |                         yield item
47 |             yield self.next_request()
48 | 
49 |     # 发送请求
50 |     def next_request(self):
51 |         self.curPage += 1
52 |         if (self.curPage <= 10):
53 |             self.positionUrl = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%83%91%E5%B7%9E&kw=php&sm=0&fl=719&isadv=0&sb=1&isfilter=1&et=2&p=" + str(
54 |                 self.curPage)
55 |             print("zhaopin page:" + str(self.curPage))
56 |             time.sleep(10)
57 |             return scrapy.http.FormRequest(self.positionUrl,
58 |                                            headers=self.headers,
59 |                                            callback=self.parse)


--------------------------------------------------------------------------------
/www_job_com/spiders/zhipin_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import time
 4 | from www_job_com.items import WwwJobComItem
 5 | 
 6 | 
 7 | class ZhipinSpider(scrapy.Spider):
 8 |     name = 'zhipin'
 9 |     allowed_domains = ['www.zhipin.com']
10 |     start_urls = ['https://www.zhipin.com/']
11 |     positionUrl = 'https://www.zhipin.com/c101180100-p100103/h_101180100/?query='
12 |     curPage = 0
13 |     headers = {}
14 | 
15 |     def start_requests(self):
16 |         return [self.next_request()]
17 | 
18 |     def parse(self, response):
19 |         print("request -> " + response.url)
20 |         job_list = response.css('div.job-list > ul > li')
21 |         if (len(job_list) > 0):
22 |             print("zhipin Nums:" + str(len(job_list)))
23 |             for job in job_list:
24 |                 item = WwwJobComItem()
25 |                 job_primary = job.css('div.job-primary')
26 |                 item['position_id'] = job.css('div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip()
27 |                 item["position_name"] = job_primary.css('div.info-primary > h3 > a > div::text').extract_first().strip()
28 |                 item["salary"] = job_primary.css('div.info-primary > h3 > a > span::text').extract_first().strip()
29 |                 salary = item["salary"].split("-")
30 |                 item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2
31 |                 info_primary = job_primary.css('div.info-primary > p::text').extract()
32 |                 item['city'] = info_primary[0].strip()
33 |                 item['work_year'] = info_primary[1].strip()
34 |                 item['education'] = info_primary[2].strip()
35 |                 item['company_name'] = job_primary.css(
36 |                     'div.info-company > div.company-text > h3 > a::text').extract_first().strip()
37 |                 company_infos = job_primary.css('div.info-company > div.company-text > p::text').extract()
38 |                 if len(company_infos) == 3:
39 |                     item['industry_field'] = company_infos[0].strip()
40 |                     item['finance_stage'] = company_infos[1].strip()
41 |                     item['company_size'] = company_infos[2].strip()
42 |                 else:
43 |                     item['industry_field'] = company_infos[0].strip()
44 |                     item['finance_stage'] = ""
45 |                     item['company_size'] = company_infos[1].strip()
46 | 
47 |                 item['position_lables'] = ""  # job_primary.css('div.info-detail > div.tags > span::text').extract()
48 |                 item['time'] = job.css('div.info-publis > p::text').extract_first().strip()
49 |                 item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
50 |                 item['platform'] = "zhipin"
51 |                 yield item
52 |             yield self.next_request()
53 | 
54 |     # 发送请求
55 |     def next_request(self):
56 |         self.curPage += 1
57 |         print("zhipin page:" + str(self.curPage))
58 |         time.sleep(10)
59 |         return scrapy.http.FormRequest(
60 |             self.positionUrl + ("&page=%d&ka=page-%d" %
61 |                                 (self.curPage, self.curPage)),
62 |             headers=self.headers,
63 |             callback=self.parse)
64 | 


--------------------------------------------------------------------------------