├── .gitignore ├── LICENSE ├── README.md └── spider ├── __init__.py ├── settings.py └── zl_spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "{}" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright 2018 James 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zhilian_spider 2 | 智联招聘关键词搜索职位信息爬虫 -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | @author:James 4 | Created on:18-2-12 19:43 5 | """ 6 | -------------------------------------------------------------------------------- /spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | @author:James 4 | Created on:18-2-12 19:43 5 | """ 6 | # 搜索结果页 7 | URL_RESULT = 'http://sou.zhaopin.com/jobs/searchresult.ashx' 8 | 9 | # 搜索请求参数名 10 | 11 | # 1.职位关键字 12 | KEY_KEYWORD = 'kw' 13 | 14 | # 2.地域范围 15 | KEY_AREA = 'jl' 16 | 17 | # 3.页码 18 | KYE_PAGENUM = 'p' 19 | 20 | # 关键字 21 | VALUE_KEYWORD = 'Java' 22 | 23 | # 范围 24 | VALUE_AREA = '全国' 25 | 26 | # 页数限制 27 | PAGE_LIMIT = 1 28 | -------------------------------------------------------------------------------- /spider/zl_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 智联招聘关键词搜索结果收集 4 | 5 | @author:James 6 | Created on:18-2-12 19:48 7 | """ 8 | import re 9 | import requests 10 | import gevent 11 | import sqlalchemy 12 | from bs4 import BeautifulSoup 13 | 14 | from spider import settings 15 | 16 | 17 | # 职位信息 18 | class JobItem: 19 | def __init__(self, job_name, job_corporation, job_monthly_salary, job_work_place, job_release_date, job_category, 20 | job_work_experience, job_minimum_education_requirements, job_recruiting_numbers, job_job_category): 21 | # 标题 22 | self.name = job_name 23 | # 公司 24 | self.corporation = job_corporation 25 | # 月薪 26 | self.monthly_salary = job_monthly_salary 27 | # 工作地点 28 | self.work_place = job_work_place 29 | # 发布日期 30 | self.release_date = job_release_date 31 | # 工作性质 32 | self.job_category = job_category 33 | # 工作经验 34 | self.work_experience = job_work_experience 35 | # 最低学历 36 | self.min_edu_requirements = job_minimum_education_requirements 37 | # 招聘人数 38 | self.recruiting_number = job_recruiting_numbers 39 | # 职位类别 40 | self.category = job_job_category 41 | 42 | 43 | # 详情收集器 44 | class GetDetailInfo: 45 | def __init__(self, urls): 46 | self.urls = urls 47 | 48 | def get_detail_info(self): 49 | works = [gevent.spawn(self.get_detail_info_page, i) for i in self.urls] 50 | gevent.joinall(works) 51 | 52 | def get_detail_info_page(self, url): 53 | response = requests.get(url) 54 | content = response.content 55 | soup = BeautifulSoup(content, 'lxml') 56 | job_name = soup.find('h1').get_text() 57 | job_organization = soup.select('h2 > a')[0].get_text() 58 | job_details = soup.select('div.terminalpage-left > ul > li > strong') 59 | job_monthly_salary = job_details[0].get_text() 60 | job_work_place = job_details[1].get_text() 61 | job_release_date = job_details[2].get_text() 62 | job_category = job_details[3].get_text() 63 | job_work_experience = job_details[4].get_text() 64 | job_minimum_education_requirements = job_details[5].get_text() 65 | job_recruiting_numbers = job_details[6].get_text() 66 | job_job_category = job_details[7].get_text() 67 | job_item = JobItem(job_name=job_name, job_corporation=job_organization, job_monthly_salary=job_monthly_salary, 68 | job_work_place=job_work_place, job_release_date=job_release_date, job_category=job_category, 69 | job_work_experience=job_work_experience, 70 | job_minimum_education_requirements=job_minimum_education_requirements, 71 | job_recruiting_numbers=job_recruiting_numbers, job_job_category=job_job_category) 72 | 73 | 74 | # 详细信息 URL 采集 75 | class GetResultUrls: 76 | def __init__(self): 77 | self.url_repository = UrlRepository() 78 | self.page_limit = settings.PAGE_LIMIT 79 | self.url_search = settings.URL_RESULT 80 | self.page_maximum = 0 81 | 82 | # 获取指定搜索条件的所有详情页链接 83 | def get_detail_urls(self): 84 | data = { 85 | settings.KEY_KEYWORD: settings.VALUE_KEYWORD, 86 | settings.KEY_AREA: settings.VALUE_AREA, 87 | } 88 | response = requests.get(self.url_search, params=data) 89 | content = response.content 90 | soup = BeautifulSoup(content, 'lxml') 91 | result_count = int(re.findall(r"共(.*?)个职位满足条件", str(soup))[0]) 92 | self.page_maximum = result_count // 60 93 | works = [gevent.spawn(self.get_detail_urls_page, i) for i in range(self.page_limit)] 94 | gevent.joinall(works, timeout=10) 95 | return self.url_repository.urls 96 | 97 | # 获取指定页码内所有的详情页链接 98 | def get_detail_urls_page(self, page_number): 99 | url_result = [] 100 | data = { 101 | settings.KEY_KEYWORD: settings.VALUE_KEYWORD, 102 | settings.KEY_AREA: settings.VALUE_AREA, 103 | settings.KYE_PAGENUM: page_number 104 | } 105 | response = requests.get(self.url_search, params=data) 106 | content = response.content 107 | soup = BeautifulSoup(content, 'lxml') 108 | detail_links = soup.select("table.newlist > tr > td.zwmc > div > a") 109 | for link in detail_links: 110 | href = link['href'] 111 | text = link.getText() 112 | # 筛选出详情链接 113 | if href.find('.do') > -1: 114 | continue 115 | # 过滤关键字 116 | if text.lower().find(settings.VALUE_KEYWORD.lower()) == -1: 117 | continue 118 | print(text) 119 | print(href) 120 | url_result.append(href) 121 | self.url_repository.push(href) 122 | return url_result 123 | 124 | 125 | # URL 仓库 126 | class UrlRepository: 127 | def __init__(self): 128 | self.urls = [] 129 | 130 | def push(self, url): 131 | self.urls.append(url) 132 | 133 | 134 | # 主类 135 | class SpiderMain: 136 | def __init__(self): 137 | self.url_result = [] 138 | self.url_search = settings.URL_RESULT 139 | 140 | def run(self): 141 | url_collector = GetResultUrls() 142 | self.url_result = url_collector.get_detail_urls() 143 | print(self.url_result) 144 | collector = GetDetailInfo(self.url_result) 145 | collector.get_detail_info() 146 | 147 | 148 | if __name__ == '__main__': 149 | app = SpiderMain() 150 | app.run() 151 | --------------------------------------------------------------------------------