├── .gitignore ├── LICENSE ├── README.md ├── city_data.json ├── crawl_36kr.py ├── crawl_company_detail.py ├── crawl_lagou.py ├── crawl_topic&article.py ├── get_city_data.py ├── requirements.txt └── screenshots ├── choose_city.gif ├── edit_resume.gif ├── home.gif ├── loading.gif ├── login.gif ├── read.gif ├── recruit.gif ├── recurit_want.gif ├── setting.gif ├── sort.gif └── sort_2.gif /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff 7 | .idea/**/workspace.xml 8 | .idea/**/tasks.xml 9 | .idea/**/usage.statistics.xml 10 | .idea/**/dictionaries 11 | .idea/**/shelf 12 | .idea/* 13 | # Sensitive or high-churn files 14 | .idea/**/dataSources/ 15 | .idea/**/dataSources.ids 16 | .idea/**/dataSources.local.xml 17 | .idea/**/sqlDataSources.xml 18 | .idea/**/dynamic.xml 19 | .idea/**/uiDesigner.xml 20 | .idea/**/dbnavigator.xml 21 | 22 | # Gradle 23 | .idea/**/gradle.xml 24 | .idea/**/libraries 25 | 26 | # Gradle and Maven with auto-import 27 | # When using Gradle or Maven with auto-import, you should exclude module files, 28 | # since they will be recreated, and may cause churn. Uncomment if using 29 | # auto-import. 30 | # .idea/modules.xml 31 | # .idea/*.iml 32 | # .idea/modules 33 | 34 | # CMake 35 | cmake-build-*/ 36 | 37 | # Mongo Explorer plugin 38 | .idea/**/mongoSettings.xml 39 | 40 | # File-based project format 41 | *.iws 42 | 43 | # IntelliJ 44 | out/ 45 | 46 | # mpeltonen/sbt-idea plugin 47 | .idea_modules/ 48 | 49 | # JIRA plugin 50 | atlassian-ide-plugin.xml 51 | 52 | # Cursive Clojure plugin 53 | .idea/replstate.xml 54 | 55 | # Crashlytics plugin (for Android Studio and IntelliJ) 56 | com_crashlytics_export_strings.xml 57 | crashlytics.properties 58 | crashlytics-build.properties 59 | fabric.properties 60 | 61 | # Editor-based Rest Client 62 | .idea/httpRequests 63 | ### Python template 64 | # Byte-compiled / optimized / DLL files 65 | __pycache__/ 66 | *.py[cod] 67 | *$py.class 68 | 69 | # C extensions 70 | *.so 71 | 72 | # Distribution / packaging 73 | .Python 74 | build/ 75 | develop-eggs/ 76 | dist/ 77 | downloads/ 78 | eggs/ 79 | .eggs/ 80 | lib/ 81 | lib64/ 82 | parts/ 83 | sdist/ 84 | var/ 85 | wheels/ 86 | *.egg-info/ 87 | .installed.cfg 88 | *.egg 89 | MANIFEST 90 | 91 | # PyInstaller 92 | # Usually these files are written by a python script from a template 93 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 94 | *.manifest 95 | *.spec 96 | 97 | # Installer logs 98 | pip-log.txt 99 | pip-delete-this-directory.txt 100 | 101 | # Unit test / coverage reports 102 | htmlcov/ 103 | .tox/ 104 | .coverage 105 | .coverage.* 106 | .cache 107 | nosetests.xml 108 | coverage.xml 109 | *.cover 110 | .hypothesis/ 111 | .pytest_cache/ 112 | 113 | # Translations 114 | *.mo 115 | *.pot 116 | 117 | # Django stuff: 118 | *.log 119 | local_settings.py 120 | db.sqlite3 121 | 122 | # Flask stuff: 123 | instance/ 124 | .webassets-cache 125 | 126 | # Scrapy stuff: 127 | .scrapy 128 | 129 | # Sphinx documentation 130 | docs/_build/ 131 | 132 | # PyBuilder 133 | target/ 134 | 135 | # Jupyter Notebook 136 | .ipynb_checkpoints 137 | 138 | # pyenv 139 | .python-version 140 | 141 | # celery beat schedule file 142 | celerybeat-schedule 143 | 144 | # SageMath parsed files 145 | *.sage.py 146 | 147 | # Environments 148 | .env 149 | .venv 150 | env/ 151 | venv/ 152 | lagou_venv/* 153 | ENV/ 154 | env.bak/ 155 | venv.bak/ 156 | 157 | # Spyder project settings 158 | .spyderproject 159 | .spyproject 160 | 161 | # Rope project settings 162 | .ropeproject 163 | 164 | # mkdocs documentation 165 | /site 166 | 167 | # mypy 168 | .mypy_cache/ 169 | 170 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Qian Bin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 全栈系列Vue版拉勾,客官们来瞧瞧 2 | 模拟拉勾app系列---python爬虫系列 3 | ### 前言 4 | 本项目是本人在闲暇时间编写的一个初级引导项目,麻雀虽小五脏俱全,所使用的东西绝大多数在开发中都能用得到,但难免会存在很多地方需要完善。 5 | 6 | 由于近期要备战法考,且工作繁忙,没有时间维护,还存在很多BUG或需要优化的地方,希望多多提出(有空了就改),当然能给个star什么的就更好了. 7 | 8 | 为了方便访问,也加入了mock数据,但不是很全,若需要完整体验,请按照下方步骤实现。 9 | 10 | 前端项目由Vue框架编写,其余部分涉及到node、python等可移至下方项目或自行查阅。 11 | ### 注意:本项目个人开发练习,不作为任何商业用途 12 | 13 | # todolist 14 | + ~~职位数据爬取~~ √ 15 | + ~~公司数据爬取~~ √ 16 | + ~~评论数据爬取~~ √ 17 | + ~~用户数据爬取~~ √ 18 | + ~~文章数据爬取~~ √ 19 | + ~~话题数据爬取~~ √ 20 | + ~~城市及地铁数据爬取~~ √ 21 | + ~~数据格式化及相应处理~~ √ 22 | 23 | # 技术栈 24 | 前端: 25 | + vue全家桶 26 | + es6 27 | + scss 28 | + mint-ui 29 | + mockjs 30 | + jquery 31 | 32 | 转发服务器: 33 | + node 34 | + express 35 | 36 | 实际api服务器: 37 | + python3 38 | + mongodb 39 | 40 | 爬虫: 41 | + python3 42 | 43 | # 效果演示 44 | ### 首次载入 45 | ![](screenshots/loading.gif) 46 | ### 登录注册 47 | ![](screenshots/login.gif) 48 | ### 首页 49 | ![](screenshots/home.gif) 50 | ### 文章阅读 51 | ![](screenshots/read.gif) 52 | ### 选择城市 53 | ![](screenshots/choose_city.gif) 54 | ### 职位查看 55 | ![](screenshots/recruit.gif) 56 | ### 筛选 57 | ![](screenshots/recurit_want.gif) 58 | ### 排序 59 | ![](screenshots/sort.gif) 60 | ### 排序2 61 | ![](screenshots/sort_2.gif) 62 | ### 简历修改 63 | ![](screenshots/edit_resume.gif) 64 | ### 我的设置 65 | ![](screenshots/setting.gif) 66 | 67 | ps:还有更多的设置就不截图了,有点大,有兴趣的clone下去看看吧 68 | 69 | # 线上地址 70 | 71 | # 说明 72 | 前端地址:https://github.com/qianbin01/lagou_vue 73 | 74 | 代理api地址:https://github.com/qianbin01/lagou_node 75 | 76 | api地址:https://github.com/qianbin01/lagou_python_api 77 | 78 | 爬虫地址:https://github.com/qianbin01/lagou_spider 79 | # 项目配置 80 | ubuntu 16.04 81 | # 运行步骤 82 | 必备步骤: 83 | 1. 运行爬虫项目 84 | 2. 运行python-api项目 85 | 3. 运行node-api转发项目 86 | 4. 运行本项目 87 | 88 | 本项目步骤: 89 | 90 | 1. git clone https://github.com/qianbin01/lagou_spider.git 91 | 2. cd lagou_spider 92 | 3. pip install -r requirements.txt 93 | 4. 搭建mongodb服务 94 | 5. 修改代理服务器ip,如何自搭代理服务器戳 95 | 这里 96 | 6. 根据不同系统设置定时任务 97 | #### windows 98 | schtasks语法:schtasks /create /tn 设定定时运行的名字 /tr “运行程序” /sc daily /st时间 99 | demo: 100 | schtasks /create /tn 定时运行 /tr "notepad" /sc daily /st 12:30(12:30时运行记事本) 101 | #### linux(ubuntu,mac) 102 | crontab –e 设置 103 | crontab –l 查看 104 | cmd: 105 | 分 时 天 周 月 命令(*代表当前单位的所有时间) 106 | * * * * * command 107 | 每天9点运行一次语句为: 108 | 0 9 * * * /usr/bin/python3 /home/qb/do_something.py 109 | 110 | # 其他 111 | 1. 测试地址: http://114.67.151.31:5010 (单机勿压。感谢) 112 | 2. ps:拉勾头条的地址找不到,暂用36kr代替 113 | 114 | 115 | # 点点你们的小手吧 116 | 知乎专栏:https://zhuanlan.zhihu.com/c_1010582778160779264 117 | 118 | 掘金:https://juejin.im/user/5b8291bce51d4538ab043911 119 | 120 | 思否:https://segmentfault.com/u/qishidexinxin 121 | 122 | 希望对大家有帮助 123 | 124 | ![](http://oh343spqg.bkt.clouddn.com/dianzan.jpg) 125 | 126 | 大佬们赞助一波续费服务器吧 127 | 128 | 129 | 130 | 131 | 132 | 133 | # License 134 | MIT 135 | 136 | -------------------------------------------------------------------------------- /crawl_36kr.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pymongo 3 | import config 4 | 5 | client = pymongo.MongoClient(host=config.MONGO_HOST, port=config.MONGO_PORT) 6 | db = client[config.MONGO_DB] 7 | db.authenticate(config.MONGO_AUTH_NAME, config.MONGO_AUTH_PASSWORD) 8 | news = db['news_36kr'] 9 | 10 | 11 | def get_news_by_36kr(): 12 | list_base_url = 'https://36kr.com/api/search-column/218?per_page=100&page={}' 13 | detail_base_url = 'https://36kr.com/api/post/{}' 14 | for i in range(1, 2): 15 | url = list_base_url.format(i) 16 | r = requests.get(url) 17 | for item in r.json().get('data').get('items'): 18 | detail_url = detail_base_url.format(item['id']) 19 | detail_r = requests.get(detail_url) 20 | detail_data = detail_r.json().get('data') 21 | single_data = { 22 | 'nid': item['id'], 23 | 'summary': item['summary'], 24 | 'title': item['title'], 25 | 'publish_item': detail_data['published_at'], 26 | 'extraction_tags': item['extraction_tags'], 27 | 'cover': item['cover'], 28 | 'content': detail_data['content'], 29 | 'count': detail_data.get('counters').get('view_count') 30 | } 31 | insert_data = news.find_one({'nid': item['id']}) 32 | if not insert_data: 33 | print(single_data) 34 | news.insert(single_data) 35 | 36 | 37 | def format_news(): 38 | import re 39 | p = re.compile('\"(.*?)\"') 40 | for item in news.find(): 41 | if type(item['extraction_tags']) == str: 42 | match = p.findall(item['extraction_tags']) 43 | news.update({'_id': item['_id']}, {'$set': {'extraction_tags': match}}) 44 | print(match) 45 | 46 | 47 | if __name__ == '__main__': 48 | get_news_by_36kr() 49 | format_news() 50 | -------------------------------------------------------------------------------- /crawl_company_detail.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pymongo 3 | import config 4 | from bs4 import BeautifulSoup 5 | import time 6 | 7 | client = pymongo.MongoClient(host=config.MONGO_HOST, port=config.MONGO_PORT) 8 | db = client[config.MONGO_DB] 9 | db.authenticate(config.MONGO_AUTH_NAME, config.MONGO_AUTH_PASSWORD) 10 | company = db['company'] 11 | company_detail = db['company_detail'] 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' 14 | '(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 15 | 'Host': 'www.lagou.com' 16 | } 17 | base_url = 'https://www.lagou.com/gongsi/{}.html' 18 | recruit_url = 'https://www.lagou.com/gongsi/searchPosition.json' 19 | reply_url = 'https://www.lagou.com/gongsi/searchInterviewExperiences.json' 20 | question_url = 'https://www.lagou.com/gongsi/q{}.html' 21 | 22 | 23 | # 获取代理 24 | def get_proxy(): 25 | return requests.get("http://{}:5010/get/".format(config.MONGO_HOST)).content # ip替换成自己的服务器地址 26 | 27 | 28 | # 删除服务器无用代理 29 | def delete_proxy(proxy): 30 | requests.get("http://{}:5010/delete/?proxy={}".format(config.MONGO_HOST, proxy)) # ip替换成自己的服务器地址 31 | 32 | 33 | def get_html(doc, cid): 34 | company_detail_one = company_detail.find_one({'companyId': cid}) 35 | if company_detail_one: 36 | print(doc['companyShortName'] + '已经存在,直接跳过') 37 | return False 38 | proxy = str(get_proxy(), encoding='utf-8') 39 | proxies = { 40 | 'http': 'http://{}'.format(proxy), 41 | 'https': 'http://{}'.format(proxy), 42 | } # 获取并设置代理 43 | url = base_url.format(cid) 44 | r = requests.get(url, headers=headers, proxies=proxies) 45 | soup = BeautifulSoup(r.text, 'lxml') 46 | span = soup.find('span', class_='company_content') 47 | if not span: 48 | while True: 49 | proxy = str(get_proxy(), encoding='utf-8') 50 | proxies = { 51 | 'http': 'http://{}'.format(proxy), 52 | 'https': 'http://{}'.format(proxy), 53 | } # 获取并设置代理 54 | try: 55 | r = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False) 56 | soup = BeautifulSoup(r.text, 'lxml') 57 | span = soup.find('span', class_='company_content') 58 | if span: 59 | break 60 | except Exception as e: 61 | if 'HTTPSConnectionPool' in str(e): 62 | print('这个代理不能用,我删了你 {}'.format(proxy)) # 代理本身不可用则删除该代理 63 | delete_proxy(proxy) 64 | detail_doc = doc.copy() 65 | # 公司介绍 66 | detail_doc['companyIntroduce'] = span.text 67 | # 图片 68 | img_ul = soup.find('ul', class_='company_img') 69 | img_list = [] 70 | if img_ul: 71 | for li in img_ul.find_all('li'): 72 | img_list.append({'src': li.get('data-item')}) 73 | detail_doc['imgList'] = img_list 74 | # 地址 75 | address_ul = soup.find('ul', class_='con_mlist_ul') 76 | address_list = [] 77 | if address_ul: 78 | for li in address_ul.find_all('li'): 79 | address_list.append({ 80 | 'bigAddress': li.find('p', class_='mlist_li_title').text.strip().replace('\n', '').replace(' ', ''), 81 | 'smallAddress': li.find('p', class_='mlist_li_desc').text.strip() 82 | }) 83 | detail_doc['addressList'] = address_list 84 | # 历史记载 85 | history_ul = soup.find('ul', class_='history_ul') 86 | history_list = [] 87 | if history_ul: 88 | for li in history_ul.find_all('li'): 89 | history_list.append({ 90 | 'historyDate': li.find('div', class_='li_date').text.strip().replace('\n', '.'), 91 | 'historyText': li.find('span', class_='desc_real_title').text.strip() 92 | }) 93 | detail_doc['historyList'] = history_list 94 | 95 | # 问题记载 96 | question_r = requests.get(question_url.format(cid), allow_redirects=False) 97 | question_soup = BeautifulSoup(question_r.text, 'lxml') 98 | question_ul = question_soup.find('ul', id='question-answer-list') 99 | question_list = [] 100 | if question_ul: 101 | for li in question_ul.find_all('li'): 102 | try: 103 | question_list.append({ 104 | 'itemTitle': li.find('h4', class_='item-title').text.strip().replace('\n', '.'), 105 | 'itemTime': li.find('span', class_='item-time').text.strip(), 106 | 'itemStatus': li.find('div', class_='item-status').text.strip(), 107 | }) 108 | except Exception as e: 109 | print(e) 110 | continue 111 | detail_doc['questionList'] = question_list 112 | 113 | # 反馈记载 114 | reply_data = { 115 | 'companyId': cid, 116 | 'positionType': '', 117 | 'pageNo': 1, 118 | 'pageSize': 10 119 | } 120 | reply_header = headers 121 | reply_header['Referer'] = 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId={}'.format(cid) 122 | try: 123 | reply_r = requests.post(reply_url, data=reply_data, headers=reply_header, allow_redirects=False) 124 | reply_list = reply_r.json().get('content').get('data').get('page').get('result') 125 | detail_doc['replyList'] = reply_list 126 | except Exception as e: 127 | print(e) 128 | print('这里请求太快,代理不够用,等3分钟再请求吧') 129 | time.sleep(180) 130 | reply_r = requests.post(reply_url, data=reply_data, headers=reply_header, allow_redirects=False) 131 | reply_list = reply_r.json().get('content').get('data').get('page').get('result') 132 | detail_doc['replyList'] = reply_list 133 | # 职位记载 134 | recruit_data = { 135 | 'companyId': cid, 136 | 'positionFirstType': '全部', 137 | 'schoolJob': False, 138 | 'pageNo': 1, 139 | 'pageSize': 100 140 | } 141 | recruit_header = headers 142 | recruit_header['Referer'] = 'https://www.lagou.com/gongsi/j{}.html'.format(cid) 143 | try: 144 | recruit_r = requests.post(recruit_url, data=recruit_data, headers=recruit_header, allow_redirects=False) 145 | recruit_list = recruit_r.json().get('content').get('data').get('page').get('result') 146 | detail_doc['recruitList'] = recruit_list 147 | except Exception as e: 148 | print(e) 149 | print('这里请求太快,代理不够用,等3分钟再请求吧') 150 | time.sleep(180) 151 | recruit_r = requests.post(recruit_url, data=recruit_data, headers=recruit_header, allow_redirects=False) 152 | recruit_list = recruit_r.json().get('content').get('data').get('page').get('result') 153 | detail_doc['recruitList'] = recruit_list 154 | print(detail_doc) 155 | company_detail.insert(detail_doc) 156 | 157 | 158 | def get_cid_from_db(): 159 | companies = company.find(no_cursor_timeout=True) 160 | for item in companies: 161 | del item['_id'] 162 | print(item['companyShortName']) 163 | get_html(item, item['companyId']) 164 | companies.close() 165 | 166 | 167 | if __name__ == '__main__': 168 | get_cid_from_db() 169 | -------------------------------------------------------------------------------- /crawl_lagou.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pymongo 3 | import config 4 | 5 | # 基本信息 6 | client = pymongo.MongoClient(host=config.MONGO_HOST, port=config.MONGO_PORT) 7 | db = client[config.MONGO_DB] 8 | db.authenticate(config.MONGO_AUTH_NAME, config.MONGO_AUTH_PASSWORD) 9 | recruit_data = db['recruit'] 10 | topic_data = db['topic'] 11 | company_data = db['company'] 12 | article_data = db['article'] 13 | headers = { 14 | "Referer": "https://www.lagou.com/jobs/list_", 15 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 16 | "(KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36", 17 | } 18 | keywords = [ 19 | 'java后端', 20 | 'java', 21 | 'java web', 22 | 'java 实习', 23 | 'java 分布式', 24 | '前端', 25 | '前端实习', 26 | 'javascript', 27 | 'web', 28 | 'vue', 29 | 'html5', 30 | '全栈', 31 | 'node', 32 | 'node.js', 33 | 'web实习', 34 | 'react', 35 | 'angular', 36 | 'reactnative', 37 | 'python', 38 | 'python爬虫', 39 | '大数据', 40 | 'django', 41 | 'flask', 42 | 'python实习', 43 | '量化交易', 44 | 'mongodb', 45 | 'redis', 46 | '机器学习', 47 | '算法', 48 | '计算机视觉', 49 | '人工智能', 50 | '自然语言', 51 | '程序员', 52 | '设计师', 53 | 'ui', 54 | '产品经理', 55 | '运维', 56 | '运营', 57 | '互联网运营' 58 | ] 59 | 60 | 61 | # 获取代理 62 | def get_proxy(): 63 | return requests.get("http://{}:5010/get/".format(config.MONGO_HOST)).content # ip替换成自己的服务器地址 64 | 65 | 66 | # 删除服务器无用代理 67 | def delete_proxy(proxy): 68 | requests.get("http://{}:5010/delete/?proxy={}".format(config.MONGO_HOST, proxy)) # ip替换成自己的服务器地址 69 | 70 | 71 | # 获取求职岗位数据 72 | def get_data_by_crawl(city, kw): 73 | url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&city={}'.format(city) 74 | proxy = str(get_proxy(), encoding='utf-8') 75 | proxies = { 76 | 'http': 'http://{}'.format(proxy), 77 | 'https': 'http://{}'.format(proxy), 78 | } # 获取并设置代理 79 | for i in range(1, 100): 80 | data = {"first": "true", "pn": i, "kd": kw} 81 | try: 82 | base_request = requests.post(url, data=data, headers=headers, timeout=3, proxies=proxies) 83 | base_request.json().get('content') 84 | except Exception as e: 85 | print(e) 86 | continue 87 | if not base_request.json().get('content', ''): 88 | flag = False 89 | while not flag: # 若代理ip没走通则换一个 90 | try: 91 | r = requests.post(url, data=data, headers=headers, timeout=3, proxies=proxies) 92 | if not r.json().get('content', ''): 93 | raise Exception('这个ip不能用') 94 | save_to_db(r.json().get('content', ''), 'data') # 存入数据库 95 | flag = True # 成功获取数据跳出循环 96 | except Exception as e: 97 | if 'HTTPSConnectionPool' in str(e): 98 | print('这个代理不能用,我删了你 {}'.format(proxy)) # 代理本身不可用则删除该代理 99 | delete_proxy(proxy) 100 | proxy = str(get_proxy(), encoding='utf-8') 101 | proxies = { 102 | 'http': 'http://{}'.format(proxy), 103 | 'https': 'http://{}'.format(proxy), 104 | } # 切换代理 105 | else: 106 | save_to_db(base_request.json().get('content', ''), 'data') # 存入数据库 107 | 108 | 109 | # 获取公司数据 110 | def get_company_by_crawl(): 111 | headers['Referer'] = 'https://www.lagou.com/gongsi/0-0-0-0' 112 | url = 'https://www.lagou.com/gongsi/0-0-0-0.json' 113 | proxy = str(get_proxy(), encoding='utf-8') 114 | proxies = { 115 | 'http': 'http://{}'.format(proxy), 116 | 'https': 'http://{}'.format(proxy), 117 | } # 获取并设置代理 118 | for i in range(1, 100): 119 | data = { 120 | 'first': False, 121 | 'pn': i, 122 | 'sortField': 1, 123 | 'havemark': 0 124 | } 125 | flag = False 126 | while not flag: # 若代理ip没走通则换一个 127 | try: 128 | r = requests.post(url, data=data, headers=headers, timeout=3, proxies=proxies) 129 | if not r.json().get('result', ''): 130 | if not r.json().get('totalCount'): 131 | raise Exception('这个ip不能用') 132 | else: 133 | return False 134 | save_to_db(r.json().get('result', ''), 'company') # 存入数据库 135 | flag = True # 成功获取数据跳出循环 136 | except Exception as e: 137 | if 'HTTPSConnectionPool' in str(e): 138 | print('这个代理不能用,我删了你 {}'.format(proxy)) # 代理本身不可用则删除该代理 139 | delete_proxy(proxy) 140 | proxy = str(get_proxy(), encoding='utf-8') 141 | proxies = { 142 | 'http': 'http://{}'.format(proxy), 143 | 'https': 'http://{}'.format(proxy), 144 | } # 切换代理 145 | 146 | 147 | # 存储数据 148 | def save_to_db(content, now_type): 149 | if now_type == 'company': 150 | data_list = content 151 | for item in data_list: 152 | print(item) 153 | find_data = company_data.find_one( 154 | {'companyId': item.get('companyId')}) 155 | if not find_data: # 查重后插入数据库 156 | company_data.insert(item) 157 | elif now_type == 'data': 158 | data_list = content.get('positionResult').get('result') 159 | for item in data_list: 160 | find_data = recruit_data.find_one( 161 | {'companyId': item.get('companyId'), 'createTime': item.get('createTime')}) 162 | if not find_data: # 查重后插入数据库 163 | print(item) 164 | recruit_data.insert(item) 165 | 166 | 167 | def format_img(): 168 | recruits = recruit_data.find(no_cursor_timeout=True) 169 | for recruit in recruits: 170 | try: 171 | company_logo = recruit['companyLogo'] 172 | if 'http://www.lgstatic.com' not in company_logo and 'https://static.lagou' not in company_logo: 173 | company_logo = 'http://www.lgstatic.com/' + company_logo 174 | recruit_data.update({'_id': recruit['_id']}, {'$set': {'companyLogo': company_logo}}) 175 | except Exception as e: 176 | print(e) 177 | continue 178 | recruits.close() 179 | companies = company_data.find(no_cursor_timeout=True) 180 | for company in companies: 181 | try: 182 | company_logo2 = company['companyLogo'] 183 | if 'http://www.lgstatic.com' not in company_logo2 and 'https://static.lagou' not in company_logo2: 184 | company_logo2 = 'http://www.lgstatic.com/' + company_logo2 185 | recruit_data.update({'_id': company['_id']}, {'$set': {'companyLogo': company_logo2}}) 186 | except Exception as e: 187 | print(e) 188 | continue 189 | companies.close() 190 | 191 | 192 | if __name__ == '__main__': 193 | get_company_by_crawl() 194 | for keyword in keywords: 195 | get_data_by_crawl('全国', keyword) 196 | format_img() 197 | -------------------------------------------------------------------------------- /crawl_topic&article.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pymongo 3 | import config 4 | 5 | # 基本信息 6 | client = pymongo.MongoClient(host=config.MONGO_HOST, port=config.MONGO_PORT) 7 | db = client[config.MONGO_DB] 8 | db.authenticate(config.MONGO_AUTH_NAME, config.MONGO_AUTH_PASSWORD) 9 | topic_data = db['topic'] 10 | article_data = db['article'] 11 | comment_data = db['comment'] 12 | comment_user = db['comment_user'] 13 | headers = { 14 | "Referer": "https://www.lagou.com/jobs/list_", 15 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 16 | "(KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36" 17 | } 18 | 19 | 20 | # 获取话题数据 21 | def get_topic_by_crawl(): 22 | for i in range(1, 100): 23 | url = 'https://yanzhi.lagou.com/topic/getTopicList.json?categoryId=&pageNo={}&pageSize=20'.format(i) 24 | r = requests.get(url) 25 | try: 26 | save_to_db(r.json().get('content').get('data').get('topicPage'), 'topic', '') 27 | if not r.json().get('content').get('data').get('hasMoreTopic'): 28 | return False 29 | except Exception as e: 30 | print(e) 31 | 32 | 33 | # 获取文章数据 34 | def get_article_by_crawl(): 35 | topics = topic_data.find(no_cursor_timeout=True) 36 | for item in topics: 37 | topic_id = item.get('id') 38 | article_base_url = 'https://yanzhi.lagou.com/topic/moreTopicNewsList.json?topicId={}&pageNo={}&pageSize=20' 39 | for i in range(1, 100): 40 | url = article_base_url.format(topic_id, i) 41 | print(url) 42 | r = requests.get(url) 43 | try: 44 | save_to_db(r.json().get('content').get('data').get('topicNewsList'), 'article', topic_id) 45 | if not r.json().get('content').get('data').get('hasMore'): 46 | break 47 | except Exception as e: 48 | print(e) 49 | if r.json(): 50 | if r.json().get('content'): 51 | if r.json().get('content').get('data'): 52 | if not r.json().get('content').get('data').get('hasMore'): 53 | break 54 | else: 55 | break 56 | else: 57 | break 58 | else: 59 | break 60 | topics.close() 61 | 62 | 63 | # 存储数据 64 | def save_to_db(content, now_type, topic_id): 65 | if now_type == 'topic': 66 | data_list = content.get('result') 67 | for item in data_list: 68 | find_data = topic_data.find_one( 69 | {'id': item.get('id'), 'title': item.get('title')}) 70 | if not find_data: # 查重后插入数据库 71 | topic_data.insert(item) 72 | elif now_type == 'article': 73 | data_list = content 74 | for item in data_list: 75 | find_data = article_data.find_one( 76 | {'questionId': item.get('news').get('questionId'), 'time': item.get('news').get('time')}) 77 | if not find_data: # 查重后插入数据库 78 | item['topic_id'] = topic_id 79 | item['questionId'] = str(item.get('news').get('questionId')) 80 | item['time'] = item.get('news').get('time') 81 | item['news']['topic_id'] = topic_id 82 | article_data.insert(item) 83 | 84 | 85 | # 将评论单独划出 86 | def get_comment_from_article(): 87 | articles = article_data.find(no_cursor_timeout=True) 88 | for item in articles: 89 | comment_list = item.get('news').get('answerInfoList') 90 | if comment_list: 91 | for sub_item in comment_list: 92 | comment_item = comment_data.find_one({'answerId': sub_item.get('answerId')}) 93 | if not comment_item: 94 | sub_item['article_id'] = str(item['questionId']) 95 | comment_data.insert(sub_item) 96 | article_data.update({'_id': item['_id']}, {'$set': {'answerInfoList': []}}) 97 | articles.close() 98 | 99 | 100 | # 将评论用户从评论中单独划出 101 | def get_user_from_comment(): 102 | comments = comment_data.find(no_cursor_timeout=True) 103 | for item in comments: 104 | sub_item = item.get('answerUser') 105 | comment_user_item = comment_user.find_one({'id': sub_item.get('id')}) 106 | if not comment_user_item: 107 | sub_item['answerId'] = item['answerId'] 108 | comment_user.insert(sub_item) 109 | comment_data.update({'_id': item['_id']}, {'$set': {'answerUser': ''}}) 110 | comments.close() 111 | 112 | 113 | def format_img(): 114 | users = comment_user.find(no_cursor_timeout=True) 115 | for user in users: 116 | try: 117 | if user.get('portrait'): 118 | portrait = user['portrait'] 119 | if 'http://www.lgstatic.com' not in portrait and 'https://static.lagou' not in portrait: 120 | portrait = 'http://www.lgstatic.com/' + portrait 121 | comment_user.update({'_id': user['_id']}, {'$set': {'portrait': portrait}}) 122 | except Exception as e: 123 | print(e) 124 | users.close() 125 | topics = topic_data.find(no_cursor_timeout=True) 126 | for topic in topics: 127 | try: 128 | logo = topic['logo'] 129 | if 'http://www.lgstatic.com' not in logo and 'https://static.lagou' not in logo: 130 | logo = 'http://www.lgstatic.com/' + logo 131 | topic_data.update({'_id': topic['_id']}, {'$set': {'logo': logo}}) 132 | except Exception as e: 133 | print(e) 134 | topics.close() 135 | 136 | 137 | if __name__ == '__main__': 138 | get_topic_by_crawl() 139 | get_article_by_crawl() 140 | get_comment_from_article() 141 | get_user_from_comment() 142 | format_img() 143 | -------------------------------------------------------------------------------- /get_city_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import pymongo 4 | import config 5 | 6 | # 7 | client = pymongo.MongoClient(host=config.MONGO_HOST, port=config.MONGO_PORT) 8 | db = client[config.MONGO_DB] 9 | # db.authenticate(config.MONGO_AUTH_NAME, config.MONGO_AUTH_PASSWORD) 10 | city_districts = db['city_districts'] 11 | district_areas = db['district_areas'] 12 | 13 | subways_lines = db['subways_lines'] 14 | line_stops = db['line_stops'] 15 | 16 | 17 | def get_subway_data(): 18 | city_with_subway_url = 'http://map.baidu.com/?qt=subwayscity' 19 | subway_detail_url = 'http://map.baidu.com/?qt=bsi&c={}' 20 | r = requests.get(city_with_subway_url) 21 | for item in r.json().get('subways_city').get('cities'): 22 | if item['code'] < 10000: 23 | url = subway_detail_url.format(item['code']) 24 | r = requests.get(url) 25 | subways_line = { 26 | 'cityName': item['cn_name'], 27 | 'subWayList': [] 28 | } 29 | for subway in r.json().get('content'): 30 | if subway['line_name'].split('(')[0] not in subways_line['subWayList']: 31 | subways_line['subWayList'].append(subway['line_name'].split('(')[0]) 32 | 33 | line = line_stops.find_one( 34 | { 35 | 'cityName': item['cn_name'], 36 | 'lineName': subway['line_name'].split('(')[0] 37 | }) 38 | if not line: 39 | line = { 40 | 'cityName': item['cn_name'], 41 | 'lineName': subway['line_name'].split('(')[0], 42 | 'stops': [] 43 | } 44 | for stop in subway['stops']: 45 | line['stops'].append(stop['name']) 46 | line_stops.insert(line) 47 | else: 48 | stops = line['stops'] 49 | for stop in subway['stops']: 50 | if stop['name'] not in stops: 51 | stops.append(stop['name']) 52 | line_stops.update({'_id': line['_id']}, {'$set': {'stops': stops}}) 53 | subway_line = subways_lines.find_one({ 54 | 'cityName': item['cn_name'], 55 | }) 56 | if not subway_line: 57 | subways_lines.insert(subways_line) 58 | 59 | 60 | def combine_data(): 61 | import json 62 | with open('city_data.json', 'r', encoding='utf-8') as f: 63 | data = json.load(f) 64 | cities = data.get('data') 65 | for city in cities: 66 | for area in city['cities']: 67 | city_district = { 68 | 'cityName': area['name'], 69 | 'districts': [] 70 | } 71 | for country in area.get('counties'): 72 | city_district['districts'].append(country['name']) 73 | district_area = { 74 | 'cityName': area['name'], 75 | 'districts': country['name'], 76 | 'areas': [], 77 | } 78 | for circle in country.get('circles'): 79 | district_area['areas'].append(circle['name']) 80 | area_collection = district_areas.find_one({ 81 | 'cityName': district_area['cityName'], 82 | 'districts': district_area['districts'], 83 | }) 84 | if not area_collection: 85 | district_areas.insert(district_area) 86 | city_collection = city_districts.find_one({ 87 | 'cityName': city_district['cityName'], 88 | }) 89 | if not city_collection: 90 | city_districts.insert(city_district) 91 | 92 | 93 | if __name__ == '__main__': 94 | combine_data() 95 | get_subway_data() 96 | 97 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | requests==2.18.4 3 | pymongo==3.7.1 4 | -------------------------------------------------------------------------------- /screenshots/choose_city.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/choose_city.gif -------------------------------------------------------------------------------- /screenshots/edit_resume.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/edit_resume.gif -------------------------------------------------------------------------------- /screenshots/home.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/home.gif -------------------------------------------------------------------------------- /screenshots/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/loading.gif -------------------------------------------------------------------------------- /screenshots/login.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/login.gif -------------------------------------------------------------------------------- /screenshots/read.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/read.gif -------------------------------------------------------------------------------- /screenshots/recruit.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/recruit.gif -------------------------------------------------------------------------------- /screenshots/recurit_want.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/recurit_want.gif -------------------------------------------------------------------------------- /screenshots/setting.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/setting.gif -------------------------------------------------------------------------------- /screenshots/sort.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/sort.gif -------------------------------------------------------------------------------- /screenshots/sort_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qianbin01/lagou_spider/55c466331b84a0ea6f0fae6544cbd10e10e71947/screenshots/sort_2.gif --------------------------------------------------------------------------------