├── README.md ├── __init__.py ├── doc ├── demo1.jpg └── demo2.jpg ├── lagou ├── __init__.py ├── config │ └── myservice.conf ├── db.py ├── items.py ├── middlewares.py ├── pipelines.py ├── runweb │ ├── __init__.py │ ├── runweb.py │ ├── static │ │ └── echarts.min.js │ └── templates │ │ ├── jobstocity.html │ │ └── jobstomoney.html ├── settings.py ├── spiders │ ├── __init__.py │ └── lagouSpider.py └── utils.py ├── requirements.txt └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | 拉钩爬虫出图 2 | ========== 3 | 4 | [![Python 2.6](https://img.shields.io/badge/python-2.6-yellow.svg)](https://www.python.org/) 5 | [![](https://img.shields.io/badge/flask-0.10.1-green.svg)](http://docs.jinkan.org/docs/flask/) 6 | [![](https://img.shields.io/badge/Scrapy-1.4.0-green.svg)](http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html) 7 | [![](https://img.shields.io/badge/bs4-0.0.1-brightgreen.svg)](https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html) 8 | [![](https://img.shields.io/badge/requests-2.9.1-brightgreen.svg)](http://docs.python-requests.org/zh_CN/latest/user/quickstart.html) 9 | 10 | 11 | 12 | ### 一、关于findjob 13 | 14 | - 爬取拉勾devops以及运维开发的数据 15 | - 进行可视化显示 16 | - 目的是为了方便找工作,爬去拉勾数据进行数据可视化展示分析以便找到一个称心如意的工作 17 | 18 | 19 | 20 | ### 二、依赖环境 21 | 22 | ```cpp 23 | pip install -r requirments.txt 24 | ``` 25 | 26 | ### 三、具体部署步骤 27 | 28 | > 1. 进入项目目录 29 | 30 | ``` cpp 31 | cd findjob/config/ 32 | ``` 33 | 34 | > 2.创建数据库lagou,创建表lagou 35 | 36 | ```cpp 37 | CREATE DATABASE lagou; 38 | 39 | CREATE TABLE `jobinfo` ( 40 | `positionId` varchar(255) DEFAULT NULL, 41 | `positionName` varchar(255) DEFAULT NULL, 42 | `city` varchar(255) DEFAULT NULL, 43 | `createTime` varchar(255) DEFAULT NULL, 44 | `salary` varchar(255) DEFAULT NULL, 45 | `companyId` varchar(50) DEFAULT NULL, 46 | `companyName` varchar(255) DEFAULT NULL, 47 | `companyFullName` varchar(255) DEFAULT NULL, 48 | `minsalary` double DEFAULT NULL, 49 | `munsalary` double DEFAULT NULL, 50 | `maxsalary` double DEFAULT NULL 51 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 52 | ``` 53 | 54 | > 3.编辑service.conf, 配置数据库地址以及账号密码和flask启动绑定的端口和地址 55 | 56 | ```cpp 57 | [common] 58 | mysql_host = 127.0.0.1 59 | mysql_port = 3306 60 | mysql_user = root 61 | mysql_passwd = 123456 62 | mysql_db = lagou 63 | mysql_charset = utf8 64 | passport_key = 123456 65 | [web] 66 | bind = 0.0.0.0 67 | port = 8000 68 | ``` 69 | 70 | 71 | ### 四、如何访问 72 | 73 | ```cpp 74 | 75 | 1.cd lagou 76 | 2.scrapy crawl lagou 77 | 3.cd lagou/runweb 78 | 4.python runweb.py 79 | 5.访问http://IP:8000/jobstomoney && 访问http://IP:8000/jobstocity 80 | ``` 81 | 82 | ### 五、版权 83 | 84 | 代码内容采用 [新 BSD 许可](LICENSE) 85 | 文档内容采用 [署名-禁止演绎 4.0 国际协议许可](https://creativecommons.org/licenses/by-nd/4.0/deed.zh) 86 | 87 | ### 六、TODO 88 | 89 | - [ ] 集成BOSS直聘等平台数据,进行综合评测 90 | - [ ] 针对不同的岗位可进行筛选操作 91 | 92 | ### 七、鸣谢 93 | 94 | - 核心代码贡献者: 国内大型互联网公司首席扛服务器工程师--刘老师 95 | - 场景的需求才会让这个项目诞生 96 | 97 | 98 | ### 八、Demo 99 | 100 | ![](./doc/demo1.jpg "例子1") 101 | ![](./doc/demo2.jpg "例子2") 102 | 103 | ### 九、特别提示 104 | 105 | - 本项目仅做沟通交流使用,请勿用作非法用途,谢谢 106 | - 欢迎大家提pr 107 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/__init__.py -------------------------------------------------------------------------------- /doc/demo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo1.jpg -------------------------------------------------------------------------------- /doc/demo2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo2.jpg -------------------------------------------------------------------------------- /lagou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/__init__.py -------------------------------------------------------------------------------- /lagou/config/myservice.conf: -------------------------------------------------------------------------------- 1 | [common] 2 | mysql_host = 127.0.0.1 3 | mysql_port = 3306 4 | mysql_user = root 5 | mysql_passwd = 123456 6 | mysql_db = lagou 7 | mysql_charset = utf8 8 | passport_key = 123456 9 | 10 | [web] 11 | bind = 0.0.0.0 12 | port = 8000 13 | -------------------------------------------------------------------------------- /lagou/db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | 4 | import MySQLdb as mysql 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class Cursor(): 11 | def __init__(self, config): 12 | self.config = dict([(k[6:], config[k]) for k in config if k.startswith('mysql_')]) 13 | if 'port' in self.config: 14 | self.config['port'] = int(self.config['port']) 15 | if self.config: 16 | self._connect_db() 17 | 18 | def _connect_db(self): 19 | self.db = mysql.connect(**self.config) 20 | self.db.autocommit(True) 21 | self.cur = self.db.cursor() 22 | 23 | def _close_db(self): 24 | self.cur.close() 25 | self.db.close() 26 | 27 | def _execute(self, sql): 28 | try: 29 | return self.cur.execute(sql) 30 | except: 31 | self._close_db() 32 | self._connect_db() 33 | return self.cur.execute(sql) 34 | 35 | def _fetchone(self): 36 | return self.cur.fetchone() 37 | 38 | def _fetchall(self): 39 | return self.cur.fetchall() 40 | 41 | def _insert_sql(self, table_name, data): 42 | fields, values = [], [] 43 | for k, v in data.items(): 44 | fields.append(k) 45 | values.append("'%s'" % v) 46 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (table_name, ','.join(fields), ','.join(values)) 47 | return sql 48 | 49 | def execute_insert_sql(self, table_name, data): 50 | sql = self._insert_sql(table_name, data) 51 | return self._execute(sql) 52 | 53 | def _select_sql(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 54 | if isinstance(where, dict) and where: 55 | conditions = [] 56 | for k, v in where.items(): 57 | if isinstance(v, list): 58 | conditions.append("%s IN (%s)" % (k, ','.join(v))) 59 | elif isinstance(v, str) or isinstance(v, unicode): 60 | conditions.append("%s='%s'" % (k, v)) 61 | elif isinstance(v, int): 62 | conditions.append("%s=%s" % (k, v)) 63 | 64 | sql = "SELECT %s FROM %s WHERE %s" % (','.join(fields), table_name, ' AND '.join(conditions)) 65 | 66 | elif not where: 67 | sql = "SELECT %s FROM %s" % (','.join(fields), table_name) 68 | else: 69 | sql = "" 70 | if order and (isinstance(order, str) or isinstance(order, unicode)): 71 | sql = "%s ORDER BY %s %s" % (sql, order, 'ASC' if asc_order else 'DESC') 72 | if limit and isinstance(limit, tuple) and len(limit) == 2: 73 | sql = "%s LIMIT %s,%s" % (sql, limit[0], limit[1]) 74 | return sql 75 | 76 | def get_one_result(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 77 | sql = self._select_sql(table_name, fields, where, order, asc_order, limit) 78 | if not sql: 79 | return None 80 | self._execute(sql) 81 | result_set = self._fetchone() 82 | if result_set: 83 | return dict([(k, '' if result_set[i] is None else result_set[i]) for i, k in enumerate(fields)]) 84 | else: 85 | return {} 86 | 87 | def get_results(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 88 | sql = self._select_sql(table_name, fields, where, order, asc_order, limit) 89 | self._execute(sql) 90 | result_sets = self._fetchall() 91 | return [dict([(k, '' if row[i] is None else row[i]) for i, k in enumerate(fields)]) for row in result_sets] 92 | 93 | def _update_sql(self, table_name, data, where, fields=None): 94 | if not (where and isinstance(where, dict)): 95 | return "" 96 | where_cond = ["%s='%s'" % (k, v) for k, v in where.items()] 97 | if fields: 98 | conditions = ["%s='%s'" % (k, data[k]) for k in fields] 99 | else: 100 | conditions = ["%s='%s'" % (k, data[k]) for k in data] 101 | sql = "UPDATE %s SET %s WHERE %s" % (table_name, ','.join(conditions), ' AND '.join(where_cond)) 102 | return sql 103 | 104 | def execute_update_sql(self, table_name, data, where, fields=None): 105 | sql = self._update_sql(table_name, data, where, fields) 106 | if sql: 107 | return self._execute(sql) 108 | else: 109 | return "" 110 | 111 | def _delete_sql(self, table_name, where): 112 | if not (where and isinstance(where, dict)): 113 | return "" 114 | where_cond = ["%s='%s'" % (k, v) for k, v in where.items()] 115 | sql = "DELETE FROM %s WHERE %s" % (table_name, ' AND '.join(where_cond)) 116 | return sql 117 | 118 | def execute_delete_sql(self, table_name, where): 119 | sql = self._delete_sql(table_name, where) 120 | if sql: 121 | return self._execute(sql) 122 | else: 123 | return "" 124 | 125 | def if_id_exist(self, table_name, field_id): 126 | if isinstance(field_id, list): 127 | id_num=len(field_id) 128 | result = self.get_results(table_name, ['id'], {'id': field_id}) 129 | if id_num !=len(result): 130 | result=False 131 | else: 132 | result = self.get_one_result(table_name, ['id'], {'id': field_id}) 133 | if result: 134 | return True 135 | else: 136 | return False 137 | -------------------------------------------------------------------------------- /lagou/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LagouzpItem(scrapy.Item): 12 | positionId = scrapy.Field() 13 | positionName= scrapy.Field() 14 | city= scrapy.Field() 15 | createTime= scrapy.Field() 16 | companyId= scrapy.Field() 17 | companyName= scrapy.Field() 18 | companyFullName= scrapy.Field() 19 | salary= scrapy.Field() 20 | minsalary = scrapy.Field() 21 | munsalary = scrapy.Field() 22 | maxsalary = scrapy.Field() 23 | -------------------------------------------------------------------------------- /lagou/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LagouSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /lagou/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | # class LagouPipeline(object): 10 | # def process_item(self, item, spider): 11 | # return item 12 | 13 | #import MySQLdb 14 | from db import Cursor 15 | from utils import get_config 16 | 17 | 18 | class LagouzpPipeline(object): 19 | def __init__(self): 20 | self.config = get_config("common") 21 | self.cursor = Cursor(self.config) 22 | 23 | def process_item(self, item, spider): 24 | data = { 25 | "positionId": item['positionId'], 26 | "positionName": item['positionName'], 27 | "city": item['city'], 28 | "createTime": item['createTime'], 29 | "companyId": item['companyId'], 30 | "companyName": item['companyName'], 31 | "companyFullName": item['companyFullName'], 32 | "salary": item['salary'], 33 | "minsalary": item['minsalary'], 34 | "munsalary": item['munsalary'], 35 | "maxsalary": item['maxsalary'] 36 | } 37 | self.cursor.execute_insert_sql('jobinfo', data) 38 | return item 39 | -------------------------------------------------------------------------------- /lagou/runweb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/runweb/__init__.py -------------------------------------------------------------------------------- /lagou/runweb/runweb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from flask import Flask, render_template 5 | from collections import Counter 6 | from os import sys 7 | from os import path 8 | sys.path.append(path.dirname(path.dirname( path.abspath(__file__)))) 9 | import db 10 | import utils 11 | 12 | 13 | app = Flask(__name__) 14 | configweb = utils.get_config('web') 15 | configdb = utils.get_config('common') 16 | cursor = db.Cursor(configdb) 17 | 18 | 19 | @app.route('/jobstocity') 20 | def jobstocity(): 21 | output = [ 22 | 'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary' 23 | ] 24 | result = cursor.get_results('jobinfo', output) 25 | city_list = [item.values()[1] for item in result] 26 | city_dict = Counter(city_list) 27 | 28 | return render_template('jobstocity.html', jobstocity=city_dict) 29 | 30 | 31 | @app.route('/jobstomoney') 32 | def jobstomoney(): 33 | output = [ 34 | 'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary' 35 | ] 36 | result = cursor.get_results('jobinfo', output) 37 | money_list = [item.values()[4] for item in result] 38 | money_dict = Counter(money_list) 39 | _money_dict = [{'value': v, 'name': k.encode("utf-8")}for k, v in money_dict.items()] 40 | 41 | return render_template('jobstomoney.html', jobstomoney_dict=_money_dict, jobstomoney_list=money_list) 42 | 43 | 44 | if __name__ == '__main__': 45 | app.run(host=configweb.get('bind', '0.0.0.0'), port=int(configweb.get('port', 8000)), debug=True) 46 | -------------------------------------------------------------------------------- /lagou/runweb/templates/jobstocity.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /lagou/runweb/templates/jobstomoney.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /lagou/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lagou project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'lagou' 13 | 14 | SPIDER_MODULES = ['lagou.spiders'] 15 | NEWSPIDER_MODULE = 'lagou.spiders' 16 | ITEM_PIPELINES = { 17 | 'lagou.pipelines.LagouzpPipeline': 200, # 200是为了设置工序顺序 18 | } 19 | 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'lagou (+http://www.yourdomain.com)' 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = True 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | #CONCURRENT_REQUESTS = 32 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | #DOWNLOAD_DELAY = 3 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | #COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'lagou.middlewares.LagouSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | #DOWNLOADER_MIDDLEWARES = { 59 | # 'lagou.middlewares.MyCustomDownloaderMiddleware': 543, 60 | #} 61 | 62 | # Enable or disable extensions 63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 64 | #EXTENSIONS = { 65 | # 'scrapy.extensions.telnet.TelnetConsole': None, 66 | #} 67 | 68 | # Configure item pipelines 69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 70 | #ITEM_PIPELINES = { 71 | # 'lagou.pipelines.LagouPipeline': 300, 72 | #} 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | #AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | #AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | #HTTPCACHE_ENABLED = True 90 | #HTTPCACHE_EXPIRATION_SECS = 0 91 | #HTTPCACHE_DIR = 'httpcache' 92 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /lagou/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /lagou/spiders/lagouSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -*- coding:utf-8 -*- 5 | import scrapy 6 | from ..items import LagouzpItem 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import json 10 | import re 11 | 12 | 13 | class Spider(scrapy.Spider): 14 | name = 'lagou' 15 | cookies = { 16 | 'user_trace_token': '20170329220541-535dcc08aa394057884d3de6c06da2aa', 17 | 'JSESSIONID': 'ABAAABAAAFCAAEG36FCE164D221C1CEB89E796234273C56', 18 | 'PRE_UTM': '', 19 | 'BAIDUID': '45A2EA96C0D623AADAE825E1A3DE41F8:FG=1', 20 | 'PRE_SITE': '', 21 | 'PRE_HOST': '', 22 | 'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F', 23 | 'index_location_city': '%E5%8C%97%E4%BA%AC', 24 | 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1491116405,1491116452,1493122880,1493122898', 25 | 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1493123186', 26 | '_ga': 'GA1.2.1412866745.1489497427', 27 | 'LGUID': '20170819003739-8cd023cd-8433-11e7-9d86-525400f775ce', 28 | 'LGSID': '20170819002347-9caa6942-8431-11e7-8a38-5254005c3644', 29 | 'TG-TRACK-CODE': 'index_search', 30 | 'SEARCH_ID': 'b3638e10d2a5464598572ce7dfb66e1b' 31 | } 32 | headers = { 33 | "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 34 | } 35 | 36 | def start_requests(self): 37 | kd = ['运维开发'] 38 | city = ['北京', '上海', '广州', '深圳', '杭州'] 39 | urls_kd = ['https://www.lagou.com/jobs/list_{}?px=default&city='.format(one) for one in kd] 40 | for urls in urls_kd: 41 | urls_city = [urls + one for one in city] 42 | for url in urls_city: 43 | response = requests.get(url, headers=self.headers, cookies=self.cookies) 44 | location = url.split('&')[-1].split('=')[1] 45 | key = url.split('/')[-1].split('?')[0].split('_')[1] 46 | soup = BeautifulSoup(response.text, 'lxml') 47 | pages = soup.find('span', {'class': 'span totalNum'}).get_text() 48 | for i in range(1, int(pages) + 1): 49 | url = "https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}".format(location, key, i) 50 | yield scrapy.Request(url, callback=self.parse) 51 | 52 | def parse(self, response): 53 | data = json.loads(response.text) 54 | content = data['content'] 55 | positionResult = content['data']['page']['result'] 56 | item = LagouzpItem() 57 | 58 | for one in positionResult: 59 | item['positionId'] = one['positionId'] 60 | item['positionName']= one['positionName'] 61 | item['city'] = one['city'] 62 | item['createTime'] = one['createTime'].split(' ')[0] 63 | item['companyId'] = one['companyId'] 64 | item['companyName'] = one['companyName'] 65 | item['companyFullName'] = one['companyFullName'] 66 | item['salary'] = one['salary'] 67 | item['minsalary'] = re.split(r'(k|-|K)', one['salary'])[0] 68 | item['munsalary'] = (int(re.split(r'(k|-|K)', one['salary'])[0]) + int(re.split(r'(k|-|K)', one['salary'])[4])) / 2 69 | item['maxsalary'] = re.split(r'(k|-|K)', one['salary'])[4] 70 | yield item 71 | -------------------------------------------------------------------------------- /lagou/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import ConfigParser 6 | 7 | work_dir = os.path.dirname(os.path.realpath(__file__)) 8 | 9 | 10 | def get_config(section=''): 11 | config = ConfigParser.ConfigParser() 12 | service_conf= os.path.join(work_dir, 'config/myservice.conf') 13 | config.read(service_conf) 14 | 15 | conf_items = dict(config.items('common')) if config.has_section('common') else {} 16 | if section and config.has_section(section): 17 | conf_items.update(config.items(section)) 18 | return conf_items 19 | 20 | 21 | if __name__ == "__main__": 22 | print get_config('common') 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements automatically generated by pigar. 2 | # https://github.com/Damnever/pigar 3 | 4 | # runweb/runweb.py: 4 5 | Flask == 0.10.1 6 | 7 | # db.py: 4 8 | MySQL_python == 1.2.5 9 | 10 | # items.py: 8 11 | # middlewares.py: 8 12 | # spiders/lagouSpider.py: 5 13 | Scrapy == 1.4.0 14 | 15 | # spiders/lagouSpider.py: 8 16 | bs4 == 0.0.1 17 | 18 | # pipelines.py: 14 19 | # runweb/runweb.py: 9 20 | db == 0.1.1 21 | 22 | # spiders/lagouSpider.py: 7 23 | requests == 2.9.1 24 | 25 | # pipelines.py: 15 26 | # runweb/runweb.py: 10 27 | utils == 0.9.0 28 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lagou.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = lagou 12 | --------------------------------------------------------------------------------