├── README.md
├── __init__.py
├── doc
    ├── demo1.jpg
    └── demo2.jpg
├── lagou
    ├── __init__.py
    ├── config
    │   └── myservice.conf
    ├── db.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── runweb
    │   ├── __init__.py
    │   ├── runweb.py
    │   ├── static
    │   │   └── echarts.min.js
    │   └── templates
    │   │   ├── jobstocity.html
    │   │   └── jobstomoney.html
    ├── settings.py
    ├── spiders
    │   ├── __init__.py
    │   └── lagouSpider.py
    └── utils.py
├── requirements.txt
└── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
  1 | 拉钩爬虫出图
  2 | ==========
  3 | 
  4 | [![Python 2.6](https://img.shields.io/badge/python-2.6-yellow.svg)](https://www.python.org/)
  5 | [![](https://img.shields.io/badge/flask-0.10.1-green.svg)](http://docs.jinkan.org/docs/flask/)
  6 | [![](https://img.shields.io/badge/Scrapy-1.4.0-green.svg)](http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html)
  7 | [![](https://img.shields.io/badge/bs4-0.0.1-brightgreen.svg)](https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html)
  8 | [![](https://img.shields.io/badge/requests-2.9.1-brightgreen.svg)](http://docs.python-requests.org/zh_CN/latest/user/quickstart.html)
  9 | 
 10 | 
 11 | 
 12 | ### 一、关于findjob
 13 | 
 14 | - 爬取拉勾devops以及运维开发的数据
 15 | - 进行可视化显示
 16 | - 目的是为了方便找工作，爬去拉勾数据进行数据可视化展示分析以便找到一个称心如意的工作
 17 | 
 18 | 
 19 | 
 20 | ### 二、依赖环境
 21 | 
 22 | ```cpp
 23 | pip install -r requirments.txt
 24 | ```
 25 | 
 26 | ### 三、具体部署步骤
 27 | 
 28 | > 1. 进入项目目录
 29 | 
 30 | ``` cpp
 31 | cd findjob/config/
 32 | ``` 
 33 | 
 34 | > 2.创建数据库lagou,创建表lagou
 35 | 
 36 | ```cpp
 37 | CREATE DATABASE lagou;
 38 | 
 39 | CREATE TABLE `jobinfo` (
 40 |   `positionId` varchar(255) DEFAULT NULL,
 41 |   `positionName` varchar(255) DEFAULT NULL,
 42 |   `city` varchar(255) DEFAULT NULL,
 43 |   `createTime` varchar(255) DEFAULT NULL,
 44 |   `salary` varchar(255) DEFAULT NULL,
 45 |   `companyId` varchar(50) DEFAULT NULL,
 46 |   `companyName` varchar(255) DEFAULT NULL,
 47 |   `companyFullName` varchar(255) DEFAULT NULL,
 48 |   `minsalary` double DEFAULT NULL,
 49 |   `munsalary` double DEFAULT NULL,
 50 |   `maxsalary` double DEFAULT NULL
 51 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 52 | ```
 53 | 
 54 | > 3.编辑service.conf, 配置数据库地址以及账号密码和flask启动绑定的端口和地址
 55 | 
 56 | ```cpp
 57 | [common]
 58 |     mysql_host = 127.0.0.1
 59 |     mysql_port = 3306
 60 |     mysql_user = root
 61 |     mysql_passwd = 123456
 62 |     mysql_db = lagou
 63 |     mysql_charset = utf8
 64 |     passport_key = 123456
 65 | [web]
 66 |     bind = 0.0.0.0
 67 |     port = 8000
 68 | ```
 69 | 
 70 | 
 71 | ### 四、如何访问
 72 | 
 73 | ```cpp
 74 | 
 75 | 1.cd lagou
 76 | 2.scrapy crawl lagou
 77 | 3.cd lagou/runweb
 78 | 4.python runweb.py
 79 | 5.访问http://IP:8000/jobstomoney && 访问http://IP:8000/jobstocity
 80 | ```
 81 | 
 82 | ### 五、版权
 83 | 
 84 | 代码内容采用 [新 BSD 许可](LICENSE)
 85 | 文档内容采用 [署名-禁止演绎 4.0 国际协议许可](https://creativecommons.org/licenses/by-nd/4.0/deed.zh)
 86 | 
 87 | ### 六、TODO
 88 | 
 89 | - [ ] 集成BOSS直聘等平台数据，进行综合评测
 90 | - [ ] 针对不同的岗位可进行筛选操作
 91 | 
 92 | ### 七、鸣谢
 93 | 
 94 | - 核心代码贡献者: 国内大型互联网公司首席扛服务器工程师－－刘老师
 95 | - 场景的需求才会让这个项目诞生
 96 | 
 97 | 
 98 | ### 八、Demo
 99 | 
100 | ![](./doc/demo1.jpg "例子1")
101 | ![](./doc/demo2.jpg "例子2")
102 | 
103 | ### 九、特别提示
104 | 
105 | - 本项目仅做沟通交流使用，请勿用作非法用途，谢谢
106 | - 欢迎大家提pr
107 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/__init__.py


--------------------------------------------------------------------------------
/doc/demo1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo1.jpg


--------------------------------------------------------------------------------
/doc/demo2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo2.jpg


--------------------------------------------------------------------------------
/lagou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/__init__.py


--------------------------------------------------------------------------------
/lagou/config/myservice.conf:
--------------------------------------------------------------------------------
 1 | [common]
 2 | mysql_host = 127.0.0.1
 3 | mysql_port = 3306
 4 | mysql_user = root
 5 | mysql_passwd = 123456
 6 | mysql_db = lagou
 7 | mysql_charset = utf8
 8 | passport_key = 123456
 9 | 
10 | [web]
11 | bind = 0.0.0.0
12 | port = 8000
13 | 


--------------------------------------------------------------------------------
/lagou/db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding:utf-8
  3 | 
  4 | import MySQLdb as mysql
  5 | import sys
  6 | reload(sys)
  7 | sys.setdefaultencoding('utf-8')
  8 | 
  9 | 
 10 | class Cursor():
 11 |     def __init__(self, config):
 12 |         self.config = dict([(k[6:], config[k]) for k in config if k.startswith('mysql_')])
 13 |         if 'port' in self.config:
 14 |             self.config['port'] = int(self.config['port'])
 15 |         if self.config:
 16 |             self._connect_db()
 17 | 
 18 |     def _connect_db(self):
 19 |         self.db = mysql.connect(**self.config)
 20 |         self.db.autocommit(True)
 21 |         self.cur = self.db.cursor()
 22 | 
 23 |     def _close_db(self):
 24 |         self.cur.close()
 25 |         self.db.close()
 26 | 
 27 |     def _execute(self, sql):
 28 |         try:
 29 |             return self.cur.execute(sql)
 30 |         except:
 31 |             self._close_db()
 32 |             self._connect_db()
 33 |             return self.cur.execute(sql)
 34 | 
 35 |     def _fetchone(self):
 36 |         return self.cur.fetchone()
 37 | 
 38 |     def _fetchall(self):
 39 |         return self.cur.fetchall()
 40 | 
 41 |     def _insert_sql(self, table_name, data):
 42 |         fields, values = [], []
 43 |         for k, v in data.items():
 44 |             fields.append(k)
 45 |             values.append("'%s'" % v)
 46 |         sql = "INSERT INTO %s (%s) VALUES (%s)" % (table_name, ','.join(fields), ','.join(values))
 47 |         return sql
 48 | 
 49 |     def execute_insert_sql(self, table_name, data):
 50 |         sql = self._insert_sql(table_name, data)
 51 |         return self._execute(sql)
 52 | 
 53 |     def _select_sql(self, table_name, fields, where=None, order=None, asc_order=True, limit=None):
 54 |         if isinstance(where, dict) and where:
 55 |             conditions = []
 56 |             for k, v in where.items():
 57 |                 if isinstance(v, list):
 58 |                     conditions.append("%s IN (%s)" % (k, ','.join(v)))
 59 |                 elif isinstance(v, str) or isinstance(v, unicode):
 60 |                     conditions.append("%s='%s'" % (k, v))
 61 |                 elif isinstance(v, int):
 62 |                     conditions.append("%s=%s" % (k, v))
 63 | 
 64 |             sql = "SELECT %s FROM %s WHERE %s" % (','.join(fields), table_name, ' AND '.join(conditions))
 65 | 
 66 |         elif not where:
 67 |             sql = "SELECT %s FROM %s" % (','.join(fields), table_name)
 68 |         else:
 69 |             sql = ""
 70 |         if order and (isinstance(order, str) or isinstance(order, unicode)):
 71 |             sql = "%s ORDER BY %s %s" % (sql, order, 'ASC' if asc_order else 'DESC')
 72 |         if limit and isinstance(limit, tuple) and len(limit) == 2:
 73 |             sql = "%s LIMIT %s,%s" % (sql, limit[0], limit[1])
 74 |         return sql
 75 | 
 76 |     def get_one_result(self, table_name, fields, where=None, order=None, asc_order=True, limit=None):
 77 |         sql = self._select_sql(table_name, fields, where, order, asc_order, limit)
 78 |         if not sql:
 79 |             return None
 80 |         self._execute(sql)
 81 |         result_set = self._fetchone()
 82 |         if result_set:
 83 |             return dict([(k, '' if result_set[i] is None else result_set[i]) for i, k in enumerate(fields)])
 84 |         else:
 85 |             return {}
 86 | 
 87 |     def get_results(self, table_name, fields, where=None, order=None, asc_order=True, limit=None):
 88 |         sql = self._select_sql(table_name, fields, where, order, asc_order, limit)
 89 |         self._execute(sql)
 90 |         result_sets = self._fetchall()
 91 |         return [dict([(k, '' if row[i] is None else row[i]) for i, k in enumerate(fields)]) for row in result_sets]
 92 | 
 93 |     def _update_sql(self, table_name, data, where, fields=None):
 94 |         if not (where and isinstance(where, dict)):
 95 |             return ""
 96 |         where_cond = ["%s='%s'" % (k, v) for k, v in where.items()]
 97 |         if fields:
 98 |             conditions = ["%s='%s'" % (k, data[k]) for k in fields]
 99 |         else:
100 |             conditions = ["%s='%s'" % (k, data[k]) for k in data]
101 |         sql = "UPDATE %s SET %s WHERE %s" % (table_name, ','.join(conditions), ' AND '.join(where_cond))
102 |         return sql
103 | 
104 |     def execute_update_sql(self, table_name, data, where, fields=None):
105 |         sql = self._update_sql(table_name, data, where, fields)
106 |         if sql:
107 |             return self._execute(sql)
108 |         else:
109 |             return ""
110 | 
111 |     def _delete_sql(self, table_name, where):
112 |         if not (where and isinstance(where, dict)):
113 |             return ""
114 |         where_cond = ["%s='%s'" % (k, v) for k, v in where.items()]
115 |         sql = "DELETE FROM %s WHERE %s" % (table_name, ' AND '.join(where_cond))
116 |         return sql
117 | 
118 |     def execute_delete_sql(self, table_name, where):
119 |         sql = self._delete_sql(table_name, where)
120 |         if sql:
121 |             return self._execute(sql)
122 |         else:
123 |             return ""
124 | 
125 |     def if_id_exist(self, table_name, field_id):
126 |         if isinstance(field_id, list):
127 |             id_num=len(field_id)
128 |             result = self.get_results(table_name, ['id'], {'id': field_id})
129 |             if id_num !=len(result):
130 |                 result=False
131 |         else:
132 |             result = self.get_one_result(table_name, ['id'], {'id': field_id})
133 |         if result:
134 |             return True
135 |         else:
136 |             return False
137 | 


--------------------------------------------------------------------------------
/lagou/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LagouzpItem(scrapy.Item):
12 |     positionId = scrapy.Field()
13 |     positionName= scrapy.Field()
14 |     city= scrapy.Field()
15 |     createTime= scrapy.Field()
16 |     companyId= scrapy.Field()
17 |     companyName= scrapy.Field()
18 |     companyFullName= scrapy.Field()
19 |     salary= scrapy.Field()
20 |     minsalary = scrapy.Field()
21 |     munsalary = scrapy.Field()
22 |     maxsalary = scrapy.Field()
23 | 


--------------------------------------------------------------------------------
/lagou/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class LagouSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/lagou/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | # class LagouPipeline(object):
10 |     # def process_item(self, item, spider):
11 |         # return item
12 | 
13 | #import MySQLdb
14 | from db import Cursor
15 | from utils import get_config
16 | 
17 | 
18 | class LagouzpPipeline(object):
19 |     def __init__(self):
20 |         self.config = get_config("common")
21 |         self.cursor = Cursor(self.config)
22 | 
23 |     def process_item(self, item, spider):
24 |         data = {
25 |             "positionId": item['positionId'],
26 |             "positionName": item['positionName'],
27 |             "city": item['city'],
28 |             "createTime": item['createTime'],
29 |             "companyId": item['companyId'],
30 |             "companyName": item['companyName'],
31 |             "companyFullName": item['companyFullName'],
32 |             "salary": item['salary'],
33 |             "minsalary": item['minsalary'],
34 |             "munsalary": item['munsalary'],
35 |             "maxsalary": item['maxsalary']
36 |         }
37 |         self.cursor.execute_insert_sql('jobinfo', data)
38 |         return item
39 | 


--------------------------------------------------------------------------------
/lagou/runweb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/runweb/__init__.py


--------------------------------------------------------------------------------
/lagou/runweb/runweb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from flask import Flask, render_template
 5 | from collections import Counter
 6 | from os import sys
 7 | from os import path
 8 | sys.path.append(path.dirname(path.dirname( path.abspath(__file__))))
 9 | import db
10 | import utils
11 | 
12 | 
13 | app = Flask(__name__)
14 | configweb = utils.get_config('web')
15 | configdb = utils.get_config('common')
16 | cursor = db.Cursor(configdb)
17 | 
18 | 
19 | @app.route('/jobstocity')
20 | def jobstocity():
21 |     output = [
22 |         'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary'
23 |     ]
24 |     result = cursor.get_results('jobinfo', output)
25 |     city_list = [item.values()[1] for item in result]
26 |     city_dict = Counter(city_list)
27 | 
28 |     return render_template('jobstocity.html', jobstocity=city_dict)
29 | 
30 | 
31 | @app.route('/jobstomoney')
32 | def jobstomoney():
33 |     output = [
34 |         'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary'
35 |     ]
36 |     result = cursor.get_results('jobinfo', output)
37 |     money_list = [item.values()[4] for item in result]
38 |     money_dict = Counter(money_list)
39 |     _money_dict = [{'value': v, 'name': k.encode("utf-8")}for k, v in money_dict.items()]
40 | 
41 |     return render_template('jobstomoney.html', jobstomoney_dict=_money_dict, jobstomoney_list=money_list)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     app.run(host=configweb.get('bind', '0.0.0.0'), port=int(configweb.get('port', 8000)), debug=True)
46 | 


--------------------------------------------------------------------------------
/lagou/runweb/templates/jobstocity.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="utf-8">
 5 |     <title>ECharts</title>
 6 |     <!-- 引入 echarts.js -->
 7 |     <script src="/static/echarts.min.js"></script>
 8 | </head>
 9 | <body>
10 |     <!-- 为ECharts准备一个具备大小（宽高）的Dom -->
11 |     <div id="main" style="width: 1400px;height:900px;"></div>
12 |     <script type="text/javascript">
13 |         // 基于准备好的dom，初始化echarts实例
14 |         var myChart = echarts.init(document.getElementById('main'));
15 | 
16 |         // 指定图表的配置项和数据
17 |         var option = {
18 |             color:['#3398DB'],
19 |             title: {
20 |                 text: '职位分布'
21 |             },
22 |             tooltip: {
23 |                 trigger: 'axis',
24 |                 axisPointer: { // 坐标轴指示器，坐标轴触发有效
25 |                     type: 'shadow' // 默认为直线，可选为：'line' | 'shadow'
26 |                 }
27 |             },
28 |             legend: {
29 |                 data: ['职位数量'],
30 |                 align: 'right',
31 |                 right: 10
32 |             },
33 |             grid: {
34 |                 left: '3%',
35 |                 right: '4%',
36 |                 bottom: '3%',
37 |                 containLabel: true
38 |             },
39 |             xAxis: [{
40 |                 type: 'category',
41 |                 data:[{% for item in jobstocity %}'{{ item }}',{% endfor %}]
42 |             }],
43 |             yAxis: [{
44 |                 type: 'value',
45 |                 name: '个',
46 |                 axisLabel: {
47 |                     formatter: '{value}'
48 |                 }
49 |             }],
50 |             series: [
51 |                 {
52 | 
53 |                 name: '职位数量',
54 |                 type: 'bar',
55 |                 barWidth: '30%',
56 |                 data: {{ jobstocity.values() }}
57 |             },
58 |             ],
59 | 
60 |         };
61 | 
62 |         // 使用刚指定的配置项和数据显示图表。
63 |         myChart.setOption(option);
64 | 
65 |     </script>
66 | </body>
67 | </html>
68 | 
69 | 


--------------------------------------------------------------------------------
/lagou/runweb/templates/jobstomoney.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="utf-8">
 5 |     <title>ECharts</title>
 6 |     <!-- 引入 echarts.js -->
 7 |     <script src="/static/echarts.min.js"></script>
 8 | </head>
 9 | <body>
10 |     <!-- 为ECharts准备一个具备大小（宽高）的Dom -->
11 |     <div id="main" style="width: 2000px;height:1000px;"></div>
12 |     <script type="text/javascript">
13 |         // 基于准备好的dom，初始化echarts实例
14 |         var myChart = echarts.init(document.getElementById('main'));
15 | 
16 |         // 指定图表的配置项和数据
17 |         var option = {
18 |             title : {
19 |                 text: '薪资分布',
20 |                 subtext: '薪资',
21 |                 x:'center'
22 |             },
23 |             tooltip : {
24 |                 trigger: 'item',
25 |                 formatter: "{a} <br/>{b} : {c} ({d}%)"
26 |             },
27 |             legend: {
28 |                 orient: 'vertical',
29 |                 left: 'left',
30 |                 data: [{% for item in jobstomoney_list %}'{{ item }}',{% endfor %}]
31 |             },
32 |             series : [
33 |                 {
34 |                     name: '钱',
35 |                     type: 'pie',
36 |                     radius : '40%',
37 |                     center: ['50%', '50%'],
38 |                     data:{{jobstomoney_dict |safe }},
39 |                     itemStyle: {
40 |                         emphasis: {
41 |                             shadowBlur: 10,
42 |                             shadowOffsetX: 0,
43 |                             shadowColor: 'rgba(0, 0, 0, 0.5)'
44 |                         }
45 |                     }
46 |                 }
47 |             ]
48 |         };
49 |         // 使用刚指定的配置项和数据显示图表。
50 |         myChart.setOption(option);
51 | 
52 |     </script>
53 | </body>
54 | </html>
55 | 


--------------------------------------------------------------------------------
/lagou/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for lagou project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'lagou'
13 | 
14 | SPIDER_MODULES = ['lagou.spiders']
15 | NEWSPIDER_MODULE = 'lagou.spiders'
16 | ITEM_PIPELINES = {
17 |     'lagou.pipelines.LagouzpPipeline': 200,  # 200是为了设置工序顺序
18 | }
19 | 
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'lagou (+http://www.yourdomain.com)'
23 | 
24 | # Obey robots.txt rules
25 | ROBOTSTXT_OBEY = True
26 | 
27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
28 | #CONCURRENT_REQUESTS = 32
29 | 
30 | # Configure a delay for requests for the same website (default: 0)
31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
32 | # See also autothrottle settings and docs
33 | #DOWNLOAD_DELAY = 3
34 | # The download delay setting will honor only one of:
35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
36 | #CONCURRENT_REQUESTS_PER_IP = 16
37 | 
38 | # Disable cookies (enabled by default)
39 | #COOKIES_ENABLED = False
40 | 
41 | # Disable Telnet Console (enabled by default)
42 | #TELNETCONSOLE_ENABLED = False
43 | 
44 | # Override the default request headers:
45 | #DEFAULT_REQUEST_HEADERS = {
46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47 | #   'Accept-Language': 'en',
48 | #}
49 | 
50 | # Enable or disable spider middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
52 | #SPIDER_MIDDLEWARES = {
53 | #    'lagou.middlewares.LagouSpiderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable downloader middlewares
57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
58 | #DOWNLOADER_MIDDLEWARES = {
59 | #    'lagou.middlewares.MyCustomDownloaderMiddleware': 543,
60 | #}
61 | 
62 | # Enable or disable extensions
63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
64 | #EXTENSIONS = {
65 | #    'scrapy.extensions.telnet.TelnetConsole': None,
66 | #}
67 | 
68 | # Configure item pipelines
69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
70 | #ITEM_PIPELINES = {
71 | #    'lagou.pipelines.LagouPipeline': 300,
72 | #}
73 | 
74 | # Enable and configure the AutoThrottle extension (disabled by default)
75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
76 | #AUTOTHROTTLE_ENABLED = True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY = 5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY = 60
81 | # The average number of requests Scrapy should be sending in parallel to
82 | # each remote server
83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84 | # Enable showing throttling stats for every response received:
85 | #AUTOTHROTTLE_DEBUG = False
86 | 
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | #HTTPCACHE_ENABLED = True
90 | #HTTPCACHE_EXPIRATION_SECS = 0
91 | #HTTPCACHE_DIR = 'httpcache'
92 | #HTTPCACHE_IGNORE_HTTP_CODES = []
93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
94 | 


--------------------------------------------------------------------------------
/lagou/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/lagou/spiders/lagouSpider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -*- coding:utf-8 -*-
 5 | import scrapy
 6 | from ..items import LagouzpItem
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | import json
10 | import re
11 | 
12 | 
13 | class Spider(scrapy.Spider):
14 |     name = 'lagou'
15 |     cookies = {
16 |         'user_trace_token': '20170329220541-535dcc08aa394057884d3de6c06da2aa',
17 |         'JSESSIONID': 'ABAAABAAAFCAAEG36FCE164D221C1CEB89E796234273C56',
18 |         'PRE_UTM': '',
19 |         'BAIDUID': '45A2EA96C0D623AADAE825E1A3DE41F8:FG=1',
20 |         'PRE_SITE': '',
21 |         'PRE_HOST': '',
22 |         'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2F',
23 |         'index_location_city': '%E5%8C%97%E4%BA%AC',
24 |         'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1491116405,1491116452,1493122880,1493122898',
25 |         'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1493123186',
26 |         '_ga': 'GA1.2.1412866745.1489497427',
27 |         'LGUID': '20170819003739-8cd023cd-8433-11e7-9d86-525400f775ce',
28 |         'LGSID': '20170819002347-9caa6942-8431-11e7-8a38-5254005c3644',
29 |         'TG-TRACK-CODE': 'index_search',
30 |         'SEARCH_ID': 'b3638e10d2a5464598572ce7dfb66e1b'
31 |     }
32 |     headers = {
33 |         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
34 |     }
35 | 
36 |     def start_requests(self):
37 |         kd = ['运维开发']
38 |         city = ['北京', '上海', '广州', '深圳', '杭州']
39 |         urls_kd = ['https://www.lagou.com/jobs/list_{}?px=default&city='.format(one) for one in kd]
40 |         for urls in urls_kd:
41 |             urls_city = [urls + one for one in city]
42 |             for url in urls_city:
43 |                 response = requests.get(url, headers=self.headers, cookies=self.cookies)
44 |                 location = url.split('&')[-1].split('=')[1]
45 |                 key = url.split('/')[-1].split('?')[0].split('_')[1]
46 |                 soup = BeautifulSoup(response.text, 'lxml')
47 |                 pages = soup.find('span', {'class': 'span totalNum'}).get_text()
48 |                 for i in range(1, int(pages) + 1):
49 |                     url = "https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}".format(location, key, i)
50 |                     yield scrapy.Request(url, callback=self.parse)
51 | 
52 |     def parse(self, response):
53 |         data = json.loads(response.text)
54 |         content = data['content']
55 |         positionResult = content['data']['page']['result']
56 |         item = LagouzpItem()
57 | 
58 |         for one in positionResult:
59 |                 item['positionId'] = one['positionId']
60 |                 item['positionName']= one['positionName']
61 |                 item['city'] = one['city']
62 |                 item['createTime'] = one['createTime'].split(' ')[0]
63 |                 item['companyId'] = one['companyId']
64 |                 item['companyName'] = one['companyName']
65 |                 item['companyFullName'] = one['companyFullName']
66 |                 item['salary'] = one['salary']
67 |                 item['minsalary'] = re.split(r'(k|-|K)', one['salary'])[0]
68 |                 item['munsalary'] = (int(re.split(r'(k|-|K)', one['salary'])[0]) + int(re.split(r'(k|-|K)', one['salary'])[4])) / 2
69 |                 item['maxsalary'] = re.split(r'(k|-|K)', one['salary'])[4]
70 |                 yield item
71 | 


--------------------------------------------------------------------------------
/lagou/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import ConfigParser
 6 | 
 7 | work_dir = os.path.dirname(os.path.realpath(__file__))
 8 | 
 9 | 
10 | def get_config(section=''):
11 |     config = ConfigParser.ConfigParser()
12 |     service_conf= os.path.join(work_dir, 'config/myservice.conf')
13 |     config.read(service_conf)
14 | 
15 |     conf_items = dict(config.items('common')) if config.has_section('common') else {}
16 |     if section and config.has_section(section):
17 |         conf_items.update(config.items(section))
18 |     return conf_items
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     print get_config('common')
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Requirements automatically generated by pigar.
 2 | # https://github.com/Damnever/pigar
 3 | 
 4 | # runweb/runweb.py: 4
 5 | Flask == 0.10.1
 6 | 
 7 | # db.py: 4
 8 | MySQL_python == 1.2.5
 9 | 
10 | # items.py: 8
11 | # middlewares.py: 8
12 | # spiders/lagouSpider.py: 5
13 | Scrapy == 1.4.0
14 | 
15 | # spiders/lagouSpider.py: 8
16 | bs4 == 0.0.1
17 | 
18 | # pipelines.py: 14
19 | # runweb/runweb.py: 9
20 | db == 0.1.1
21 | 
22 | # spiders/lagouSpider.py: 7
23 | requests == 2.9.1
24 | 
25 | # pipelines.py: 15
26 | # runweb/runweb.py: 10
27 | utils == 0.9.0
28 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lagou.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lagou
12 | 


--------------------------------------------------------------------------------