├── README.md ├── __init__.py ├── doc ├── demo1.jpg └── demo2.jpg ├── lagou ├── __init__.py ├── config │ └── myservice.conf ├── db.py ├── items.py ├── middlewares.py ├── pipelines.py ├── runweb │ ├── __init__.py │ ├── runweb.py │ ├── static │ │ └── echarts.min.js │ └── templates │ │ ├── jobstocity.html │ │ └── jobstomoney.html ├── settings.py ├── spiders │ ├── __init__.py │ └── lagouSpider.py └── utils.py ├── requirements.txt └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | 拉钩爬虫出图 2 | ========== 3 | 4 | [](https://www.python.org/) 5 | [](http://docs.jinkan.org/docs/flask/) 6 | [](http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html) 7 | [](https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html) 8 | [](http://docs.python-requests.org/zh_CN/latest/user/quickstart.html) 9 | 10 | 11 | 12 | ### 一、关于findjob 13 | 14 | - 爬取拉勾devops以及运维开发的数据 15 | - 进行可视化显示 16 | - 目的是为了方便找工作,爬去拉勾数据进行数据可视化展示分析以便找到一个称心如意的工作 17 | 18 | 19 | 20 | ### 二、依赖环境 21 | 22 | ```cpp 23 | pip install -r requirments.txt 24 | ``` 25 | 26 | ### 三、具体部署步骤 27 | 28 | > 1. 进入项目目录 29 | 30 | ``` cpp 31 | cd findjob/config/ 32 | ``` 33 | 34 | > 2.创建数据库lagou,创建表lagou 35 | 36 | ```cpp 37 | CREATE DATABASE lagou; 38 | 39 | CREATE TABLE `jobinfo` ( 40 | `positionId` varchar(255) DEFAULT NULL, 41 | `positionName` varchar(255) DEFAULT NULL, 42 | `city` varchar(255) DEFAULT NULL, 43 | `createTime` varchar(255) DEFAULT NULL, 44 | `salary` varchar(255) DEFAULT NULL, 45 | `companyId` varchar(50) DEFAULT NULL, 46 | `companyName` varchar(255) DEFAULT NULL, 47 | `companyFullName` varchar(255) DEFAULT NULL, 48 | `minsalary` double DEFAULT NULL, 49 | `munsalary` double DEFAULT NULL, 50 | `maxsalary` double DEFAULT NULL 51 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 52 | ``` 53 | 54 | > 3.编辑service.conf, 配置数据库地址以及账号密码和flask启动绑定的端口和地址 55 | 56 | ```cpp 57 | [common] 58 | mysql_host = 127.0.0.1 59 | mysql_port = 3306 60 | mysql_user = root 61 | mysql_passwd = 123456 62 | mysql_db = lagou 63 | mysql_charset = utf8 64 | passport_key = 123456 65 | [web] 66 | bind = 0.0.0.0 67 | port = 8000 68 | ``` 69 | 70 | 71 | ### 四、如何访问 72 | 73 | ```cpp 74 | 75 | 1.cd lagou 76 | 2.scrapy crawl lagou 77 | 3.cd lagou/runweb 78 | 4.python runweb.py 79 | 5.访问http://IP:8000/jobstomoney && 访问http://IP:8000/jobstocity 80 | ``` 81 | 82 | ### 五、版权 83 | 84 | 代码内容采用 [新 BSD 许可](LICENSE) 85 | 文档内容采用 [署名-禁止演绎 4.0 国际协议许可](https://creativecommons.org/licenses/by-nd/4.0/deed.zh) 86 | 87 | ### 六、TODO 88 | 89 | - [ ] 集成BOSS直聘等平台数据,进行综合评测 90 | - [ ] 针对不同的岗位可进行筛选操作 91 | 92 | ### 七、鸣谢 93 | 94 | - 核心代码贡献者: 国内大型互联网公司首席扛服务器工程师--刘老师 95 | - 场景的需求才会让这个项目诞生 96 | 97 | 98 | ### 八、Demo 99 | 100 |  101 |  102 | 103 | ### 九、特别提示 104 | 105 | - 本项目仅做沟通交流使用,请勿用作非法用途,谢谢 106 | - 欢迎大家提pr 107 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/__init__.py -------------------------------------------------------------------------------- /doc/demo1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo1.jpg -------------------------------------------------------------------------------- /doc/demo2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/doc/demo2.jpg -------------------------------------------------------------------------------- /lagou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/__init__.py -------------------------------------------------------------------------------- /lagou/config/myservice.conf: -------------------------------------------------------------------------------- 1 | [common] 2 | mysql_host = 127.0.0.1 3 | mysql_port = 3306 4 | mysql_user = root 5 | mysql_passwd = 123456 6 | mysql_db = lagou 7 | mysql_charset = utf8 8 | passport_key = 123456 9 | 10 | [web] 11 | bind = 0.0.0.0 12 | port = 8000 13 | -------------------------------------------------------------------------------- /lagou/db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | 4 | import MySQLdb as mysql 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | class Cursor(): 11 | def __init__(self, config): 12 | self.config = dict([(k[6:], config[k]) for k in config if k.startswith('mysql_')]) 13 | if 'port' in self.config: 14 | self.config['port'] = int(self.config['port']) 15 | if self.config: 16 | self._connect_db() 17 | 18 | def _connect_db(self): 19 | self.db = mysql.connect(**self.config) 20 | self.db.autocommit(True) 21 | self.cur = self.db.cursor() 22 | 23 | def _close_db(self): 24 | self.cur.close() 25 | self.db.close() 26 | 27 | def _execute(self, sql): 28 | try: 29 | return self.cur.execute(sql) 30 | except: 31 | self._close_db() 32 | self._connect_db() 33 | return self.cur.execute(sql) 34 | 35 | def _fetchone(self): 36 | return self.cur.fetchone() 37 | 38 | def _fetchall(self): 39 | return self.cur.fetchall() 40 | 41 | def _insert_sql(self, table_name, data): 42 | fields, values = [], [] 43 | for k, v in data.items(): 44 | fields.append(k) 45 | values.append("'%s'" % v) 46 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (table_name, ','.join(fields), ','.join(values)) 47 | return sql 48 | 49 | def execute_insert_sql(self, table_name, data): 50 | sql = self._insert_sql(table_name, data) 51 | return self._execute(sql) 52 | 53 | def _select_sql(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 54 | if isinstance(where, dict) and where: 55 | conditions = [] 56 | for k, v in where.items(): 57 | if isinstance(v, list): 58 | conditions.append("%s IN (%s)" % (k, ','.join(v))) 59 | elif isinstance(v, str) or isinstance(v, unicode): 60 | conditions.append("%s='%s'" % (k, v)) 61 | elif isinstance(v, int): 62 | conditions.append("%s=%s" % (k, v)) 63 | 64 | sql = "SELECT %s FROM %s WHERE %s" % (','.join(fields), table_name, ' AND '.join(conditions)) 65 | 66 | elif not where: 67 | sql = "SELECT %s FROM %s" % (','.join(fields), table_name) 68 | else: 69 | sql = "" 70 | if order and (isinstance(order, str) or isinstance(order, unicode)): 71 | sql = "%s ORDER BY %s %s" % (sql, order, 'ASC' if asc_order else 'DESC') 72 | if limit and isinstance(limit, tuple) and len(limit) == 2: 73 | sql = "%s LIMIT %s,%s" % (sql, limit[0], limit[1]) 74 | return sql 75 | 76 | def get_one_result(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 77 | sql = self._select_sql(table_name, fields, where, order, asc_order, limit) 78 | if not sql: 79 | return None 80 | self._execute(sql) 81 | result_set = self._fetchone() 82 | if result_set: 83 | return dict([(k, '' if result_set[i] is None else result_set[i]) for i, k in enumerate(fields)]) 84 | else: 85 | return {} 86 | 87 | def get_results(self, table_name, fields, where=None, order=None, asc_order=True, limit=None): 88 | sql = self._select_sql(table_name, fields, where, order, asc_order, limit) 89 | self._execute(sql) 90 | result_sets = self._fetchall() 91 | return [dict([(k, '' if row[i] is None else row[i]) for i, k in enumerate(fields)]) for row in result_sets] 92 | 93 | def _update_sql(self, table_name, data, where, fields=None): 94 | if not (where and isinstance(where, dict)): 95 | return "" 96 | where_cond = ["%s='%s'" % (k, v) for k, v in where.items()] 97 | if fields: 98 | conditions = ["%s='%s'" % (k, data[k]) for k in fields] 99 | else: 100 | conditions = ["%s='%s'" % (k, data[k]) for k in data] 101 | sql = "UPDATE %s SET %s WHERE %s" % (table_name, ','.join(conditions), ' AND '.join(where_cond)) 102 | return sql 103 | 104 | def execute_update_sql(self, table_name, data, where, fields=None): 105 | sql = self._update_sql(table_name, data, where, fields) 106 | if sql: 107 | return self._execute(sql) 108 | else: 109 | return "" 110 | 111 | def _delete_sql(self, table_name, where): 112 | if not (where and isinstance(where, dict)): 113 | return "" 114 | where_cond = ["%s='%s'" % (k, v) for k, v in where.items()] 115 | sql = "DELETE FROM %s WHERE %s" % (table_name, ' AND '.join(where_cond)) 116 | return sql 117 | 118 | def execute_delete_sql(self, table_name, where): 119 | sql = self._delete_sql(table_name, where) 120 | if sql: 121 | return self._execute(sql) 122 | else: 123 | return "" 124 | 125 | def if_id_exist(self, table_name, field_id): 126 | if isinstance(field_id, list): 127 | id_num=len(field_id) 128 | result = self.get_results(table_name, ['id'], {'id': field_id}) 129 | if id_num !=len(result): 130 | result=False 131 | else: 132 | result = self.get_one_result(table_name, ['id'], {'id': field_id}) 133 | if result: 134 | return True 135 | else: 136 | return False 137 | -------------------------------------------------------------------------------- /lagou/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LagouzpItem(scrapy.Item): 12 | positionId = scrapy.Field() 13 | positionName= scrapy.Field() 14 | city= scrapy.Field() 15 | createTime= scrapy.Field() 16 | companyId= scrapy.Field() 17 | companyName= scrapy.Field() 18 | companyFullName= scrapy.Field() 19 | salary= scrapy.Field() 20 | minsalary = scrapy.Field() 21 | munsalary = scrapy.Field() 22 | maxsalary = scrapy.Field() 23 | -------------------------------------------------------------------------------- /lagou/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LagouSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /lagou/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | # class LagouPipeline(object): 10 | # def process_item(self, item, spider): 11 | # return item 12 | 13 | #import MySQLdb 14 | from db import Cursor 15 | from utils import get_config 16 | 17 | 18 | class LagouzpPipeline(object): 19 | def __init__(self): 20 | self.config = get_config("common") 21 | self.cursor = Cursor(self.config) 22 | 23 | def process_item(self, item, spider): 24 | data = { 25 | "positionId": item['positionId'], 26 | "positionName": item['positionName'], 27 | "city": item['city'], 28 | "createTime": item['createTime'], 29 | "companyId": item['companyId'], 30 | "companyName": item['companyName'], 31 | "companyFullName": item['companyFullName'], 32 | "salary": item['salary'], 33 | "minsalary": item['minsalary'], 34 | "munsalary": item['munsalary'], 35 | "maxsalary": item['maxsalary'] 36 | } 37 | self.cursor.execute_insert_sql('jobinfo', data) 38 | return item 39 | -------------------------------------------------------------------------------- /lagou/runweb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuima/findjob/8beb50d9370843875e2f0df6231c3795e64cf1af/lagou/runweb/__init__.py -------------------------------------------------------------------------------- /lagou/runweb/runweb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from flask import Flask, render_template 5 | from collections import Counter 6 | from os import sys 7 | from os import path 8 | sys.path.append(path.dirname(path.dirname( path.abspath(__file__)))) 9 | import db 10 | import utils 11 | 12 | 13 | app = Flask(__name__) 14 | configweb = utils.get_config('web') 15 | configdb = utils.get_config('common') 16 | cursor = db.Cursor(configdb) 17 | 18 | 19 | @app.route('/jobstocity') 20 | def jobstocity(): 21 | output = [ 22 | 'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary' 23 | ] 24 | result = cursor.get_results('jobinfo', output) 25 | city_list = [item.values()[1] for item in result] 26 | city_dict = Counter(city_list) 27 | 28 | return render_template('jobstocity.html', jobstocity=city_dict) 29 | 30 | 31 | @app.route('/jobstomoney') 32 | def jobstomoney(): 33 | output = [ 34 | 'positionId', 'positionName', 'city', 'createTime', 'salary', 'companyId', 'companyName', 'companyFullName', 'minsalary', 'munsalary', 'maxsalary' 35 | ] 36 | result = cursor.get_results('jobinfo', output) 37 | money_list = [item.values()[4] for item in result] 38 | money_dict = Counter(money_list) 39 | _money_dict = [{'value': v, 'name': k.encode("utf-8")}for k, v in money_dict.items()] 40 | 41 | return render_template('jobstomoney.html', jobstomoney_dict=_money_dict, jobstomoney_list=money_list) 42 | 43 | 44 | if __name__ == '__main__': 45 | app.run(host=configweb.get('bind', '0.0.0.0'), port=int(configweb.get('port', 8000)), debug=True) 46 | -------------------------------------------------------------------------------- /lagou/runweb/templates/jobstocity.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |