├── lagou
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── lagou_spider.py
    ├── pipelines.py
    ├── items.py
    └── settings.py
├── requirements.txt
├── main.py
├── scrapy.cfg
├── README.md
└── .gitignore


/lagou/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==1.1.0
2 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | import scrapy.cmdline as cmd
3 | cmd.execute('scrapy crawl LagouSpider'.split())
4 | 


--------------------------------------------------------------------------------
/lagou/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lagou.settings
 8 | 
 9 | [deploy]
10 | url = http://localhost:6800/
11 | project = lagou
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #jobSpider
 2 | 
 3 | jobSpider是一只scrapy爬虫，用于爬取职位信息
 4 | 
 5 | 目前收录：
 6 | 
 7 | *  [拉勾网](https://www.lagou.com)
 8 | 
 9 | 
10 | # 功能
11 | 1. 爬取Lagou网的职位信息(爬取最新的5000条)
12 | 
13 | 
14 | 
15 | # 安装与依赖
16 | *  git clone https://github.com/wwj718/jobSpider
17 | *  cd jobSpider
18 | *  pip install -r requirements.txt
19 | *  mongodb(可选)
20 | *  在setting.py中修改csv保存的路径（FEED_URI变量）,默认是当前目录
21 | *  运行 ： scrapy crawl LagouSpider（开始爬取数据）
22 | 
23 | 
24 | # 我的开发环境
25 | OSX python2.7
26 | 
27 | 在windows7下测试可用
28 | 
29 | ### 可选特性
30 | 
31 | 如果要使用mongodb数据库，取消setting.py中的ITEM_PIPELINES注释
32 | 
33 | #  代码风格
34 | 采用[yapf](https://github.com/google/yapf)来统一代码风格
35 | 
36 | `yapf -i filename.py`
37 | 
38 | 


--------------------------------------------------------------------------------
/lagou/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | # setting设置类比django
 9 | # 去重 http://scrapy-chs.readthedocs.io/zh_CN/1.0/topics/item-pipeline.html#id3
10 | # 参考 https://realpython.com/blog/python/web-scraping-with-scrapy-and-mongodb/
11 | 
12 | from scrapy.conf import settings
13 | import pymongo
14 | 
15 | 
16 | class LagouMongodbPipeline(object):
17 |     def __init__(self):
18 |         host = settings['MONGODB_HOST']
19 |         port = settings['MONGODB_PORT']
20 |         dbname = settings['MONGODB_DB_NAME']
21 |         collection = settings['MONGODB_COLLECTION']
22 |         connection = pymongo.MongoClient(host, port)
23 |         db = connection[dbname]
24 |         self.collection = db[collection]
25 | 
26 |     def process_item(self, item, spider):
27 |         self.collection.insert(dict(item))  #json变为字典，可以做些清洗校验工作
28 |         return item
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### OSX ###
 2 | .DS_Store
 3 | .AppleDouble
 4 | .LSOverride
 5 | 
 6 | # csv
 7 | *.csv
 8 | ### SublimeText ###
 9 | # cache files for sublime text
10 | *.tmlanguage.cache
11 | *.tmPreferences.cache
12 | *.stTheme.cache
13 | 
14 | # workspace files are user-specific
15 | *.sublime-workspace
16 | 
17 | # project files should be checked into the repository, unless a significant
18 | # proportion of contributors will probably not be using SublimeText
19 | # *.sublime-project
20 | 
21 | # sftp configuration file
22 | sftp-config.json
23 | 
24 | # Basics
25 | *.py[cod]
26 | __pycache__
27 | 
28 | # Logs
29 | *.log
30 | pip-log.txt
31 | 
32 | # Unit test / coverage reports
33 | .coverage
34 | .tox
35 | nosetests.xml
36 | htmlcov
37 | 
38 | # Translations
39 | *.mo
40 | *.pot
41 | 
42 | # Pycharm
43 | .idea/*
44 | 
45 | 
46 | # Vim
47 | 
48 | *~
49 | *.swp
50 | *.swo
51 | 
52 | # npm
53 | node_modules/
54 | 
55 | # Compass
56 | .sass-cache
57 | 
58 | # virtual environments
59 | .env
60 | 
61 | # User-uploaded media
62 | cas_test/media/
63 | 
64 | # Hitch directory
65 | tests/.hitch
66 | 
67 | 


--------------------------------------------------------------------------------
/lagou/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LagouItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     city = scrapy.Field()
15 |     companySize = scrapy.Field()
16 |     companyShortName = scrapy.Field()
17 |     positionName = scrapy.Field()
18 |     salaryMax = scrapy.Field()
19 |     salaryMin = scrapy.Field()
20 |     salaryAvg = scrapy.Field()
21 |     #positionType = scrapy.Field()
22 |     positionAdvantage = scrapy.Field()
23 |     companyLabelList = scrapy.Field()
24 |     companyLogo = scrapy.Field()
25 |     workYear = scrapy.Field()  # 工作年限
26 |     education = scrapy.Field() #教育经历
27 |     jobNature = scrapy.Field() # 全职还是兼职
28 |     financeStage = scrapy.Field() #成长型/c轮/d轮
29 |     district = scrapy.Field() # 朝阳区
30 |     deliver = scrapy.Field() # 统计 已提交简历
31 |     createTime = scrapy.Field() # 创建时间
32 |     industryField = scrapy.Field() # 行业
33 |     #showCount = scrapy.Field() # 被查看次数
34 |     appShow = scrapy.Field()
35 |     pcShow = scrapy.Field()
36 |     positionId = scrapy.Field() # 职位id：positionId
37 |     score = scrapy.Field()
38 | 
39 | 


--------------------------------------------------------------------------------
/lagou/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for lagou project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | #     中文版参考 http://scrapy-chs.readthedocs.io/zh_CN/1.0/intro/overview.html
 12 | import time
 13 | 
 14 | 
 15 | 
 16 | BOT_NAME = 'lagou'
 17 | 
 18 | SPIDER_MODULES = ['lagou.spiders']
 19 | NEWSPIDER_MODULE = 'lagou.spiders'
 20 | 
 21 | 
 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 23 | #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
 24 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0'
 25 | 
 26 | timestr = time.strftime("%Y%m%d-%H%M%S")
 27 | #print timestr
 28 | FEED_URI = './lagouSpider-{}.csv'.format(timestr) # 以时间为准创建csv
 29 | 
 30 | 
 31 | FEED_FORMAT = 'csv'
 32 | 
 33 | COOKIES_ENABLED = True
 34 | 
 35 | MONGODB_HOST = '127.0.0.1'
 36 | MONGODB_PORT = 27017
 37 | MONGODB_DB_NAME = 'Lagou'
 38 | MONGODB_COLLECTION = 'position' #职位
 39 | 
 40 | # wwj
 41 | # ITEM_PIPELINES = {'lagou.pipelines.LagouMongodbPipeline':10} ，注释后不使用数据库
 42 | 
 43 | # SCHEDULER  = "scrapy_redis.scheduler.Scheduler"
 44 | # SCHEDULER_PERSIST= True
 45 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
 46 | # REDIS_URL = None
 47 | # REDIS_HOST = '127.0.0.1'
 48 | # REDIS_PORT = 6379
 49 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 50 | #CONCURRENT_REQUESTS=32
 51 | 
 52 | # Configure a delay for requests for the same website (default: 0)
 53 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 54 | # See also autothrottle settings and docs
 55 | #DOWNLOAD_DELAY=3
 56 | # The download delay setting will honor only one of:
 57 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
 58 | #CONCURRENT_REQUESTS_PER_IP=16
 59 | 
 60 | # Disable cookies (enabled by default)
 61 | #COOKIES_ENABLED=False
 62 | 
 63 | # Disable Telnet Console (enabled by default)
 64 | #TELNETCONSOLE_ENABLED=False
 65 | 
 66 | # Override the default request headers:
 67 | #DEFAULT_REQUEST_HEADERS = {
 68 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 69 | #   'Accept-Language': 'en',
 70 | #}
 71 | 
 72 | # Enable or disable spider middlewares
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 74 | #SPIDER_MIDDLEWARES = {
 75 | #    'lagou.middlewares.MyCustomSpiderMiddleware': 543,
 76 | #}
 77 | 
 78 | # Enable or disable downloader middlewares
 79 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 80 | #DOWNLOADER_MIDDLEWARES = {
 81 | #    'lagou.middlewares.MyCustomDownloaderMiddleware': 543,
 82 | #}
 83 | 
 84 | # Enable or disable extensions
 85 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 86 | #EXTENSIONS = {
 87 | #    'scrapy.telnet.TelnetConsole': None,
 88 | #}
 89 | 
 90 | # Configure item pipelines
 91 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 92 | #ITEM_PIPELINES = {
 93 | #    'lagou.pipelines.SomePipeline': 300,
 94 | #}
 95 | 
 96 | # Enable and configure the AutoThrottle extension (disabled by default)
 97 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 98 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
 99 | #AUTOTHROTTLE_ENABLED=True
100 | # The initial download delay
101 | #AUTOTHROTTLE_START_DELAY=5
102 | # The maximum download delay to be set in case of high latencies
103 | #AUTOTHROTTLE_MAX_DELAY=60
104 | # Enable showing throttling stats for every response received:
105 | #AUTOTHROTTLE_DEBUG=False
106 | 
107 | # Enable and configure HTTP caching (disabled by default)
108 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
109 | #HTTPCACHE_ENABLED=True
110 | #HTTPCACHE_EXPIRATION_SECS=0
111 | #HTTPCACHE_DIR='httpcache'
112 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
113 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
114 | 


--------------------------------------------------------------------------------
/lagou/spiders/lagou_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | import json
  4 | from lagou.items import LagouItem
  5 | #from scrapy_redis.spiders import RedisSpider
  6 | 
  7 | 
  8 | class LagoupositonSpider(scrapy.Spider):
  9 |     name = "LagouSpider"
 10 |     #allowed_domains = ["lagou.com/zhaopin/"]
 11 |     start_urls = ('http://www.lagou.com/zhaopin/')
 12 |     totalPageCount = 0
 13 |     curpage = 1
 14 |     curkd = 1 #当前关键字
 15 |     position_url = 'http://www.lagou.com/jobs/positionAjax.json?'
 16 | 
 17 |     #city = u'北京'
 18 |     #kds = [u'java','python','PHP','.NET','JavaScript','C#','C++','C','VB','Dephi','Perl','Ruby','Go','ASP','Shell']
 19 |     #kds = [u'大数据',u'云计算',u'docker',u'中间件','Node.js',u'数据挖掘',u'自然语言处理',u'搜索算法',u'精准推荐',u'全栈工程师',u'图像处理',u'机器学习',u'语音识别']
 20 |     #kds = ['HTML5','Android','iOS',u'web前端','Flash','U3D','COCOS2D-X']
 21 |     #kds = [u'spark','MySQL','SQLServer','Oracle','DB2','MongoDB' 'ETL','Hive',u'数据仓库','Hadoop']
 22 |     #kds = [u'大数据',u'云计算',u'docker',u'中间件']
 23 |     #kd = kds[0]
 24 | 
 25 |     def start_requests(self):
 26 |         # for self.kd in self.kds:
 27 |         #
 28 |         #     scrapy.http.FormRequest(self.position_url,
 29 |         #                                 formdata={'pn':str(self.curpage),'kd':self.kd},callback=self.parse)
 30 |         #     也可以是city
 31 |         #查询特定关键词的内容，通过request
 32 |         return [scrapy.http.FormRequest(self.position_url,
 33 |                                         formdata={'pn': str(self.curpage)}, #第一页
 34 |                                         callback=self.parse)]
 35 | 
 36 |     def parse(self, response):
 37 |         #print response.body
 38 |         item = LagouItem()
 39 |         jdict = json.loads(response.body)
 40 |         jcontent = jdict["content"]
 41 |         jposresult = jcontent["positionResult"]
 42 |         jresult = 5000 #jposresult["result"]
 43 |         #print(jposresult['totalCount'])  #最新数据# 231405, # 这是爬虫蜜罐，基本是无效的重复数据，直接设置为5000吧
 44 |         
 45 |         self.totalPageCount = jposresult['totalCount'] / 15 + 1 #/15正常，/150用于测试
 46 |         print(self.totalPageCount)
 47 |         for each in jresult:
 48 |             item['city'] = each['city']
 49 |             item['positionId'] = each['positionId']
 50 |             item['companyLogo'] = each['companyLogo']
 51 |             item['workYear'] = each['workYear']
 52 |             item['education'] = each['education']
 53 |             item['jobNature'] = each['jobNature']
 54 |             item['financeStage'] = each['financeStage']
 55 |             item['district'] = each['district']
 56 |             item['deliver'] = each['deliver']
 57 |             item['createTime'] = each['createTime']
 58 |             item['industryField'] = each['industryField']
 59 |             #item['showCount'] = each['showCount']
 60 |             item['pcShow'] = each['pcShow']
 61 |             item['appShow'] = each['appShow']
 62 |             item['score'] = each['score']
 63 |             item['companyShortName'] = each['companyShortName']
 64 |             item['companySize'] = each['companySize']
 65 |             item['positionName'] = each['positionName']
 66 |             #item['positionType'] = each['positionType']
 67 |             salary = each['salary']
 68 |             salary = salary.split('-')
 69 |             #把工资字符串（ak-bk）转成最大和最小值(a,b)
 70 |             #todo:写成单独函数：util
 71 |             if len(salary) == 1:
 72 |                 item['salaryMax'] = int(salary[0][:salary[0].find('k')])
 73 |             else:
 74 |                 item['salaryMax'] = int(salary[1][:salary[1].find('k')])
 75 |             item['salaryMin'] = int(salary[0][:salary[0].find('k')])
 76 |             item['salaryAvg'] = (item['salaryMin'] + item['salaryMax']) / 2
 77 |             item['positionAdvantage'] = each['positionAdvantage']
 78 |             item['companyLabelList'] = each['companyLabelList']
 79 |             # item['keyword'] = self.kd
 80 |             yield item
 81 |         if self.curpage <= self.totalPageCount:
 82 |             self.curpage += 1 #继续爬下一页
 83 |             #if self.curpage == 335:
 84 |             #    self.curpage = 5000
 85 |             print(u"当前页{}".format(self.curpage))
 86 |             yield scrapy.http.FormRequest(
 87 |                 self.position_url,
 88 |                 # formdata = {'pn': str(self.curpage), 'kd': self.kd},callback=self.parse)
 89 |                 formdata={'pn': str(self.curpage)}, #pn 从335到5000 是空的
 90 |                 callback=self.parse)
 91 |         # 爬多个关键字
 92 |         '''
 93 |         elif self.curkd < len(self.kds):
 94 |             self.curpage = 1
 95 |             self.totalPageCount = 0
 96 |             self.curkd += 1  #当前关键字，名字不好
 97 |             self.kd = self.kds[self.curkd]
 98 |             yield scrapy.http.FormRequest(self.position_url,
 99 |                                         formdata = {'pn': str(self.curpage), 'kd': self.kd},callback=self.parse)
100 |         '''
101 | 


--------------------------------------------------------------------------------