├── lagou ├── __init__.py ├── spiders │ ├── __init__.py │ └── lagou_spider.py ├── pipelines.py ├── items.py └── settings.py ├── requirements.txt ├── main.py ├── scrapy.cfg ├── README.md └── .gitignore /lagou/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.1.0 2 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import scrapy.cmdline as cmd 3 | cmd.execute('scrapy crawl LagouSpider'.split()) 4 | -------------------------------------------------------------------------------- /lagou/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lagou.settings 8 | 9 | [deploy] 10 | url = http://localhost:6800/ 11 | project = lagou 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #jobSpider 2 | 3 | jobSpider是一只scrapy爬虫,用于爬取职位信息 4 | 5 | 目前收录: 6 | 7 | * [拉勾网](https://www.lagou.com) 8 | 9 | 10 | # 功能 11 | 1. 爬取Lagou网的职位信息(爬取最新的5000条) 12 | 13 | 14 | 15 | # 安装与依赖 16 | * git clone https://github.com/wwj718/jobSpider 17 | * cd jobSpider 18 | * pip install -r requirements.txt 19 | * mongodb(可选) 20 | * 在setting.py中修改csv保存的路径(FEED_URI变量),默认是当前目录 21 | * 运行 : scrapy crawl LagouSpider(开始爬取数据) 22 | 23 | 24 | # 我的开发环境 25 | OSX python2.7 26 | 27 | 在windows7下测试可用 28 | 29 | ### 可选特性 30 | 31 | 如果要使用mongodb数据库,取消setting.py中的ITEM_PIPELINES注释 32 | 33 | # 代码风格 34 | 采用[yapf](https://github.com/google/yapf)来统一代码风格 35 | 36 | `yapf -i filename.py` 37 | 38 | -------------------------------------------------------------------------------- /lagou/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | # setting设置类比django 9 | # 去重 http://scrapy-chs.readthedocs.io/zh_CN/1.0/topics/item-pipeline.html#id3 10 | # 参考 https://realpython.com/blog/python/web-scraping-with-scrapy-and-mongodb/ 11 | 12 | from scrapy.conf import settings 13 | import pymongo 14 | 15 | 16 | class LagouMongodbPipeline(object): 17 | def __init__(self): 18 | host = settings['MONGODB_HOST'] 19 | port = settings['MONGODB_PORT'] 20 | dbname = settings['MONGODB_DB_NAME'] 21 | collection = settings['MONGODB_COLLECTION'] 22 | connection = pymongo.MongoClient(host, port) 23 | db = connection[dbname] 24 | self.collection = db[collection] 25 | 26 | def process_item(self, item, spider): 27 | self.collection.insert(dict(item)) #json变为字典,可以做些清洗校验工作 28 | return item 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### OSX ### 2 | .DS_Store 3 | .AppleDouble 4 | .LSOverride 5 | 6 | # csv 7 | *.csv 8 | ### SublimeText ### 9 | # cache files for sublime text 10 | *.tmlanguage.cache 11 | *.tmPreferences.cache 12 | *.stTheme.cache 13 | 14 | # workspace files are user-specific 15 | *.sublime-workspace 16 | 17 | # project files should be checked into the repository, unless a significant 18 | # proportion of contributors will probably not be using SublimeText 19 | # *.sublime-project 20 | 21 | # sftp configuration file 22 | sftp-config.json 23 | 24 | # Basics 25 | *.py[cod] 26 | __pycache__ 27 | 28 | # Logs 29 | *.log 30 | pip-log.txt 31 | 32 | # Unit test / coverage reports 33 | .coverage 34 | .tox 35 | nosetests.xml 36 | htmlcov 37 | 38 | # Translations 39 | *.mo 40 | *.pot 41 | 42 | # Pycharm 43 | .idea/* 44 | 45 | 46 | # Vim 47 | 48 | *~ 49 | *.swp 50 | *.swo 51 | 52 | # npm 53 | node_modules/ 54 | 55 | # Compass 56 | .sass-cache 57 | 58 | # virtual environments 59 | .env 60 | 61 | # User-uploaded media 62 | cas_test/media/ 63 | 64 | # Hitch directory 65 | tests/.hitch 66 | 67 | -------------------------------------------------------------------------------- /lagou/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LagouItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | city = scrapy.Field() 15 | companySize = scrapy.Field() 16 | companyShortName = scrapy.Field() 17 | positionName = scrapy.Field() 18 | salaryMax = scrapy.Field() 19 | salaryMin = scrapy.Field() 20 | salaryAvg = scrapy.Field() 21 | #positionType = scrapy.Field() 22 | positionAdvantage = scrapy.Field() 23 | companyLabelList = scrapy.Field() 24 | companyLogo = scrapy.Field() 25 | workYear = scrapy.Field() # 工作年限 26 | education = scrapy.Field() #教育经历 27 | jobNature = scrapy.Field() # 全职还是兼职 28 | financeStage = scrapy.Field() #成长型/c轮/d轮 29 | district = scrapy.Field() # 朝阳区 30 | deliver = scrapy.Field() # 统计 已提交简历 31 | createTime = scrapy.Field() # 创建时间 32 | industryField = scrapy.Field() # 行业 33 | #showCount = scrapy.Field() # 被查看次数 34 | appShow = scrapy.Field() 35 | pcShow = scrapy.Field() 36 | positionId = scrapy.Field() # 职位id:positionId 37 | score = scrapy.Field() 38 | 39 | -------------------------------------------------------------------------------- /lagou/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lagou project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | # 中文版参考 http://scrapy-chs.readthedocs.io/zh_CN/1.0/intro/overview.html 12 | import time 13 | 14 | 15 | 16 | BOT_NAME = 'lagou' 17 | 18 | SPIDER_MODULES = ['lagou.spiders'] 19 | NEWSPIDER_MODULE = 'lagou.spiders' 20 | 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' 24 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0' 25 | 26 | timestr = time.strftime("%Y%m%d-%H%M%S") 27 | #print timestr 28 | FEED_URI = './lagouSpider-{}.csv'.format(timestr) # 以时间为准创建csv 29 | 30 | 31 | FEED_FORMAT = 'csv' 32 | 33 | COOKIES_ENABLED = True 34 | 35 | MONGODB_HOST = '127.0.0.1' 36 | MONGODB_PORT = 27017 37 | MONGODB_DB_NAME = 'Lagou' 38 | MONGODB_COLLECTION = 'position' #职位 39 | 40 | # wwj 41 | # ITEM_PIPELINES = {'lagou.pipelines.LagouMongodbPipeline':10} ,注释后不使用数据库 42 | 43 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 44 | # SCHEDULER_PERSIST= True 45 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 46 | # REDIS_URL = None 47 | # REDIS_HOST = '127.0.0.1' 48 | # REDIS_PORT = 6379 49 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 50 | #CONCURRENT_REQUESTS=32 51 | 52 | # Configure a delay for requests for the same website (default: 0) 53 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 54 | # See also autothrottle settings and docs 55 | #DOWNLOAD_DELAY=3 56 | # The download delay setting will honor only one of: 57 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 58 | #CONCURRENT_REQUESTS_PER_IP=16 59 | 60 | # Disable cookies (enabled by default) 61 | #COOKIES_ENABLED=False 62 | 63 | # Disable Telnet Console (enabled by default) 64 | #TELNETCONSOLE_ENABLED=False 65 | 66 | # Override the default request headers: 67 | #DEFAULT_REQUEST_HEADERS = { 68 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 69 | # 'Accept-Language': 'en', 70 | #} 71 | 72 | # Enable or disable spider middlewares 73 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 74 | #SPIDER_MIDDLEWARES = { 75 | # 'lagou.middlewares.MyCustomSpiderMiddleware': 543, 76 | #} 77 | 78 | # Enable or disable downloader middlewares 79 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 80 | #DOWNLOADER_MIDDLEWARES = { 81 | # 'lagou.middlewares.MyCustomDownloaderMiddleware': 543, 82 | #} 83 | 84 | # Enable or disable extensions 85 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 86 | #EXTENSIONS = { 87 | # 'scrapy.telnet.TelnetConsole': None, 88 | #} 89 | 90 | # Configure item pipelines 91 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 92 | #ITEM_PIPELINES = { 93 | # 'lagou.pipelines.SomePipeline': 300, 94 | #} 95 | 96 | # Enable and configure the AutoThrottle extension (disabled by default) 97 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 98 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 99 | #AUTOTHROTTLE_ENABLED=True 100 | # The initial download delay 101 | #AUTOTHROTTLE_START_DELAY=5 102 | # The maximum download delay to be set in case of high latencies 103 | #AUTOTHROTTLE_MAX_DELAY=60 104 | # Enable showing throttling stats for every response received: 105 | #AUTOTHROTTLE_DEBUG=False 106 | 107 | # Enable and configure HTTP caching (disabled by default) 108 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 109 | #HTTPCACHE_ENABLED=True 110 | #HTTPCACHE_EXPIRATION_SECS=0 111 | #HTTPCACHE_DIR='httpcache' 112 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 113 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 114 | -------------------------------------------------------------------------------- /lagou/spiders/lagou_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | from lagou.items import LagouItem 5 | #from scrapy_redis.spiders import RedisSpider 6 | 7 | 8 | class LagoupositonSpider(scrapy.Spider): 9 | name = "LagouSpider" 10 | #allowed_domains = ["lagou.com/zhaopin/"] 11 | start_urls = ('http://www.lagou.com/zhaopin/') 12 | totalPageCount = 0 13 | curpage = 1 14 | curkd = 1 #当前关键字 15 | position_url = 'http://www.lagou.com/jobs/positionAjax.json?' 16 | 17 | #city = u'北京' 18 | #kds = [u'java','python','PHP','.NET','JavaScript','C#','C++','C','VB','Dephi','Perl','Ruby','Go','ASP','Shell'] 19 | #kds = [u'大数据',u'云计算',u'docker',u'中间件','Node.js',u'数据挖掘',u'自然语言处理',u'搜索算法',u'精准推荐',u'全栈工程师',u'图像处理',u'机器学习',u'语音识别'] 20 | #kds = ['HTML5','Android','iOS',u'web前端','Flash','U3D','COCOS2D-X'] 21 | #kds = [u'spark','MySQL','SQLServer','Oracle','DB2','MongoDB' 'ETL','Hive',u'数据仓库','Hadoop'] 22 | #kds = [u'大数据',u'云计算',u'docker',u'中间件'] 23 | #kd = kds[0] 24 | 25 | def start_requests(self): 26 | # for self.kd in self.kds: 27 | # 28 | # scrapy.http.FormRequest(self.position_url, 29 | # formdata={'pn':str(self.curpage),'kd':self.kd},callback=self.parse) 30 | # 也可以是city 31 | #查询特定关键词的内容,通过request 32 | return [scrapy.http.FormRequest(self.position_url, 33 | formdata={'pn': str(self.curpage)}, #第一页 34 | callback=self.parse)] 35 | 36 | def parse(self, response): 37 | #print response.body 38 | item = LagouItem() 39 | jdict = json.loads(response.body) 40 | jcontent = jdict["content"] 41 | jposresult = jcontent["positionResult"] 42 | jresult = 5000 #jposresult["result"] 43 | #print(jposresult['totalCount']) #最新数据# 231405, # 这是爬虫蜜罐,基本是无效的重复数据,直接设置为5000吧 44 | 45 | self.totalPageCount = jposresult['totalCount'] / 15 + 1 #/15正常,/150用于测试 46 | print(self.totalPageCount) 47 | for each in jresult: 48 | item['city'] = each['city'] 49 | item['positionId'] = each['positionId'] 50 | item['companyLogo'] = each['companyLogo'] 51 | item['workYear'] = each['workYear'] 52 | item['education'] = each['education'] 53 | item['jobNature'] = each['jobNature'] 54 | item['financeStage'] = each['financeStage'] 55 | item['district'] = each['district'] 56 | item['deliver'] = each['deliver'] 57 | item['createTime'] = each['createTime'] 58 | item['industryField'] = each['industryField'] 59 | #item['showCount'] = each['showCount'] 60 | item['pcShow'] = each['pcShow'] 61 | item['appShow'] = each['appShow'] 62 | item['score'] = each['score'] 63 | item['companyShortName'] = each['companyShortName'] 64 | item['companySize'] = each['companySize'] 65 | item['positionName'] = each['positionName'] 66 | #item['positionType'] = each['positionType'] 67 | salary = each['salary'] 68 | salary = salary.split('-') 69 | #把工资字符串(ak-bk)转成最大和最小值(a,b) 70 | #todo:写成单独函数:util 71 | if len(salary) == 1: 72 | item['salaryMax'] = int(salary[0][:salary[0].find('k')]) 73 | else: 74 | item['salaryMax'] = int(salary[1][:salary[1].find('k')]) 75 | item['salaryMin'] = int(salary[0][:salary[0].find('k')]) 76 | item['salaryAvg'] = (item['salaryMin'] + item['salaryMax']) / 2 77 | item['positionAdvantage'] = each['positionAdvantage'] 78 | item['companyLabelList'] = each['companyLabelList'] 79 | # item['keyword'] = self.kd 80 | yield item 81 | if self.curpage <= self.totalPageCount: 82 | self.curpage += 1 #继续爬下一页 83 | #if self.curpage == 335: 84 | # self.curpage = 5000 85 | print(u"当前页{}".format(self.curpage)) 86 | yield scrapy.http.FormRequest( 87 | self.position_url, 88 | # formdata = {'pn': str(self.curpage), 'kd': self.kd},callback=self.parse) 89 | formdata={'pn': str(self.curpage)}, #pn 从335到5000 是空的 90 | callback=self.parse) 91 | # 爬多个关键字 92 | ''' 93 | elif self.curkd < len(self.kds): 94 | self.curpage = 1 95 | self.totalPageCount = 0 96 | self.curkd += 1 #当前关键字,名字不好 97 | self.kd = self.kds[self.curkd] 98 | yield scrapy.http.FormRequest(self.position_url, 99 | formdata = {'pn': str(self.curpage), 'kd': self.kd},callback=self.parse) 100 | ''' 101 | --------------------------------------------------------------------------------