├── jd ├── data │ └── .gitkeep ├── jd │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── identity.py │ │ └── comment.py │ ├── items.py │ ├── pipelines.py │ ├── parse.py │ ├── middlewares.py │ └── settings.py ├── README.md └── scrapy.cfg ├── lagou ├── __init__.py ├── README.md ├── parse.py ├── manage.py ├── https.py └── setting.py ├── README.md ├── requirements.txt └── .gitignore /jd/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /jd/jd/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lagou/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /jd/README.md: -------------------------------------------------------------------------------- 1 | ## www.jd.com 2 | 京东商城物品id及评论信息爬虫 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 前言 2 | 一些小爬虫汇总,不定期更新 3 | 4 | ## 爬虫列表 5 | ### 拉勾网职位信息 6 | ### 京东商城物品评论 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | demjson>=2.2.4 2 | lxml>=3.7.3 3 | multiprocessing>=2.6.2.1 4 | requests>=2.14.2 5 | scrapy>=1.4.0 -------------------------------------------------------------------------------- /lagou/README.md: -------------------------------------------------------------------------------- 1 | # www.lagou.com-spider 2 | 拉勾网职位信息爬虫
详细见我的博客http://blog.csdn.net/sinat_33741547/article/details/54847950 3 | -------------------------------------------------------------------------------- /jd/jd/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project temp files 2 | *.pyc 3 | *.log 4 | _build/ 5 | temp/ 6 | 7 | # pycharm temp files 8 | .idea/ 9 | 10 | # mac temp files 11 | .DS_Store 12 | 13 | # wheel temp files 14 | build/ 15 | *.egg-info/ 16 | 17 | # Vim temp files 18 | *.swp 19 | *.swo 20 | -------------------------------------------------------------------------------- /jd/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jd.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jd 12 | -------------------------------------------------------------------------------- /jd/jd/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JdItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | info = scrapy.Field() 15 | -------------------------------------------------------------------------------- /jd/jd/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | 9 | 10 | class JdPipeline(object): 11 | def open_spider(self, spider): 12 | path = os.path.join(os.getcwd(), 'data', '{}.txt'.format(spider.name)) 13 | self.fa = open(path, 'a+') 14 | 15 | def close_spider(self, spider): 16 | self.fa.close() 17 | 18 | def process_item(self, item, spider): 19 | self.fa.write('{}\n'.format(str(item['info']))) 20 | -------------------------------------------------------------------------------- /jd/jd/spiders/identity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.http import Request 4 | from jd.parse import parse_info 5 | from jd.items import JdItem 6 | from jd.settings import PD_MAPS 7 | 8 | 9 | class IdentitySpider(scrapy.Spider): 10 | name = 'identity' 11 | allowed_domains = ['www.jd.com'] 12 | start_urls = [] 13 | 14 | def start_requests(self): 15 | for _, url in PD_MAPS.items(): 16 | yield Request(url) 17 | 18 | def parse(self, response): 19 | item = JdItem() 20 | ids = parse_info(response.body, 'item.jd.com.*?(\\d+)\.html', 1) 21 | for i in ids: 22 | item['info'] = i.strip() 23 | yield item 24 | -------------------------------------------------------------------------------- /jd/jd/spiders/comment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import demjson 4 | import scrapy 5 | from scrapy.http import Request 6 | from jd.items import JdItem 7 | from jd.settings import CM_URL, SC_MAPS 8 | 9 | 10 | class CommentSpider(scrapy.Spider): 11 | name = 'comment' 12 | allowed_domains = [] 13 | start_urls = ['http://www.jd.com/'] 14 | 15 | def parse(self, response): 16 | path = os.path.join(os.getcwd(), 'data', 'identity.txt') 17 | fr = open(path, 'r') 18 | urls = [(CM_URL.format(i.strip(), s, 0), s) for i in fr.readlines() for _, s in SC_MAPS.items()] 19 | for url, score in urls: 20 | yield Request(url, meta={'page': 0, 'score': score}, callback=self.parse_comment) 21 | 22 | def parse_comment(self, response): 23 | item = JdItem() 24 | meta = response.meta 25 | response_json = demjson.decode(txt=response.text, encoding='utf-8') 26 | if meta['page'] < int(response_json['maxPage']) and meta['page'] < 100: 27 | meta['page'] += 1 28 | url = u'='.join(response._url.split(u'=')[:-1])+u'='+str(meta['page']) 29 | yield Request(url, meta=meta, callback=self.parse_comment) 30 | for c in response_json['comments']: 31 | content = ''.join(c['content']).strip().replace(u'\n', u'').replace(u'\r', u'').encode('utf-8') 32 | item['info'] = '{} {}'.format(meta['score'], content) 33 | yield item 34 | -------------------------------------------------------------------------------- /jd/jd/parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import hashlib 4 | 5 | 6 | def hash_md5(string): 7 | md5 = hashlib.md5() 8 | try: 9 | md5.update(string.encode('utf-8')) 10 | except Exception as e: 11 | md5.update(string) 12 | return md5.hexdigest() 13 | 14 | 15 | def parse_tool(content): 16 | ''' 17 | 清除html标签 18 | :return: 19 | ''' 20 | if type(content) != str: return content 21 | sublist = ['', '', '', '', '', '', 22 | '
', '
', '', '
  • ', '
  • ', '', 23 | '', '', '', '', '', '', 24 | '\r', '\n', '&.*?;', '&', '#.*?;', '', '', '
    ', '
    ', 25 | '
    ', '
    ', '', '', '', '', 26 | '', '', '', '', '', 27 | '', '', '', '', 28 | '', '', '
    ', ''] 29 | try: 30 | for substring in [re.compile(string, re.S) for string in sublist]: 31 | content = re.sub(substring, "", content).strip() 32 | except Exception as e: 33 | print('parse_tool:' + str(e)) 34 | finally: 35 | return content 36 | 37 | 38 | def extract_str(items): 39 | content = '' 40 | for item in items: 41 | content += str(item.encode('utf-8')) 42 | content = parse_tool(content) 43 | return content 44 | 45 | 46 | def parse_info(r, p, i=0): 47 | ''' 48 | 解析信息 49 | :return: 50 | ''' 51 | try: 52 | pattern = re.compile(p, re.S) 53 | items = re.findall(pattern, r) 54 | if not i: 55 | item = items[i] 56 | else: 57 | item = items 58 | except Exception as e: 59 | print('parse_info:' + str(e)) 60 | item = '' 61 | finally: 62 | return item 63 | -------------------------------------------------------------------------------- /jd/jd/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JdSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /lagou/parse.py: -------------------------------------------------------------------------------- 1 | import re 2 | import demjson 3 | 4 | class Parse: 5 | ''' 6 | 解析网页信息 7 | ''' 8 | def __init__(self, htmlCode): 9 | self.htmlCode = htmlCode 10 | self.json = demjson.decode(htmlCode) 11 | pass 12 | 13 | 14 | def parseTool(self,content): 15 | ''' 16 | 清除html标签 17 | ''' 18 | if type(content) != str: return content 19 | sublist = ['','','','','','', 20 | '
    ','
    ','','
  • ','
  • ','', 21 | '','','','','','', 22 | '\r','\n','&.*?;','&','#.*?;','',''] 23 | try: 24 | for substring in [re.compile(string, re.S) for string in sublist]: 25 | content = re.sub(substring, "", content).strip() 26 | except: 27 | raise Exception('Error '+str(substring.pattern)) 28 | return content 29 | 30 | 31 | def parsePage(self): 32 | ''' 33 | 解析并计算页面数量 34 | :return: 页面数量 35 | ''' 36 | totalCount = self.json['content']['positionResult']['totalCount'] #职位总数量 37 | resultSize = self.json['content']['positionResult']['resultSize'] #每一页显示的数量 38 | pageCount = int(totalCount) // int(resultSize) + 1 #页面数量 39 | return pageCount 40 | 41 | 42 | def parseInfo(self): 43 | ''' 44 | 解析信息 45 | ''' 46 | info = [] 47 | for position in self.json['content']['positionResult']['result']: 48 | i = {} 49 | i['companyName'] = position['companyFullName'] 50 | i['companyDistrict'] = position['district'] 51 | i['companyLabel'] = position['companyLabelList'] 52 | i['companySize'] = position['companySize'] 53 | i['companyStage'] = position['financeStage'] 54 | i['companyType'] = position['industryField'] 55 | i['positionType'] = position['firstType'] 56 | i['positionEducation'] = position['education'] 57 | i['positionAdvantage'] = position['positionAdvantage'] 58 | i['positionSalary'] = position['salary'] 59 | i['positionWorkYear'] = position['workYear'] 60 | info.append(i) 61 | return info 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /lagou/manage.py: -------------------------------------------------------------------------------- 1 | from lagou.https import Http 2 | from lagou.parse import Parse 3 | from lagou.setting import headers as hd 4 | from lagou.setting import cookies as ck 5 | import time 6 | import logging 7 | logging.basicConfig(level=logging.ERROR, 8 | format='%(asctime)s Process%(process)d:%(thread)d %(message)s', 9 | datefmt='%Y-%m-%d %H:%M:%S', 10 | filename='diary.log', 11 | filemode='a') 12 | 13 | 14 | def getInfo(url, para): 15 | """ 16 | 获取信息 17 | """ 18 | generalHttp = Http() 19 | htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck) 20 | generalParse = Parse(htmlCode) 21 | pageCount = generalParse.parsePage() 22 | info = [] 23 | for i in range(1, pageCount+1): 24 | print('第%s页' % i) 25 | para['pn'] = str(i) 26 | htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck) 27 | generalParse = Parse(htmlCode) 28 | info = info + getInfoDetail(generalParse) 29 | time.sleep(2) 30 | return info 31 | 32 | 33 | def getInfoDetail(generalParse): 34 | """ 35 | 信息解析 36 | """ 37 | info = generalParse.parseInfo() 38 | return info 39 | 40 | 41 | def processInfo(info, para): 42 | """ 43 | 信息存储 44 | """ 45 | logging.error('Process start') 46 | try: 47 | title = 'companyName,companyType,companyStage,companyLabel,companySize,companyDistrict,' \ 48 | 'positionType,positionEducation,positionAdvantage,positionSalary,positionWorkYear\n' 49 | file = open('%s.txt' % para['city'], 'a') 50 | file.write(title) 51 | for p in info: 52 | line = str(p['companyName']) + ',' + str(p['companyType']) + ',' + str(p['companyStage']) + ',' + \ 53 | str(p['companyLabel']) + ',' + str(p['companySize']) + ',' + str(p['companyDistrict']) + ',' + \ 54 | str(p['positionType']) + ',' + str(p['positionEducation']) + ',' + str(p['positionAdvantage']) + ',' +\ 55 | str(p['positionSalary']) + ',' + str(p['positionWorkYear']) + '\n' 56 | file.write(line) 57 | file.close() 58 | return True 59 | except: 60 | logging.error('Process except') 61 | return None 62 | 63 | 64 | def main(url, para): 65 | """ 66 | 主函数逻辑 67 | """ 68 | logging.error('Main start') 69 | if url: 70 | info = getInfo(url, para) # 获取信息 71 | flag = processInfo(info, para) # 信息储存 72 | return flag 73 | else: 74 | return None 75 | 76 | 77 | if __name__ == '__main__': 78 | kdList = [u'数据分析'] 79 | cityList = [u'广州', u'深圳'] 80 | url = 'https://www.lagou.com/jobs/positionAjax.json' 81 | for city in cityList: 82 | print('爬取%s' % city) 83 | para = {'first': 'true','pn': '1', 'kd': kdList[0], 'city': city} 84 | flag = main(url, para) 85 | if flag: print('%s爬取成功' % city) 86 | else: print('%s爬取失败' % city) 87 | -------------------------------------------------------------------------------- /lagou/https.py: -------------------------------------------------------------------------------- 1 | from lagou.setting import IP,UA 2 | import requests,random 3 | import logging 4 | logging.basicConfig(level=logging.ERROR, 5 | format='%(asctime)s Process%(process)d:%(thread)d %(message)s', 6 | datefmt='%Y-%m-%d %H:%M:%S', 7 | filename='diary.log', 8 | filemode='a') 9 | class Http: 10 | ''' 11 | http请求相关的操作 12 | ''' 13 | def __init__(self): 14 | pass 15 | 16 | def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5): 17 | ''' 18 | 获取网页源码 19 | url: 网页链接 20 | headers: headers 21 | cookies: cookies 22 | proxy: 代理 23 | timeOut: 请求超时时间 24 | timeOutRetry: 超时重试次数 25 | return: 源码 26 | ''' 27 | if not url: 28 | logging.error('GetError url not exit') 29 | return 'None' 30 | logging.error('Get %s' % url) 31 | try: 32 | if not headers: headers = {'User-Agent': UA[random.randint(0, len(UA)-1)]} 33 | #if not proxy: proxy = {'http':"http://"+IP[random.randint(0, len(IP)-1)]} 34 | response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut) 35 | if response.status_code == 200 or response.status_code == 302: 36 | htmlCode = response.text 37 | else: 38 | htmlCode = 'None' 39 | logging.error('Get %s %s' % (str(response.status_code), url)) 40 | except Exception as e: 41 | logging.error('GetExcept %s' % str(e)) 42 | if timeOutRetry > 0: 43 | htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry-1)) 44 | else: 45 | logging.error('GetTimeOut %s' % url) 46 | htmlCode = 'None' 47 | return htmlCode 48 | 49 | def post(self,url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5): 50 | ''' 51 | post获取响应 52 | url: 目标链接 53 | para: 参数 54 | headers: headers 55 | cookies: cookies 56 | proxy: 代理 57 | timeOut: 请求超时时间 58 | timeOutRetry: 超时重试次数 59 | return: 响应 60 | ''' 61 | if not url or not para: 62 | logging.error('PostError url or para not exit') 63 | return None 64 | logging.error('Post %s' % url) 65 | try: 66 | if not headers: 67 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3'} 68 | response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut) 69 | if response.status_code == 200 or response.status_code == 302: 70 | htmlCode = response.text 71 | else: 72 | htmlCode = None 73 | logging.error('Post %s %s' % (str(response.status_code), url)) 74 | except Exception as e: 75 | logging.error('PostExcept %s' % str(e)) 76 | if timeOutRetry > 0: 77 | htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry-1)) 78 | else: 79 | logging.error('PostTimeOut %s' % url) 80 | htmlCode = None 81 | return htmlCode 82 | 83 | def confirm(self, htmlCode, url, headers, cookies,proxy,catch_retry=5): 84 | ''' 85 | 反爬,验证页面 86 | htmlCode:网页源码 87 | return:网页源码 88 | ''' 89 | #获取网页title判断是否被ban 90 | return htmlCode 91 | 92 | def urlprocess(self,items): 93 | # + URL 中+号表示空格 %2B 94 | # 空格 URL中的空格可以用+号或者编码 %20 95 | # / 分隔目录和子目录 %2F 96 | # ? 分隔实际的URL和参数 %3F 97 | # % 指定特殊字符 %25 98 | # # 表示书签 %23 99 | # & URL 中指定的参数间的分隔符 %26 100 | # = URL 中指定参数的值 %3D 101 | content = items.replace('/','%2F').replace('=','%3D').replace('+','%2B').replace(\ 102 | ' ','%20').replace('/','%2F').replace('?','%3F').replace('=','%3D') 103 | return content 104 | -------------------------------------------------------------------------------- /jd/jd/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jd project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jd' 13 | 14 | SPIDER_MODULES = ['jd.spiders'] 15 | NEWSPIDER_MODULE = 'jd.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'jd (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | # SPIDER_MIDDLEWARES = { 51 | # 'jd.middlewares.JdSpiderMiddleware': 543, 52 | # } 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'jd.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'jd.pipelines.JdPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | # 各类商品链接 94 | PD_MAPS = {'家电': 'https://jiadian.jd.com/', 95 | '手机': 'https://shouji.jd.com/', 96 | '运营商': 'https://wt.jd.com/', 97 | '数码': 'https://shuma.jd.com/', 98 | '电脑': 'https://diannao.jd.com/', 99 | '办公': 'https://bg.jd.com/', 100 | '家居': 'https://channel.jd.com/home.html', 101 | '家具': 'https://channel.jd.com/furniture.html', 102 | '家装': 'https://channel.jd.com/decoration.html'} 103 | 104 | # 评论 105 | CM_URL = 'https://sclub.jd.com/comment/productPageComments.action?productId={}&score={}&sortType=5&pageSize=10&isShadowSku=0&fold=1&page={}' 106 | SC_MAPS = {'好评': '3', 107 | '中评': '2', 108 | '差评': '1'} 109 | -------------------------------------------------------------------------------- /lagou/setting.py: -------------------------------------------------------------------------------- 1 | #HEADER 2 | headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 3 | 'Accept-Encoding': 'gzip, deflate', 4 | 'Host': 'www.lagou.com', 5 | 'Origin': 'http://www.lagou.com', 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36', 7 | 'X-Requested-With': 'XMLHttpRequest', 8 | 'Referer': 'http://www.lagou.com', 9 | 'Proxy-Connection': 'keep-alive', 10 | 'X-Anit-Forge-Code': '0', 11 | 'X-Anit-Forge-Token': None} 12 | 13 | #COOKIES 14 | cookies = {'JSESSIONID': '99021FFD6F8EC6B6CD209754427DEA93', 15 | '_gat': '1', 16 | 'user_trace_token': '20170203041008-9835aec2-e983-11e6-8a36-525400f775ce', 17 | 'PRE_UTM': '', 18 | 'PRE_HOST': '', 19 | 'PRE_SITE': '', 20 | 'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2F', 21 | 'LGUID': '20170203041008-9835b1c9-e983-11e6-8a36-525400f775ce', 22 | 'SEARCH_ID': 'bfed7faa3a0244cc8dc1bb361f0e8e35', 23 | 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066203', 24 | 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066567', 25 | '_ga': 'GA1.2.2003702965.1486066203', 26 | 'LGSID': '20170203041008-9835b03a-e983-11e6-8a36-525400f775ce', 27 | 'LGRID': '20170203041612-714b1ea3-e984-11e6-8a36-525400f775ce'} 28 | 29 | # IP池 30 | # 0(pay) or 1(free) or 2(None) 31 | TAGIP = 0 32 | 33 | # IP 34 | IP = [] 35 | 36 | # UA 37 | UA = ['Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3', 38 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\ 39 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))', 40 | 41 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ 42 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)', 43 | 44 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \ 45 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)', 46 | 47 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ 48 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)', 49 | 50 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)', 51 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 52 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13', 53 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13', 54 | 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 55 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3', 56 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 57 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1', 58 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)', 59 | 60 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \ 61 | SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 62 | 63 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', 64 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', 65 | 66 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \ 67 | (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 68 | 69 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 70 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 71 | 72 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \ 73 | Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', 74 | 75 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \ 76 | Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 77 | 78 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \ 79 | .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ', 80 | 81 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \ 82 | .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 83 | 84 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)', 85 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ', 86 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1', 87 | 88 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \ 89 | Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 90 | 91 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0', 92 | 93 | 'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \ 94 | Version/5.0.2 Mobile/8C148 Safari/6533.18.5', 95 | 96 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre', 97 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 98 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)'] 99 | --------------------------------------------------------------------------------