├── jd
    ├── data
    │   └── .gitkeep
    ├── jd
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── identity.py
    │   │   └── comment.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── parse.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── README.md
    └── scrapy.cfg
├── lagou
    ├── __init__.py
    ├── README.md
    ├── parse.py
    ├── manage.py
    ├── https.py
    └── setting.py
├── README.md
├── requirements.txt
└── .gitignore


/jd/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/jd/jd/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lagou/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/jd/README.md:
--------------------------------------------------------------------------------
1 | ## www.jd.com
2 | 京东商城物品id及评论信息爬虫


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 前言
2 | 一些小爬虫汇总，不定期更新
3 | 
4 | ## 爬虫列表
5 | ### 拉勾网职位信息
6 | ### 京东商城物品评论
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | demjson>=2.2.4
2 | lxml>=3.7.3
3 | multiprocessing>=2.6.2.1
4 | requests>=2.14.2
5 | scrapy>=1.4.0


--------------------------------------------------------------------------------
/lagou/README.md:
--------------------------------------------------------------------------------
1 | # www.lagou.com-spider
2 | 拉勾网职位信息爬虫<br>详细见我的博客http://blog.csdn.net/sinat_33741547/article/details/54847950
3 | 


--------------------------------------------------------------------------------
/jd/jd/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # project temp files
 2 | *.pyc
 3 | *.log
 4 | _build/
 5 | temp/
 6 | 
 7 | # pycharm temp files
 8 | .idea/
 9 | 
10 | # mac temp files
11 | .DS_Store
12 | 
13 | # wheel temp files
14 | build/
15 | *.egg-info/
16 | 
17 | # Vim temp files
18 | *.swp
19 | *.swo
20 | 


--------------------------------------------------------------------------------
/jd/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jd.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd
12 | 


--------------------------------------------------------------------------------
/jd/jd/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JdItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     info = scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/jd/jd/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import os
 8 | 
 9 | 
10 | class JdPipeline(object):
11 |     def open_spider(self, spider):
12 |         path = os.path.join(os.getcwd(), 'data', '{}.txt'.format(spider.name))
13 |         self.fa = open(path, 'a+')
14 | 
15 |     def close_spider(self, spider):
16 |         self.fa.close()
17 | 
18 |     def process_item(self, item, spider):
19 |         self.fa.write('{}\n'.format(str(item['info'])))
20 | 


--------------------------------------------------------------------------------
/jd/jd/spiders/identity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.http import Request
 4 | from jd.parse import parse_info
 5 | from jd.items import JdItem
 6 | from jd.settings import PD_MAPS
 7 | 
 8 | 
 9 | class IdentitySpider(scrapy.Spider):
10 |     name = 'identity'
11 |     allowed_domains = ['www.jd.com']
12 |     start_urls = []
13 | 
14 |     def start_requests(self):
15 |         for _, url in PD_MAPS.items():
16 |             yield Request(url)
17 | 
18 |     def parse(self, response):
19 |         item = JdItem()
20 |         ids = parse_info(response.body, 'item.jd.com.*?(\\d+)\.html', 1)
21 |         for i in ids:
22 |             item['info'] = i.strip()
23 |             yield item
24 | 


--------------------------------------------------------------------------------
/jd/jd/spiders/comment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import demjson
 4 | import scrapy
 5 | from scrapy.http import Request
 6 | from jd.items import JdItem
 7 | from jd.settings import CM_URL, SC_MAPS
 8 | 
 9 | 
10 | class CommentSpider(scrapy.Spider):
11 |     name = 'comment'
12 |     allowed_domains = []
13 |     start_urls = ['http://www.jd.com/']
14 | 
15 |     def parse(self, response):
16 |         path = os.path.join(os.getcwd(), 'data', 'identity.txt')
17 |         fr = open(path, 'r')
18 |         urls = [(CM_URL.format(i.strip(), s, 0), s) for i in fr.readlines() for _, s in SC_MAPS.items()]
19 |         for url, score in urls:
20 |             yield Request(url, meta={'page': 0, 'score': score}, callback=self.parse_comment)
21 | 
22 |     def parse_comment(self, response):
23 |         item = JdItem()
24 |         meta = response.meta
25 |         response_json = demjson.decode(txt=response.text, encoding='utf-8')
26 |         if meta['page'] < int(response_json['maxPage']) and meta['page'] < 100:
27 |             meta['page'] += 1
28 |             url = u'='.join(response._url.split(u'=')[:-1])+u'='+str(meta['page'])
29 |             yield Request(url, meta=meta, callback=self.parse_comment)
30 |         for c in response_json['comments']:
31 |             content = ''.join(c['content']).strip().replace(u'\n', u'').replace(u'\r', u'').encode('utf-8')
32 |             item['info'] = '{} {}'.format(meta['score'], content)
33 |             yield item
34 | 


--------------------------------------------------------------------------------
/jd/jd/parse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import hashlib
 4 | 
 5 | 
 6 | def hash_md5(string):
 7 |     md5 = hashlib.md5()
 8 |     try:
 9 |         md5.update(string.encode('utf-8'))
10 |     except Exception as e:
11 |         md5.update(string)
12 |     return md5.hexdigest()
13 | 
14 | 
15 | def parse_tool(content):
16 |     '''
17 |     清除html标签
18 |     :return:
19 |     '''
20 |     if type(content) != str: return content
21 |     sublist = ['<p.*?>', '</p.*?>', '<b.*?>', '</b.*?>', '<div.*?>', '</div.*?>',
22 |                '</br>', '<br />', '<ul>', '</ul>', '<li>', '</li>', '<strong>',
23 |                '</strong>', '<table.*?>', '<tr.*?>', '</tr>', '<td.*?>', '</td>',
24 |                '\r', '\n', '&.*?;', '&', '#.*?;', '<em>', '</em>', '<dt>', '</dt>',
25 |                '<dd>', '</dd>', '<a.*?>', '</a.*?>', '<span.*?>', '</span.*?>',
26 |                '<th.*?>', '</th.*?>', '<label.*?>', '</label.*?>', '<h4.*?>',
27 |                '<font.*?>', '</font.*?>', '<thread.*?>', '</thread.*?>',
28 |                '</tbody.*?>', '<tbody.*?>', '</table.*?>', '</h4.*?>']
29 |     try:
30 |         for substring in [re.compile(string, re.S) for string in sublist]:
31 |             content = re.sub(substring, "", content).strip()
32 |     except Exception as e:
33 |         print('parse_tool:' + str(e))
34 |     finally:
35 |         return content
36 | 
37 | 
38 | def extract_str(items):
39 |     content = ''
40 |     for item in items:
41 |         content += str(item.encode('utf-8'))
42 |     content = parse_tool(content)
43 |     return content
44 | 
45 | 
46 | def parse_info(r, p, i=0):
47 |     '''
48 |     解析信息
49 |     :return:
50 |     '''
51 |     try:
52 |         pattern = re.compile(p, re.S)
53 |         items = re.findall(pattern, r)
54 |         if not i:
55 |             item = items[i]
56 |         else:
57 |             item = items
58 |     except Exception as e:
59 |         print('parse_info:' + str(e))
60 |         item = ''
61 |     finally:
62 |         return item
63 | 


--------------------------------------------------------------------------------
/jd/jd/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class JdSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/lagou/parse.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import demjson
 3 | 
 4 | class Parse:
 5 |     '''
 6 |     解析网页信息
 7 |     '''
 8 |     def __init__(self, htmlCode):
 9 |         self.htmlCode = htmlCode
10 |         self.json = demjson.decode(htmlCode)
11 |         pass
12 | 
13 | 
14 |     def parseTool(self,content):
15 |         '''
16 |         清除html标签
17 |         '''
18 |         if type(content) != str: return content
19 |         sublist = ['<p.*?>','</p.*?>','<b.*?>','</b.*?>','<div.*?>','</div.*?>',
20 |                    '</br>','<br />','<ul>','</ul>','<li>','</li>','<strong>',
21 |                    '</strong>','<table.*?>','<tr.*?>','</tr>','<td.*?>','</td>',
22 |                    '\r','\n','&.*?;','&','#.*?;','<em>','</em>']
23 |         try:
24 |             for substring in [re.compile(string, re.S) for string in sublist]:
25 |                 content = re.sub(substring, "", content).strip()
26 |         except:
27 |             raise Exception('Error '+str(substring.pattern))
28 |         return content
29 | 
30 | 
31 |     def parsePage(self):
32 |         '''
33 |         解析并计算页面数量
34 |         :return: 页面数量
35 |         '''
36 |         totalCount = self.json['content']['positionResult']['totalCount']      #职位总数量
37 |         resultSize = self.json['content']['positionResult']['resultSize']      #每一页显示的数量
38 |         pageCount = int(totalCount) // int(resultSize) + 1          #页面数量
39 |         return pageCount
40 | 
41 | 
42 |     def parseInfo(self):
43 |         '''
44 |         解析信息
45 |         '''
46 |         info = []
47 |         for position in self.json['content']['positionResult']['result']:
48 |             i = {}
49 |             i['companyName'] = position['companyFullName']
50 |             i['companyDistrict'] = position['district']
51 |             i['companyLabel'] = position['companyLabelList']
52 |             i['companySize'] = position['companySize']
53 |             i['companyStage'] = position['financeStage']
54 |             i['companyType'] = position['industryField']
55 |             i['positionType'] = position['firstType']
56 |             i['positionEducation'] = position['education']
57 |             i['positionAdvantage'] = position['positionAdvantage']
58 |             i['positionSalary'] = position['salary']
59 |             i['positionWorkYear'] = position['workYear']
60 |             info.append(i)
61 |         return info
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/lagou/manage.py:
--------------------------------------------------------------------------------
 1 | from lagou.https import Http
 2 | from lagou.parse import Parse
 3 | from lagou.setting import headers as hd
 4 | from lagou.setting import cookies as ck
 5 | import time
 6 | import logging
 7 | logging.basicConfig(level=logging.ERROR,
 8 |                     format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
 9 |                     datefmt='%Y-%m-%d %H:%M:%S',
10 |                     filename='diary.log',
11 |                     filemode='a')
12 | 
13 | 
14 | def getInfo(url, para):
15 |     """
16 |     获取信息
17 |     """
18 |     generalHttp = Http()
19 |     htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck)
20 |     generalParse = Parse(htmlCode)
21 |     pageCount = generalParse.parsePage()
22 |     info = []
23 |     for i in range(1, pageCount+1):
24 |         print('第%s页' % i)
25 |         para['pn'] = str(i)
26 |         htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck)
27 |         generalParse = Parse(htmlCode)
28 |         info = info + getInfoDetail(generalParse)
29 |         time.sleep(2)
30 |     return info
31 | 
32 | 
33 | def getInfoDetail(generalParse):
34 |     """
35 |     信息解析
36 |     """
37 |     info = generalParse.parseInfo()
38 |     return info
39 | 
40 | 
41 | def processInfo(info, para):
42 |     """
43 |     信息存储
44 |     """
45 |     logging.error('Process start')
46 |     try:
47 |         title = 'companyName,companyType,companyStage,companyLabel,companySize,companyDistrict,' \
48 |                 'positionType,positionEducation,positionAdvantage,positionSalary,positionWorkYear\n'
49 |         file = open('%s.txt' % para['city'], 'a')
50 |         file.write(title)
51 |         for p in info:
52 |             line = str(p['companyName']) + ',' + str(p['companyType']) + ',' + str(p['companyStage']) + ',' + \
53 |                    str(p['companyLabel']) + ',' + str(p['companySize']) + ',' + str(p['companyDistrict']) + ',' + \
54 |                    str(p['positionType']) + ',' + str(p['positionEducation']) + ',' + str(p['positionAdvantage']) + ',' +\
55 |                    str(p['positionSalary']) + ',' + str(p['positionWorkYear']) + '\n'
56 |             file.write(line)
57 |         file.close()
58 |         return True
59 |     except:
60 |         logging.error('Process except')
61 |         return None
62 | 
63 | 
64 | def main(url, para):
65 |     """
66 |     主函数逻辑
67 |     """
68 |     logging.error('Main start')
69 |     if url:
70 |         info = getInfo(url, para)             # 获取信息
71 |         flag = processInfo(info, para)             # 信息储存
72 |         return flag
73 |     else:
74 |         return None
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     kdList = [u'数据分析']
79 |     cityList = [u'广州', u'深圳']
80 |     url = 'https://www.lagou.com/jobs/positionAjax.json'
81 |     for city in cityList:
82 |         print('爬取%s' % city)
83 |         para = {'first': 'true','pn': '1', 'kd': kdList[0], 'city': city}
84 |         flag = main(url, para)
85 |         if flag: print('%s爬取成功' % city)
86 |         else: print('%s爬取失败' % city)
87 | 


--------------------------------------------------------------------------------
/lagou/https.py:
--------------------------------------------------------------------------------
  1 | from lagou.setting import IP,UA
  2 | import requests,random
  3 | import logging
  4 | logging.basicConfig(level=logging.ERROR,
  5 |                     format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
  6 |                     datefmt='%Y-%m-%d %H:%M:%S',
  7 |                     filename='diary.log',
  8 |                     filemode='a')
  9 | class Http:
 10 |     '''
 11 |     http请求相关的操作
 12 |     '''
 13 |     def __init__(self):
 14 |         pass
 15 | 
 16 |     def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
 17 |         '''
 18 |         获取网页源码
 19 |         url: 网页链接
 20 |         headers: headers
 21 |         cookies: cookies
 22 |         proxy: 代理
 23 |         timeOut: 请求超时时间
 24 |         timeOutRetry: 超时重试次数
 25 |         return: 源码
 26 |         '''
 27 |         if not url:
 28 |             logging.error('GetError url not exit')
 29 |             return 'None'
 30 |         logging.error('Get %s' % url)
 31 |         try:
 32 |             if not headers: headers = {'User-Agent': UA[random.randint(0, len(UA)-1)]}
 33 |             #if not proxy: proxy = {'http':"http://"+IP[random.randint(0, len(IP)-1)]}
 34 |             response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
 35 |             if response.status_code == 200 or response.status_code == 302:
 36 |                 htmlCode = response.text
 37 |             else:
 38 |                 htmlCode = 'None'
 39 |             logging.error('Get %s %s' % (str(response.status_code), url))
 40 |         except Exception as e:
 41 |             logging.error('GetExcept %s' % str(e))
 42 |             if timeOutRetry > 0:
 43 |                 htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry-1))
 44 |             else:
 45 |                 logging.error('GetTimeOut %s' % url)
 46 |                 htmlCode = 'None'
 47 |         return htmlCode
 48 | 
 49 |     def post(self,url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
 50 |         '''
 51 |         post获取响应
 52 |         url: 目标链接
 53 |         para: 参数
 54 |         headers: headers
 55 |         cookies: cookies
 56 |         proxy: 代理
 57 |         timeOut: 请求超时时间
 58 |         timeOutRetry: 超时重试次数
 59 |         return: 响应
 60 |         '''
 61 |         if not url or not para:
 62 |             logging.error('PostError url or para not exit')
 63 |             return None
 64 |         logging.error('Post %s' % url)
 65 |         try:
 66 |             if not headers:
 67 |                 headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3'}
 68 |             response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
 69 |             if response.status_code == 200 or response.status_code == 302:
 70 |                 htmlCode = response.text
 71 |             else:
 72 |                 htmlCode = None
 73 |             logging.error('Post %s %s' % (str(response.status_code), url))
 74 |         except Exception as e:
 75 |             logging.error('PostExcept %s' % str(e))
 76 |             if timeOutRetry > 0:
 77 |                 htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry-1))
 78 |             else:
 79 |                 logging.error('PostTimeOut %s' % url)
 80 |                 htmlCode = None
 81 |         return htmlCode
 82 | 
 83 |     def confirm(self, htmlCode, url, headers, cookies,proxy,catch_retry=5):
 84 |         '''
 85 |         反爬，验证页面
 86 |         htmlCode:网页源码
 87 |         return:网页源码
 88 |         '''
 89 |         #获取网页title判断是否被ban
 90 |         return htmlCode
 91 | 
 92 |     def urlprocess(self,items):
 93 |         # +    URL 中+号表示空格               %2B
 94 |         # 空格 URL中的空格可以用+号或者编码    %20
 95 |         # /    分隔目录和子目录                %2F
 96 |         # ?    分隔实际的URL和参数             %3F
 97 |         # %    指定特殊字符                    %25
 98 |         # #    表示书签                        %23
 99 |         # &    URL 中指定的参数间的分隔符      %26
100 |         # =    URL 中指定参数的值              %3D
101 |         content = items.replace('&#047;','%2F').replace('&#061;','%3D').replace('+','%2B').replace(\
102 |                                 ' ','%20').replace('/','%2F').replace('?','%3F').replace('=','%3D')
103 |         return content
104 | 


--------------------------------------------------------------------------------
/jd/jd/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for jd project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'jd'
 13 | 
 14 | SPIDER_MODULES = ['jd.spiders']
 15 | NEWSPIDER_MODULE = 'jd.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'jd (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | DOWNLOAD_DELAY = 1
 31 | # The download delay setting will honor only one of:
 32 | CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 |     'Accept-Language': 'en',
 45 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
 46 | }
 47 | 
 48 | # Enable or disable spider middlewares
 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 50 | # SPIDER_MIDDLEWARES = {
 51 | #    'jd.middlewares.JdSpiderMiddleware': 543,
 52 | # }
 53 | 
 54 | # Enable or disable downloader middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 56 | #DOWNLOADER_MIDDLEWARES = {
 57 | #    'jd.middlewares.MyCustomDownloaderMiddleware': 543,
 58 | #}
 59 | 
 60 | # Enable or disable extensions
 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 62 | #EXTENSIONS = {
 63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 64 | #}
 65 | 
 66 | # Configure item pipelines
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 68 | ITEM_PIPELINES = {
 69 |    'jd.pipelines.JdPipeline': 300,
 70 | }
 71 | 
 72 | # Enable and configure the AutoThrottle extension (disabled by default)
 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 74 | #AUTOTHROTTLE_ENABLED = True
 75 | # The initial download delay
 76 | #AUTOTHROTTLE_START_DELAY = 5
 77 | # The maximum download delay to be set in case of high latencies
 78 | #AUTOTHROTTLE_MAX_DELAY = 60
 79 | # The average number of requests Scrapy should be sending in parallel to
 80 | # each remote server
 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 82 | # Enable showing throttling stats for every response received:
 83 | #AUTOTHROTTLE_DEBUG = False
 84 | 
 85 | # Enable and configure HTTP caching (disabled by default)
 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 87 | #HTTPCACHE_ENABLED = True
 88 | #HTTPCACHE_EXPIRATION_SECS = 0
 89 | #HTTPCACHE_DIR = 'httpcache'
 90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 92 | 
 93 | # 各类商品链接
 94 | PD_MAPS = {'家电': 'https://jiadian.jd.com/',
 95 |         '手机': 'https://shouji.jd.com/',
 96 |         '运营商': 'https://wt.jd.com/',
 97 |         '数码': 'https://shuma.jd.com/',
 98 |         '电脑': 'https://diannao.jd.com/',
 99 |         '办公': 'https://bg.jd.com/',
100 |         '家居': 'https://channel.jd.com/home.html',
101 |         '家具': 'https://channel.jd.com/furniture.html',
102 |         '家装': 'https://channel.jd.com/decoration.html'}
103 | 
104 | # 评论
105 | CM_URL = 'https://sclub.jd.com/comment/productPageComments.action?productId={}&score={}&sortType=5&pageSize=10&isShadowSku=0&fold=1&page={}'
106 | SC_MAPS = {'好评': '3',
107 |            '中评': '2',
108 |            '差评': '1'}
109 | 


--------------------------------------------------------------------------------
/lagou/setting.py:
--------------------------------------------------------------------------------
 1 | #HEADER
 2 | headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
 3 |            'Accept-Encoding': 'gzip, deflate',
 4 |            'Host': 'www.lagou.com',
 5 |            'Origin': 'http://www.lagou.com',
 6 |            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
 7 |            'X-Requested-With': 'XMLHttpRequest',
 8 |            'Referer': 'http://www.lagou.com',
 9 |            'Proxy-Connection': 'keep-alive',
10 |            'X-Anit-Forge-Code': '0',
11 |            'X-Anit-Forge-Token': None}
12 | 
13 | #COOKIES
14 | cookies = {'JSESSIONID': '99021FFD6F8EC6B6CD209754427DEA93',
15 |            '_gat': '1',
16 |            'user_trace_token': '20170203041008-9835aec2-e983-11e6-8a36-525400f775ce',
17 |            'PRE_UTM': '',
18 |            'PRE_HOST': '',
19 |            'PRE_SITE': '',
20 |            'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2F',
21 |            'LGUID': '20170203041008-9835b1c9-e983-11e6-8a36-525400f775ce',
22 |            'SEARCH_ID': 'bfed7faa3a0244cc8dc1bb361f0e8e35',
23 |            'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066203',
24 |            'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066567',
25 |            '_ga': 'GA1.2.2003702965.1486066203',
26 |            'LGSID': '20170203041008-9835b03a-e983-11e6-8a36-525400f775ce',
27 |            'LGRID': '20170203041612-714b1ea3-e984-11e6-8a36-525400f775ce'}
28 | 
29 | # IP池
30 | # 0(pay) or 1(free) or 2(None)
31 | TAGIP = 0
32 | 
33 | # IP
34 | IP = []
35 | 
36 | # UA
37 | UA = ['Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3',
38 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\
39 |        Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))',
40 | 
41 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
42 |       Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)',
43 | 
44 |       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \
45 |       Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)',
46 | 
47 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
48 |       Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)',
49 | 
50 |       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)',
51 |       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
52 |       'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
53 |       'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
54 |       'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
55 |       'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
56 |       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
57 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
58 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
59 | 
60 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
61 |       SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
62 | 
63 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
64 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
65 | 
66 |       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \
67 |       (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
68 | 
69 |       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
70 |       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
71 | 
72 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \
73 |       Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
74 | 
75 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
76 |       Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
77 | 
78 |       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \
79 |       .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
80 | 
81 |       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \
82 |       .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
83 | 
84 |       'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)',
85 |       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ',
86 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
87 | 
88 |       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \
89 |       Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
90 | 
91 |       'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0',
92 | 
93 |       'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \
94 |       Version/5.0.2 Mobile/8C148 Safari/6533.18.5',
95 | 
96 |       'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
97 |       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
98 |       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)']
99 | 


--------------------------------------------------------------------------------