├── jd
├── data
│ └── .gitkeep
├── jd
│ ├── __init__.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── identity.py
│ │ └── comment.py
│ ├── items.py
│ ├── pipelines.py
│ ├── parse.py
│ ├── middlewares.py
│ └── settings.py
├── README.md
└── scrapy.cfg
├── lagou
├── __init__.py
├── README.md
├── parse.py
├── manage.py
├── https.py
└── setting.py
├── README.md
├── requirements.txt
└── .gitignore
/jd/data/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/jd/jd/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lagou/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/jd/README.md:
--------------------------------------------------------------------------------
1 | ## www.jd.com
2 | 京东商城物品id及评论信息爬虫
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 前言
2 | 一些小爬虫汇总,不定期更新
3 |
4 | ## 爬虫列表
5 | ### 拉勾网职位信息
6 | ### 京东商城物品评论
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | demjson>=2.2.4
2 | lxml>=3.7.3
3 | multiprocessing>=2.6.2.1
4 | requests>=2.14.2
5 | scrapy>=1.4.0
--------------------------------------------------------------------------------
/lagou/README.md:
--------------------------------------------------------------------------------
1 | # www.lagou.com-spider
2 | 拉勾网职位信息爬虫
详细见我的博客http://blog.csdn.net/sinat_33741547/article/details/54847950
3 |
--------------------------------------------------------------------------------
/jd/jd/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # project temp files
2 | *.pyc
3 | *.log
4 | _build/
5 | temp/
6 |
7 | # pycharm temp files
8 | .idea/
9 |
10 | # mac temp files
11 | .DS_Store
12 |
13 | # wheel temp files
14 | build/
15 | *.egg-info/
16 |
17 | # Vim temp files
18 | *.swp
19 | *.swo
20 |
--------------------------------------------------------------------------------
/jd/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jd.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd
12 |
--------------------------------------------------------------------------------
/jd/jd/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class JdItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | info = scrapy.Field()
15 |
--------------------------------------------------------------------------------
/jd/jd/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import os
8 |
9 |
10 | class JdPipeline(object):
11 | def open_spider(self, spider):
12 | path = os.path.join(os.getcwd(), 'data', '{}.txt'.format(spider.name))
13 | self.fa = open(path, 'a+')
14 |
15 | def close_spider(self, spider):
16 | self.fa.close()
17 |
18 | def process_item(self, item, spider):
19 | self.fa.write('{}\n'.format(str(item['info'])))
20 |
--------------------------------------------------------------------------------
/jd/jd/spiders/identity.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.http import Request
4 | from jd.parse import parse_info
5 | from jd.items import JdItem
6 | from jd.settings import PD_MAPS
7 |
8 |
9 | class IdentitySpider(scrapy.Spider):
10 | name = 'identity'
11 | allowed_domains = ['www.jd.com']
12 | start_urls = []
13 |
14 | def start_requests(self):
15 | for _, url in PD_MAPS.items():
16 | yield Request(url)
17 |
18 | def parse(self, response):
19 | item = JdItem()
20 | ids = parse_info(response.body, 'item.jd.com.*?(\\d+)\.html', 1)
21 | for i in ids:
22 | item['info'] = i.strip()
23 | yield item
24 |
--------------------------------------------------------------------------------
/jd/jd/spiders/comment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import demjson
4 | import scrapy
5 | from scrapy.http import Request
6 | from jd.items import JdItem
7 | from jd.settings import CM_URL, SC_MAPS
8 |
9 |
10 | class CommentSpider(scrapy.Spider):
11 | name = 'comment'
12 | allowed_domains = []
13 | start_urls = ['http://www.jd.com/']
14 |
15 | def parse(self, response):
16 | path = os.path.join(os.getcwd(), 'data', 'identity.txt')
17 | fr = open(path, 'r')
18 | urls = [(CM_URL.format(i.strip(), s, 0), s) for i in fr.readlines() for _, s in SC_MAPS.items()]
19 | for url, score in urls:
20 | yield Request(url, meta={'page': 0, 'score': score}, callback=self.parse_comment)
21 |
22 | def parse_comment(self, response):
23 | item = JdItem()
24 | meta = response.meta
25 | response_json = demjson.decode(txt=response.text, encoding='utf-8')
26 | if meta['page'] < int(response_json['maxPage']) and meta['page'] < 100:
27 | meta['page'] += 1
28 | url = u'='.join(response._url.split(u'=')[:-1])+u'='+str(meta['page'])
29 | yield Request(url, meta=meta, callback=self.parse_comment)
30 | for c in response_json['comments']:
31 | content = ''.join(c['content']).strip().replace(u'\n', u'').replace(u'\r', u'').encode('utf-8')
32 | item['info'] = '{} {}'.format(meta['score'], content)
33 | yield item
34 |
--------------------------------------------------------------------------------
/jd/jd/parse.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | import hashlib
4 |
5 |
6 | def hash_md5(string):
7 | md5 = hashlib.md5()
8 | try:
9 | md5.update(string.encode('utf-8'))
10 | except Exception as e:
11 | md5.update(string)
12 | return md5.hexdigest()
13 |
14 |
15 | def parse_tool(content):
16 | '''
17 | 清除html标签
18 | :return:
19 | '''
20 | if type(content) != str: return content
21 | sublist = ['
', '', '', '', '', '',
22 | '', '
', '', '', '', '',
23 | '', '', '', '', '', '',
24 | '\r', '\n', '&.*?;', '&', '#.*?;', '', '', '', '',
25 | '', '', '', '', '', '',
26 | '', '', '', '', '',
27 | '', '', '', '',
28 | '', '', '', '']
29 | try:
30 | for substring in [re.compile(string, re.S) for string in sublist]:
31 | content = re.sub(substring, "", content).strip()
32 | except Exception as e:
33 | print('parse_tool:' + str(e))
34 | finally:
35 | return content
36 |
37 |
38 | def extract_str(items):
39 | content = ''
40 | for item in items:
41 | content += str(item.encode('utf-8'))
42 | content = parse_tool(content)
43 | return content
44 |
45 |
46 | def parse_info(r, p, i=0):
47 | '''
48 | 解析信息
49 | :return:
50 | '''
51 | try:
52 | pattern = re.compile(p, re.S)
53 | items = re.findall(pattern, r)
54 | if not i:
55 | item = items[i]
56 | else:
57 | item = items
58 | except Exception as e:
59 | print('parse_info:' + str(e))
60 | item = ''
61 | finally:
62 | return item
63 |
--------------------------------------------------------------------------------
/jd/jd/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class JdSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/lagou/parse.py:
--------------------------------------------------------------------------------
1 | import re
2 | import demjson
3 |
4 | class Parse:
5 | '''
6 | 解析网页信息
7 | '''
8 | def __init__(self, htmlCode):
9 | self.htmlCode = htmlCode
10 | self.json = demjson.decode(htmlCode)
11 | pass
12 |
13 |
14 | def parseTool(self,content):
15 | '''
16 | 清除html标签
17 | '''
18 | if type(content) != str: return content
19 | sublist = ['','','','','','',
20 | '','
','','','','',
21 | '','','','','','',
22 | '\r','\n','&.*?;','&','#.*?;','','']
23 | try:
24 | for substring in [re.compile(string, re.S) for string in sublist]:
25 | content = re.sub(substring, "", content).strip()
26 | except:
27 | raise Exception('Error '+str(substring.pattern))
28 | return content
29 |
30 |
31 | def parsePage(self):
32 | '''
33 | 解析并计算页面数量
34 | :return: 页面数量
35 | '''
36 | totalCount = self.json['content']['positionResult']['totalCount'] #职位总数量
37 | resultSize = self.json['content']['positionResult']['resultSize'] #每一页显示的数量
38 | pageCount = int(totalCount) // int(resultSize) + 1 #页面数量
39 | return pageCount
40 |
41 |
42 | def parseInfo(self):
43 | '''
44 | 解析信息
45 | '''
46 | info = []
47 | for position in self.json['content']['positionResult']['result']:
48 | i = {}
49 | i['companyName'] = position['companyFullName']
50 | i['companyDistrict'] = position['district']
51 | i['companyLabel'] = position['companyLabelList']
52 | i['companySize'] = position['companySize']
53 | i['companyStage'] = position['financeStage']
54 | i['companyType'] = position['industryField']
55 | i['positionType'] = position['firstType']
56 | i['positionEducation'] = position['education']
57 | i['positionAdvantage'] = position['positionAdvantage']
58 | i['positionSalary'] = position['salary']
59 | i['positionWorkYear'] = position['workYear']
60 | info.append(i)
61 | return info
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/lagou/manage.py:
--------------------------------------------------------------------------------
1 | from lagou.https import Http
2 | from lagou.parse import Parse
3 | from lagou.setting import headers as hd
4 | from lagou.setting import cookies as ck
5 | import time
6 | import logging
7 | logging.basicConfig(level=logging.ERROR,
8 | format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
9 | datefmt='%Y-%m-%d %H:%M:%S',
10 | filename='diary.log',
11 | filemode='a')
12 |
13 |
14 | def getInfo(url, para):
15 | """
16 | 获取信息
17 | """
18 | generalHttp = Http()
19 | htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck)
20 | generalParse = Parse(htmlCode)
21 | pageCount = generalParse.parsePage()
22 | info = []
23 | for i in range(1, pageCount+1):
24 | print('第%s页' % i)
25 | para['pn'] = str(i)
26 | htmlCode = generalHttp.post(url, para=para, headers=hd, cookies=ck)
27 | generalParse = Parse(htmlCode)
28 | info = info + getInfoDetail(generalParse)
29 | time.sleep(2)
30 | return info
31 |
32 |
33 | def getInfoDetail(generalParse):
34 | """
35 | 信息解析
36 | """
37 | info = generalParse.parseInfo()
38 | return info
39 |
40 |
41 | def processInfo(info, para):
42 | """
43 | 信息存储
44 | """
45 | logging.error('Process start')
46 | try:
47 | title = 'companyName,companyType,companyStage,companyLabel,companySize,companyDistrict,' \
48 | 'positionType,positionEducation,positionAdvantage,positionSalary,positionWorkYear\n'
49 | file = open('%s.txt' % para['city'], 'a')
50 | file.write(title)
51 | for p in info:
52 | line = str(p['companyName']) + ',' + str(p['companyType']) + ',' + str(p['companyStage']) + ',' + \
53 | str(p['companyLabel']) + ',' + str(p['companySize']) + ',' + str(p['companyDistrict']) + ',' + \
54 | str(p['positionType']) + ',' + str(p['positionEducation']) + ',' + str(p['positionAdvantage']) + ',' +\
55 | str(p['positionSalary']) + ',' + str(p['positionWorkYear']) + '\n'
56 | file.write(line)
57 | file.close()
58 | return True
59 | except:
60 | logging.error('Process except')
61 | return None
62 |
63 |
64 | def main(url, para):
65 | """
66 | 主函数逻辑
67 | """
68 | logging.error('Main start')
69 | if url:
70 | info = getInfo(url, para) # 获取信息
71 | flag = processInfo(info, para) # 信息储存
72 | return flag
73 | else:
74 | return None
75 |
76 |
77 | if __name__ == '__main__':
78 | kdList = [u'数据分析']
79 | cityList = [u'广州', u'深圳']
80 | url = 'https://www.lagou.com/jobs/positionAjax.json'
81 | for city in cityList:
82 | print('爬取%s' % city)
83 | para = {'first': 'true','pn': '1', 'kd': kdList[0], 'city': city}
84 | flag = main(url, para)
85 | if flag: print('%s爬取成功' % city)
86 | else: print('%s爬取失败' % city)
87 |
--------------------------------------------------------------------------------
/lagou/https.py:
--------------------------------------------------------------------------------
1 | from lagou.setting import IP,UA
2 | import requests,random
3 | import logging
4 | logging.basicConfig(level=logging.ERROR,
5 | format='%(asctime)s Process%(process)d:%(thread)d %(message)s',
6 | datefmt='%Y-%m-%d %H:%M:%S',
7 | filename='diary.log',
8 | filemode='a')
9 | class Http:
10 | '''
11 | http请求相关的操作
12 | '''
13 | def __init__(self):
14 | pass
15 |
16 | def get(self, url, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
17 | '''
18 | 获取网页源码
19 | url: 网页链接
20 | headers: headers
21 | cookies: cookies
22 | proxy: 代理
23 | timeOut: 请求超时时间
24 | timeOutRetry: 超时重试次数
25 | return: 源码
26 | '''
27 | if not url:
28 | logging.error('GetError url not exit')
29 | return 'None'
30 | logging.error('Get %s' % url)
31 | try:
32 | if not headers: headers = {'User-Agent': UA[random.randint(0, len(UA)-1)]}
33 | #if not proxy: proxy = {'http':"http://"+IP[random.randint(0, len(IP)-1)]}
34 | response = requests.get(url, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
35 | if response.status_code == 200 or response.status_code == 302:
36 | htmlCode = response.text
37 | else:
38 | htmlCode = 'None'
39 | logging.error('Get %s %s' % (str(response.status_code), url))
40 | except Exception as e:
41 | logging.error('GetExcept %s' % str(e))
42 | if timeOutRetry > 0:
43 | htmlCode = self.get(url=url, timeOutRetry=(timeOutRetry-1))
44 | else:
45 | logging.error('GetTimeOut %s' % url)
46 | htmlCode = 'None'
47 | return htmlCode
48 |
49 | def post(self,url, para, headers=None, cookies=None, proxy=None, timeOut=5, timeOutRetry=5):
50 | '''
51 | post获取响应
52 | url: 目标链接
53 | para: 参数
54 | headers: headers
55 | cookies: cookies
56 | proxy: 代理
57 | timeOut: 请求超时时间
58 | timeOutRetry: 超时重试次数
59 | return: 响应
60 | '''
61 | if not url or not para:
62 | logging.error('PostError url or para not exit')
63 | return None
64 | logging.error('Post %s' % url)
65 | try:
66 | if not headers:
67 | headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3'}
68 | response = requests.post(url, data=para, headers=headers, cookies=cookies, proxies=proxy, timeout=timeOut)
69 | if response.status_code == 200 or response.status_code == 302:
70 | htmlCode = response.text
71 | else:
72 | htmlCode = None
73 | logging.error('Post %s %s' % (str(response.status_code), url))
74 | except Exception as e:
75 | logging.error('PostExcept %s' % str(e))
76 | if timeOutRetry > 0:
77 | htmlCode = self.post(url=url, para=para, timeOutRetry=(timeOutRetry-1))
78 | else:
79 | logging.error('PostTimeOut %s' % url)
80 | htmlCode = None
81 | return htmlCode
82 |
83 | def confirm(self, htmlCode, url, headers, cookies,proxy,catch_retry=5):
84 | '''
85 | 反爬,验证页面
86 | htmlCode:网页源码
87 | return:网页源码
88 | '''
89 | #获取网页title判断是否被ban
90 | return htmlCode
91 |
92 | def urlprocess(self,items):
93 | # + URL 中+号表示空格 %2B
94 | # 空格 URL中的空格可以用+号或者编码 %20
95 | # / 分隔目录和子目录 %2F
96 | # ? 分隔实际的URL和参数 %3F
97 | # % 指定特殊字符 %25
98 | # # 表示书签 %23
99 | # & URL 中指定的参数间的分隔符 %26
100 | # = URL 中指定参数的值 %3D
101 | content = items.replace('/','%2F').replace('=','%3D').replace('+','%2B').replace(\
102 | ' ','%20').replace('/','%2F').replace('?','%3F').replace('=','%3D')
103 | return content
104 |
--------------------------------------------------------------------------------
/jd/jd/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for jd project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'jd'
13 |
14 | SPIDER_MODULES = ['jd.spiders']
15 | NEWSPIDER_MODULE = 'jd.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jd (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | 'Accept-Language': 'en',
45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
46 | }
47 |
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | # SPIDER_MIDDLEWARES = {
51 | # 'jd.middlewares.JdSpiderMiddleware': 543,
52 | # }
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'jd.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'jd.pipelines.JdPipeline': 300,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | # 各类商品链接
94 | PD_MAPS = {'家电': 'https://jiadian.jd.com/',
95 | '手机': 'https://shouji.jd.com/',
96 | '运营商': 'https://wt.jd.com/',
97 | '数码': 'https://shuma.jd.com/',
98 | '电脑': 'https://diannao.jd.com/',
99 | '办公': 'https://bg.jd.com/',
100 | '家居': 'https://channel.jd.com/home.html',
101 | '家具': 'https://channel.jd.com/furniture.html',
102 | '家装': 'https://channel.jd.com/decoration.html'}
103 |
104 | # 评论
105 | CM_URL = 'https://sclub.jd.com/comment/productPageComments.action?productId={}&score={}&sortType=5&pageSize=10&isShadowSku=0&fold=1&page={}'
106 | SC_MAPS = {'好评': '3',
107 | '中评': '2',
108 | '差评': '1'}
109 |
--------------------------------------------------------------------------------
/lagou/setting.py:
--------------------------------------------------------------------------------
1 | #HEADER
2 | headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
3 | 'Accept-Encoding': 'gzip, deflate',
4 | 'Host': 'www.lagou.com',
5 | 'Origin': 'http://www.lagou.com',
6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
7 | 'X-Requested-With': 'XMLHttpRequest',
8 | 'Referer': 'http://www.lagou.com',
9 | 'Proxy-Connection': 'keep-alive',
10 | 'X-Anit-Forge-Code': '0',
11 | 'X-Anit-Forge-Token': None}
12 |
13 | #COOKIES
14 | cookies = {'JSESSIONID': '99021FFD6F8EC6B6CD209754427DEA93',
15 | '_gat': '1',
16 | 'user_trace_token': '20170203041008-9835aec2-e983-11e6-8a36-525400f775ce',
17 | 'PRE_UTM': '',
18 | 'PRE_HOST': '',
19 | 'PRE_SITE': '',
20 | 'PRE_LAND': 'https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2F',
21 | 'LGUID': '20170203041008-9835b1c9-e983-11e6-8a36-525400f775ce',
22 | 'SEARCH_ID': 'bfed7faa3a0244cc8dc1bb361f0e8e35',
23 | 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066203',
24 | 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1486066567',
25 | '_ga': 'GA1.2.2003702965.1486066203',
26 | 'LGSID': '20170203041008-9835b03a-e983-11e6-8a36-525400f775ce',
27 | 'LGRID': '20170203041612-714b1ea3-e984-11e6-8a36-525400f775ce'}
28 |
29 | # IP池
30 | # 0(pay) or 1(free) or 2(None)
31 | TAGIP = 0
32 |
33 | # IP
34 | IP = []
35 |
36 | # UA
37 | UA = ['Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3',
38 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0;\
39 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1))',
40 |
41 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
42 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)',
43 |
44 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; \
45 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)',
46 |
47 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
48 | Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; 360SE)',
49 |
50 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; 360SE)',
51 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
52 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
53 | 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
54 | 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
55 | 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
56 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
57 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
58 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
59 |
60 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; \
61 | SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
62 |
63 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
64 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
65 |
66 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 \
67 | (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
68 |
69 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
70 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
71 |
72 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) \
73 | Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
74 |
75 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) \
76 | Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
77 |
78 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; \
79 | .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
80 |
81 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; \
82 | .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
83 |
84 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)',
85 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ',
86 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
87 |
88 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) \
89 | Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
90 |
91 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0',
92 |
93 | 'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) \
94 | Version/5.0.2 Mobile/8C148 Safari/6533.18.5',
95 |
96 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
97 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
98 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)']
99 |
--------------------------------------------------------------------------------