├── .gitignore
├── 58pbdn.py
├── CookiesOfZhihu
    ├── README.rst
    ├── config.ini
    └── login.py
├── JDspider
    ├── JDspider
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── SpiderofJD.py
    │   │   └── __init__.py
    └── scrapy.cfg
├── exercisebs4.py
├── pbdnof58
    ├── .idea
    │   ├── modules.xml
    │   ├── pbdnof58.iml
    │   └── workspace.xml
    ├── README.rst
    ├── pbdnof58
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── 58Urlspider.py
    │   │   ├── 58spider-redis.py
    │   │   └── __init__.py
    └── scrapy.cfg
└── tayleswift.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pyc
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/58pbdn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'qcl'
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import requests
 6 | 
 7 | ListOfUrl = []
 8 | def GetUrl(url):
 9 |     web_info = requests.get(url).text
10 |     soup = BeautifulSoup(web_info, 'html5lib')
11 |     listofa = soup.select('#infolist > table > tbody > tr')
12 |     for i in listofa:
13 |         for j in i.find_all('td'):
14 |             if j['class'] == ['tc']:
15 |                 if j.get_text() == '':
16 |                     for k in i.find_all('td'):
17 |                         if k['class'] == ['t']:
18 |                             ListOfUrl.append(k.a['href'])
19 | 
20 | def GetAllInfo(start, end):
21 |     for i in xrange(start, end + 1):
22 |         url = url = 'http://bj.58.com/pbdn/0/pn' + str(i)
23 |         GetUrl(url)
24 |     for i in ListOfUrl:
25 |         wb_info = requests.get(i).text
26 |         soup = BeautifulSoup(wb_info, 'lxml')
27 |         # titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1 ')
28 |         title = soup.title.text
29 |         # time = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
30 |         time = soup.select('.time')
31 |         price = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')
32 |         quality = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
33 |         area = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a')
34 |         print title, time[0].get_text(), price[0].get_text(), quality[0].get_text().strip()
35 |         if len(area) == 2:
36 |             print area[0].get_text() + '-' + area[1].get_text()
37 |         elif len(area) == 1:
38 |             print area[0].get_text()
39 |         print '----------------------------------------------------------------------------'
40 |     print 'done', len(ListOfUrl)
41 | 
42 | 
43 | GetAllInfo(1, 20)
44 | 
45 | 


--------------------------------------------------------------------------------
/CookiesOfZhihu/README.rst:
--------------------------------------------------------------------------------
1 | 知乎可能会随着时间的推移而改变cookies变量，需要自己手动进行设置，修改变量的值。
2 | 


--------------------------------------------------------------------------------
/CookiesOfZhihu/config.ini:
--------------------------------------------------------------------------------
 1 | [info]
 2 | email = xxxxxxx
 3 | password = xxxx
 4 | 
 5 | [cookies]
 6 | l_n_c=1
 7 | q_c1=650ddc07bd754017820871e5188ac1db|1463311811000|1463311811000
 8 | _xsrf=a5b69090b2bdc67fd4987318958219e4
 9 | cap_id="MDVhMzIzZDQ1MWFhNDk3ZDkwMjU3MmNkNWI1MDUzMzk=|1463311811|595e134256102926f52ce541ee1b0b302e832e93"
10 | l_cap_id="YmZkYWYxN2M1MjUwNDgwZmJiZDNkMWIxMzg2NzI0ZDc=|1463311811|639b2df5e9affa6aa901ca5e6f9354e185a32e3b"
11 | d_c0="ABCAZD0f7QmPTtosO-72HbNdU6o9uCkyom8=|1463311812"
12 | _zap=23eb1f89-b696-4775-92e0-11a289d24cc8
13 | _za=d7b6603b-6391-435d-b83f-ae5eef570fda
14 | __utmt=1
15 | login="MjJjNjdhNTlmZGIzNGJmYzkzMmYwZGMzZmExYjdkZGQ=|1463311835|7820d87c772e131fd637936a0eaa93b76cc0a3db"
16 | z_c0=Mi4wQUFBQVYyODFBQUFBRUlCa1BSX3RDUmNBQUFCaEFsVk44T3BmVndCZGlod2ktVVh2WDluM1oxa2xlb1FKMWt3R1FR|1463311856|b45e94045da643cff4775b4a8036070dcff8bfd4
17 | __utma=51854390.635799030.1463311831.1463311831.1463311831.1
18 | __utmb=51854390.4.10.1463311831
19 | __utmc=51854390
20 | __utmz=51854390.1463311831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)
21 | __utmv=51854390.100-1|2=registration_date=20140820=1^3=entry_date=20140820=1


--------------------------------------------------------------------------------
/CookiesOfZhihu/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | 网络爬虫之用户名密码及验证码登陆：爬取知乎网站
 4 | '''
 5 | import requests
 6 | import ConfigParser
 7 | 
 8 | def create_session():
 9 |     cf = ConfigParser.ConfigParser()
10 |     cf.read('config.ini')
11 |     cookies = cf.items('cookies')
12 |     cookies = dict(cookies)
13 |     print cookies
14 |     email = cf.get('info', 'email')
15 |     password = cf.get('info', 'password')
16 | 
17 |     session = requests.session()
18 |     login_data = {'email': email, 'password': password}
19 |     header = {
20 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
21 |         'Host': 'www.zhihu.com',
22 |         'Referer': 'http://www.zhihu.com/'
23 |     }
24 |     r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header)
25 |     print r.json()
26 |     if r.json()['r'] == 1:
27 |         print 'Login Failed, reason is:',
28 |         print 'So we use cookies to login in...'
29 |         has_cookies = False
30 |         for key in cookies:
31 |             if key != '__name__' and cookies[key] != '':
32 |                 has_cookies = True
33 |                 break
34 |         if has_cookies is False:
35 |             raise ValueError('请填写config.ini文件中的cookies项.')
36 |         else:
37 |             # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆
38 |             r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆
39 | 
40 |     with open('login.html', 'w') as fp:
41 |         fp.write(r.content)
42 | 
43 |     return session, cookies
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     requests_session, requests_cookies = create_session()
48 | 
49 |     # url = 'http://www.zhihu.com/login/email'
50 |     url = 'http://www.zhihu.com/topic/19552832'
51 |     # content = requests_session.get(url).content # 未登陆
52 |     # content = requests.get(url, cookies=requests_cookies).content # 已登陆
53 |     content = requests_session.get(url, cookies=requests_cookies).content # 已登陆
54 |     with open('url.html', 'w') as fp:
55 |         fp.write(content)
56 | 


--------------------------------------------------------------------------------
/JDspider/JDspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/JDspider/JDspider/__init__.py


--------------------------------------------------------------------------------
/JDspider/JDspider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.loader import ItemLoader
10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
11 | 
12 | 
13 | class JdspiderItem(scrapy.Item):
14 |     # define the fields for your item here like:
15 |     title = scrapy.Field()
16 |     desc = scrapy.Field()
17 |     price = scrapy.Field()
18 |     haoping = scrapy.Field()
19 |     zhongping = scrapy.Field()
20 |     chaping = scrapy.Field()
21 |     shaitu = scrapy.Field()
22 | 
23 | class JDspiderLoader(ItemLoader):
24 |     default_item_class = JdspiderItem
25 |     default_input_processor = MapCompose(lambda s: s.strip())
26 |     default_output_processor = TakeFirst()
27 |     description_out = Join()
28 | 


--------------------------------------------------------------------------------
/JDspider/JDspider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class JdspiderPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/JDspider/JDspider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for JDspider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'JDspider'
13 | 
14 | SPIDER_MODULES = ['JDspider.spiders']
15 | NEWSPIDER_MODULE = 'JDspider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'JDspider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'JDspider.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'JDspider.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'JDspider.pipelines.JdspiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/JDspider/JDspider/spiders/SpiderofJD.py:
--------------------------------------------------------------------------------
 1 | from scrapy_redis.spiders import RedisSpider
 2 | from JDspider.items import JDspiderLoader
 3 | from splinter import Browser
 4 | from scrapy import log
 5 | 
 6 | 
 7 | 
 8 | class Myspider(RedisSpider):
 9 |     '''spider that reads urls from redis queue (myspider:start_urls).'''
10 |     name = 'jdspider'
11 |     redis_key = 'jdspider_urls'
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         domain = kwargs.pop('domain', '')
15 |         self.allowed_domans = filter(None, domain.split(','))
16 |         super(Myspider, self).__init__(*args, **kwargs)
17 | 
18 |     def parse(self, response):
19 |         el = JDspiderLoader(response=response)
20 |         el.add_xpath('title', '//*[@id="name"]/h1/text()')
21 |         with Browser() as browser:
22 |             url = response.url
23 |             browser.visit(url)
24 |             price = browser.find_by_id('jd-price')
25 |             if price == []:
26 |                 price = browser.find_by_xpath('//*[@id="price"]/strong')
27 |             # self.log(price[0].value, level=log.DEBUG)
28 |             el.add_value('price', price[0].value[1:])
29 |         with Browser() as browser:
30 |             number = response.url.split('/')[-1].split('.')[0]
31 |             url = 'http://club.jd.com/review/' + number + '-2-1.html'
32 |             browser.visit(url)
33 |             shaitu = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[5]/a/em')
34 |             el.add_value('shaitu', shaitu[0].value[1:-1])
35 |             haoping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[2]/a/em')
36 |             el.add_value('haoping', haoping[0].value[1:-1])
37 |             zhongping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[3]/a/em')
38 |             el.add_value('zhongping', zhongping[0].value[1:-1])
39 |             chaping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[4]/a/em')
40 |             el.add_value('chaping', chaping[0].value[1:-1])
41 |         return el.load_item()
42 | 
43 | 


--------------------------------------------------------------------------------
/JDspider/JDspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/JDspider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = JDspider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = JDspider
12 | 


--------------------------------------------------------------------------------
/exercisebs4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'qcl'
 3 | 
 4 | # 采用的是小猪短租的杭州4月14到4月16日的信息。取用了300个数据，如果你采用的不足300个，注意要去掉重复的。目前发现了一个问题是有人将
 5 | # 同样的房源发了多次，目前并没有做这部分的校验工作。
 6 | 
 7 | 
 8 | from bs4 import BeautifulSoup
 9 | import requests
10 | 
11 | ListOfUrls = []
12 | page = 1
13 | LenOfUrls = 0
14 | while len(ListOfUrls) != 300:
15 |     url = 'http://hz.xiaozhu.com/search-duanzufang-p'+ str(page) + '-0/?startDate=2016-04-15&endDate=2016-04-16'
16 |     print url
17 |     duanzu = requests.get(url)
18 |     soup = BeautifulSoup(duanzu.text, 'html5lib')
19 |     urls = soup.select('#page_list > ul > li > a')
20 |     try:
21 |         for url1 in urls:
22 |             ListOfUrls.append(url1['href'])
23 |             if len(ListOfUrls) == 300:
24 |                 break
25 |     except:
26 |         print url1, page
27 |     finally:
28 |         page += 1
29 | for i in ListOfUrls:
30 |     data = requests.get(i)
31 |     soup = BeautifulSoup(data.text, 'lxml')
32 |     title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
33 |     address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
34 |     price = soup.select('#pricePart > div.day_l > span')
35 |     PhotoOfHome = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(1) > img')
36 |     NameOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
37 |     sex = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
38 |     PhotoOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
39 |     print '网址 : %s' % (i.encode('utf-8'))
40 |     print '标题 : %s' % (title[0].get_text().encode('utf-8'))
41 |     print '地址 : %s' % (address[0].get_text().encode('utf-8'))
42 |     print '价钱 : %s' % (price[0].get_text().encode('utf-8'))
43 |     print '房间照片 : %s' % (PhotoOfHome[0]['data-bigimg'].encode('utf-8'))
44 |     print '房东名字 : %s' % (NameOfHoster[0].get_text().encode('utf-8'))
45 |     print '房东照片 : %s' % (PhotoOfHoster[0]['src'].encode('utf-8'))
46 |     if sex[0]['class'] == ['member_ico']:
47 |         sex = '男'
48 |     else:
49 |         sex = '女'
50 |     print ('房东性别 : %s') % (sex)
51 |     print '_______________________________________________________________________________________________'
52 | 
53 | 


--------------------------------------------------------------------------------
/pbdnof58/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pbdnof58.iml" filepath="$PROJECT_DIR$/.idea/pbdnof58.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/pbdnof58/.idea/pbdnof58.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/pbdnof58/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="d9be59dd-b861-48c0-97b1-33b07e80889c" name="Default" comment="" />
  5 |     <ignored path="pbdnof58.iws" />
  6 |     <ignored path=".idea/workspace.xml" />
  7 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  8 |     <option name="TRACKING_ENABLED" value="true" />
  9 |     <option name="SHOW_DIALOG" value="false" />
 10 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 11 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 12 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 13 |   </component>
 14 |   <component name="ChangesViewManager" flattened_view="true" show_ignored="false" />
 15 |   <component name="CreatePatchCommitExecutor">
 16 |     <option name="PATCH_PATH" value="" />
 17 |   </component>
 18 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 19 |   <component name="FavoritesManager">
 20 |     <favorites_list name="pbdnof58" />
 21 |   </component>
 22 |   <component name="FileEditorManager">
 23 |     <leaf>
 24 |       <file leaf-file-name="settings.py" pinned="false" current-in-tab="false">
 25 |         <entry file="file://$PROJECT_DIR$/pbdnof58/settings.py">
 26 |           <provider selected="true" editor-type-id="text-editor">
 27 |             <state relative-caret-position="640">
 28 |               <caret line="94" column="19" selection-start-line="94" selection-start-column="19" selection-end-line="94" selection-end-column="19" />
 29 |               <folding />
 30 |             </state>
 31 |           </provider>
 32 |         </entry>
 33 |       </file>
 34 |       <file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
 35 |         <entry file="file://$PROJECT_DIR$/pbdnof58/pipelines.py">
 36 |           <provider selected="true" editor-type-id="text-editor">
 37 |             <state relative-caret-position="-95">
 38 |               <caret line="31" column="18" selection-start-line="30" selection-start-column="12" selection-end-line="31" selection-end-column="18" />
 39 |               <folding>
 40 |                 <element signature="e#192#230#0" expanded="true" />
 41 |               </folding>
 42 |             </state>
 43 |           </provider>
 44 |         </entry>
 45 |       </file>
 46 |       <file leaf-file-name="items.py" pinned="false" current-in-tab="false">
 47 |         <entry file="file://$PROJECT_DIR$/pbdnof58/items.py">
 48 |           <provider selected="true" editor-type-id="text-editor">
 49 |             <state relative-caret-position="300">
 50 |               <caret line="20" column="13" selection-start-line="20" selection-start-column="4" selection-end-line="20" selection-end-column="13" />
 51 |               <folding>
 52 |                 <element signature="e#151#178#0" expanded="true" />
 53 |               </folding>
 54 |             </state>
 55 |           </provider>
 56 |         </entry>
 57 |       </file>
 58 |       <file leaf-file-name="58spider-redis.py" pinned="false" current-in-tab="true">
 59 |         <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/58spider-redis.py">
 60 |           <provider selected="true" editor-type-id="text-editor">
 61 |             <state relative-caret-position="30">
 62 |               <caret line="7" column="23" selection-start-line="7" selection-start-column="12" selection-end-line="7" selection-end-column="23" />
 63 |               <folding>
 64 |                 <element signature="e#0#44#0" expanded="true" />
 65 |               </folding>
 66 |             </state>
 67 |           </provider>
 68 |         </entry>
 69 |       </file>
 70 |       <file leaf-file-name="58Urlspider.py" pinned="false" current-in-tab="false">
 71 |         <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/58Urlspider.py">
 72 |           <provider selected="true" editor-type-id="text-editor">
 73 |             <state relative-caret-position="300">
 74 |               <caret line="20" column="42" selection-start-line="20" selection-start-column="42" selection-end-line="20" selection-end-column="42" />
 75 |               <folding>
 76 |                 <element signature="e#0#44#0" expanded="true" />
 77 |               </folding>
 78 |             </state>
 79 |           </provider>
 80 |         </entry>
 81 |       </file>
 82 |       <file leaf-file-name="scrapy.cfg" pinned="false" current-in-tab="false">
 83 |         <entry file="file://$PROJECT_DIR$/scrapy.cfg">
 84 |           <provider selected="true" editor-type-id="text-editor">
 85 |             <state relative-caret-position="0">
 86 |               <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
 87 |               <folding />
 88 |             </state>
 89 |           </provider>
 90 |         </entry>
 91 |       </file>
 92 |       <file leaf-file-name="__init__.py" pinned="false" current-in-tab="false">
 93 |         <entry file="file://$USER_HOME$/myenv/lib/python2.7/site-packages/scrapy/spiders/__init__.py">
 94 |           <provider selected="true" editor-type-id="text-editor">
 95 |             <state relative-caret-position="311">
 96 |               <caret line="74" column="8" selection-start-line="74" selection-start-column="8" selection-end-line="74" selection-end-column="8" />
 97 |               <folding />
 98 |             </state>
 99 |           </provider>
100 |         </entry>
101 |       </file>
102 |       <file leaf-file-name="__init__.py" pinned="false" current-in-tab="false">
103 |         <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/__init__.py">
104 |           <provider selected="true" editor-type-id="text-editor">
105 |             <state relative-caret-position="60">
106 |               <caret line="4" column="0" selection-start-line="4" selection-start-column="0" selection-end-line="4" selection-end-column="0" />
107 |               <folding />
108 |             </state>
109 |           </provider>
110 |         </entry>
111 |       </file>
112 |     </leaf>
113 |   </component>
114 |   <component name="FileTemplateManagerImpl">
115 |     <option name="RECENT_TEMPLATES">
116 |       <list>
117 |         <option value="Python Script" />
118 |       </list>
119 |     </option>
120 |   </component>
121 |   <component name="IdeDocumentHistory">
122 |     <option name="CHANGED_PATHS">
123 |       <list>
124 |         <option value="$PROJECT_DIR$/pbdnof58/spiders/58Urlspider-redis.py" />
125 |         <option value="$PROJECT_DIR$/pbdnof58/items.py" />
126 |         <option value="$PROJECT_DIR$/pbdnof58/spiders/58Urlspider.py" />
127 |         <option value="$PROJECT_DIR$/aaa.json" />
128 |         <option value="$PROJECT_DIR$/pbdnof58/spiders/58spider-redis.py" />
129 |         <option value="$PROJECT_DIR$/items.json" />
130 |         <option value="$PROJECT_DIR$/pbdnof58/settings.py" />
131 |         <option value="$PROJECT_DIR$/pbdnof58/pipelines.py" />
132 |       </list>
133 |     </option>
134 |   </component>
135 |   <component name="ProjectFrameBounds">
136 |     <option name="x" value="113" />
137 |     <option name="y" value="66" />
138 |     <option name="width" value="1400" />
139 |     <option name="height" value="900" />
140 |   </component>
141 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
142 |     <OptionsSetting value="true" id="Add" />
143 |     <OptionsSetting value="true" id="Remove" />
144 |     <OptionsSetting value="true" id="Checkout" />
145 |     <OptionsSetting value="true" id="Update" />
146 |     <OptionsSetting value="true" id="Status" />
147 |     <OptionsSetting value="true" id="Edit" />
148 |     <ConfirmationsSetting value="0" id="Add" />
149 |     <ConfirmationsSetting value="0" id="Remove" />
150 |   </component>
151 |   <component name="ProjectView">
152 |     <navigator currentView="ProjectPane" proportions="" version="1">
153 |       <flattenPackages />
154 |       <showMembers />
155 |       <showModules />
156 |       <showLibraryContents />
157 |       <hideEmptyPackages />
158 |       <abbreviatePackageNames />
159 |       <autoscrollToSource />
160 |       <autoscrollFromSource />
161 |       <sortByType />
162 |       <manualOrder />
163 |       <foldersAlwaysOnTop value="true" />
164 |     </navigator>
165 |     <panes>
166 |       <pane id="ProjectPane">
167 |         <subPane>
168 |           <PATH>
169 |             <PATH_ELEMENT>
170 |               <option name="myItemId" value="pbdnof58" />
171 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
172 |             </PATH_ELEMENT>
173 |           </PATH>
174 |           <PATH>
175 |             <PATH_ELEMENT>
176 |               <option name="myItemId" value="pbdnof58" />
177 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
178 |             </PATH_ELEMENT>
179 |             <PATH_ELEMENT>
180 |               <option name="myItemId" value="pbdnof58" />
181 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
182 |             </PATH_ELEMENT>
183 |           </PATH>
184 |           <PATH>
185 |             <PATH_ELEMENT>
186 |               <option name="myItemId" value="pbdnof58" />
187 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
188 |             </PATH_ELEMENT>
189 |             <PATH_ELEMENT>
190 |               <option name="myItemId" value="pbdnof58" />
191 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
192 |             </PATH_ELEMENT>
193 |             <PATH_ELEMENT>
194 |               <option name="myItemId" value="pbdnof58" />
195 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
196 |             </PATH_ELEMENT>
197 |           </PATH>
198 |           <PATH>
199 |             <PATH_ELEMENT>
200 |               <option name="myItemId" value="pbdnof58" />
201 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
202 |             </PATH_ELEMENT>
203 |             <PATH_ELEMENT>
204 |               <option name="myItemId" value="pbdnof58" />
205 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
206 |             </PATH_ELEMENT>
207 |             <PATH_ELEMENT>
208 |               <option name="myItemId" value="pbdnof58" />
209 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
210 |             </PATH_ELEMENT>
211 |             <PATH_ELEMENT>
212 |               <option name="myItemId" value="spiders" />
213 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
214 |             </PATH_ELEMENT>
215 |           </PATH>
216 |         </subPane>
217 |       </pane>
218 |       <pane id="Scope" />
219 |       <pane id="Scratches" />
220 |     </panes>
221 |   </component>
222 |   <component name="PropertiesComponent">
223 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
224 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
225 |     <property name="settings.editor.splitter.proportion" value="0.2" />
226 |   </component>
227 |   <component name="RunManager">
228 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
229 |       <option name="INTERPRETER_OPTIONS" value="" />
230 |       <option name="PARENT_ENVS" value="true" />
231 |       <envs>
232 |         <env name="PYTHONUNBUFFERED" value="1" />
233 |       </envs>
234 |       <option name="SDK_HOME" value="" />
235 |       <option name="WORKING_DIRECTORY" value="" />
236 |       <option name="IS_MODULE_SDK" value="false" />
237 |       <option name="ADD_CONTENT_ROOTS" value="true" />
238 |       <option name="ADD_SOURCE_ROOTS" value="true" />
239 |       <module name="pbdnof58" />
240 |       <option name="SCRIPT_NAME" value="" />
241 |       <option name="PARAMETERS" value="" />
242 |       <option name="SHOW_COMMAND_LINE" value="false" />
243 |       <method />
244 |     </configuration>
245 |     <configuration default="true" type="Tox" factoryName="Tox">
246 |       <option name="INTERPRETER_OPTIONS" value="" />
247 |       <option name="PARENT_ENVS" value="true" />
248 |       <envs />
249 |       <option name="SDK_HOME" value="" />
250 |       <option name="WORKING_DIRECTORY" value="" />
251 |       <option name="IS_MODULE_SDK" value="false" />
252 |       <option name="ADD_CONTENT_ROOTS" value="true" />
253 |       <option name="ADD_SOURCE_ROOTS" value="true" />
254 |       <module name="pbdnof58" />
255 |       <method />
256 |     </configuration>
257 |     <configuration default="true" type="tests" factoryName="Attests">
258 |       <option name="INTERPRETER_OPTIONS" value="" />
259 |       <option name="PARENT_ENVS" value="true" />
260 |       <envs />
261 |       <option name="SDK_HOME" value="" />
262 |       <option name="WORKING_DIRECTORY" value="" />
263 |       <option name="IS_MODULE_SDK" value="false" />
264 |       <option name="ADD_CONTENT_ROOTS" value="true" />
265 |       <option name="ADD_SOURCE_ROOTS" value="true" />
266 |       <module name="pbdnof58" />
267 |       <option name="SCRIPT_NAME" value="" />
268 |       <option name="CLASS_NAME" value="" />
269 |       <option name="METHOD_NAME" value="" />
270 |       <option name="FOLDER_NAME" value="" />
271 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
272 |       <option name="PATTERN" value="" />
273 |       <option name="USE_PATTERN" value="false" />
274 |       <method />
275 |     </configuration>
276 |     <configuration default="true" type="tests" factoryName="Doctests">
277 |       <option name="INTERPRETER_OPTIONS" value="" />
278 |       <option name="PARENT_ENVS" value="true" />
279 |       <envs />
280 |       <option name="SDK_HOME" value="" />
281 |       <option name="WORKING_DIRECTORY" value="" />
282 |       <option name="IS_MODULE_SDK" value="false" />
283 |       <option name="ADD_CONTENT_ROOTS" value="true" />
284 |       <option name="ADD_SOURCE_ROOTS" value="true" />
285 |       <module name="pbdnof58" />
286 |       <option name="SCRIPT_NAME" value="" />
287 |       <option name="CLASS_NAME" value="" />
288 |       <option name="METHOD_NAME" value="" />
289 |       <option name="FOLDER_NAME" value="" />
290 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
291 |       <option name="PATTERN" value="" />
292 |       <option name="USE_PATTERN" value="false" />
293 |       <method />
294 |     </configuration>
295 |     <configuration default="true" type="tests" factoryName="Nosetests">
296 |       <option name="INTERPRETER_OPTIONS" value="" />
297 |       <option name="PARENT_ENVS" value="true" />
298 |       <envs />
299 |       <option name="SDK_HOME" value="" />
300 |       <option name="WORKING_DIRECTORY" value="" />
301 |       <option name="IS_MODULE_SDK" value="false" />
302 |       <option name="ADD_CONTENT_ROOTS" value="true" />
303 |       <option name="ADD_SOURCE_ROOTS" value="true" />
304 |       <module name="pbdnof58" />
305 |       <option name="SCRIPT_NAME" value="" />
306 |       <option name="CLASS_NAME" value="" />
307 |       <option name="METHOD_NAME" value="" />
308 |       <option name="FOLDER_NAME" value="" />
309 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
310 |       <option name="PATTERN" value="" />
311 |       <option name="USE_PATTERN" value="false" />
312 |       <option name="PARAMS" value="" />
313 |       <option name="USE_PARAM" value="false" />
314 |       <method />
315 |     </configuration>
316 |     <configuration default="true" type="tests" factoryName="Unittests">
317 |       <option name="INTERPRETER_OPTIONS" value="" />
318 |       <option name="PARENT_ENVS" value="true" />
319 |       <envs />
320 |       <option name="SDK_HOME" value="" />
321 |       <option name="WORKING_DIRECTORY" value="" />
322 |       <option name="IS_MODULE_SDK" value="false" />
323 |       <option name="ADD_CONTENT_ROOTS" value="true" />
324 |       <option name="ADD_SOURCE_ROOTS" value="true" />
325 |       <module name="pbdnof58" />
326 |       <option name="SCRIPT_NAME" value="" />
327 |       <option name="CLASS_NAME" value="" />
328 |       <option name="METHOD_NAME" value="" />
329 |       <option name="FOLDER_NAME" value="" />
330 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
331 |       <option name="PATTERN" value="" />
332 |       <option name="USE_PATTERN" value="false" />
333 |       <option name="PUREUNITTEST" value="true" />
334 |       <option name="PARAMS" value="" />
335 |       <option name="USE_PARAM" value="false" />
336 |       <method />
337 |     </configuration>
338 |     <configuration default="true" type="tests" factoryName="py.test">
339 |       <option name="INTERPRETER_OPTIONS" value="" />
340 |       <option name="PARENT_ENVS" value="true" />
341 |       <envs />
342 |       <option name="SDK_HOME" value="" />
343 |       <option name="WORKING_DIRECTORY" value="" />
344 |       <option name="IS_MODULE_SDK" value="false" />
345 |       <option name="ADD_CONTENT_ROOTS" value="true" />
346 |       <option name="ADD_SOURCE_ROOTS" value="true" />
347 |       <module name="pbdnof58" />
348 |       <option name="SCRIPT_NAME" value="" />
349 |       <option name="CLASS_NAME" value="" />
350 |       <option name="METHOD_NAME" value="" />
351 |       <option name="FOLDER_NAME" value="" />
352 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
353 |       <option name="PATTERN" value="" />
354 |       <option name="USE_PATTERN" value="false" />
355 |       <option name="testToRun" value="" />
356 |       <option name="keywords" value="" />
357 |       <option name="params" value="" />
358 |       <option name="USE_PARAM" value="false" />
359 |       <option name="USE_KEYWORD" value="false" />
360 |       <method />
361 |     </configuration>
362 |   </component>
363 |   <component name="ShelveChangesManager" show_recycled="false">
364 |     <option name="remove_strategy" value="false" />
365 |   </component>
366 |   <component name="SvnConfiguration">
367 |     <configuration />
368 |   </component>
369 |   <component name="TaskManager">
370 |     <task active="true" id="Default" summary="Default task">
371 |       <changelist id="d9be59dd-b861-48c0-97b1-33b07e80889c" name="Default" comment="" />
372 |       <created>1461386327912</created>
373 |       <option name="number" value="Default" />
374 |       <option name="presentableId" value="Default" />
375 |       <updated>1461386327912</updated>
376 |     </task>
377 |     <servers />
378 |   </component>
379 |   <component name="ToolWindowManager">
380 |     <frame x="113" y="66" width="1400" height="900" extended-state="0" />
381 |     <editor active="false" />
382 |     <layout>
383 |       <window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
384 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
385 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
386 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.3261131" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
387 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32972324" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
388 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
389 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
390 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
391 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
392 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
393 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
394 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
395 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
396 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
397 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
398 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
399 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
400 |     </layout>
401 |   </component>
402 |   <component name="Vcs.Log.UiProperties">
403 |     <option name="RECENTLY_FILTERED_USER_GROUPS">
404 |       <collection />
405 |     </option>
406 |     <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
407 |       <collection />
408 |     </option>
409 |   </component>
410 |   <component name="VcsContentAnnotationSettings">
411 |     <option name="myLimit" value="2678400000" />
412 |   </component>
413 |   <component name="XDebuggerManager">
414 |     <breakpoint-manager>
415 |       <option name="time" value="3" />
416 |     </breakpoint-manager>
417 |     <watches-manager />
418 |   </component>
419 |   <component name="editorHistoryManager">
420 |     <entry file="file://$PROJECT_DIR$/pbdnof58/__init__.py">
421 |       <provider selected="true" editor-type-id="text-editor">
422 |         <state relative-caret-position="0">
423 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
424 |           <folding />
425 |         </state>
426 |       </provider>
427 |     </entry>
428 |     <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/__init__.py">
429 |       <provider selected="true" editor-type-id="text-editor">
430 |         <state relative-caret-position="60">
431 |           <caret line="4" column="0" selection-start-line="4" selection-start-column="0" selection-end-line="4" selection-end-column="0" />
432 |           <folding />
433 |         </state>
434 |       </provider>
435 |     </entry>
436 |     <entry file="file://$USER_HOME$/myenv/lib/python2.7/site-packages/scrapy/spiders/__init__.py">
437 |       <provider selected="true" editor-type-id="text-editor">
438 |         <state relative-caret-position="311">
439 |           <caret line="74" column="8" selection-start-line="74" selection-start-column="8" selection-end-line="74" selection-end-column="8" />
440 |           <folding />
441 |         </state>
442 |       </provider>
443 |     </entry>
444 |     <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/58Urlspider-redis.py">
445 |       <provider selected="true" editor-type-id="text-editor">
446 |         <state relative-caret-position="0">
447 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
448 |           <folding />
449 |         </state>
450 |       </provider>
451 |     </entry>
452 |     <entry file="file://$PROJECT_DIR$/scrapy.cfg">
453 |       <provider selected="true" editor-type-id="text-editor">
454 |         <state relative-caret-position="0">
455 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
456 |           <folding />
457 |         </state>
458 |       </provider>
459 |     </entry>
460 |     <entry file="file://$PROJECT_DIR$/aaa.json">
461 |       <provider selected="true" editor-type-id="text-editor">
462 |         <state relative-caret-position="0">
463 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
464 |           <folding />
465 |         </state>
466 |       </provider>
467 |     </entry>
468 |     <entry file="file://$PROJECT_DIR$/items.json">
469 |       <provider selected="true" editor-type-id="text-editor">
470 |         <state relative-caret-position="0">
471 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
472 |           <folding />
473 |         </state>
474 |       </provider>
475 |     </entry>
476 |     <entry file="file://$PROJECT_DIR$/pbdnof58/items.py">
477 |       <provider selected="true" editor-type-id="text-editor">
478 |         <state relative-caret-position="300">
479 |           <caret line="20" column="13" selection-start-line="20" selection-start-column="4" selection-end-line="20" selection-end-column="13" />
480 |           <folding>
481 |             <element signature="e#151#178#0" expanded="true" />
482 |           </folding>
483 |         </state>
484 |       </provider>
485 |     </entry>
486 |     <entry file="file://$PROJECT_DIR$/pbdnof58/settings.py">
487 |       <provider selected="true" editor-type-id="text-editor">
488 |         <state relative-caret-position="640">
489 |           <caret line="94" column="19" selection-start-line="94" selection-start-column="19" selection-end-line="94" selection-end-column="19" />
490 |           <folding />
491 |         </state>
492 |       </provider>
493 |     </entry>
494 |     <entry file="file://$PROJECT_DIR$/pbdnof58/pipelines.py">
495 |       <provider selected="true" editor-type-id="text-editor">
496 |         <state relative-caret-position="-95">
497 |           <caret line="31" column="18" selection-start-line="30" selection-start-column="12" selection-end-line="31" selection-end-column="18" />
498 |           <folding>
499 |             <element signature="e#192#230#0" expanded="true" />
500 |           </folding>
501 |         </state>
502 |       </provider>
503 |     </entry>
504 |     <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/58Urlspider.py">
505 |       <provider selected="true" editor-type-id="text-editor">
506 |         <state relative-caret-position="300">
507 |           <caret line="20" column="42" selection-start-line="20" selection-start-column="42" selection-end-line="20" selection-end-column="42" />
508 |           <folding>
509 |             <element signature="e#0#44#0" expanded="true" />
510 |           </folding>
511 |         </state>
512 |       </provider>
513 |     </entry>
514 |     <entry file="file://$PROJECT_DIR$/pbdnof58/spiders/58spider-redis.py">
515 |       <provider selected="true" editor-type-id="text-editor">
516 |         <state relative-caret-position="30">
517 |           <caret line="7" column="23" selection-start-line="7" selection-start-column="12" selection-end-line="7" selection-end-column="23" />
518 |           <folding>
519 |             <element signature="e#0#44#0" expanded="true" />
520 |           </folding>
521 |         </state>
522 |       </provider>
523 |     </entry>
524 |   </component>
525 | </project>


--------------------------------------------------------------------------------
/pbdnof58/README.rst:
--------------------------------------------------------------------------------
1 | 采用了scrapy-redis做了一个分布式的爬虫。最后用mysql存储数据。需要事先安装以及配置好redis以及mysql。
2 | 
3 | 进入工程后，scrapy crawl myspider_58page运行这个命令就会开始爬页码直到最后结束。需要在redis中给出初始页码。
4 | 
5 | scrapy crawl myspider_58运行这个命令就会开始爬取具体的个人卖家的信息，会存储在相应的mysql表中。
6 | 
7 | 上述2个爬虫都可以多开来达到多进程爬取的效果。
8 | 
9 | 


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/pbdnof58/pbdnof58/__init__.py


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | from sched import scheduler
 8 | 
 9 | import scrapy
10 | from scrapy.loader import ItemLoader
11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
12 | 
13 | 
14 | class Pbdnof58Item(scrapy.Item):
15 |     # define the fields for your item here like:
16 |     title = scrapy.Field()
17 |     time = scrapy.Field()
18 |     price = scrapy.Field()
19 |     quality = scrapy.Field()
20 |     area = scrapy.Field()
21 |     UrlofPage = scrapy.Field()
22 | 
23 | class Pbdnof58Loader(ItemLoader):
24 |     default_item_class = Pbdnof58Item
25 |     default_input_processor = MapCompose(lambda s: s.strip())
26 |     default_output_processor = TakeFirst()
27 |     description_out = Join()


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from scrapy.exceptions import DropItem
 8 | from twisted.enterprise import adbapi
 9 | import MySQLdb
10 | import MySQLdb.cursors
11 | from scrapy import log
12 | 
13 | 
14 | class CleanPipeline(object):
15 | 
16 |     def __init__(self):
17 |         self.has = set()
18 | 
19 |     def process_item(self, item, spider):
20 |         if item.keys() >= 5:
21 |             if item in self.has:
22 |                 raise DropItem("Duplicate item found: %s" % item)
23 |             else:
24 |                 self.has.add(item)
25 |                 return item
26 | 
27 | 
28 | class MySQLPipeline(object):
29 |     def __init__(self, dbpool):
30 |         self.dbpool = dbpool
31 | 
32 |     @classmethod
33 |     def from_settings(cls, settings):
34 |         dbargs = dict(
35 |             host=settings['MYSQL_HOST'],
36 |             db=settings['MYSQL_DBNAME'],
37 |             user=settings['MYSQL_USER'],
38 |             passwd=settings['MYSQL_PASSWD'],
39 |             charset='utf8',
40 |             cursorclass = MySQLdb.cursors.DictCursor,
41 |             use_unicode= True,
42 |         )
43 |         dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
44 |         return cls(dbpool)
45 | 
46 |     def process_item(self, item, spider):
47 |         d = self.dbpool.runInteraction(self.__do__insert, item, spider)
48 |         d.addBoth(lambda _: item)
49 |         return d
50 | 
51 |     def __do__insert(self, conn, item, spider):
52 |         try:
53 |             conn.execute("""
54 |                 insert into 58pbdndb set title = %s, area = %s, price = %s, quality = %s, time = %s
55 |             """, (item['title'], item['area'], item['price'], item['quality'], item['time']))
56 | 
57 |         except MySQLdb.Error, e:
58 |             spider.log("Mysql Error %d: %s" % (e.args[0], e.args[1]), level=log.DEBUG)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for pbdnof58 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'pbdnof58'
13 | 
14 | SPIDER_MODULES = ['pbdnof58.spiders']
15 | NEWSPIDER_MODULE = 'pbdnof58.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'pbdnof58 (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'pbdnof58.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'pbdnof58.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'pbdnof58.pipelines.CleanPipeline': 300,
69 |     'pbdnof58.pipelines.MySQLPipeline': 400,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 
93 | # start MySQL database configure setting
94 | MYSQL_HOST = 'localhost'
95 | MYSQL_DBNAME = 'qcl'
96 | MYSQL_USER = 'root'
97 | MYSQL_PASSWD = ''
98 | # end of MySQL database configure setting
99 | 


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/58Urlspider.py:
--------------------------------------------------------------------------------
 1 | from scrapy_redis.spiders import RedisSpider
 2 | from pbdnof58.items import Pbdnof58Loader
 3 | from redis import Redis
 4 | from scrapy import log
 5 | from time import sleep
 6 | 
 7 | class Myspider(RedisSpider):
 8 |     '''spider that reads urls from redis queue (myspider:start_urls).'''
 9 |     name = 'myspider_58page'
10 |     redis_key = 'myspider:58_urls'
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         domain = kwargs.pop('domain', '')
14 |         self.allowed_domans = filter(None, domain.split(','))
15 |         super(Myspider, self).__init__(*args, **kwargs)
16 |         self.url = 'http://bj.58.com'
17 | 
18 |     def parse(self, response):
19 |         el = Pbdnof58Loader(response=response)
20 |         PageUrl = response.xpath('//a[contains(@class, "next")]/@href').extract()
21 |         self.log(PageUrl, level=log.DEBUG)
22 |         r = Redis()
23 |         if PageUrl != []:
24 |             r.lpush('myspider:58_urls', self.url + PageUrl[0])
25 |             sleep(1)
26 |             el.add_value('UrlofPage', self.url + PageUrl[0])
27 |         urls = response.xpath('//table[contains(@class, "tbimg")]/tr')
28 |         for url in urls:
29 |             url = url.xpath('td[contains(@class, "t")]/a/@href').extract()
30 |             if len(url) == 1 and 'zhuan' not in url[0]:
31 |                 r.lpush('myspider:start_urls', url[0])
32 |         return el.load_item()
33 | 


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/58spider-redis.py:
--------------------------------------------------------------------------------
 1 | from scrapy_redis.spiders import RedisSpider
 2 | from pbdnof58.items import Pbdnof58Loader
 3 | # from scrapy import log
 4 | 
 5 | 
 6 | class Myspider(RedisSpider):
 7 |     '''spider that reads urls from redis queue (myspider:start_urls).'''
 8 |     name = 'myspider_58'
 9 |     redis_key = 'myspider:start_urls'
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         domain = kwargs.pop('domain', '')
13 |         self.allowed_domans = filter(None, domain.split(','))
14 |         super(Myspider, self).__init__(*args, **kwargs)
15 | 
16 | 
17 |     def parse(self, response):
18 | 
19 |         el = Pbdnof58Loader(response=response)
20 |         el.add_xpath('title', '//h1/text()')
21 |         el.add_xpath('price', '//span[contains(@class, "price c_f50")]/text()'.strip())
22 |         quality = response.xpath('//ul[contains(@class, "suUl")]/li')
23 |         quality = quality[1].xpath('div[contains(@class, "su_con")]/span/text()').extract()[0].strip()
24 |         el.add_value('quality', quality)
25 |         area = response.xpath('//span[contains(@class, "c_25d")]/a/text()'.strip())
26 |         if area == []:
27 |             area = 'None'
28 |         elif len(area) == 1:
29 |             area = area[0].extract()
30 |         else:
31 |             area = area[0].extract() + '-' + area[1].extract()
32 |         el.add_value('area', area)
33 |         el.add_xpath('time', '//li[contains(@class, "time")]/text()')
34 |         return el.load_item()
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/pbdnof58/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = pbdnof58.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pbdnof58
12 | 


--------------------------------------------------------------------------------
/tayleswift.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | __author__ = 'qcl'
 3 | 
 4 | from urllib import urlretrieve
 5 | from bs4 import BeautifulSoup
 6 | import requests
 7 | 
 8 | listofpic = []
 9 | def getpiclist(url):
10 |     wb_data = requests.get(url).text
11 |     soup = BeautifulSoup(wb_data, 'html5lib')
12 |     urls = soup.select('#main-container > div > div.grid-thumb.grid-responsive > div > div > div > a > img')
13 |     for url in urls:
14 |         listofpic.append(url['src'])
15 | def getpicture(first, last):
16 |     for i in xrange(first, last + 1):
17 |         url = 'http://weheartit.com/inspirations/taylorswift?page=' + str(i)
18 |         getpiclist(url)
19 |     for i in xrange(len(listofpic)):
20 |         postfix = listofpic[i].split('.')[-1]
21 |         local = 'E:\\tayleswift\\tayleswift' + str(i) + '.' + postfix
22 |         urlretrieve(listofpic[i], local)
23 |     print 'done'
24 | getpicture(1, 20)
25 | 
26 | 


--------------------------------------------------------------------------------