├── .gitignore
├── 58pbdn.py
├── CookiesOfZhihu
├── README.rst
├── config.ini
└── login.py
├── JDspider
├── JDspider
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── SpiderofJD.py
│ │ └── __init__.py
└── scrapy.cfg
├── exercisebs4.py
├── pbdnof58
├── .idea
│ ├── modules.xml
│ ├── pbdnof58.iml
│ └── workspace.xml
├── README.rst
├── pbdnof58
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── 58Urlspider.py
│ │ ├── 58spider-redis.py
│ │ └── __init__.py
└── scrapy.cfg
└── tayleswift.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pyc
3 | .DS_Store
4 |
--------------------------------------------------------------------------------
/58pbdn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'qcl'
3 |
4 | from bs4 import BeautifulSoup
5 | import requests
6 |
7 | ListOfUrl = []
8 | def GetUrl(url):
9 | web_info = requests.get(url).text
10 | soup = BeautifulSoup(web_info, 'html5lib')
11 | listofa = soup.select('#infolist > table > tbody > tr')
12 | for i in listofa:
13 | for j in i.find_all('td'):
14 | if j['class'] == ['tc']:
15 | if j.get_text() == '':
16 | for k in i.find_all('td'):
17 | if k['class'] == ['t']:
18 | ListOfUrl.append(k.a['href'])
19 |
20 | def GetAllInfo(start, end):
21 | for i in xrange(start, end + 1):
22 | url = url = 'http://bj.58.com/pbdn/0/pn' + str(i)
23 | GetUrl(url)
24 | for i in ListOfUrl:
25 | wb_info = requests.get(i).text
26 | soup = BeautifulSoup(wb_info, 'lxml')
27 | # titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1 ')
28 | title = soup.title.text
29 | # time = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
30 | time = soup.select('.time')
31 | price = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')
32 | quality = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
33 | area = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a')
34 | print title, time[0].get_text(), price[0].get_text(), quality[0].get_text().strip()
35 | if len(area) == 2:
36 | print area[0].get_text() + '-' + area[1].get_text()
37 | elif len(area) == 1:
38 | print area[0].get_text()
39 | print '----------------------------------------------------------------------------'
40 | print 'done', len(ListOfUrl)
41 |
42 |
43 | GetAllInfo(1, 20)
44 |
45 |
--------------------------------------------------------------------------------
/CookiesOfZhihu/README.rst:
--------------------------------------------------------------------------------
1 | 知乎可能会随着时间的推移而改变cookies变量,需要自己手动进行设置,修改变量的值。
2 |
--------------------------------------------------------------------------------
/CookiesOfZhihu/config.ini:
--------------------------------------------------------------------------------
1 | [info]
2 | email = xxxxxxx
3 | password = xxxx
4 |
5 | [cookies]
6 | l_n_c=1
7 | q_c1=650ddc07bd754017820871e5188ac1db|1463311811000|1463311811000
8 | _xsrf=a5b69090b2bdc67fd4987318958219e4
9 | cap_id="MDVhMzIzZDQ1MWFhNDk3ZDkwMjU3MmNkNWI1MDUzMzk=|1463311811|595e134256102926f52ce541ee1b0b302e832e93"
10 | l_cap_id="YmZkYWYxN2M1MjUwNDgwZmJiZDNkMWIxMzg2NzI0ZDc=|1463311811|639b2df5e9affa6aa901ca5e6f9354e185a32e3b"
11 | d_c0="ABCAZD0f7QmPTtosO-72HbNdU6o9uCkyom8=|1463311812"
12 | _zap=23eb1f89-b696-4775-92e0-11a289d24cc8
13 | _za=d7b6603b-6391-435d-b83f-ae5eef570fda
14 | __utmt=1
15 | login="MjJjNjdhNTlmZGIzNGJmYzkzMmYwZGMzZmExYjdkZGQ=|1463311835|7820d87c772e131fd637936a0eaa93b76cc0a3db"
16 | z_c0=Mi4wQUFBQVYyODFBQUFBRUlCa1BSX3RDUmNBQUFCaEFsVk44T3BmVndCZGlod2ktVVh2WDluM1oxa2xlb1FKMWt3R1FR|1463311856|b45e94045da643cff4775b4a8036070dcff8bfd4
17 | __utma=51854390.635799030.1463311831.1463311831.1463311831.1
18 | __utmb=51854390.4.10.1463311831
19 | __utmc=51854390
20 | __utmz=51854390.1463311831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)
21 | __utmv=51854390.100-1|2=registration_date=20140820=1^3=entry_date=20140820=1
--------------------------------------------------------------------------------
/CookiesOfZhihu/login.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | 网络爬虫之用户名密码及验证码登陆:爬取知乎网站
4 | '''
5 | import requests
6 | import ConfigParser
7 |
8 | def create_session():
9 | cf = ConfigParser.ConfigParser()
10 | cf.read('config.ini')
11 | cookies = cf.items('cookies')
12 | cookies = dict(cookies)
13 | print cookies
14 | email = cf.get('info', 'email')
15 | password = cf.get('info', 'password')
16 |
17 | session = requests.session()
18 | login_data = {'email': email, 'password': password}
19 | header = {
20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',
21 | 'Host': 'www.zhihu.com',
22 | 'Referer': 'http://www.zhihu.com/'
23 | }
24 | r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header)
25 | print r.json()
26 | if r.json()['r'] == 1:
27 | print 'Login Failed, reason is:',
28 | print 'So we use cookies to login in...'
29 | has_cookies = False
30 | for key in cookies:
31 | if key != '__name__' and cookies[key] != '':
32 | has_cookies = True
33 | break
34 | if has_cookies is False:
35 | raise ValueError('请填写config.ini文件中的cookies项.')
36 | else:
37 | # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆
38 | r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆
39 |
40 | with open('login.html', 'w') as fp:
41 | fp.write(r.content)
42 |
43 | return session, cookies
44 |
45 |
46 | if __name__ == '__main__':
47 | requests_session, requests_cookies = create_session()
48 |
49 | # url = 'http://www.zhihu.com/login/email'
50 | url = 'http://www.zhihu.com/topic/19552832'
51 | # content = requests_session.get(url).content # 未登陆
52 | # content = requests.get(url, cookies=requests_cookies).content # 已登陆
53 | content = requests_session.get(url, cookies=requests_cookies).content # 已登陆
54 | with open('url.html', 'w') as fp:
55 | fp.write(content)
56 |
--------------------------------------------------------------------------------
/JDspider/JDspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/JDspider/JDspider/__init__.py
--------------------------------------------------------------------------------
/JDspider/JDspider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy.loader import ItemLoader
10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
11 |
12 |
13 | class JdspiderItem(scrapy.Item):
14 | # define the fields for your item here like:
15 | title = scrapy.Field()
16 | desc = scrapy.Field()
17 | price = scrapy.Field()
18 | haoping = scrapy.Field()
19 | zhongping = scrapy.Field()
20 | chaping = scrapy.Field()
21 | shaitu = scrapy.Field()
22 |
23 | class JDspiderLoader(ItemLoader):
24 | default_item_class = JdspiderItem
25 | default_input_processor = MapCompose(lambda s: s.strip())
26 | default_output_processor = TakeFirst()
27 | description_out = Join()
28 |
--------------------------------------------------------------------------------
/JDspider/JDspider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class JdspiderPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/JDspider/JDspider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for JDspider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'JDspider'
13 |
14 | SPIDER_MODULES = ['JDspider.spiders']
15 | NEWSPIDER_MODULE = 'JDspider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'JDspider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'JDspider.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'JDspider.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'JDspider.pipelines.JdspiderPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/JDspider/JDspider/spiders/SpiderofJD.py:
--------------------------------------------------------------------------------
1 | from scrapy_redis.spiders import RedisSpider
2 | from JDspider.items import JDspiderLoader
3 | from splinter import Browser
4 | from scrapy import log
5 |
6 |
7 |
8 | class Myspider(RedisSpider):
9 | '''spider that reads urls from redis queue (myspider:start_urls).'''
10 | name = 'jdspider'
11 | redis_key = 'jdspider_urls'
12 |
13 | def __init__(self, *args, **kwargs):
14 | domain = kwargs.pop('domain', '')
15 | self.allowed_domans = filter(None, domain.split(','))
16 | super(Myspider, self).__init__(*args, **kwargs)
17 |
18 | def parse(self, response):
19 | el = JDspiderLoader(response=response)
20 | el.add_xpath('title', '//*[@id="name"]/h1/text()')
21 | with Browser() as browser:
22 | url = response.url
23 | browser.visit(url)
24 | price = browser.find_by_id('jd-price')
25 | if price == []:
26 | price = browser.find_by_xpath('//*[@id="price"]/strong')
27 | # self.log(price[0].value, level=log.DEBUG)
28 | el.add_value('price', price[0].value[1:])
29 | with Browser() as browser:
30 | number = response.url.split('/')[-1].split('.')[0]
31 | url = 'http://club.jd.com/review/' + number + '-2-1.html'
32 | browser.visit(url)
33 | shaitu = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[5]/a/em')
34 | el.add_value('shaitu', shaitu[0].value[1:-1])
35 | haoping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[2]/a/em')
36 | el.add_value('haoping', haoping[0].value[1:-1])
37 | zhongping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[3]/a/em')
38 | el.add_value('zhongping', zhongping[0].value[1:-1])
39 | chaping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[4]/a/em')
40 | el.add_value('chaping', chaping[0].value[1:-1])
41 | return el.load_item()
42 |
43 |
--------------------------------------------------------------------------------
/JDspider/JDspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/JDspider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = JDspider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = JDspider
12 |
--------------------------------------------------------------------------------
/exercisebs4.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'qcl'
3 |
4 | # 采用的是小猪短租的杭州4月14到4月16日的信息。取用了300个数据,如果你采用的不足300个,注意要去掉重复的。目前发现了一个问题是有人将
5 | # 同样的房源发了多次,目前并没有做这部分的校验工作。
6 |
7 |
8 | from bs4 import BeautifulSoup
9 | import requests
10 |
11 | ListOfUrls = []
12 | page = 1
13 | LenOfUrls = 0
14 | while len(ListOfUrls) != 300:
15 | url = 'http://hz.xiaozhu.com/search-duanzufang-p'+ str(page) + '-0/?startDate=2016-04-15&endDate=2016-04-16'
16 | print url
17 | duanzu = requests.get(url)
18 | soup = BeautifulSoup(duanzu.text, 'html5lib')
19 | urls = soup.select('#page_list > ul > li > a')
20 | try:
21 | for url1 in urls:
22 | ListOfUrls.append(url1['href'])
23 | if len(ListOfUrls) == 300:
24 | break
25 | except:
26 | print url1, page
27 | finally:
28 | page += 1
29 | for i in ListOfUrls:
30 | data = requests.get(i)
31 | soup = BeautifulSoup(data.text, 'lxml')
32 | title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
33 | address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
34 | price = soup.select('#pricePart > div.day_l > span')
35 | PhotoOfHome = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(1) > img')
36 | NameOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
37 | sex = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
38 | PhotoOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
39 | print '网址 : %s' % (i.encode('utf-8'))
40 | print '标题 : %s' % (title[0].get_text().encode('utf-8'))
41 | print '地址 : %s' % (address[0].get_text().encode('utf-8'))
42 | print '价钱 : %s' % (price[0].get_text().encode('utf-8'))
43 | print '房间照片 : %s' % (PhotoOfHome[0]['data-bigimg'].encode('utf-8'))
44 | print '房东名字 : %s' % (NameOfHoster[0].get_text().encode('utf-8'))
45 | print '房东照片 : %s' % (PhotoOfHoster[0]['src'].encode('utf-8'))
46 | if sex[0]['class'] == ['member_ico']:
47 | sex = '男'
48 | else:
49 | sex = '女'
50 | print ('房东性别 : %s') % (sex)
51 | print '_______________________________________________________________________________________________'
52 |
53 |
--------------------------------------------------------------------------------
/pbdnof58/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/pbdnof58/.idea/pbdnof58.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/pbdnof58/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 | 1461386327912
373 |
374 |
375 | 1461386327912
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
--------------------------------------------------------------------------------
/pbdnof58/README.rst:
--------------------------------------------------------------------------------
1 | 采用了scrapy-redis做了一个分布式的爬虫。最后用mysql存储数据。需要事先安装以及配置好redis以及mysql。
2 |
3 | 进入工程后,scrapy crawl myspider_58page运行这个命令就会开始爬页码直到最后结束。需要在redis中给出初始页码。
4 |
5 | scrapy crawl myspider_58运行这个命令就会开始爬取具体的个人卖家的信息,会存储在相应的mysql表中。
6 |
7 | 上述2个爬虫都可以多开来达到多进程爬取的效果。
8 |
9 |
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/pbdnof58/pbdnof58/__init__.py
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 | from sched import scheduler
8 |
9 | import scrapy
10 | from scrapy.loader import ItemLoader
11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join
12 |
13 |
14 | class Pbdnof58Item(scrapy.Item):
15 | # define the fields for your item here like:
16 | title = scrapy.Field()
17 | time = scrapy.Field()
18 | price = scrapy.Field()
19 | quality = scrapy.Field()
20 | area = scrapy.Field()
21 | UrlofPage = scrapy.Field()
22 |
23 | class Pbdnof58Loader(ItemLoader):
24 | default_item_class = Pbdnof58Item
25 | default_input_processor = MapCompose(lambda s: s.strip())
26 | default_output_processor = TakeFirst()
27 | description_out = Join()
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from scrapy.exceptions import DropItem
8 | from twisted.enterprise import adbapi
9 | import MySQLdb
10 | import MySQLdb.cursors
11 | from scrapy import log
12 |
13 |
14 | class CleanPipeline(object):
15 |
16 | def __init__(self):
17 | self.has = set()
18 |
19 | def process_item(self, item, spider):
20 | if item.keys() >= 5:
21 | if item in self.has:
22 | raise DropItem("Duplicate item found: %s" % item)
23 | else:
24 | self.has.add(item)
25 | return item
26 |
27 |
28 | class MySQLPipeline(object):
29 | def __init__(self, dbpool):
30 | self.dbpool = dbpool
31 |
32 | @classmethod
33 | def from_settings(cls, settings):
34 | dbargs = dict(
35 | host=settings['MYSQL_HOST'],
36 | db=settings['MYSQL_DBNAME'],
37 | user=settings['MYSQL_USER'],
38 | passwd=settings['MYSQL_PASSWD'],
39 | charset='utf8',
40 | cursorclass = MySQLdb.cursors.DictCursor,
41 | use_unicode= True,
42 | )
43 | dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
44 | return cls(dbpool)
45 |
46 | def process_item(self, item, spider):
47 | d = self.dbpool.runInteraction(self.__do__insert, item, spider)
48 | d.addBoth(lambda _: item)
49 | return d
50 |
51 | def __do__insert(self, conn, item, spider):
52 | try:
53 | conn.execute("""
54 | insert into 58pbdndb set title = %s, area = %s, price = %s, quality = %s, time = %s
55 | """, (item['title'], item['area'], item['price'], item['quality'], item['time']))
56 |
57 | except MySQLdb.Error, e:
58 | spider.log("Mysql Error %d: %s" % (e.args[0], e.args[1]), level=log.DEBUG)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for pbdnof58 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'pbdnof58'
13 |
14 | SPIDER_MODULES = ['pbdnof58.spiders']
15 | NEWSPIDER_MODULE = 'pbdnof58.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'pbdnof58 (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'pbdnof58.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'pbdnof58.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'pbdnof58.pipelines.CleanPipeline': 300,
69 | 'pbdnof58.pipelines.MySQLPipeline': 400,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | # start MySQL database configure setting
94 | MYSQL_HOST = 'localhost'
95 | MYSQL_DBNAME = 'qcl'
96 | MYSQL_USER = 'root'
97 | MYSQL_PASSWD = ''
98 | # end of MySQL database configure setting
99 |
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/58Urlspider.py:
--------------------------------------------------------------------------------
1 | from scrapy_redis.spiders import RedisSpider
2 | from pbdnof58.items import Pbdnof58Loader
3 | from redis import Redis
4 | from scrapy import log
5 | from time import sleep
6 |
7 | class Myspider(RedisSpider):
8 | '''spider that reads urls from redis queue (myspider:start_urls).'''
9 | name = 'myspider_58page'
10 | redis_key = 'myspider:58_urls'
11 |
12 | def __init__(self, *args, **kwargs):
13 | domain = kwargs.pop('domain', '')
14 | self.allowed_domans = filter(None, domain.split(','))
15 | super(Myspider, self).__init__(*args, **kwargs)
16 | self.url = 'http://bj.58.com'
17 |
18 | def parse(self, response):
19 | el = Pbdnof58Loader(response=response)
20 | PageUrl = response.xpath('//a[contains(@class, "next")]/@href').extract()
21 | self.log(PageUrl, level=log.DEBUG)
22 | r = Redis()
23 | if PageUrl != []:
24 | r.lpush('myspider:58_urls', self.url + PageUrl[0])
25 | sleep(1)
26 | el.add_value('UrlofPage', self.url + PageUrl[0])
27 | urls = response.xpath('//table[contains(@class, "tbimg")]/tr')
28 | for url in urls:
29 | url = url.xpath('td[contains(@class, "t")]/a/@href').extract()
30 | if len(url) == 1 and 'zhuan' not in url[0]:
31 | r.lpush('myspider:start_urls', url[0])
32 | return el.load_item()
33 |
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/58spider-redis.py:
--------------------------------------------------------------------------------
1 | from scrapy_redis.spiders import RedisSpider
2 | from pbdnof58.items import Pbdnof58Loader
3 | # from scrapy import log
4 |
5 |
6 | class Myspider(RedisSpider):
7 | '''spider that reads urls from redis queue (myspider:start_urls).'''
8 | name = 'myspider_58'
9 | redis_key = 'myspider:start_urls'
10 |
11 | def __init__(self, *args, **kwargs):
12 | domain = kwargs.pop('domain', '')
13 | self.allowed_domans = filter(None, domain.split(','))
14 | super(Myspider, self).__init__(*args, **kwargs)
15 |
16 |
17 | def parse(self, response):
18 |
19 | el = Pbdnof58Loader(response=response)
20 | el.add_xpath('title', '//h1/text()')
21 | el.add_xpath('price', '//span[contains(@class, "price c_f50")]/text()'.strip())
22 | quality = response.xpath('//ul[contains(@class, "suUl")]/li')
23 | quality = quality[1].xpath('div[contains(@class, "su_con")]/span/text()').extract()[0].strip()
24 | el.add_value('quality', quality)
25 | area = response.xpath('//span[contains(@class, "c_25d")]/a/text()'.strip())
26 | if area == []:
27 | area = 'None'
28 | elif len(area) == 1:
29 | area = area[0].extract()
30 | else:
31 | area = area[0].extract() + '-' + area[1].extract()
32 | el.add_value('area', area)
33 | el.add_xpath('time', '//li[contains(@class, "time")]/text()')
34 | return el.load_item()
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/pbdnof58/pbdnof58/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/pbdnof58/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = pbdnof58.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pbdnof58
12 |
--------------------------------------------------------------------------------
/tayleswift.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | __author__ = 'qcl'
3 |
4 | from urllib import urlretrieve
5 | from bs4 import BeautifulSoup
6 | import requests
7 |
8 | listofpic = []
9 | def getpiclist(url):
10 | wb_data = requests.get(url).text
11 | soup = BeautifulSoup(wb_data, 'html5lib')
12 | urls = soup.select('#main-container > div > div.grid-thumb.grid-responsive > div > div > div > a > img')
13 | for url in urls:
14 | listofpic.append(url['src'])
15 | def getpicture(first, last):
16 | for i in xrange(first, last + 1):
17 | url = 'http://weheartit.com/inspirations/taylorswift?page=' + str(i)
18 | getpiclist(url)
19 | for i in xrange(len(listofpic)):
20 | postfix = listofpic[i].split('.')[-1]
21 | local = 'E:\\tayleswift\\tayleswift' + str(i) + '.' + postfix
22 | urlretrieve(listofpic[i], local)
23 | print 'done'
24 | getpicture(1, 20)
25 |
26 |
--------------------------------------------------------------------------------