├── .idea
├── dianping_data.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── anjuke
├── README.md
├── anjuke
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── middlewares.py
│ ├── middlewares.pyc
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __init__.pyc
│ │ ├── anjuke_spider.py
│ │ └── anjuke_spider.pyc
└── scrapy.cfg
├── book_rank
├── .idea
│ ├── book_rank.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── book_rank
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── pipelines.py
│ ├── run.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __init__.pyc
│ │ ├── bookspider.py
│ │ └── bookspider.pyc
├── issue.txt
└── scrapy.cfg
├── img_recognize
├── captcha-1.jpg
├── captcha-2.jpg
└── readme.txt
└── love_food
├── .DS_Store
├── .idea
├── love_food.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── love_food
├── .DS_Store
├── __init__.py
├── __init__.pyc
├── items.py
├── items.pyc
├── middlewares.py
├── middlewares.pyc
├── pipelines.py
├── pipelines.pyc
├── settings.py
├── settings.pyc
└── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── foodspider.py
│ └── foodspider.pyc
└── scrapy.cfg
/.idea/dianping_data.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 | 1482406932192
363 |
364 |
365 | 1482406932192
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dianping_data
2 | python爬虫练习一(love_food):某地区大众点评商家详细信息收集(scrapy,csv)
3 |
4 | python爬虫练习二(book_rank):某地区图书馆模拟登陆<无验证码>获取热门图书借阅榜(scrapy,mongodb)
5 |
6 | python爬虫练习三(img_recognize):验证码识别学习(python,PIL)
7 |
8 | Python爬虫练习四(anjuke):爬取安居客上海租房的信息(scrapy,csv)—反爬应对
9 |
--------------------------------------------------------------------------------
/anjuke/README.md:
--------------------------------------------------------------------------------
1 | anjuke_spider
2 |
3 | 爬取安居客租房链接下所有的租房信息。
4 | 1.使用随机ua,保存为csv文件
5 | 2.爬取频率过高会被安居客封ip数小时。
6 | ---应对:1)使用代理ip池,但是免费的好多没法用,放弃。
7 | 2)调整DOWNLOAD_DELAY时间,左右不大。
8 | 3)多个机器,爬取不同页面。
9 |
10 | ---选择处理办法:
11 | 使用Google cache,找到爬取页面对应的cache url即可
12 |
13 | ps:不管爬虫选取的是什么网站,爬取网站上的数据只是为了练习python和分析下数据。坚决不对网站进行恶意的请求。
--------------------------------------------------------------------------------
/anjuke/anjuke/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/__init__.py
--------------------------------------------------------------------------------
/anjuke/anjuke/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/__init__.pyc
--------------------------------------------------------------------------------
/anjuke/anjuke/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class AnjukeItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | house_type = scrapy.Field() #房子格局
15 | rent_type = scrapy.Field() #出租类型:整租、合租
16 | renovation = scrapy.Field() #装修情况
17 | address = scrapy.Field() #地址
18 | owner = scrapy.Field() #联系人
19 | price = scrapy.Field() #出租价格
20 |
--------------------------------------------------------------------------------
/anjuke/anjuke/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/items.pyc
--------------------------------------------------------------------------------
/anjuke/anjuke/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import random
3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
4 |
5 | class RotateUserAgentMiddleware(UserAgentMiddleware):
6 |
7 | def __init__(self, user_agent=''):
8 | self.user_agent = user_agent
9 |
10 | def process_request(self, request, spider):
11 | ua = random.choice(self.user_agent_list)
12 | if ua:
13 | print ua, '----------------------user_agent chosed-------------------'
14 | request.headers.setdefault('User-Agent', ua)
15 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
16 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
17 | user_agent_list = [
18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
19 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
20 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
21 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
22 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
23 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
24 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
26 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
30 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
32 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
36 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
37 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
38 | "Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts),"
39 | "Mozilla/4.0 (compatible; MSIE 8.0; AOL 9.6; AOLBuild 4340.5004; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
40 | ]
41 |
--------------------------------------------------------------------------------
/anjuke/anjuke/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/middlewares.pyc
--------------------------------------------------------------------------------
/anjuke/anjuke/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class AnjukePipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/anjuke/anjuke/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for anjuke project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'anjuke'
13 |
14 | SPIDER_MODULES = ['anjuke.spiders']
15 | NEWSPIDER_MODULE = 'anjuke.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'anjuke (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'anjuke.middlewares.AnjukeSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'anjuke.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'anjuke.pipelines.AnjukePipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | HTTPCACHE_ENABLED = False
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | # 以csv文件进行保存
92 | #FEED_URI = u'file:/Users/zhangrui/Desktop/anjuke_data.csv'
93 | FEED_URI = u'file:///E:/anjuke_data.csv'
94 | FEED_FORMAT = 'CSV'
95 |
96 | COOKIES_ENABLED = False
97 | DOWNLOAD_DELAY = 3
98 | # Obey robots.txt rules
99 |
100 | # 以csv文件进行保存
101 | # FEED_URI = u'file:///D:/food_data.csv'
102 | # FEED_FORMAT = 'CSV'
103 |
104 | # 随机ua设置
105 | DOWNLOADER_MIDDLEWARES = {
106 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
107 | 'anjuke.middlewares.RotateUserAgentMiddleware': 400,
108 |
109 | }
110 |
--------------------------------------------------------------------------------
/anjuke/anjuke/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/settings.pyc
--------------------------------------------------------------------------------
/anjuke/anjuke/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/anjuke/anjuke/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/spiders/__init__.pyc
--------------------------------------------------------------------------------
/anjuke/anjuke/spiders/anjuke_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #author zhangr
3 |
4 | import scrapy
5 | from scrapy.contrib.spiders import CrawlSpider
6 | from scrapy.http import request, Request
7 | from scrapy.selector import Selector
8 | from anjuke.items import AnjukeItem #引入items中的类
9 |
10 | class Anjuke(CrawlSpider):
11 | name = "anjuke_spider"
12 | #allowed_domains = ["sh.zu.anjuke.com"]
13 | start_urls = ['google cache url'] #安居客租房链接地址
14 |
15 | def parse(self, response):
16 | item = AnjukeItem() #所有数据
17 | selector = Selector(response)
18 | HouseData = selector.xpath('//*[@id="list-content"]/div') #div[1],div[2]需要舍弃
19 | for eachhouse in HouseData[3:]:
20 | house_type = eachhouse.xpath('div[1]/p[1]/text()[1]').extract()
21 | rent_type = eachhouse.xpath('div[1]/p[1]/text()[2]').extract()
22 | renovation = eachhouse.xpath('div[1]/p[1]/text()[3]').extract()
23 | address = eachhouse.xpath('div[1]/address/text()').extract()
24 | owner = eachhouse.xpath('div[1]/p[2]/span/text()').extract()
25 | price = eachhouse.xpath('div[2]/p/strong/text()').extract() #不要写成/div[2]/p/...没看清坑了自己
26 |
27 | if house_type:
28 | item['house_type'] = house_type
29 | else:
30 | item['house_type'] = None
31 | if rent_type:
32 | item['rent_type'] = rent_type
33 | else:
34 | item['rent_type'] = None
35 | if renovation:
36 | item['renovation'] = renovation
37 | else:
38 | item['renovation'] = None
39 | if address:
40 | item['address'] = address
41 | else:
42 | item['address'] = None
43 | if owner:
44 | item['owner'] = owner
45 | else:
46 | item['owner'] = None
47 | if price:
48 | item['price'] = price
49 | else:
50 | item['price'] = None
51 | yield item
52 |
53 | nextpage = selector.xpath('//div[@class="multi-page"]/a/@href').extract()[-1] #取最后一个href,顺序无法取
54 | print nextpage
55 | if nextpage:
56 | yield Request(nextpage,callback=self.parse)
57 |
58 |
59 |
--------------------------------------------------------------------------------
/anjuke/anjuke/spiders/anjuke_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/spiders/anjuke_spider.pyc
--------------------------------------------------------------------------------
/anjuke/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = anjuke.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = anjuke
12 |
--------------------------------------------------------------------------------
/book_rank/.idea/book_rank.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/book_rank/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/book_rank/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/book_rank/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/book_rank/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 | 1477227517229
359 |
360 |
361 | 1477227517229
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
--------------------------------------------------------------------------------
/book_rank/book_rank/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/__init__.py
--------------------------------------------------------------------------------
/book_rank/book_rank/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/__init__.pyc
--------------------------------------------------------------------------------
/book_rank/book_rank/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 | import scrapy
8 | from scrapy.item import Item,Field
9 |
10 | class BookRankItem(scrapy.Item):
11 | # 定义需要爬取的内容
12 | # name = scrapy.Field()
13 | rank = Field()
14 | name = Field()
15 | author = Field()
16 | press = Field()
17 | publish_time = Field()
18 | view_number = Field()
19 |
--------------------------------------------------------------------------------
/book_rank/book_rank/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/items.pyc
--------------------------------------------------------------------------------
/book_rank/book_rank/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from pymongo import MongoClient
8 | from scrapy.conf import settings
9 | from scrapy.exceptions import DropItem
10 | from scrapy import log
11 |
12 | class BookRankPipeline(object):
13 | def __init__(self):
14 | client = MongoClient(host=settings['MONGODB_SERVER'], port=settings['MONGODB_PORT'])
15 | db = client[settings['MONGODB_DB']]
16 | self.collection = db[settings['MONGODB_COLLECTION']]
17 |
18 | def process_item(self, item, spider):
19 | valid = True
20 | for data in item:
21 | if not data:
22 | valid = False
23 | raise DropItem('Missing{0}!'.format(data))
24 | if valid:
25 | self.collection.insert(dict(item))
26 | log.msg('data added to mongodb database', level=log.DEBUG, spider=spider)
27 |
28 | return item
29 |
--------------------------------------------------------------------------------
/book_rank/book_rank/run.py:
--------------------------------------------------------------------------------
1 |
2 | from scrapy import cmdline
3 |
4 |
5 | cmdline.execute("scrapy crawl bookspider".split())
--------------------------------------------------------------------------------
/book_rank/book_rank/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for book_rank project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'book_rank'
13 |
14 | SPIDER_MODULES = ['book_rank.spiders']
15 | NEWSPIDER_MODULE = 'book_rank.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'book_rank (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | USER_AGENT = 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
25 | COOKIES_ENABLED = False
26 | DOWNLOAD_DELAY = 3
27 | # Obey robots.txt rules
28 |
29 | # 以csv文件进行保存
30 | FEED_URI = u'file:///E:/book_rank.csv'
31 | FEED_FORMAT = 'CSV'
32 |
--------------------------------------------------------------------------------
/book_rank/book_rank/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/settings.pyc
--------------------------------------------------------------------------------
/book_rank/book_rank/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/book_rank/book_rank/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/spiders/__init__.pyc
--------------------------------------------------------------------------------
/book_rank/book_rank/spiders/bookspider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #author zhangr
3 |
4 | import scrapy
5 | from scrapy.contrib.spiders import CrawlSpider
6 | from scrapy.http import request,Request
7 | from scrapy.selector import Selector
8 | import sys
9 | import urllib
10 | import urllib2
11 | import cookielib
12 | from book_rank.items import BookRankItem #引入items中的类
13 |
14 | class Book(CrawlSpider):
15 | name = "bookspider"
16 | start_urls = [
17 | "http://opac.zjgtsg.com/opac/ranking/bookLoanRank"
18 | ]
19 | ReadID = '' # 登录系统的账号,这里是身份证号码
20 | ReadPasswd = '14e52634c81e53e0ef7f87b034eab171' # 登录密码的密文,POST中得到的
21 |
22 | def login_url(self):
23 | self.loginUrl = 'http://opac.zjgtsg.com/opac/reader/space'
24 | self.cookies = cookielib.CookieJar()
25 | # 自行分析POST的数据,这个系统不需要验证码
26 | self.postdata = urllib.urlencode({
27 | 'rdid': ReadID,
28 | 'rdPasswd': ReadPasswd
29 | })
30 | self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookies))
31 |
32 | def parse(self, response):
33 | item = BookRankItem()
34 | selector = Selector(response)
35 | Books = selector.xpath('//table[@id="contentTable"]/tr') #获取页面所有图书信息 注:忽略tbody标签,不然入坑
36 |
37 | for eachBook in Books:
38 | rank = eachBook.xpath('td[1]/text()').extract()
39 | name = eachBook.xpath('td[2]/a/text()').extract() #a标签里面的属性值
40 | author = eachBook.xpath('td[3]/text()').extract()
41 | press = eachBook.xpath('td[4]/text()').extract()
42 | publish_time = eachBook.xpath('td[5]/text()').extract()
43 | view_number = eachBook.xpath('td[6]/text()').extract()
44 | if(rank and name and author and press and publish_time and view_number): #剔除第一个tr标签的记录
45 | item['rank'] = rank
46 | item['name'] = name
47 | item['author'] = author
48 | item['press'] = press
49 | item['publish_time'] = publish_time
50 | item['view_number'] = view_number
51 | else:
52 | item['rank'] = None
53 | item['name'] = None
54 | item['author'] = None
55 | item['press'] = None
56 | item['publish_time'] = None
57 | item['view_number'] = None
58 |
59 | yield item
--------------------------------------------------------------------------------
/book_rank/book_rank/spiders/bookspider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/spiders/bookspider.pyc
--------------------------------------------------------------------------------
/book_rank/issue.txt:
--------------------------------------------------------------------------------
1 | 忽略tbody,否则xpath正确,依然没数据
--------------------------------------------------------------------------------
/book_rank/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = book_rank.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = book_rank
12 |
--------------------------------------------------------------------------------
/img_recognize/captcha-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/img_recognize/captcha-1.jpg
--------------------------------------------------------------------------------
/img_recognize/captcha-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/img_recognize/captcha-2.jpg
--------------------------------------------------------------------------------
/img_recognize/readme.txt:
--------------------------------------------------------------------------------
1 | #使用Python做简单验证码识别
2 |
3 | 主要记录两类比较常见的验证码,一类是captcha-1.jpg;一类是captcha-2.jpg.
4 |
5 | 1.依赖
6 | PIL pytesseract tessseract-ocr
7 | 2.建议在linux系统环境中使用
8 | 3.参考 https://my.oschina.net/jhao104/blog/647326
9 |
10 |
11 | 备注:机器学习部分,咱不讨论。具体案列中遇到再做分析。
12 |
--------------------------------------------------------------------------------
/love_food/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/.DS_Store
--------------------------------------------------------------------------------
/love_food/.idea/love_food.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/love_food/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/love_food/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/love_food/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/love_food/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 | 1476712339998
336 |
337 |
338 | 1476712339998
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
--------------------------------------------------------------------------------
/love_food/love_food/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/.DS_Store
--------------------------------------------------------------------------------
/love_food/love_food/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/__init__.py
--------------------------------------------------------------------------------
/love_food/love_food/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/__init__.pyc
--------------------------------------------------------------------------------
/love_food/love_food/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 |
11 | class LoveFoodItem(Item):
12 | restaurant = Field()
13 | star = Field()
14 | average_price = Field()
15 | foodtype = Field()
16 | addr = Field()
17 |
18 |
--------------------------------------------------------------------------------
/love_food/love_food/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/items.pyc
--------------------------------------------------------------------------------
/love_food/love_food/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import random
3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
4 |
5 |
6 | class RotateUserAgentMiddleware(UserAgentMiddleware):
7 |
8 | def __init__(self, user_agent=''):
9 | self.user_agent = user_agent
10 |
11 | def process_request(self, request, spider):
12 | ua = random.choice(self.user_agent_list)
13 | if ua:
14 | print ua, '----------------------user_agent chosed-------------------'
15 | request.headers.setdefault('User-Agent', ua)
16 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
17 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
18 | user_agent_list = [
19 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
20 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
22 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
23 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
25 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
27 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
29 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
34 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
35 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
36 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
37 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
38 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
39 | "Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts),"
40 | "Mozilla/4.0 (compatible; MSIE 8.0; AOL 9.6; AOLBuild 4340.5004; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
41 | ]
42 |
43 |
--------------------------------------------------------------------------------
/love_food/love_food/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/middlewares.pyc
--------------------------------------------------------------------------------
/love_food/love_food/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from pymongo import MongoClient
8 | from scrapy.conf import settings
9 | from scrapy.exceptions import DropItem
10 | from scrapy import log
11 |
12 |
13 | class LoveFoodPipeline(object):
14 | def __init__(self):
15 | client = MongoClient(host=settings['MONGODB_SERVER'], port=settings['MONGODB_PORT'])
16 | db = client[settings['MONGODB_DB']]
17 | self.collection = db[settings['MONGODB_COLLECTION']]
18 |
19 | def process_item(self, item, spider):
20 | valid = True
21 | for data in item:
22 | if not data:
23 | valid = False
24 | raise DropItem('Missing{0}!'.format(data))
25 | if valid:
26 | self.collection.insert(dict(item))
27 | log.msg('data added to mongodb database', level=log.DEBUG, spider=spider)
28 |
29 | return item
30 |
--------------------------------------------------------------------------------
/love_food/love_food/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/pipelines.pyc
--------------------------------------------------------------------------------
/love_food/love_food/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for love_food project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'love_food'
13 |
14 | SPIDER_MODULES = ['love_food.spiders']
15 | NEWSPIDER_MODULE = 'love_food.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 |
20 | # USER_AGENT = 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
21 | COOKIES_ENABLED = False
22 | DOWNLOAD_DELAY = 3
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = True
25 |
26 | # 以csv文件进行保存
27 | # FEED_URI = u'file:///D:/food_data.csv'
28 | # FEED_FORMAT = 'CSV'
29 |
30 | # 随机ua设置
31 | DOWNLOADER_MIDDLEWARES = {
32 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
33 | 'love_food.middlewares.RotateUserAgentMiddleware': 400,
34 | }
35 |
36 | # MongoDB设置
37 | ITEM_PIPELINES = {
38 | 'love_food.pipelines.LoveFoodPipeline': 300,
39 | }
40 | MONGODB_SERVER = "localhost"
41 | MONGODB_PORT = 27017
42 | MONGODB_DB = 'mongo'
43 | MONGODB_COLLECTION = 'dazongdianpin'
44 |
--------------------------------------------------------------------------------
/love_food/love_food/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/settings.pyc
--------------------------------------------------------------------------------
/love_food/love_food/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/love_food/love_food/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/spiders/__init__.pyc
--------------------------------------------------------------------------------
/love_food/love_food/spiders/foodspider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #author zhangr
3 | #thanks to Lving
4 | import scrapy
5 | from scrapy.contrib.spiders import CrawlSpider
6 | from scrapy.http import request, Request
7 | from scrapy.selector import Selector
8 | from love_food.items import LoveFoodItem #引入items中的类
9 |
10 |
11 | class Food(CrawlSpider):
12 | name = "foodspider"
13 | redis_key = 'foodspider:start_urls'
14 | start_urls = ['http://www.dianping.com/search/category/418/10']
15 | url = 'http://www.dianping.com/search/category/418/10'
16 |
17 | def parse(self, response):
18 | item = LoveFoodItem() #所有的网页数据
19 | selector = Selector(response)
20 | Foods = selector.xpath('//*[@id="shop-all-list"]/ul/li')
21 | for eachFood in Foods:
22 | restaurant = eachFood.xpath('div[2]/div[1]/a/h4/text()').extract()
23 | star = eachFood.xpath('div[2]/div[2]/span/@title').extract()
24 | average_price = eachFood.xpath('div[2]/div[2]/a[2]/b/text()').extract()
25 | foodtype = eachFood.xpath('div[2]/div[3]/a[1]/span/text()').extract()
26 | addr = eachFood.xpath('div[2]/div[3]/a[2]/span/text()').extract()
27 | if restaurant:
28 | print restaurant[0]
29 | item['restaurant'] = restaurant[0]
30 | else:
31 | item['restaurant'] = None
32 | if star:
33 | print star[0]
34 | item['star'] = star[0]
35 | else:
36 | item['star'] = None
37 | if average_price:
38 | print average_price[0]
39 | item['average_price'] = average_price[0]
40 | else:
41 | item['average_price'] = None
42 | if foodtype:
43 | print foodtype[0]
44 | item['foodtype'] = foodtype[0]
45 | else:
46 | item['foodtype'] = None
47 | if addr:
48 | print addr[0]
49 | item['addr'] = addr[0]
50 | else:
51 | item['addr'] = None
52 |
53 | yield item
54 | nextpage = selector.xpath('//*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a/@href').extract()[-1]
55 | # nextpage 的标签容易出现变动
56 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[11] page1
57 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[12] page2
58 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[12] page3
59 | if nextpage:
60 | print nextpage, '*************************next page******************************'
61 | # 字符串切片 拼接
62 | nextpage = nextpage[23:]
63 | yield Request(self.url+nextpage, callback=self.parse)
64 |
--------------------------------------------------------------------------------
/love_food/love_food/spiders/foodspider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/spiders/foodspider.pyc
--------------------------------------------------------------------------------
/love_food/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = love_food.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = love_food
12 |
--------------------------------------------------------------------------------