├── .idea ├── dianping_data.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── anjuke ├── README.md ├── anjuke │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── items.pyc │ ├── middlewares.py │ ├── middlewares.pyc │ ├── pipelines.py │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── anjuke_spider.py │ │ └── anjuke_spider.pyc └── scrapy.cfg ├── book_rank ├── .idea │ ├── book_rank.iml │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── workspace.xml ├── book_rank │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── items.pyc │ ├── pipelines.py │ ├── run.py │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── bookspider.py │ │ └── bookspider.pyc ├── issue.txt └── scrapy.cfg ├── img_recognize ├── captcha-1.jpg ├── captcha-2.jpg └── readme.txt └── love_food ├── .DS_Store ├── .idea ├── love_food.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── love_food ├── .DS_Store ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── middlewares.py ├── middlewares.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── foodspider.py │ └── foodspider.pyc └── scrapy.cfg /.idea/dianping_data.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 16 | 17 | 18 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 48 | 49 | 60 | 61 | 62 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 100 | 101 | 102 | 103 | 106 | 107 | 110 | 111 | 112 | 113 | 116 | 117 | 120 | 121 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 153 | 154 | 170 | 171 | 187 | 188 | 204 | 205 | 221 | 222 | 233 | 234 | 252 | 253 | 271 | 272 | 292 | 293 | 314 | 315 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 1482406932192 363 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 415 | 418 | 419 | 420 | 422 | 423 | 424 | 425 | 426 | 427 | 429 | 430 | 431 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dianping_data 2 | python爬虫练习一（love_food):某地区大众点评商家详细信息收集（scrapy,csv） 3 | 4 | python爬虫练习二（book_rank):某地区图书馆模拟登陆<无验证码>获取热门图书借阅榜(scrapy,mongodb) 5 | 6 | python爬虫练习三（img_recognize):验证码识别学习(python，PIL) 7 | 8 | Python爬虫练习四（anjuke）:爬取安居客上海租房的信息（scrapy,csv）—反爬应对 9 | -------------------------------------------------------------------------------- /anjuke/README.md: -------------------------------------------------------------------------------- 1 | anjuke_spider 2 | 3 | 爬取安居客租房链接下所有的租房信息。 4 | 1.使用随机ua，保存为csv文件 5 | 2.爬取频率过高会被安居客封ip数小时。 6 | ---应对：1）使用代理ip池，但是免费的好多没法用，放弃。 7 | 2）调整DOWNLOAD_DELAY时间，左右不大。 8 | 3）多个机器，爬取不同页面。 9 | 10 | ---选择处理办法： 11 | 使用Google cache，找到爬取页面对应的cache url即可 12 | 13 | ps：不管爬虫选取的是什么网站，爬取网站上的数据只是为了练习python和分析下数据。坚决不对网站进行恶意的请求。 -------------------------------------------------------------------------------- /anjuke/anjuke/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/__init__.py -------------------------------------------------------------------------------- /anjuke/anjuke/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/__init__.pyc -------------------------------------------------------------------------------- /anjuke/anjuke/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class AnjukeItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | house_type = scrapy.Field() #房子格局 15 | rent_type = scrapy.Field() #出租类型：整租、合租 16 | renovation = scrapy.Field() #装修情况 17 | address = scrapy.Field() #地址 18 | owner = scrapy.Field() #联系人 19 | price = scrapy.Field() #出租价格 20 | -------------------------------------------------------------------------------- /anjuke/anjuke/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/items.pyc -------------------------------------------------------------------------------- /anjuke/anjuke/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 4 | 5 | class RotateUserAgentMiddleware(UserAgentMiddleware): 6 | 7 | def __init__(self, user_agent=''): 8 | self.user_agent = user_agent 9 | 10 | def process_request(self, request, spider): 11 | ua = random.choice(self.user_agent_list) 12 | if ua: 13 | print ua, '----------------------user_agent chosed-------------------' 14 | request.headers.setdefault('User-Agent', ua) 15 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 16 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 17 | user_agent_list = [ 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 19 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 20 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 21 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 22 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 23 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 24 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 25 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 26 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 30 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 32 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 36 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 37 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", 38 | "Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)," 39 | "Mozilla/4.0 (compatible; MSIE 8.0; AOL 9.6; AOLBuild 4340.5004; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)" 40 | ] 41 | -------------------------------------------------------------------------------- /anjuke/anjuke/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/middlewares.pyc -------------------------------------------------------------------------------- /anjuke/anjuke/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class AnjukePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /anjuke/anjuke/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for anjuke project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'anjuke' 13 | 14 | SPIDER_MODULES = ['anjuke.spiders'] 15 | NEWSPIDER_MODULE = 'anjuke.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'anjuke (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'anjuke.middlewares.AnjukeSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'anjuke.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'anjuke.pipelines.AnjukePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | HTTPCACHE_ENABLED = False 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | # 以csv文件进行保存 92 | #FEED_URI = u'file:/Users/zhangrui/Desktop/anjuke_data.csv' 93 | FEED_URI = u'file:///E:/anjuke_data.csv' 94 | FEED_FORMAT = 'CSV' 95 | 96 | COOKIES_ENABLED = False 97 | DOWNLOAD_DELAY = 3 98 | # Obey robots.txt rules 99 | 100 | # 以csv文件进行保存 101 | # FEED_URI = u'file:///D:/food_data.csv' 102 | # FEED_FORMAT = 'CSV' 103 | 104 | # 随机ua设置 105 | DOWNLOADER_MIDDLEWARES = { 106 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 107 | 'anjuke.middlewares.RotateUserAgentMiddleware': 400, 108 | 109 | } 110 | -------------------------------------------------------------------------------- /anjuke/anjuke/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/settings.pyc -------------------------------------------------------------------------------- /anjuke/anjuke/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /anjuke/anjuke/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/spiders/__init__.pyc -------------------------------------------------------------------------------- /anjuke/anjuke/spiders/anjuke_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #author zhangr 3 | 4 | import scrapy 5 | from scrapy.contrib.spiders import CrawlSpider 6 | from scrapy.http import request, Request 7 | from scrapy.selector import Selector 8 | from anjuke.items import AnjukeItem #引入items中的类 9 | 10 | class Anjuke(CrawlSpider): 11 | name = "anjuke_spider" 12 | #allowed_domains = ["sh.zu.anjuke.com"] 13 | start_urls = ['google cache url'] #安居客租房链接地址 14 | 15 | def parse(self, response): 16 | item = AnjukeItem() #所有数据 17 | selector = Selector(response) 18 | HouseData = selector.xpath('//*[@id="list-content"]/div') #div[1],div[2]需要舍弃 19 | for eachhouse in HouseData[3:]: 20 | house_type = eachhouse.xpath('div[1]/p[1]/text()[1]').extract() 21 | rent_type = eachhouse.xpath('div[1]/p[1]/text()[2]').extract() 22 | renovation = eachhouse.xpath('div[1]/p[1]/text()[3]').extract() 23 | address = eachhouse.xpath('div[1]/address/text()').extract() 24 | owner = eachhouse.xpath('div[1]/p[2]/span/text()').extract() 25 | price = eachhouse.xpath('div[2]/p/strong/text()').extract() #不要写成/div[2]/p/...没看清坑了自己 26 | 27 | if house_type: 28 | item['house_type'] = house_type 29 | else: 30 | item['house_type'] = None 31 | if rent_type: 32 | item['rent_type'] = rent_type 33 | else: 34 | item['rent_type'] = None 35 | if renovation: 36 | item['renovation'] = renovation 37 | else: 38 | item['renovation'] = None 39 | if address: 40 | item['address'] = address 41 | else: 42 | item['address'] = None 43 | if owner: 44 | item['owner'] = owner 45 | else: 46 | item['owner'] = None 47 | if price: 48 | item['price'] = price 49 | else: 50 | item['price'] = None 51 | yield item 52 | 53 | nextpage = selector.xpath('//div[@class="multi-page"]/a/@href').extract()[-1] #取最后一个href，顺序无法取 54 | print nextpage 55 | if nextpage: 56 | yield Request(nextpage,callback=self.parse) 57 | 58 | 59 | -------------------------------------------------------------------------------- /anjuke/anjuke/spiders/anjuke_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/anjuke/anjuke/spiders/anjuke_spider.pyc -------------------------------------------------------------------------------- /anjuke/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = anjuke.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = anjuke 12 | -------------------------------------------------------------------------------- /book_rank/.idea/book_rank.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /book_rank/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /book_rank/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /book_rank/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /book_rank/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 66 | 67 | 68 | 76 | 77 | 78 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 115 | 116 | 117 | 118 | 121 | 122 | 125 | 126 | 127 | 128 | 131 | 132 | 135 | 136 | 139 | 140 | 141 | 142 | 145 | 146 | 149 | 150 | 153 | 154 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 187 | 188 | 204 | 205 | 221 | 222 | 233 | 234 | 252 | 253 | 271 | 272 | 292 | 293 | 314 | 315 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 1477227517229 359 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 392 | 395 | 396 | 397 | 399 | 400 | 401 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | -------------------------------------------------------------------------------- /book_rank/book_rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/__init__.py -------------------------------------------------------------------------------- /book_rank/book_rank/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/__init__.pyc -------------------------------------------------------------------------------- /book_rank/book_rank/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | import scrapy 8 | from scrapy.item import Item,Field 9 | 10 | class BookRankItem(scrapy.Item): 11 | # 定义需要爬取的内容 12 | # name = scrapy.Field() 13 | rank = Field() 14 | name = Field() 15 | author = Field() 16 | press = Field() 17 | publish_time = Field() 18 | view_number = Field() 19 | -------------------------------------------------------------------------------- /book_rank/book_rank/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/items.pyc -------------------------------------------------------------------------------- /book_rank/book_rank/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from pymongo import MongoClient 8 | from scrapy.conf import settings 9 | from scrapy.exceptions import DropItem 10 | from scrapy import log 11 | 12 | class BookRankPipeline(object): 13 | def __init__(self): 14 | client = MongoClient(host=settings['MONGODB_SERVER'], port=settings['MONGODB_PORT']) 15 | db = client[settings['MONGODB_DB']] 16 | self.collection = db[settings['MONGODB_COLLECTION']] 17 | 18 | def process_item(self, item, spider): 19 | valid = True 20 | for data in item: 21 | if not data: 22 | valid = False 23 | raise DropItem('Missing{0}!'.format(data)) 24 | if valid: 25 | self.collection.insert(dict(item)) 26 | log.msg('data added to mongodb database', level=log.DEBUG, spider=spider) 27 | 28 | return item 29 | -------------------------------------------------------------------------------- /book_rank/book_rank/run.py: -------------------------------------------------------------------------------- 1 | 2 | from scrapy import cmdline 3 | 4 | 5 | cmdline.execute("scrapy crawl bookspider".split()) -------------------------------------------------------------------------------- /book_rank/book_rank/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for book_rank project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'book_rank' 13 | 14 | SPIDER_MODULES = ['book_rank.spiders'] 15 | NEWSPIDER_MODULE = 'book_rank.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'book_rank (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | USER_AGENT = 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' 25 | COOKIES_ENABLED = False 26 | DOWNLOAD_DELAY = 3 27 | # Obey robots.txt rules 28 | 29 | # 以csv文件进行保存 30 | FEED_URI = u'file:///E:/book_rank.csv' 31 | FEED_FORMAT = 'CSV' 32 | -------------------------------------------------------------------------------- /book_rank/book_rank/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/settings.pyc -------------------------------------------------------------------------------- /book_rank/book_rank/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /book_rank/book_rank/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/spiders/__init__.pyc -------------------------------------------------------------------------------- /book_rank/book_rank/spiders/bookspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #author zhangr 3 | 4 | import scrapy 5 | from scrapy.contrib.spiders import CrawlSpider 6 | from scrapy.http import request,Request 7 | from scrapy.selector import Selector 8 | import sys 9 | import urllib 10 | import urllib2 11 | import cookielib 12 | from book_rank.items import BookRankItem #引入items中的类 13 | 14 | class Book(CrawlSpider): 15 | name = "bookspider" 16 | start_urls = [ 17 | "http://opac.zjgtsg.com/opac/ranking/bookLoanRank" 18 | ] 19 | ReadID = '' # 登录系统的账号，这里是身份证号码 20 | ReadPasswd = '14e52634c81e53e0ef7f87b034eab171' # 登录密码的密文，POST中得到的 21 | 22 | def login_url(self): 23 | self.loginUrl = 'http://opac.zjgtsg.com/opac/reader/space' 24 | self.cookies = cookielib.CookieJar() 25 | # 自行分析POST的数据，这个系统不需要验证码 26 | self.postdata = urllib.urlencode({ 27 | 'rdid': ReadID, 28 | 'rdPasswd': ReadPasswd 29 | }) 30 | self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookies)) 31 | 32 | def parse(self, response): 33 | item = BookRankItem() 34 | selector = Selector(response) 35 | Books = selector.xpath('//table[@id="contentTable"]/tr') #获取页面所有图书信息注：忽略tbody标签，不然入坑 36 | 37 | for eachBook in Books: 38 | rank = eachBook.xpath('td[1]/text()').extract() 39 | name = eachBook.xpath('td[2]/a/text()').extract() #a标签里面的属性值 40 | author = eachBook.xpath('td[3]/text()').extract() 41 | press = eachBook.xpath('td[4]/text()').extract() 42 | publish_time = eachBook.xpath('td[5]/text()').extract() 43 | view_number = eachBook.xpath('td[6]/text()').extract() 44 | if(rank and name and author and press and publish_time and view_number): #剔除第一个tr标签的记录 45 | item['rank'] = rank 46 | item['name'] = name 47 | item['author'] = author 48 | item['press'] = press 49 | item['publish_time'] = publish_time 50 | item['view_number'] = view_number 51 | else: 52 | item['rank'] = None 53 | item['name'] = None 54 | item['author'] = None 55 | item['press'] = None 56 | item['publish_time'] = None 57 | item['view_number'] = None 58 | 59 | yield item -------------------------------------------------------------------------------- /book_rank/book_rank/spiders/bookspider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/book_rank/book_rank/spiders/bookspider.pyc -------------------------------------------------------------------------------- /book_rank/issue.txt: -------------------------------------------------------------------------------- 1 | 忽略tbody，否则xpath正确，依然没数据 -------------------------------------------------------------------------------- /book_rank/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = book_rank.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = book_rank 12 | -------------------------------------------------------------------------------- /img_recognize/captcha-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/img_recognize/captcha-1.jpg -------------------------------------------------------------------------------- /img_recognize/captcha-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/img_recognize/captcha-2.jpg -------------------------------------------------------------------------------- /img_recognize/readme.txt: -------------------------------------------------------------------------------- 1 | #使用Python做简单验证码识别 2 | 3 | 主要记录两类比较常见的验证码，一类是captcha-1.jpg;一类是captcha-2.jpg. 4 | 5 | 1.依赖 6 | PIL pytesseract tessseract-ocr 7 | 2.建议在linux系统环境中使用 8 | 3.参考 https://my.oschina.net/jhao104/blog/647326 9 | 10 | 11 | 备注:机器学习部分，咱不讨论。具体案列中遇到再做分析。 12 | -------------------------------------------------------------------------------- /love_food/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/.DS_Store -------------------------------------------------------------------------------- /love_food/.idea/love_food.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /love_food/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /love_food/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /love_food/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /love_food/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 62 | 63 | 64 | 66 | 67 | 74 | 75 | 76 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 114 | 115 | 116 | 117 | 120 | 121 | 124 | 125 | 126 | 127 | 130 | 131 | 134 | 135 | 138 | 139 | 140 | 141 | 144 | 145 | 148 | 149 | 152 | 153 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 183 | 184 | 200 | 201 | 212 | 213 | 231 | 232 | 250 | 251 | 271 | 272 | 293 | 294 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 1476712339998 336 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 369 | 372 | 373 | 374 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | -------------------------------------------------------------------------------- /love_food/love_food/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/.DS_Store -------------------------------------------------------------------------------- /love_food/love_food/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/__init__.py -------------------------------------------------------------------------------- /love_food/love_food/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/__init__.pyc -------------------------------------------------------------------------------- /love_food/love_food/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class LoveFoodItem(Item): 12 | restaurant = Field() 13 | star = Field() 14 | average_price = Field() 15 | foodtype = Field() 16 | addr = Field() 17 | 18 | -------------------------------------------------------------------------------- /love_food/love_food/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/items.pyc -------------------------------------------------------------------------------- /love_food/love_food/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 4 | 5 | 6 | class RotateUserAgentMiddleware(UserAgentMiddleware): 7 | 8 | def __init__(self, user_agent=''): 9 | self.user_agent = user_agent 10 | 11 | def process_request(self, request, spider): 12 | ua = random.choice(self.user_agent_list) 13 | if ua: 14 | print ua, '----------------------user_agent chosed-------------------' 15 | request.headers.setdefault('User-Agent', ua) 16 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 17 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 18 | user_agent_list = [ 19 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 20 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 22 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 23 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 25 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 27 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 29 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 33 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 34 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 35 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 36 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 37 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 38 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", 39 | "Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)," 40 | "Mozilla/4.0 (compatible; MSIE 8.0; AOL 9.6; AOLBuild 4340.5004; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)" 41 | ] 42 | 43 | -------------------------------------------------------------------------------- /love_food/love_food/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/middlewares.pyc -------------------------------------------------------------------------------- /love_food/love_food/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from pymongo import MongoClient 8 | from scrapy.conf import settings 9 | from scrapy.exceptions import DropItem 10 | from scrapy import log 11 | 12 | 13 | class LoveFoodPipeline(object): 14 | def __init__(self): 15 | client = MongoClient(host=settings['MONGODB_SERVER'], port=settings['MONGODB_PORT']) 16 | db = client[settings['MONGODB_DB']] 17 | self.collection = db[settings['MONGODB_COLLECTION']] 18 | 19 | def process_item(self, item, spider): 20 | valid = True 21 | for data in item: 22 | if not data: 23 | valid = False 24 | raise DropItem('Missing{0}!'.format(data)) 25 | if valid: 26 | self.collection.insert(dict(item)) 27 | log.msg('data added to mongodb database', level=log.DEBUG, spider=spider) 28 | 29 | return item 30 | -------------------------------------------------------------------------------- /love_food/love_food/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/pipelines.pyc -------------------------------------------------------------------------------- /love_food/love_food/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for love_food project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'love_food' 13 | 14 | SPIDER_MODULES = ['love_food.spiders'] 15 | NEWSPIDER_MODULE = 'love_food.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | 20 | # USER_AGENT = 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' 21 | COOKIES_ENABLED = False 22 | DOWNLOAD_DELAY = 3 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = True 25 | 26 | # 以csv文件进行保存 27 | # FEED_URI = u'file:///D:/food_data.csv' 28 | # FEED_FORMAT = 'CSV' 29 | 30 | # 随机ua设置 31 | DOWNLOADER_MIDDLEWARES = { 32 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 33 | 'love_food.middlewares.RotateUserAgentMiddleware': 400, 34 | } 35 | 36 | # MongoDB设置 37 | ITEM_PIPELINES = { 38 | 'love_food.pipelines.LoveFoodPipeline': 300, 39 | } 40 | MONGODB_SERVER = "localhost" 41 | MONGODB_PORT = 27017 42 | MONGODB_DB = 'mongo' 43 | MONGODB_COLLECTION = 'dazongdianpin' 44 | -------------------------------------------------------------------------------- /love_food/love_food/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/settings.pyc -------------------------------------------------------------------------------- /love_food/love_food/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /love_food/love_food/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/spiders/__init__.pyc -------------------------------------------------------------------------------- /love_food/love_food/spiders/foodspider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #author zhangr 3 | #thanks to Lving 4 | import scrapy 5 | from scrapy.contrib.spiders import CrawlSpider 6 | from scrapy.http import request, Request 7 | from scrapy.selector import Selector 8 | from love_food.items import LoveFoodItem #引入items中的类 9 | 10 | 11 | class Food(CrawlSpider): 12 | name = "foodspider" 13 | redis_key = 'foodspider:start_urls' 14 | start_urls = ['http://www.dianping.com/search/category/418/10'] 15 | url = 'http://www.dianping.com/search/category/418/10' 16 | 17 | def parse(self, response): 18 | item = LoveFoodItem() #所有的网页数据 19 | selector = Selector(response) 20 | Foods = selector.xpath('//*[@id="shop-all-list"]/ul/li') 21 | for eachFood in Foods: 22 | restaurant = eachFood.xpath('div[2]/div[1]/a/h4/text()').extract() 23 | star = eachFood.xpath('div[2]/div[2]/span/@title').extract() 24 | average_price = eachFood.xpath('div[2]/div[2]/a[2]/b/text()').extract() 25 | foodtype = eachFood.xpath('div[2]/div[3]/a[1]/span/text()').extract() 26 | addr = eachFood.xpath('div[2]/div[3]/a[2]/span/text()').extract() 27 | if restaurant: 28 | print restaurant[0] 29 | item['restaurant'] = restaurant[0] 30 | else: 31 | item['restaurant'] = None 32 | if star: 33 | print star[0] 34 | item['star'] = star[0] 35 | else: 36 | item['star'] = None 37 | if average_price: 38 | print average_price[0] 39 | item['average_price'] = average_price[0] 40 | else: 41 | item['average_price'] = None 42 | if foodtype: 43 | print foodtype[0] 44 | item['foodtype'] = foodtype[0] 45 | else: 46 | item['foodtype'] = None 47 | if addr: 48 | print addr[0] 49 | item['addr'] = addr[0] 50 | else: 51 | item['addr'] = None 52 | 53 | yield item 54 | nextpage = selector.xpath('//*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a/@href').extract()[-1] 55 | # nextpage 的标签容易出现变动 56 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[11] page1 57 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[12] page2 58 | # //*[@id="top"]/div[6]/div[3]/div[1]/div[2]/a[12] page3 59 | if nextpage: 60 | print nextpage, '*************************next page******************************' 61 | # 字符串切片拼接 62 | nextpage = nextpage[23:] 63 | yield Request(self.url+nextpage, callback=self.parse) 64 | -------------------------------------------------------------------------------- /love_food/love_food/spiders/foodspider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PyCN/dianping_data/5318ff821a268ff62baeed091fd5cf7d74f8d9f8/love_food/love_food/spiders/foodspider.pyc -------------------------------------------------------------------------------- /love_food/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = love_food.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = love_food 12 | --------------------------------------------------------------------------------