├── .DS_Store ├── .gitignore ├── README.md ├── images360 ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── images.py └── scrapy.cfg /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/Images360/cd077d04cf18b680ab392af1b674ce018dd174d1/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /images 2 | .idea 3 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Images360 2 | 3 | Download Images From 360 Using Scrapy -------------------------------------------------------------------------------- /images360/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/Images360/cd077d04cf18b680ab392af1b674ce018dd174d1/images360/__init__.py -------------------------------------------------------------------------------- /images360/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class ImageItem(Item): 12 | collection = table = 'images' 13 | 14 | id = Field() 15 | url = Field() 16 | title = Field() 17 | thumb = Field() 18 | -------------------------------------------------------------------------------- /images360/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Images360SpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /images360/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import pymongo 10 | import pymysql 11 | from scrapy import Request 12 | from scrapy.exceptions import DropItem 13 | from scrapy.pipelines.images import ImagesPipeline 14 | 15 | 16 | class MongoPipeline(object): 17 | def __init__(self, mongo_uri, mongo_db): 18 | self.mongo_uri = mongo_uri 19 | self.mongo_db = mongo_db 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | return cls( 24 | mongo_uri=crawler.settings.get('MONGO_URI'), 25 | mongo_db=crawler.settings.get('MONGO_DB') 26 | ) 27 | 28 | def open_spider(self, spider): 29 | self.client = pymongo.MongoClient(self.mongo_uri) 30 | self.db = self.client[self.mongo_db] 31 | 32 | def process_item(self, item, spider): 33 | name = item.collection 34 | self.db[name].insert(dict(item)) 35 | return item 36 | 37 | def close_spider(self, spider): 38 | self.client.close() 39 | 40 | 41 | class MysqlPipeline(): 42 | def __init__(self, host, database, user, password, port): 43 | self.host = host 44 | self.database = database 45 | self.user = user 46 | self.password = password 47 | self.port = port 48 | 49 | @classmethod 50 | def from_crawler(cls, crawler): 51 | return cls( 52 | host=crawler.settings.get('MYSQL_HOST'), 53 | database=crawler.settings.get('MYSQL_DATABASE'), 54 | user=crawler.settings.get('MYSQL_USER'), 55 | password=crawler.settings.get('MYSQL_PASSWORD'), 56 | port=crawler.settings.get('MYSQL_PORT'), 57 | ) 58 | 59 | def open_spider(self, spider): 60 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', 61 | port=self.port) 62 | self.cursor = self.db.cursor() 63 | 64 | def close_spider(self, spider): 65 | self.db.close() 66 | 67 | def process_item(self, item, spider): 68 | print(item['title']) 69 | data = dict(item) 70 | keys = ', '.join(data.keys()) 71 | values = ', '.join(['%s'] * len(data)) 72 | sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) 73 | self.cursor.execute(sql, tuple(data.values())) 74 | self.db.commit() 75 | return item 76 | 77 | 78 | class ImagePipeline(ImagesPipeline): 79 | def file_path(self, request, response=None, info=None): 80 | url = request.url 81 | file_name = url.split('/')[-1] 82 | return file_name 83 | 84 | def item_completed(self, results, item, info): 85 | image_paths = [x['path'] for ok, x in results if ok] 86 | if not image_paths: 87 | raise DropItem('Image Downloaded Failed') 88 | return item 89 | 90 | def get_media_requests(self, item, info): 91 | yield Request(item['url']) 92 | -------------------------------------------------------------------------------- /images360/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for images360 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'images360' 13 | 14 | SPIDER_MODULES = ['images360.spiders'] 15 | NEWSPIDER_MODULE = 'images360.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'images360 (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 1 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'images360.middlewares.Images360SpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | # DOWNLOADER_MIDDLEWARES = { 55 | # 'images360.middlewares.MyCustomDownloaderMiddleware': 543, 56 | # } 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'images360.pipelines.ImagePipeline': 300, 68 | 'images360.pipelines.MongoPipeline': 301, 69 | # 'images360.pipelines.MysqlPipeline': 302, 70 | } 71 | 72 | IMAGES_STORE = './images' 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | # AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | # AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | # AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | # AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | # HTTPCACHE_ENABLED = True 90 | # HTTPCACHE_EXPIRATION_SECS = 0 91 | # HTTPCACHE_DIR = 'httpcache' 92 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | 95 | MAX_PAGE = 50 96 | 97 | MONGO_URI = 'localhost' 98 | MONGO_DB = 'images360' 99 | 100 | MYSQL_HOST = 'localhost' 101 | MYSQL_DATABASE = 'images360' 102 | MYSQL_USER = 'root' 103 | MYSQL_PASSWORD = '123456' 104 | MYSQL_PORT = 3306 105 | -------------------------------------------------------------------------------- /images360/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /images360/spiders/images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | from urllib.parse import urlencode 4 | import json 5 | 6 | from images360.items import ImageItem 7 | 8 | 9 | class ImagesSpider(Spider): 10 | name = 'images' 11 | allowed_domains = ['images.so.com'] 12 | start_urls = ['http://images.so.com/'] 13 | 14 | 15 | def start_requests(self): 16 | data = {'ch': 'photography', 'listtype': 'new'} 17 | base_url = 'https://image.so.com/zjl?' 18 | for page in range(1, self.settings.get('MAX_PAGE') + 1): 19 | data['sn'] = page * 30 20 | params = urlencode(data) 21 | url = base_url + params 22 | yield Request(url, self.parse) 23 | 24 | def parse(self, response): 25 | result = json.loads(response.text) 26 | for image in result.get('list'): 27 | item = ImageItem() 28 | item['id'] = image.get('id') 29 | item['url'] = image.get('qhimg_url') 30 | item['title'] = image.get('title') 31 | item['thumb'] = image.get('qhimg_thumb') 32 | print('item', item) 33 | yield item 34 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = images360.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = images360 12 | --------------------------------------------------------------------------------