├── .DS_Store
├── .gitignore
├── README.md
├── images360
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── images.py
└── scrapy.cfg


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/Images360/cd077d04cf18b680ab392af1b674ce018dd174d1/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /images
2 | .idea
3 | *.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Images360
2 | 
3 | Download Images From 360 Using Scrapy


--------------------------------------------------------------------------------
/images360/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/Images360/cd077d04cf18b680ab392af1b674ce018dd174d1/images360/__init__.py


--------------------------------------------------------------------------------
/images360/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Item, Field
 9 | 
10 | 
11 | class ImageItem(Item):
12 |     collection = table = 'images'
13 |     
14 |     id = Field()
15 |     url = Field()
16 |     title = Field()
17 |     thumb = Field()
18 | 


--------------------------------------------------------------------------------
/images360/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class Images360SpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/images360/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | import pymongo
10 | import pymysql
11 | from scrapy import Request
12 | from scrapy.exceptions import DropItem
13 | from scrapy.pipelines.images import ImagesPipeline
14 | 
15 | 
16 | class MongoPipeline(object):
17 |     def __init__(self, mongo_uri, mongo_db):
18 |         self.mongo_uri = mongo_uri
19 |         self.mongo_db = mongo_db
20 |     
21 |     @classmethod
22 |     def from_crawler(cls, crawler):
23 |         return cls(
24 |             mongo_uri=crawler.settings.get('MONGO_URI'),
25 |             mongo_db=crawler.settings.get('MONGO_DB')
26 |         )
27 |     
28 |     def open_spider(self, spider):
29 |         self.client = pymongo.MongoClient(self.mongo_uri)
30 |         self.db = self.client[self.mongo_db]
31 |     
32 |     def process_item(self, item, spider):
33 |         name = item.collection
34 |         self.db[name].insert(dict(item))
35 |         return item
36 |     
37 |     def close_spider(self, spider):
38 |         self.client.close()
39 | 
40 | 
41 | class MysqlPipeline():
42 |     def __init__(self, host, database, user, password, port):
43 |         self.host = host
44 |         self.database = database
45 |         self.user = user
46 |         self.password = password
47 |         self.port = port
48 |     
49 |     @classmethod
50 |     def from_crawler(cls, crawler):
51 |         return cls(
52 |             host=crawler.settings.get('MYSQL_HOST'),
53 |             database=crawler.settings.get('MYSQL_DATABASE'),
54 |             user=crawler.settings.get('MYSQL_USER'),
55 |             password=crawler.settings.get('MYSQL_PASSWORD'),
56 |             port=crawler.settings.get('MYSQL_PORT'),
57 |         )
58 |     
59 |     def open_spider(self, spider):
60 |         self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
61 |                                   port=self.port)
62 |         self.cursor = self.db.cursor()
63 |     
64 |     def close_spider(self, spider):
65 |         self.db.close()
66 |     
67 |     def process_item(self, item, spider):
68 |         print(item['title'])
69 |         data = dict(item)
70 |         keys = ', '.join(data.keys())
71 |         values = ', '.join(['%s'] * len(data))
72 |         sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
73 |         self.cursor.execute(sql, tuple(data.values()))
74 |         self.db.commit()
75 |         return item
76 | 
77 | 
78 | class ImagePipeline(ImagesPipeline):
79 |     def file_path(self, request, response=None, info=None):
80 |         url = request.url
81 |         file_name = url.split('/')[-1]
82 |         return file_name
83 |     
84 |     def item_completed(self, results, item, info):
85 |         image_paths = [x['path'] for ok, x in results if ok]
86 |         if not image_paths:
87 |             raise DropItem('Image Downloaded Failed')
88 |         return item
89 |     
90 |     def get_media_requests(self, item, info):
91 |         yield Request(item['url'])
92 | 


--------------------------------------------------------------------------------
/images360/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for images360 project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'images360'
 13 | 
 14 | SPIDER_MODULES = ['images360.spiders']
 15 | NEWSPIDER_MODULE = 'images360.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'images360 (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 1
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 3
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | # DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | # }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'images360.middlewares.Images360SpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 54 | # DOWNLOADER_MIDDLEWARES = {
 55 | #    'images360.middlewares.MyCustomDownloaderMiddleware': 543,
 56 | # }
 57 | 
 58 | # Enable or disable extensions
 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 60 | # EXTENSIONS = {
 61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 62 | # }
 63 | 
 64 | # Configure item pipelines
 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 66 | ITEM_PIPELINES = {
 67 |     'images360.pipelines.ImagePipeline': 300,
 68 |     'images360.pipelines.MongoPipeline': 301,
 69 |     # 'images360.pipelines.MysqlPipeline': 302,
 70 | }
 71 | 
 72 | IMAGES_STORE = './images'
 73 | 
 74 | # Enable and configure the AutoThrottle extension (disabled by default)
 75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 76 | # AUTOTHROTTLE_ENABLED = True
 77 | # The initial download delay
 78 | # AUTOTHROTTLE_START_DELAY = 5
 79 | # The maximum download delay to be set in case of high latencies
 80 | # AUTOTHROTTLE_MAX_DELAY = 60
 81 | # The average number of requests Scrapy should be sending in parallel to
 82 | # each remote server
 83 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 84 | # Enable showing throttling stats for every response received:
 85 | # AUTOTHROTTLE_DEBUG = False
 86 | 
 87 | # Enable and configure HTTP caching (disabled by default)
 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 89 | # HTTPCACHE_ENABLED = True
 90 | # HTTPCACHE_EXPIRATION_SECS = 0
 91 | # HTTPCACHE_DIR = 'httpcache'
 92 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 94 | 
 95 | MAX_PAGE = 50
 96 | 
 97 | MONGO_URI = 'localhost'
 98 | MONGO_DB = 'images360'
 99 | 
100 | MYSQL_HOST = 'localhost'
101 | MYSQL_DATABASE = 'images360'
102 | MYSQL_USER = 'root'
103 | MYSQL_PASSWORD = '123456'
104 | MYSQL_PORT = 3306
105 | 


--------------------------------------------------------------------------------
/images360/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/images360/spiders/images.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | from urllib.parse import urlencode
 4 | import json
 5 | 
 6 | from images360.items import ImageItem
 7 | 
 8 | 
 9 | class ImagesSpider(Spider):
10 |     name = 'images'
11 |     allowed_domains = ['images.so.com']
12 |     start_urls = ['http://images.so.com/']
13 |     
14 |     
15 |     def start_requests(self):
16 |         data = {'ch': 'photography', 'listtype': 'new'}
17 |         base_url = 'https://image.so.com/zjl?'
18 |         for page in range(1, self.settings.get('MAX_PAGE') + 1):
19 |             data['sn'] = page * 30
20 |             params = urlencode(data)
21 |             url = base_url + params
22 |             yield Request(url, self.parse)
23 |     
24 |     def parse(self, response):
25 |         result = json.loads(response.text)
26 |         for image in result.get('list'):
27 |             item = ImageItem()
28 |             item['id'] = image.get('id')
29 |             item['url'] = image.get('qhimg_url')
30 |             item['title'] = image.get('title')
31 |             item['thumb'] = image.get('qhimg_thumb')
32 |             print('item', item)
33 |             yield item
34 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = images360.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = images360
12 | 


--------------------------------------------------------------------------------