├── fun ├── test.py ├── __init__.py ├── spiders │ ├── __init__.py │ ├── coser2.py │ ├── coser.py │ └── meizitu.py ├── settings.py ├── items.py └── pipelines.py ├── run.py ├── scrapy.cfg ├── setup.py ├── README.md ├── .gitignore └── worldcosplay.py /fun/test.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fun/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from scrapy.cmdline import execute 4 | execute() -------------------------------------------------------------------------------- /fun/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = fun.settings 8 | 9 | [deploy] 10 | url = http://localhost:6800/ 11 | project = fun -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='fun_crawler', 7 | version='1.0', 8 | packages=find_packages(), 9 | entry_points={'scrapy': ['settings = fun.settings']}, requires=['requests', 'scrapy'] 10 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 美图爬虫 2 | 3 | By [Bohan](https://github.com/ZhangBohan). 4 | 5 | ## Description 6 | 7 | 这个项目仅用于学习 8 | 9 | 目标是爬取各上的美图,目前实现的爬取[妹子图](http://www.meizitu.com/), 10 | 还有两个cosplay图片爬取的爬虫 11 | 12 | ## Installation 13 | 14 | > git clone https://github.com/ZhangBohan/fun_crawler.git 15 | > cd fun_crawler 16 | > sudo easy_install virtualenv 17 | > virtualenv venv 18 | > source venv/bin/activate 19 | > python setup.py --requires | xargs pip install 20 | 21 | ## Usage 22 | 23 | * 妹子图:`python run.py crawl meizitu` 24 | * coser `scrapy crawl coser -o items.csv -t csv` 25 | * WorldCosplay `python worldcosplay.py 53056` -------------------------------------------------------------------------------- /fun/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for fun project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'fun' 12 | 13 | SPIDER_MODULES = ['fun.spiders'] 14 | NEWSPIDER_MODULE = 'fun.spiders' 15 | 16 | ITEM_PIPELINES = {'fun.pipelines.ImageDownloadPipeline': 1} 17 | 18 | IMAGES_STORE = '/tmp/images' 19 | 20 | 21 | DOWNLOAD_DELAY = 0.25 # 250 ms of delay 22 | 23 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' 24 | -------------------------------------------------------------------------------- /fun/spiders/coser2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.contrib.loader import ItemLoader, Identity 4 | from fun.items import CoserItem 5 | 6 | 7 | class CoserSpider(scrapy.Spider): 8 | name = "coser2" 9 | allowed_domains = ["bcy.net"] 10 | start_urls = ( 11 | 'http://bcy.net/coser/detail/9495/130440', 12 | ) 13 | 14 | def parse(self, response): 15 | l = ItemLoader(item=CoserItem(), response=response) 16 | l.add_xpath('name', "//h1[@class='js-post-title']/text()") 17 | l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") 18 | urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') 19 | urls = [url.replace('/w650', '') for url in urls] 20 | l.add_value('image_urls', urls) 21 | l.add_value('url', response.url) 22 | return l.load_item() 23 | -------------------------------------------------------------------------------- /fun/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.contrib.loader import ItemLoader 10 | from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Join 11 | 12 | 13 | class MeizituItem(scrapy.Item): 14 | url = scrapy.Field() 15 | name = scrapy.Field() 16 | tags = scrapy.Field() 17 | image_urls = scrapy.Field() 18 | images = scrapy.Field() 19 | 20 | 21 | class CoserItem(scrapy.Item): 22 | url = scrapy.Field() 23 | name = scrapy.Field() 24 | info = scrapy.Field() 25 | image_urls = scrapy.Field() 26 | images = scrapy.Field() 27 | 28 | 29 | class MyItemLoader(ItemLoader): 30 | default_input_processor = MapCompose(lambda s: s.strip()) 31 | default_output_processor = TakeFirst() 32 | description_out = Join() 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | ### Python template 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | 60 | -------------------------------------------------------------------------------- /fun/spiders/coser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.selector import Selector 3 | import scrapy 4 | from scrapy.contrib.loader import ItemLoader 5 | from fun.items import CoserItem 6 | 7 | 8 | class CoserSpider(scrapy.Spider): 9 | name = "coser" 10 | allowed_domains = ["bcy.net"] 11 | start_urls = ( 12 | 'http://bcy.net/cn125101', 13 | 'http://bcy.net/cn126487', 14 | 'http://bcy.net/cn126173' 15 | ) 16 | 17 | def parse(self, response): 18 | sel = Selector(response) 19 | 20 | for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract(): 21 | link = 'http://bcy.net%s' % link 22 | request = scrapy.Request(link, callback=self.parse_item) 23 | yield request 24 | 25 | def parse_item(self, response): 26 | l = ItemLoader(item=CoserItem(), response=response) 27 | l.add_xpath('name', "//h1[@class='js-post-title']/text()") 28 | l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") 29 | urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') 30 | urls = [url.replace('/w650', '') for url in urls] 31 | l.add_value('image_urls', urls) 32 | l.add_value('url', response.url) 33 | return l.load_item() 34 | -------------------------------------------------------------------------------- /fun/spiders/meizitu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.selector import Selector 3 | import scrapy 4 | from scrapy.contrib.loader import ItemLoader, Identity 5 | from fun.items import MeizituItem 6 | 7 | 8 | class MeizituSpider(scrapy.Spider): 9 | name = "meizitu" 10 | allowed_domains = ["meizitu.com"] 11 | start_urls = ( 12 | 'http://www.meizitu.com/', 13 | ) 14 | 15 | def parse(self, response): 16 | sel = Selector(response) 17 | for link in sel.xpath('//h2/a/@href').extract(): 18 | request = scrapy.Request(link, callback=self.parse_item) 19 | yield request 20 | 21 | pages = sel.xpath("//div[@class='navigation']/div[@id='wp_page_numbers']/ul/li/a/@href").extract() 22 | print('pages: %s' % pages) 23 | if len(pages) > 2: 24 | page_link = pages[-2] 25 | page_link = page_link.replace('/a/', '') 26 | request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse) 27 | yield request 28 | 29 | def parse_item(self, response): 30 | l = ItemLoader(item=MeizituItem(), response=response) 31 | l.add_xpath('name', '//h2/a/text()') 32 | l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") 33 | l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity()) 34 | 35 | l.add_value('url', response.url) 36 | return l.load_item() -------------------------------------------------------------------------------- /worldcosplay.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | from sys import argv 4 | # import requests 5 | import os 6 | import urllib 7 | import urllib2 8 | 9 | 10 | def main(member_id, page=1, index=0): 11 | url = 'http://worldcosplay.net/en/api/member/photos?member_id=%s&page=%s&limit=100000&rows=16&p3_photo_list=1' % (member_id, page) 12 | r = urllib2.urlopen(url) 13 | 14 | if r.code == 200: 15 | data = json.loads(r.read()) 16 | if data['has_error'] != 0: 17 | print u'接口挫了' 18 | exit(1) 19 | 20 | photo_data_list = data['list'] 21 | if not photo_data_list: 22 | print u'没东西了?第 %s 页,共下载了 %s 个图片' % (page, index - 1) 23 | exit(0) 24 | for photo_data in photo_data_list: 25 | url = photo_data['photo']['sq300_url'] 26 | subject = photo_data['photo']['subject'] 27 | url = url.replace('/sq300', '') 28 | subject = subject.replace('/', '_') 29 | 30 | if not os.path.exists(member_id): 31 | os.makedirs(member_id) 32 | 33 | filename = '%s/%s_%s_%s.jpg' % (member_id, member_id, index, subject) 34 | try: 35 | urllib.urlretrieve(url=url, filename=filename) 36 | print u'下完了%s张' % (index + 1) 37 | index += 1 38 | except Exception: 39 | print(u'这张图片下载出问题了: %s' % url) 40 | 41 | page += 1 42 | main(member_id, page=page, index=index) 43 | 44 | else: 45 | print u'挫了' 46 | exit(1) 47 | 48 | 49 | if __name__ == '__main__': 50 | if len(argv) < 2: 51 | print(u'请输入coser ID,例如:53056') 52 | exit(1) 53 | member_id = argv[1] 54 | main(member_id) -------------------------------------------------------------------------------- /fun/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import requests 8 | from fun import settings 9 | import os 10 | 11 | 12 | class ImageDownloadPipeline(object): 13 | def process_item(self, item, spider): 14 | if 'image_urls' in item: 15 | images = [] 16 | dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) 17 | 18 | request_data = {'allow_redirects': False, 19 | 'auth': None, 20 | 'cert': None, 21 | 'data': {}, 22 | 'files': {}, 23 | 'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'}, 24 | 'method': 'get', 25 | 'params': {}, 26 | 'proxies': {}, 27 | 'stream': True, 28 | 'timeout': 30, 29 | 'url': '', 30 | 'verify': True} 31 | 32 | if not os.path.exists(dir_path): 33 | os.makedirs(dir_path) 34 | for image_url in item['image_urls']: 35 | request_data['url'] = image_url 36 | us = image_url.split('/')[3:] 37 | image_file_name = '_'.join(us) 38 | file_path = '%s/%s' % (dir_path, image_file_name) 39 | images.append(file_path) 40 | if os.path.exists(file_path): 41 | continue 42 | 43 | with open(file_path, 'wb') as handle: 44 | response = requests.request(**request_data) 45 | for block in response.iter_content(1024): 46 | if not block: 47 | break 48 | 49 | handle.write(block) 50 | 51 | item['images'] = images 52 | return item 53 | --------------------------------------------------------------------------------