├── fun
    ├── test.py
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── coser2.py
    │   ├── coser.py
    │   └── meizitu.py
    ├── settings.py
    ├── items.py
    └── pipelines.py
├── run.py
├── scrapy.cfg
├── setup.py
├── README.md
├── .gitignore
└── worldcosplay.py


/fun/test.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fun/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | 
3 | from scrapy.cmdline import execute
4 | execute()


--------------------------------------------------------------------------------
/fun/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = fun.settings
 8 | 
 9 | [deploy]
10 | url = http://localhost:6800/
11 | project = fun


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name='fun_crawler',
 7 |     version='1.0',
 8 |     packages=find_packages(),
 9 |     entry_points={'scrapy': ['settings = fun.settings']}, requires=['requests', 'scrapy']
10 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 美图爬虫
 2 | 
 3 | By [Bohan](https://github.com/ZhangBohan).
 4 | 
 5 | ## Description
 6 | 
 7 | 这个项目仅用于学习
 8 | 
 9 | 目标是爬取各上的美图，目前实现的爬取[妹子图](http://www.meizitu.com/)，
10 | 还有两个cosplay图片爬取的爬虫
11 | 
12 | ## Installation
13 | 
14 |     > git clone https://github.com/ZhangBohan/fun_crawler.git
15 |     > cd fun_crawler
16 |     > sudo easy_install virtualenv
17 |     > virtualenv venv
18 |     > source venv/bin/activate
19 |     > python setup.py --requires | xargs pip install
20 |     
21 | ## Usage
22 | 
23 |  * 妹子图：`python run.py crawl meizitu`
24 |  * coser `scrapy crawl coser -o items.csv -t csv`
25 |  * WorldCosplay `python worldcosplay.py 53056`


--------------------------------------------------------------------------------
/fun/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for fun project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'fun'
12 | 
13 | SPIDER_MODULES = ['fun.spiders']
14 | NEWSPIDER_MODULE = 'fun.spiders'
15 | 
16 | ITEM_PIPELINES = {'fun.pipelines.ImageDownloadPipeline': 1}
17 | 
18 | IMAGES_STORE = '/tmp/images'
19 | 
20 | 
21 | DOWNLOAD_DELAY = 0.25    # 250 ms of delay
22 | 
23 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
24 | 


--------------------------------------------------------------------------------
/fun/spiders/coser2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.contrib.loader import ItemLoader, Identity
 4 | from fun.items import CoserItem
 5 | 
 6 | 
 7 | class CoserSpider(scrapy.Spider):
 8 |     name = "coser2"
 9 |     allowed_domains = ["bcy.net"]
10 |     start_urls = (
11 |         'http://bcy.net/coser/detail/9495/130440',
12 |     )
13 | 
14 |     def parse(self, response):
15 |         l = ItemLoader(item=CoserItem(), response=response)
16 |         l.add_xpath('name', "//h1[@class='js-post-title']/text()")
17 |         l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
18 |         urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
19 |         urls = [url.replace('/w650', '') for url in urls]
20 |         l.add_value('image_urls', urls)
21 |         l.add_value('url', response.url)
22 |         return l.load_item()
23 | 


--------------------------------------------------------------------------------
/fun/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.contrib.loader import ItemLoader
10 | from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Join
11 | 
12 | 
13 | class MeizituItem(scrapy.Item):
14 |     url = scrapy.Field()
15 |     name = scrapy.Field()
16 |     tags = scrapy.Field()
17 |     image_urls = scrapy.Field()
18 |     images = scrapy.Field()
19 | 
20 | 
21 | class CoserItem(scrapy.Item):
22 |     url = scrapy.Field()
23 |     name = scrapy.Field()
24 |     info = scrapy.Field()
25 |     image_urls = scrapy.Field()
26 |     images = scrapy.Field()
27 | 
28 | 
29 | class MyItemLoader(ItemLoader):
30 |     default_input_processor = MapCompose(lambda s: s.strip())
31 |     default_output_processor = TakeFirst()
32 |     description_out = Join()
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | ### Python template
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/fun/spiders/coser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.selector import Selector
 3 | import scrapy
 4 | from scrapy.contrib.loader import ItemLoader
 5 | from fun.items import CoserItem
 6 | 
 7 | 
 8 | class CoserSpider(scrapy.Spider):
 9 |     name = "coser"
10 |     allowed_domains = ["bcy.net"]
11 |     start_urls = (
12 |         'http://bcy.net/cn125101',
13 |         'http://bcy.net/cn126487',
14 |         'http://bcy.net/cn126173'
15 |     )
16 | 
17 |     def parse(self, response):
18 |         sel = Selector(response)
19 | 
20 |         for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
21 |             link = 'http://bcy.net%s' % link
22 |             request = scrapy.Request(link, callback=self.parse_item)
23 |             yield request
24 | 
25 |     def parse_item(self, response):
26 |         l = ItemLoader(item=CoserItem(), response=response)
27 |         l.add_xpath('name', "//h1[@class='js-post-title']/text()")
28 |         l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
29 |         urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
30 |         urls = [url.replace('/w650', '') for url in urls]
31 |         l.add_value('image_urls', urls)
32 |         l.add_value('url', response.url)
33 |         return l.load_item()
34 | 


--------------------------------------------------------------------------------
/fun/spiders/meizitu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.selector import Selector
 3 | import scrapy
 4 | from scrapy.contrib.loader import ItemLoader, Identity
 5 | from fun.items import MeizituItem
 6 | 
 7 | 
 8 | class MeizituSpider(scrapy.Spider):
 9 |     name = "meizitu"
10 |     allowed_domains = ["meizitu.com"]
11 |     start_urls = (
12 |         'http://www.meizitu.com/',
13 |     )
14 | 
15 |     def parse(self, response):
16 |         sel = Selector(response)
17 |         for link in sel.xpath('//h2/a/@href').extract():
18 |             request = scrapy.Request(link, callback=self.parse_item)
19 |             yield request
20 | 
21 |         pages = sel.xpath("//div[@class='navigation']/div[@id='wp_page_numbers']/ul/li/a/@href").extract()
22 |         print('pages: %s' % pages)
23 |         if len(pages) > 2:
24 |             page_link = pages[-2]
25 |             page_link = page_link.replace('/a/', '')
26 |             request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
27 |             yield request
28 | 
29 |     def parse_item(self, response):
30 |         l = ItemLoader(item=MeizituItem(), response=response)
31 |         l.add_xpath('name', '//h2/a/text()')
32 |         l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
33 |         l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
34 | 
35 |         l.add_value('url', response.url)
36 |         return l.load_item()


--------------------------------------------------------------------------------
/worldcosplay.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import json
 3 | from sys import argv
 4 | # import requests
 5 | import os
 6 | import urllib
 7 | import urllib2
 8 | 
 9 | 
10 | def main(member_id, page=1, index=0):
11 |     url = 'http://worldcosplay.net/en/api/member/photos?member_id=%s&page=%s&limit=100000&rows=16&p3_photo_list=1' % (member_id, page)
12 |     r = urllib2.urlopen(url)
13 | 
14 |     if r.code == 200:
15 |         data = json.loads(r.read())
16 |         if data['has_error'] != 0:
17 |             print u'接口挫了'
18 |             exit(1)
19 | 
20 |         photo_data_list = data['list']
21 |         if not photo_data_list:
22 |             print u'没东西了？第 %s 页，共下载了 %s 个图片' % (page, index - 1)
23 |             exit(0)
24 |         for photo_data in photo_data_list:
25 |             url = photo_data['photo']['sq300_url']
26 |             subject = photo_data['photo']['subject']
27 |             url = url.replace('/sq300', '')
28 |             subject = subject.replace('/', '_')
29 | 
30 |             if not os.path.exists(member_id):
31 |                 os.makedirs(member_id)
32 | 
33 |             filename = '%s/%s_%s_%s.jpg' % (member_id, member_id, index, subject)
34 |             try:
35 |                 urllib.urlretrieve(url=url, filename=filename)
36 |                 print u'下完了%s张' % (index + 1)
37 |                 index += 1
38 |             except Exception:
39 |                 print(u'这张图片下载出问题了： %s' % url)
40 | 
41 |         page += 1
42 |         main(member_id, page=page, index=index)
43 | 
44 |     else:
45 |         print u'挫了'
46 |         exit(1)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     if len(argv) < 2:
51 |         print(u'请输入coser ID，例如：53056')
52 |         exit(1)
53 |     member_id = argv[1]
54 |     main(member_id)


--------------------------------------------------------------------------------
/fun/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import requests
 8 | from fun import settings
 9 | import os
10 | 
11 | 
12 | class ImageDownloadPipeline(object):
13 |     def process_item(self, item, spider):
14 |         if 'image_urls' in item:
15 |             images = []
16 |             dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
17 | 
18 |             request_data = {'allow_redirects': False,
19 |              'auth': None,
20 |              'cert': None,
21 |              'data': {},
22 |              'files': {},
23 |              'headers': {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'},
24 |              'method': 'get',
25 |              'params': {},
26 |              'proxies': {},
27 |              'stream': True,
28 |              'timeout': 30,
29 |              'url': '',
30 |              'verify': True}
31 | 
32 |             if not os.path.exists(dir_path):
33 |                 os.makedirs(dir_path)
34 |             for image_url in item['image_urls']:
35 |                 request_data['url'] = image_url
36 |                 us = image_url.split('/')[3:]
37 |                 image_file_name = '_'.join(us)
38 |                 file_path = '%s/%s' % (dir_path, image_file_name)
39 |                 images.append(file_path)
40 |                 if os.path.exists(file_path):
41 |                     continue
42 | 
43 |                 with open(file_path, 'wb') as handle:
44 |                     response = requests.request(**request_data)
45 |                     for block in response.iter_content(1024):
46 |                         if not block:
47 |                             break
48 | 
49 |                         handle.write(block)
50 | 
51 |             item['images'] = images
52 |         return item
53 | 


--------------------------------------------------------------------------------