├── huaban
    ├── huaban
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── UserBoardsSpider.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   └── settings.py
    ├── run.py
    └── scrapy.cfg
├── vmgirls
    ├── vmgirls
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── vmgirl.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── middlewares.py
    ├── run.py
    └── scrapy.cfg
├── requirements.txt
├── .gitignore
├── capturer
├── main.py
├── LICENSE
├── telegram
    └── telegram_spider.py
├── README.md
├── fabiaoqing
    └── fabiaoqing_spider.py
├── netbian
    └── netbian_spider.py
├── sina
    └── sina_spider.py
├── toutiao
    └── toutiao_spider.py
├── lofter
    └── lofter_spider.py
└── qqzone
    └── qqzone_spider.py


/huaban/huaban/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/huaban/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute('scrapy crawl UserBoardSpider'.split())


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml==4.6.5
2 | selenium==3.141.0
3 | Scrapy==2.5.1
4 | requests==2.20.0
5 | beautifulsoup4==4.9.1
6 | Pillow==8.3.2
7 | 


--------------------------------------------------------------------------------
/vmgirls/run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | 
4 | from scrapy import cmdline
5 | 
6 | cmdline.execute('scrapy crawl vmgirl'.split())
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | *.swp
 3 | *.html
 4 | *.old
 5 | *.log
 6 | download/
 7 | build/
 8 | dist/
 9 | *.spec
10 | __pycache__/
11 | .DS_Store
12 | */.DS_Store
13 | *.session
14 | 


--------------------------------------------------------------------------------
/huaban/huaban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/capturer:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # author: 5km(smslit)
 3 | # date: 2018.06.27
 4 | # description: 
 5 | #    a shell script to run main.py
 6 | #    this scrip can solve the problem 
 7 | #    that python3 may be installed in different path
 8 | 
 9 | python3 ./main.py
10 | 


--------------------------------------------------------------------------------
/huaban/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = huaban.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = huaban
12 | 


--------------------------------------------------------------------------------
/vmgirls/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = vmgirls.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = vmgirls
12 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item
 9 | from scrapy.item import Field
10 | 
11 | 
12 | class VmgirlsItem(Item):
13 |     # define the fields for your item here like:
14 |     # name = scrapy.Field()
15 |     theme_urls = Field()
16 |     theme_titles = Field()
17 |     pass
18 | 
19 | 
20 | class VmgirlsImagesItem(Item):
21 |     image_urls = Field()
22 |     title = Field()
23 |     pass
24 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # author: litreily
 3 | # date: 2018.03.06
 4 | # description: capture pictures from webs
 5 | 
 6 | from importlib import import_module
 7 | 
 8 | if __name__ == '__main__':
 9 |     webs = {
10 |         '1': "sina",
11 |         '2': "lofter",
12 |         '3': "toutiao",
13 |         '4': "qqzone",
14 |         '5': 'telegram',
15 |         '6': 'netbian'
16 |     }
17 | 
18 |     tips = 'please select web you want to caputer(1-{0}, default=1)\n'.format(len(webs)) + \
19 |            ''.join(["\t{0} - {1}\n".format(i, webs.get(i)) for i in webs]) + \
20 |            'You want to captuer from: '
21 | 
22 |     select = input(tips)
23 |     if select not in webs:
24 |         select = '1'
25 | 
26 |     module = "{0}.{0}_spider".format(webs.get(select))
27 |     spider = import_module(module)
28 |     spider.main()
29 | 


--------------------------------------------------------------------------------
/huaban/huaban/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item
 9 | from scrapy.item import Field
10 | 
11 | 
12 | class HuabanItem(Item):
13 |     # define the fields for your item here like:
14 |     # name = scrapy.Field()
15 |     pass
16 | 
17 | 
18 | class BoardItem(Item):
19 |     # define the fields for your item here like:
20 |     # name = Field()
21 |     title = Field()
22 |     board_id = Field()
23 |     category_id = Field()
24 |     pin_count = Field()
25 |     follow_count = Field()
26 |     like_count = Field()
27 |     pass
28 | 
29 | 
30 | class PinItem(Item):
31 |     pin_id = Field()
32 |     board_id = Field()
33 |     board_title = Field()
34 |     file_id = Field()
35 |     file_key = Field()
36 |     source = Field()
37 |     tags = Field()
38 |     pass
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 litreily@outlook.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/spiders/vmgirl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from vmgirls.items import VmgirlsItem
 5 | from vmgirls.items import VmgirlsImagesItem
 6 | 
 7 | from scrapy.http import Request
 8 | from scrapy.utils.project import get_project_settings
 9 | 
10 | import os
11 | 
12 | 
13 | class VmgirlSpider(scrapy.Spider):
14 |     name = 'vmgirl'
15 |     allowed_domains = ['vmgirls.com']
16 |     start_urls = ['https://www.vmgirls.com/sitemap.shtml/']
17 | 
18 |     def __init__(self):
19 |         settings = get_project_settings()
20 |         self.user_data_dir = settings.get('USER_DATA_DIR')
21 | 
22 |     def parse(self, response):
23 |         '''Parse sitemap'''
24 |         urls = response.xpath('//*[@id="content"][1]/ul/li/a/@href').extract()
25 |         titles = response.xpath(
26 |             '//*[@id="content"][1]/ul/li/a/text()').extract()
27 | 
28 |         item = VmgirlsItem()
29 |         item['theme_urls'] = urls
30 |         item['theme_titles'] = titles
31 |         yield item
32 | 
33 |         for url, title in zip(urls, titles):
34 |             save_path = os.path.join(self.user_data_dir, title)
35 |             if not os.path.isdir(save_path):
36 |                 os.makedirs(save_path)
37 | 
38 |             yield Request(url, meta={'title': title}, callback=self.parse_page)
39 | 
40 |     def parse_page(self, response):
41 |         '''Parse each page of girls'''
42 |         urls = response.xpath(
43 |             '//*[@class="post-content"]//img/@data-src').extract()
44 |         item = VmgirlsImagesItem()
45 |         item['image_urls'] = urls
46 |         item['title'] = response.meta['title']
47 |         yield item
48 | 


--------------------------------------------------------------------------------
/telegram/telegram_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | '''Capture pictures from telegram channel
 4 | Below parameters need replace to yours
 5 |     - api_id
 6 |     - api_hash
 7 |     - proxy
 8 | Notice: 
 9 |     First time you may need enter phone number and get login code
10 | '''
11 | 
12 | import os
13 | import sys
14 | import socks
15 | 
16 | from telethon import TelegramClient, sync
17 | from telethon.tl.types import InputMessagesFilterPhotos
18 | from telethon.helpers import TotalList
19 | 
20 | 
21 | def get_path(channel):
22 |     home_path = os.path.expanduser('~')
23 |     path = os.path.join(home_path, 'Pictures/python/telegram', channel)
24 |     if not os.path.isdir(path):
25 |         os.makedirs(path)
26 |     return path
27 | 
28 | 
29 | def open_client():
30 |     # get api_id and api_hash from https://my.telegram.org/apps
31 |     api_id = None
32 |     api_hash = None
33 | 
34 |     if not api_id or not api_hash:
35 |         print('Please set api_id and api_hash, you can get it from https://my.telegram.org/apps')
36 |         sys.exit(1)
37 | 
38 |     # socks5 proxy, can set to be 'None' if no need
39 |     proxy = (socks.SOCKS5, "localhost", 1080)
40 |     return TelegramClient('tg_session', api_id=api_id,
41 |                           api_hash=api_hash, proxy=proxy).start()
42 | 
43 | 
44 | def get_photos(client, channel):
45 |     tg_link = "https://t.me/" + channel
46 | 
47 |     # get photos
48 |     print('Getting photos from ' + tg_link)
49 |     return client.get_messages(tg_link, None, filter=InputMessagesFilterPhotos)
50 | 
51 | 
52 | def main():
53 |     tg_client = open_client()
54 |     tg_channel = input('Please input telegram channel name: ')
55 | 
56 |     photos = get_photos(tg_client, tg_channel)
57 |     total = photos.total
58 | 
59 |     save_path = get_path(tg_channel)
60 | 
61 |     print('Start downloading photos...')
62 |     index = 0
63 |     for photo in photos:
64 |         filename = os.path.join(save_path, str(photo.id) + '.jpg')
65 |         index = index + 1
66 |         print("downloading {}/{} : {}".format(index, total, filename))
67 |         tg_client.download_media(photo, filename)
68 | 
69 |     tg_client.disconnect()
70 |     print("Done.")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What's Capturer
 2 | 
 3 | A capture tool used to capture pictures from web like Sina, LOFTER, huaban and so on.
 4 | 
 5 | > **If you have any suggestions or awesome websites of pictures want to capture, please let me know!!!**
 6 | 
 7 | ## Support Websites
 8 | 
 9 | - [Sina](https://weibo.com/)
10 | - [Lofter](http://www.lofter.com/)
11 | - [Toutiao](https://www.toutiao.com)
12 | - ~~[QQZone](https://qzone.qq.com/)~~: Need verify Captcha
13 | - [Huaban](https://huaban.com/)
14 | - ~~[Vmgirls](https://www.vmgirls.com/)~~: Website upgraded
15 | - [Fabiaoqing](https://www.fabiaoqing.com/)
16 | - `telegram`
17 | - [NetBian](http://pic.netbian.com/)
18 | 
19 | ## How to use
20 | 
21 | - install `python3` and libs
22 | - update your [Parameters](#parameters) of each kind of web
23 | - run `./capturer` or run `main.py` or `***_spider.py` to capture images from
24 |   - `sina`
25 |   - `lofter`
26 |   - `toutiao`
27 |   - `qqzone`
28 |   - `telegram`
29 |   - `netbian`
30 | - run `huaban/run.py` to capture images from `huaban`
31 | - run `vmgirls/run.py` to capture images from `vmgirls`
32 | - run `fabiaoqing/fabiaoqing_spider.py key1 [key2] [key3] ...`
33 | 
34 | ## Notices
35 | 
36 | Almost all of the file path based on `~/Pictures/python`, `~` means home dir.
37 | 
38 | ## Parameters
39 | 
40 | ### huaban
41 | 
42 | - `USERNAME`: username of huaban which you want to capture
43 | - `ROOT_DIR`: directories where to store the images
44 | 
45 | ### Sina
46 | 
47 | - `uid`: user-id(10 numbers) of sina weibo that you want to capture
48 | - `cookies`: your cookies after login the sina weibo
49 | - `path`: directory to save the pictures
50 | 
51 | ### Lofter
52 | 
53 | - `username`: username of lofter that you want to capture
54 | - `path`: directory to save the pictures, see the function `_get_path` in `lofter_spider.py`
55 | - `query_number`: number of blogs in each query packet, default value is 40
56 | 
57 | ### Telegram
58 | 
59 | - `api_id`: you can get from <https://my.telegram.org/apps>
60 | - `api_hash`: you can get from <https://my.telegram.org/apps>
61 | - `socks proxy`: set proxy ip and port, default is `localhost:1080`
62 | 
63 | ## Blogs
64 | 
65 | You can find all the relate blogs in <https://www.litreily.top/tags/spider/>.
66 | 
67 | - Lofter - [爬取网易LOFTER图片](https://www.litreily.top/2018/03/17/lofter/)
68 | - Sina - [爬取新浪微博用户图片](https://www.litreily.top/2018/04/10/sina/)
69 | - qqzone - [爬取QQ空间相册](https://www.litreily.top/2019/03/03/qqzone/)
70 | - Vmgirls - [Scrapy爬取vmgirls](https://www.litreily.top/2019/08/09/vmgirls/)
71 | - Netbian - [爬取彼岸图网美图](https://www.litreily.top/2020/08/09/netbian/)
72 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.exporters import JsonLinesItemExporter
 9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy.exceptions import DropItem
11 | from scrapy.http import Request
12 | 
13 | from vmgirls.items import VmgirlsItem
14 | from vmgirls.items import VmgirlsImagesItem
15 | 
16 | import os
17 | 
18 | 
19 | class VmgirlsPipeline(object):
20 |     '''Pipeline for every url of one theme, save theme info to json file'''
21 | 
22 |     def __init__(self, user_data_dir):
23 |         '''Open file to save the exported Items'''
24 |         self.user_data_dir = user_data_dir
25 | 
26 |         if not os.path.isdir(self.user_data_dir):
27 |             os.makedirs(self.user_data_dir)
28 | 
29 |     @classmethod
30 |     def from_crawler(cls, crawler):
31 |         '''Get user dir from global settings.py'''
32 |         settings = crawler.settings
33 |         return cls(settings.get('USER_DATA_DIR'))
34 | 
35 |     def process_item(self, item, spider):
36 |         '''Save item info to loacl file'''
37 |         if isinstance(item, VmgirlsItem):
38 |             self.girls_info = open(
39 |                 os.path.join(self.user_data_dir, 'vmgirls.json'), 'w+b')
40 |             self.girls_exporter = JsonLinesItemExporter(
41 |                 self.girls_info, encoding='utf-8', indent=4)
42 | 
43 |             self.girls_exporter.start_exporting()
44 | 
45 |             for url, title in zip(item['theme_urls'], item['theme_titles']):
46 |                 single_item = {'theme_url': url, 'title': title}
47 |                 self.girls_exporter.export_item(single_item)
48 | 
49 |             self.girls_exporter.finish_exporting()
50 |             self.girls_info.close()
51 |         return item
52 | 
53 | 
54 | class VmgirlsImagesPipeline(ImagesPipeline):
55 |     '''Get images from one theme'''
56 | 
57 |     def get_media_requests(self, item, info):
58 |         if isinstance(item, VmgirlsImagesItem):
59 |             for image_url in item['image_urls']:
60 |                 yield Request(image_url, meta={'item': item})
61 | 
62 |     def file_path(self, request, response=None, info=None):
63 |         '''Set image dir to IMAGES_STORE/title/base_url'''
64 |         url = request.url
65 |         item = request.meta['item']
66 |         path = os.path.join(item['title'], url.split('/')[-1])
67 |         return path
68 | 
69 |     def item_completed(self, results, item, info):
70 |         if isinstance(item, VmgirlsImagesItem):
71 |             image_paths = [x['path'] for ok, x in results if ok]
72 | 
73 |             if not image_paths:
74 |                 raise DropItem("Item contains no images")
75 |             return item
76 | 


--------------------------------------------------------------------------------
/fabiaoqing/fabiaoqing_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | # author: litreily
  4 | # date: 2019.09.15
  5 | '''按关键词爬取发表情网fabiaoqing.com的表情包'''
  6 | 
  7 | import requests
  8 | import os
  9 | import sys
 10 | from lxml import html
 11 | 
 12 | 
 13 | base_url = 'https://fabiaoqing.com/search/search/keyword/'
 14 | headers = {
 15 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
 16 | }
 17 | 
 18 | 
 19 | def get_path(keyword):
 20 |     '''生成指定关键词对应的表情包存储路径'''
 21 |     home_path = os.path.expanduser('~')
 22 |     path = os.path.join(home_path, 'Pictures/python/表情包/' + keyword)
 23 |     if not os.path.isdir(path):
 24 |         os.makedirs(path)
 25 | 
 26 |     return os.path.realpath(path)
 27 | 
 28 | 
 29 | def get_imgs(keyword):
 30 |     '''爬取某一个关键词相关的所有表情包
 31 | 
 32 |     Args:
 33 |         keyword: 表情包关键词
 34 |     '''
 35 |     page_index = 0
 36 |     img_cnts = 0
 37 |     save_dir = get_path(keyword)
 38 |     while True:
 39 |         page_index = page_index + 1
 40 |         # https://fabiaoqing.com/search/search/keyword/抱抱/type/bq/page/1.html
 41 |         url = '{}{}/type/bq/page/{}.html'.format(base_url, keyword, page_index)
 42 |         response = requests.get(url, headers=headers).content
 43 |         page = html.fromstring(response)
 44 |         imgs = page.xpath(
 45 |             '//div[@class="searchbqppdiv tagbqppdiv"]//img/@data-original')
 46 | 
 47 |         print('爬取 "{}" 相关表情包第 {} 页:'.format(keyword, page_index))
 48 |         img_cnts = download_imgs(imgs, img_cnts, save_dir)
 49 | 
 50 |         if page_index == 20 or len(imgs) == 0:
 51 |             break
 52 | 
 53 |     return img_cnts, save_dir
 54 | 
 55 | 
 56 | def download_imgs(img_urls, starti, save_dir):
 57 |     '''下载单个页面内所有图片
 58 | 
 59 |     Args:
 60 |         img_urls: 关键词相关表情包某一分页的所有图片链接
 61 |         starti: 当前页面首个图片命名id
 62 |         save_dir: 图片存储路径
 63 |     '''
 64 |     fid = starti
 65 |     for img in img_urls:
 66 |         print('\t' + img)
 67 |         fid = fid + 1
 68 |         file_name = '{}.{}'.format(fid, os.path.basename(img).split('.')[-1])
 69 |         save_path = os.path.join(save_dir, file_name)
 70 | 
 71 |         try:
 72 |             with open(save_path, 'wb') as f:
 73 |                 f.write(requests.get(img, headers=headers, timeout=20).content)
 74 |         except requests.exceptions.ConnectionError as ce:
 75 |             print(ce.strerror())
 76 |         except requests.exceptions.MissingSchema:
 77 |             print(img + ' missing schema')
 78 |         except requests.exceptions.ReadTimeout:
 79 |             print('get {} timeout, skip this item.'.format(img))
 80 |         finally:
 81 |             pass
 82 | 
 83 |     return fid
 84 | 
 85 | 
 86 | def usage():
 87 |     print('Usage:\n\t' + os.path.basename(sys.argv[0]) +
 88 |           ' [key1] [key2] [key3] ...\n')
 89 | 
 90 | 
 91 | def main():
 92 |     if len(sys.argv) < 2:
 93 |         usage()
 94 |         sys.exit(0)
 95 | 
 96 |     print('============================================')
 97 |     for keyword in sys.argv[1:]:
 98 |         print('开始爬取关键词为 "{}" 的表情包:'.format(keyword))
 99 |         count, save_dir = get_imgs(keyword)
100 |         print('共爬取 "{}" 表情包 {} 个'.format(keyword, count))
101 |         print('文件存储于"{}"'.format(save_dir))
102 |     print('\n爬取完成！')
103 |     print('============================================')
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for vmgirls project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | import os
13 | BOT_NAME = 'vmgirls'
14 | 
15 | SPIDER_MODULES = ['vmgirls.spiders']
16 | NEWSPIDER_MODULE = 'vmgirls.spiders'
17 | 
18 | USER_DIR = os.path.expanduser('~')
19 | USER_DATA_DIR = os.path.join(USER_DIR, 'Pictures/python/vmgirls')
20 | IMAGES_STORE = USER_DATA_DIR
21 | 
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'vmgirls (+http://www.yourdomain.com)'
24 | 
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = False
27 | 
28 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 | #CONCURRENT_REQUESTS = 32
30 | 
31 | # Configure a delay for requests for the same website (default: 0)
32 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
33 | # See also autothrottle settings and docs
34 | #DOWNLOAD_DELAY = 3
35 | # The download delay setting will honor only one of:
36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 | #CONCURRENT_REQUESTS_PER_IP = 16
38 | 
39 | # Disable cookies (enabled by default)
40 | #COOKIES_ENABLED = False
41 | 
42 | # Disable Telnet Console (enabled by default)
43 | #TELNETCONSOLE_ENABLED = False
44 | 
45 | # Override the default request headers:
46 | # DEFAULT_REQUEST_HEADERS = {
47 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 | #   'Accept-Language': 'en',
49 | # }
50 | 
51 | # Enable or disable spider middlewares
52 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
53 | # SPIDER_MIDDLEWARES = {
54 | #    'vmgirls.middlewares.VmgirlsSpiderMiddleware': 543,
55 | # }
56 | 
57 | # Enable or disable downloader middlewares
58 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
59 | # DOWNLOADER_MIDDLEWARES = {
60 | #    'vmgirls.middlewares.VmgirlsDownloaderMiddleware': 543,
61 | # }
62 | 
63 | # Enable or disable extensions
64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
65 | # EXTENSIONS = {
66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
67 | # }
68 | 
69 | # Configure item pipelines
70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
71 | ITEM_PIPELINES = {
72 |     'vmgirls.pipelines.VmgirlsPipeline': 300,
73 |     'vmgirls.pipelines.VmgirlsImagesPipeline': 400
74 | }
75 | 
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 | 
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 | 


--------------------------------------------------------------------------------
/huaban/huaban/spiders/UserBoardsSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider
 3 | from scrapy.http import Request
 4 | from scrapy.http import Headers
 5 | from scrapy.utils.project import get_project_settings
 6 | 
 7 | import re
 8 | 
 9 | import json
10 | 
11 | from huaban.items import BoardItem
12 | from huaban.items import PinItem
13 | 
14 | 
15 | class UserboardsspiderSpider(Spider):
16 |     name = 'UserBoardSpider'
17 |     allowed_domains = ['huaban.com']
18 | 
19 |     def __init__(self):
20 |         settings = get_project_settings()
21 |         self.username = settings.get('USERNAME')
22 |         self.hostname = 'http://huaban.com'
23 |         self.start_urls = ['{0}/{1}/'.format(self.hostname, self.username)]
24 | 
25 |     def parse(self, response):
26 |         '''Get boards info from home page of user'''
27 |         # Enable below comments need disable huabanDownloaderMiddleware
28 |         # data = response.xpath('body/script[1]').extract_first()
29 |         # user_page = re.search('app\.page\["user"\]\s*=\s*({.*});', data)[1]
30 |         # info = json.loads(user_page, encoding='utf-8')
31 |         # boards = info.get('boards')
32 | 
33 |         info = json.loads(response.text, encoding='utf-8')
34 |         boards = info['user'].get('boards')
35 | 
36 |         if not boards:
37 |             return
38 | 
39 |         # Get BoardItem and capture all boards
40 |         for board in boards:
41 |             item = BoardItem()
42 |             item['title'] = board['title']
43 |             item['board_id'] = board['board_id']
44 |             item['category_id'] = board['category_id']
45 |             item['pin_count'] = board['pin_count']
46 |             item['follow_count'] = board['follow_count']
47 |             item['like_count'] = board['like_count']
48 |             yield item
49 | 
50 |             board_url = '{0}/boards/{1}'.format(self.hostname,
51 |                                                 board['board_id'])
52 |             yield Request(board_url, meta={'board_title': board['title']}, callback=self.parse_pins)
53 | 
54 |         # Get more boards info
55 |         # Request parameters:
56 |         #   max: the last board_id get from boards
57 |         #   limit: default 10, it's the limit number of boards, can be modified
58 |         board_req = '{0}/{1}/?jg0gcj0&max={2}&limit={3}&wfl=1'.format(self.hostname,
59 |                                                                       self.username, boards[-1]['board_id'], 10)
60 |         yield Request(board_req, callback=self.parse)
61 | 
62 |     def parse_pins(self, response):
63 |         board_data = json.loads(response.text, encoding='utf-8')
64 |         pins = board_data['board'].get('pins')
65 |         board_title = response.meta['board_title']
66 | 
67 |         if not pins:
68 |             return
69 | 
70 |         for pin in pins:
71 |             item = PinItem()
72 |             item['pin_id'] = pin['pin_id']
73 |             item['board_id'] = pin['board_id']
74 |             item['board_title'] = board_title
75 |             item['file_id'] = pin['file_id']
76 |             item['file_key'] = pin['file']['key']
77 |             item['source'] = pin['source']
78 |             item['tags'] = pin['tags']
79 |             yield item
80 | 
81 |         # Get more pins info
82 |         # Request parameters:
83 |         #   max: the last pin_id get from pins
84 |         #   limit: default 20, it's the limit number of pins, can be modified
85 |         pin_req = '{0}/boards/{1}/?jg6nr2rm&max={2}&limit={3}&wfl=1'.format(
86 |             self.hostname, pins[-1]['board_id'], pins[-1]['pin_id'], 20)
87 |         yield Request(pin_req, meta={'board_title': board_title}, callback=self.parse_pins)
88 | 


--------------------------------------------------------------------------------
/netbian/netbian_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import requests
  6 | from lxml import html
  7 | from multiprocessing import Pool, cpu_count
  8 | 
  9 | 
 10 | class NetbianSpider(object):
 11 |     def __init__(self):
 12 |         self.index = 'http://pic.netbian.com'
 13 |         self.headers = {
 14 |             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
 15 |         }
 16 | 
 17 |     def get_path(self, name):
 18 |         home_path = os.path.expanduser('~')
 19 |         path = os.path.join(home_path, 'Pictures/python/netbian/' + name)
 20 |         if not os.path.isdir(path):
 21 |             os.makedirs(path)
 22 | 
 23 |         return os.path.realpath(path)
 24 | 
 25 |     def get_categories(self):
 26 |         '''get categories of website'''
 27 |         res = requests.get(self.index, headers=self.headers)
 28 |         doc = html.fromstring(res.content)
 29 |         categories = doc.xpath('//div[contains(@class, "classify")]/a')
 30 | 
 31 |         for category in categories:
 32 |             name = category.xpath('text()')[0]
 33 |             url = category.xpath('@href')[0]
 34 |             yield name, url
 35 | 
 36 |     def spider_by_category(self, category, url):
 37 |         '''Process function which use to capture images base on category'''
 38 |         path_category = self.get_path(category)
 39 |         detail_pages, page_cnt = self.parse_thumb_page(url, first_page=True)
 40 | 
 41 |         img_cnt = 0
 42 |         page_num = 1
 43 |         while True:
 44 |             for page in detail_pages:
 45 |                 img_cnt += 1
 46 | 
 47 |                 print('[{} page-{} img-{}] Parsing page {}'.format(
 48 |                     category, page_num, img_cnt, page))
 49 |                 img_url = self.parse_detail_page(page)
 50 |                 self.download_image(img_url, path_category)
 51 | 
 52 |             page_num += 1
 53 |             if page_num > page_cnt:
 54 |                 break
 55 |             detail_pages = self.parse_thumb_page(
 56 |                 '{}index_{}.html'.format(url, page_num))
 57 | 
 58 |     def parse_thumb_page(self, url, first_page=False):
 59 |         '''parse thumbnail page and get all the detail pages url'''
 60 |         res = requests.get(self.index + url, headers=self.headers)
 61 |         doc = html.fromstring(res.content)
 62 |         detail_pages = doc.xpath('//div[@class="slist"]//a/@href')
 63 | 
 64 |         if first_page:
 65 |             page_cnt = doc.xpath(
 66 |                 '//span[@class="slh"]/following-sibling::a[1]/text()')[0]
 67 |             return detail_pages, int(page_cnt)
 68 |         else:
 69 |             return detail_pages
 70 | 
 71 |     def parse_detail_page(self, url):
 72 |         '''parse detail page and get source image url'''
 73 |         res = requests.get(self.index + url, headers=self.headers)
 74 |         doc = html.fromstring(res.content)
 75 |         img_url = doc.xpath('//*[@id="img"]/img/@src')[0]
 76 | 
 77 |         return img_url
 78 | 
 79 |     def download_image(self, url, path):
 80 |         img_name = url.split('/')[-1]
 81 |         save_path = os.path.join(path, img_name)
 82 | 
 83 |         res = requests.get(self.index + url, headers=self.headers, timeout=20)
 84 |         if res.status_code == 200:
 85 |             with open(save_path, 'wb') as f:
 86 |                 f.write(res.content)
 87 | 
 88 | 
 89 | def main():
 90 |     spider = NetbianSpider()
 91 |     categories = spider.get_categories()
 92 | 
 93 |     p = Pool(cpu_count())
 94 |     for name, url in categories:
 95 |         p.apply_async(spider.spider_by_category, args=(name, url))
 96 | 
 97 |     p.close()
 98 |     p.join()
 99 |     print('All Done!')
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/vmgirls/vmgirls/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class VmgirlsSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class VmgirlsDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/huaban/huaban/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class HuabanSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class HuabanDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         request.headers['Accept'] = 'application/json'
 76 |         request.headers['X-Request'] = 'JSON'
 77 |         request.headers['X-Requested-With'] = 'XMLHttpRequest'
 78 | 
 79 |         # Must either:
 80 |         # - return None: continue processing this request
 81 |         # - or return a Response object
 82 |         # - or return a Request object
 83 |         # - or raise IgnoreRequest: process_exception() methods of
 84 |         #   installed downloader middleware will be called
 85 |         return None
 86 | 
 87 |     def process_response(self, request, response, spider):
 88 |         # Called with the response returned from the downloader.
 89 | 
 90 |         # Must either;
 91 |         # - return a Response object
 92 |         # - return a Request object
 93 |         # - or raise IgnoreRequest
 94 |         return response
 95 | 
 96 |     def process_exception(self, request, exception, spider):
 97 |         # Called when a download handler or a process_request()
 98 |         # (from other downloader middleware) raises an exception.
 99 | 
100 |         # Must either:
101 |         # - return None: continue processing this exception
102 |         # - return a Response object: stops process_exception() chain
103 |         # - return a Request object: stops process_exception() chain
104 |         pass
105 | 
106 |     def spider_opened(self, spider):
107 |         spider.logger.info('Spider opened: %s' % spider.name)
108 | 


--------------------------------------------------------------------------------
/huaban/huaban/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | 
  8 | from scrapy.exporters import JsonItemExporter
  9 | from scrapy.exporters import JsonLinesItemExporter
 10 | 
 11 | from scrapy.pipelines.images import ImagesPipeline
 12 | from scrapy.http import Request
 13 | from scrapy.exceptions import DropItem
 14 | 
 15 | from huaban.items import BoardItem
 16 | from huaban.items import PinItem
 17 | 
 18 | import os
 19 | from os.path import join, basename
 20 | 
 21 | from urllib.parse import urlparse
 22 | 
 23 | from PIL import Image
 24 | 
 25 | try:
 26 |     from cStringIO import StringIO as BytesIO
 27 | except ImportError:
 28 |     from io import BytesIO
 29 | 
 30 | 
 31 | class HuabanPipeline(object):
 32 |     def __init__(self, user_data_dir):
 33 |         '''Open file to save the exported Items'''
 34 |         self.user_data_dir = user_data_dir
 35 | 
 36 |         if not os.path.isdir(self.user_data_dir):
 37 |             os.makedirs(self.user_data_dir)
 38 | 
 39 |         # save info of BoardItem
 40 |         self.board_info = open(self.user_data_dir + 'boards.json', 'w+b')
 41 |         self.board_exporter = JsonItemExporter(
 42 |             self.board_info, encoding='utf-8', indent=4)
 43 | 
 44 |         # save info of PinItem
 45 |         self.pin_info = open(self.user_data_dir + 'pins.json', 'w+b')
 46 |         self.pin_exporter = JsonLinesItemExporter(
 47 |             self.pin_info, encoding='utf-8', indent=4)
 48 | 
 49 |     @classmethod
 50 |     def from_crawler(cls, crawler):
 51 |         '''get some global settings from settings.py'''
 52 |         settings = crawler.settings
 53 |         return cls(settings.get('USER_DATA_DIR'))
 54 | 
 55 |     def open_spider(self, spider):
 56 |         '''Start exporting BoardItem'''
 57 |         self.board_exporter.start_exporting()
 58 |         self.pin_exporter.start_exporting()
 59 | 
 60 |     def process_item(self, item, spider):
 61 |         if isinstance(item, BoardItem):
 62 |             self.board_exporter.export_item(item)
 63 |         elif isinstance(item, PinItem):
 64 |             self.pin_exporter.export_item(item)
 65 | 
 66 |         return item
 67 | 
 68 |     def close_spider(self, spider):
 69 |         '''finish exporting and close files'''
 70 |         self.board_exporter.finish_exporting()
 71 |         self.pin_exporter.finish_exporting()
 72 |         self.board_info.close()
 73 |         self.pin_info.close()
 74 | 
 75 | 
 76 | class HuabanImagesPipeline(ImagesPipeline):
 77 |     '''Implement image downloader by inherit class ImagesPipeline'''
 78 | 
 79 |     def get_media_requests(self, item, info):
 80 |         if isinstance(item, PinItem):
 81 |             image_url = 'http://img.hb.aicdn.com/' + item['file_key']
 82 |             yield Request(image_url, meta={'item': item})
 83 | 
 84 |     def file_path(self, request, response=None, info=None):
 85 |         url_path = urlparse(request.url).path
 86 |         item = request.meta['item']
 87 |         board_title = item['board_title']
 88 |         # file path: IMAGE_STORE/images/[BOARD_TITLE]/[URL_PATH].jpg
 89 |         return join('images', board_title.replace(':', '-'), basename(url_path))
 90 | 
 91 |     def check_gif(self, image):
 92 |         if image.format == 'GIF':
 93 |             return True
 94 |         else:
 95 |             return image.info.get('version') in ['GIF89a', 'GIF87a']
 96 | 
 97 |     def get_images(self, response, request, info):
 98 |         path = self.file_path(request, response=response, info=info)
 99 |         orig_image = Image.open(BytesIO(response.body))
100 | 
101 |         if self.check_gif(orig_image):
102 |             path += '.gif'
103 |             abs_path = self.store._get_filesystem_path(path)
104 |             self.store._mkdir(os.path.dirname(abs_path), info)
105 | 
106 |             # save gif image from reponse
107 |             with open(abs_path, 'wb') as f:
108 |                 f.write(response.body)
109 |             return None
110 |         else:
111 |             path += '.jpg'
112 |             image, buf = self.convert_image(orig_image)
113 | 
114 |         yield path, image, buf
115 | 
116 |     def item_completed(self, results, item, info):
117 |         image_paths = [x['path'] for ok, x in results if ok]
118 |         if not image_paths:
119 |             raise DropItem("Item contains no images")
120 |         return item
121 | 


--------------------------------------------------------------------------------
/huaban/huaban/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for huaban project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | import time
 13 | import os
 14 | 
 15 | BOT_NAME = 'huaban'
 16 | 
 17 | SPIDER_MODULES = ['huaban.spiders']
 18 | NEWSPIDER_MODULE = 'huaban.spiders'
 19 | 
 20 | # Custom settings
 21 | # USERNAME = 'meirijingxuan'
 22 | # USERNAME = 'dsk1985'
 23 | USERNAME = 'litreily'
 24 | ROOT_DIR = '{0}/Pictures/python/huaban/'.format(os.path.expanduser('~'))
 25 | USER_DIR = ROOT_DIR + USERNAME
 26 | USER_DATA_DIR = USER_DIR + '/json/'
 27 | 
 28 | # Log
 29 | SAVE_LOG = False
 30 | if SAVE_LOG:
 31 |     LOG_DIR = ROOT_DIR + '.log'
 32 |     LOG_ENCODING = 'utf-8'
 33 | 
 34 |     if not os.path.isdir(LOG_DIR):
 35 |         os.makedirs(LOG_DIR)
 36 | 
 37 |     localtime = time.strftime('%Y%m%d_%H%M%S', time.localtime())
 38 |     LOG_FILE = '{0}/{1}_{2}.log'.format(LOG_DIR, USERNAME, localtime)
 39 | 
 40 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 41 | USER_AGENT = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 42 |               ' (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
 43 | 
 44 | # Obey robots.txt rules
 45 | ROBOTSTXT_OBEY = True
 46 | 
 47 | # Configure a item exporter to save items
 48 | # FEED_FORMAT = 'json'
 49 | # FEED_URI = 'file:///D:/litreily/Pictures/python/huaban/boards.json'
 50 | # FEED_EXPORTERS_BASE = {
 51 | #     'json': 'scrapy.exporters.JsonItemExporter',
 52 | #     'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
 53 | # }
 54 | 
 55 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 56 | #CONCURRENT_REQUESTS = 32
 57 | 
 58 | # Configure a delay for requests for the same website (default: 0)
 59 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 60 | # See also autothrottle settings and docs
 61 | #DOWNLOAD_DELAY = 3
 62 | # The download delay setting will honor only one of:
 63 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 64 | #CONCURRENT_REQUESTS_PER_IP = 16
 65 | 
 66 | # Disable cookies (enabled by default)
 67 | #COOKIES_ENABLED = False
 68 | 
 69 | # Disable Telnet Console (enabled by default)
 70 | #TELNETCONSOLE_ENABLED = False
 71 | 
 72 | # Override the default request headers:
 73 | # DEFAULT_REQUEST_HEADERS = {
 74 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 75 | #   'Accept-Language': 'en',
 76 | # }
 77 | 
 78 | # Enable or disable spider middlewares
 79 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 80 | # SPIDER_MIDDLEWARES = {
 81 | #    'huaban.middlewares.HuabanSpiderMiddleware': 543,
 82 | # }
 83 | 
 84 | # Enable or disable downloader middlewares
 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 86 | DOWNLOADER_MIDDLEWARES = {
 87 |     'huaban.middlewares.HuabanDownloaderMiddleware': 543,
 88 | }
 89 | 
 90 | # Enable or disable extensions
 91 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 92 | # EXTENSIONS = {
 93 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 94 | # }
 95 | 
 96 | # Configure item pipelines
 97 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 98 | ITEM_PIPELINES = {
 99 |     'huaban.pipelines.HuabanPipeline': 300,
100 |     'huaban.pipelines.HuabanImagesPipeline': 400,
101 | }
102 | IMAGES_STORE = USER_DIR
103 | 
104 | # Enable and configure the AutoThrottle extension (disabled by default)
105 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
106 | #AUTOTHROTTLE_ENABLED = True
107 | # The initial download delay
108 | #AUTOTHROTTLE_START_DELAY = 5
109 | # The maximum download delay to be set in case of high latencies
110 | #AUTOTHROTTLE_MAX_DELAY = 60
111 | # The average number of requests Scrapy should be sending in parallel to
112 | # each remote server
113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
114 | # Enable showing throttling stats for every response received:
115 | #AUTOTHROTTLE_DEBUG = False
116 | 
117 | # Enable and configure HTTP caching (disabled by default)
118 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
119 | #HTTPCACHE_ENABLED = True
120 | #HTTPCACHE_EXPIRATION_SECS = 0
121 | #HTTPCACHE_DIR = 'httpcache'
122 | #HTTPCACHE_IGNORE_HTTP_CODES = []
123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
124 | 


--------------------------------------------------------------------------------
/sina/sina_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | # author: litreily
  4 | # date: 2018.02.05
  5 | """Capture pictures from sina-weibo with user_id."""
  6 | 
  7 | import re
  8 | import os
  9 | 
 10 | import socket
 11 | import urllib.request
 12 | 
 13 | from bs4 import BeautifulSoup
 14 | 
 15 | 
 16 | def _get_path(uid):
 17 |     home_path = os.path.expanduser('~')
 18 |     path = os.path.join(home_path, 'Pictures/python/sina', uid)
 19 |     if not os.path.isdir(path):
 20 |         os.makedirs(path)
 21 |     return path
 22 | 
 23 | 
 24 | def _get_html(url, headers):
 25 |     try:
 26 |         req = urllib.request.Request(url, headers=headers)
 27 |         page = urllib.request.urlopen(req)
 28 |         html = page.read().decode('UTF-8')
 29 |     except Exception as e:
 30 |         print("get %s failed" % url)
 31 |         return None
 32 |     return html
 33 | 
 34 | 
 35 | def _capture_images(uid, headers, path):
 36 |     filter_mode = 1      # 0-all 1-original 2-pictures
 37 |     num_pages = 1
 38 |     num_blogs = 0
 39 |     num_imgs = 0
 40 | 
 41 |     # regular expression of imgList and img
 42 |     imglist_reg = r'href="(https://weibo.cn/mblog/picAll/.{9}\?rl=2)"'
 43 |     imglist_pattern = re.compile(imglist_reg)
 44 |     img_reg = r'src="(http://w.{2}\.sinaimg.cn/(.{6,8})/.{32,33}.(jpg|gif))"'
 45 |     img_pattern = re.compile(img_reg)
 46 | 
 47 |     print('start capture picture of uid:' + uid)
 48 |     while True:
 49 |         url = 'https://weibo.cn/%s/profile?filter=%s&page=%d' % (
 50 |             uid, filter_mode, num_pages)
 51 | 
 52 |         # 1. get html of each page url
 53 |         html = _get_html(url, headers)
 54 |         if html == None:
 55 |             print('\nPlease check your user id or cookies in sina_spider.py!\n')
 56 |             os.removedirs(path)
 57 |             break
 58 | 
 59 |         # 2. parse the html and find all the imgList Url of each page
 60 |         soup = BeautifulSoup(html, "lxml")
 61 |         # <div class="c" id="M_G4gb5pY8t"><div>
 62 |         blogs = soup.body.find_all(
 63 |             attrs={'id': re.compile(r'^M_')}, recursive=False)
 64 |         num_blogs += len(blogs)
 65 | 
 66 |         if num_pages == 1:
 67 |             # get number of pages
 68 |             max_pages = soup.find('input', attrs={'name': 'mp'}).attrs['value']
 69 | 
 70 |         imgurls = []
 71 |         for blog in blogs:
 72 |             blog = str(blog)
 73 |             imglist_url = imglist_pattern.findall(blog)
 74 |             if not imglist_url:
 75 |                 # 2.1 get img-url from blog that have only one pic
 76 |                 imgurls += img_pattern.findall(blog)
 77 |             else:
 78 |                 # 2.2 get img-urls from blog that have group pics
 79 |                 html = _get_html(imglist_url[0], headers)
 80 |                 imgurls += img_pattern.findall(html)
 81 | 
 82 |         if num_pages > int(max_pages):
 83 |             print('capture complete!')
 84 |             print('captured pages:%d, blogs:%d, imgs:%d' %
 85 |                   (num_pages - 1, num_blogs, num_imgs))
 86 |             print('directory:' + path)
 87 |             break
 88 | 
 89 |         # 3. download all the imgs from each imgList
 90 |         print('PAGE {}/{} with {} images'.format(num_pages, max_pages, len(imgurls)))
 91 |         for img in imgurls:
 92 |             imgurl = img[0].replace(img[1], 'large')
 93 |             num_imgs += 1
 94 |             count = 1
 95 |             try:
 96 |                 urllib.request.urlretrieve(
 97 |                     imgurl, '{}/{}.{}'.format(path, num_imgs, img[2]))
 98 |             except socket.timeout:
 99 |                 while count <= 3:
100 |                     try:
101 |                         urllib.request.urlretrieve(
102 |                             imgurl, '{}/{}.{}'.format(path, num_imgs, img[2]))
103 |                         break
104 |                     except socket.timeout:
105 |                         count += 1
106 |             finally:
107 |                 # display the raw url of images
108 |                 print('\t%d\t%s' % (num_imgs, imgurl))
109 |                 if count > 3:
110 |                     print('\t%d\t%s failed' % (num_imgs, imgurl))
111 |                 pass
112 |         num_pages += 1
113 |         print('')
114 | 
115 | 
116 | def main():
117 |     uid = input('please input user id (e.g. 1969308311) :')
118 |     path = _get_path(uid)
119 |     socket.setdefaulttimeout(20)
120 | 
121 |     # cookie is form the above url->network->request headers
122 |     cookies = ''
123 |     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
124 |                'Cookie': cookies}
125 | 
126 |     # capture imgs from sina
127 |     _capture_images(uid, headers, path)
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     main()
132 | 


--------------------------------------------------------------------------------
/toutiao/toutiao_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Description: Get pisture collections from www.toutiao.com
  4 | Tip:  This script comes from https://github.com/smslit/spider-collection/tree/master/ttpic
  5 | Link: https://www.smslit.top/2018/06/21/spider-practice-pic-dog/
  6 | """
  7 | __author__ = '5km(smslit)'
  8 | __date__ = '20180627'
  9 | 
 10 | import os
 11 | import requests
 12 | from hashlib import md5
 13 | from functools import partial
 14 | from urllib.parse import urlencode
 15 | from multiprocessing.pool import Pool
 16 | 
 17 | PAGE_NUM = 5
 18 | 
 19 | HEADERS = {
 20 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15',
 21 |     'X-Requested-With': 'XMLHttpRequest'
 22 | }
 23 | 
 24 | 
 25 | def get_page_json(offset, keyword):
 26 |     '''生成ajax请求，发出get请求，并获取响应结果，以字典的形式返回json数据
 27 |     :param offset: 页面请求偏移值
 28 |     :type offset: 能被20整除的整数
 29 |     :param keyword: 图片搜索的关键词
 30 |     :type keyword: unicode字符串
 31 |     :return: 响应数据
 32 |     :rtype: dict
 33 |     '''
 34 |     params = {
 35 |         'aid': 24,
 36 |         'app_name': 'web_search',
 37 |         'offset': offset,
 38 |         'format': 'json',
 39 |         'keyword': keyword,
 40 |         'autoload': 'true',
 41 |         'count': '20',
 42 |         'cur_tab': '3',
 43 |         'from': 'gallery'
 44 |     }
 45 | 
 46 |     url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
 47 |     try:
 48 |         response = requests.get(url, headers=HEADERS)
 49 |         if response.status_code == 200:
 50 |             return response.json()
 51 |     except requests.ConnectionError as e:
 52 |         print('Error', e)
 53 | 
 54 | 
 55 | def parse_images_url(json):
 56 |     '''解析json字典，获得对应条目标题和图片链接
 57 |     :param json: 页面请求响应结果对应的字典数据
 58 |     :type json: dict
 59 |     '''
 60 |     data = json['data']
 61 |     if data:
 62 |         for item in data:
 63 |             try:
 64 |                 title = item['title']
 65 |                 images = item['image_list']
 66 |             except KeyError:
 67 |                 continue
 68 |             if images:
 69 |                 for image in images:
 70 |                     yield {
 71 |                         'image': image['url'].replace('list', 'origin').replace('190x124/', ''),
 72 |                         'title': title
 73 |                     }
 74 | 
 75 | 
 76 | def save_image_from(image_info, to_dir=''):
 77 |     '''根据链接获取图片，保存到标题命名的目录中，图片以md5码命名
 78 |     :param image_info: 包含标题和链接信息的字典数据
 79 |     :type image_info: dict
 80 |     :param to_dir: 要保存到的目录，默认值是空字符串，可指定目录，格式如: '狗狗'
 81 |     :type to_dir: unicode 字符串
 82 |     '''
 83 |     if image_info:
 84 |         print(image_info)
 85 |         image_dir = to_dir + '/' + image_info['title']
 86 |         if not os.path.exists(image_dir):
 87 |             os.makedirs(image_dir)
 88 |         try:
 89 |             response = requests.get(image_info['image'])
 90 |             if response.status_code == 200:
 91 |                 image_path = '{0}/{1}.{2}'.format(image_dir,
 92 |                                                   md5(response.content).hexdigest(), 'jpg')
 93 |                 print(image_path)
 94 |                 if not os.path.exists(image_path):
 95 |                     with open(image_path, 'wb') as f:
 96 |                         f.write(response.content)
 97 |                     print('图片下载完成！')
 98 |                 else:
 99 |                     print('图片已下载 -> ', image_path)
100 |         except requests.ConnectionError as e:
101 |             print('图片下载失败！')
102 |         except:
103 |             print('出现异常！')
104 | 
105 | 
106 | def get_images_of(offset, keyword):
107 |     '''主函数，用于多进程调度
108 |     :param offset: 页面链接offset参数的值
109 |     :type offset: 能被20整除的整数
110 |     :param keyword: 图片搜索关键词
111 |     :type keyword: unicode字符串
112 |     '''
113 |     print('获取第', offset + 1, '～', offset + 20, '个条目...')
114 |     json = get_page_json(offset, keyword)
115 |     for image in parse_images_url(json):
116 |         path = os.path.join(os.path.expanduser(
117 |             '~'), 'Pictures/python/toutiao', keyword)
118 |         save_image_from(image, path)
119 | 
120 | 
121 | def main():
122 |     print('\nWelcome here to get pictures from www.toutiao.com!')
123 |     keyword = input('Please input your search keywords > ')
124 |     count = None
125 |     while count == None:
126 |         number_str = input(
127 |             'Please input count of picture collection that you want(Divisible by 20 ) > ')
128 |         try:
129 |             count = int(number_str)
130 |         except ValueError:
131 |             print('Please input a valid number!')
132 | 
133 |     if count > 0:
134 |         print('Getting %s pictures...' % keyword)
135 |         page_num = count // 20 + (0 if count % 20 == 0 else 1)
136 |         offset_list = [x * 20 for x in range(0, page_num)]
137 |         pool = Pool()
138 |         partial_getter = partial(get_images_of, keyword=keyword)
139 |         pool.map(partial_getter, offset_list)
140 |         pool.close()
141 |         pool.join()
142 |     else:
143 |         print('Get Cancel!')
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     main()
148 | 


--------------------------------------------------------------------------------
/lofter/lofter_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | # author: litreily
  4 | # date: 2018.03.07
  5 | """Capture pictures from lofter with username."""
  6 | 
  7 | import re
  8 | import os
  9 | import sys
 10 | 
 11 | import requests
 12 | 
 13 | import time
 14 | import random
 15 | 
 16 | 
 17 | def _get_path(uid):
 18 |     home_path = os.path.expanduser('~')
 19 |     path = os.path.join(home_path, 'Pictures/python/lofter', uid)
 20 |     if not os.path.isdir(path):
 21 |         os.makedirs(path)
 22 |     return path
 23 | 
 24 | 
 25 | def _get_html(url, data, headers):
 26 |     try:
 27 |         html = requests.post(url, data, headers=headers)
 28 |     except Exception as e:
 29 |         print("get %s failed\n%s" % (url, str(e)))
 30 |         return None
 31 |     finally:
 32 |         pass
 33 |     return html
 34 | 
 35 | 
 36 | def _get_blogid(username):
 37 |     try:
 38 |         html = requests.get('http://%s.lofter.com' % username)
 39 |         id_reg = r'src="//www.lofter.com/control\?blogId=(.*)"'
 40 |         blogid = re.search(id_reg, html.text).group(1)
 41 |         print('The blogid of %s is: %s' % (username, blogid))
 42 |         return blogid
 43 |     except Exception as e:
 44 |         print('get blogid from http://%s.lofter.com failed' % username)
 45 |         print('please check your username.')
 46 |         exit(1)
 47 | 
 48 | 
 49 | def _get_timestamp(html, time_pattern):
 50 |     if not html:
 51 |         timestamp = round(time.time() * 1000)  # first timestamp(ms)
 52 |     else:
 53 |         timestamp = time_pattern.search(html).group(1)
 54 |     return str(timestamp)
 55 | 
 56 | 
 57 | def _get_imgurls(username, blog, headers):
 58 |     blog_url = 'http://%s.lofter.com/post/%s' % (username, blog)
 59 |     blog_html = requests.get(blog_url, headers=headers).text
 60 |     imgurls = re.findall(r'bigimgsrc="(.*?)"', blog_html)
 61 |     print('Blog\t%s\twith %d\tpictures' % (blog_url, len(imgurls)))
 62 |     return imgurls
 63 | 
 64 | 
 65 | def _capture_images(imgurl, path):
 66 |     headers = {
 67 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
 68 |     for i in range(1, 3):
 69 |         try:
 70 |             image_request = requests.get(imgurl, headers=headers, timeout=20)
 71 |             if image_request.status_code == 200:
 72 |                 open(path, 'wb').write(image_request.content)
 73 |                 break
 74 |         except requests.exceptions.ConnectionError as e:
 75 |             print('\tGet %s failed\n\terror:%s' % (imgurl, e))
 76 |             if i == 1:
 77 |                 imgurl = re.sub(r'^http://img.*?\.', 'http://img.', imgurl)
 78 |                 print('\tRetry ' + imgurl)
 79 |             else:
 80 |                 print('\tRetry fail')
 81 |         except Exception as e:
 82 |             print(e)
 83 |         finally:
 84 |             pass
 85 | 
 86 | 
 87 | def _create_query_data(blogid, timestamp, query_number):
 88 |     data = {'callCount': '1',
 89 |             'scriptSessionId': '${scriptSessionId}187',
 90 |             'httpSessionId': '',
 91 |             'c0-scriptName': 'ArchiveBean',
 92 |             'c0-methodName': 'getArchivePostByTime',
 93 |             'c0-id': '0',
 94 |             'c0-param0': 'boolean:false',
 95 |             'c0-param1': 'number:' + blogid,
 96 |             'c0-param2': 'number:' + timestamp,
 97 |             'c0-param3': 'number:' + query_number,
 98 |             'c0-param4': 'boolean:false',
 99 |             'batchId': '123456'}
100 |     return data
101 | 
102 | 
103 | def main(argv):
104 |     # prepare paramters
105 |     if len(argv) < 2:
106 |         print(os.path.basename(argv[0]) + ' username')
107 |         exit(1)
108 |     username = argv[1]
109 |     blogid = _get_blogid(username)
110 |     query_number = 40
111 |     time_pattern = re.compile(r's%d\.time=(.*);s.*type' % (query_number-1))
112 |     blog_url_pattern = re.compile(r's[\d]*\.permalink="([\w_]*)"')
113 | 
114 |     # creat path to save imgs
115 |     path = _get_path(username)
116 | 
117 |     # parameters of post packet
118 |     url = 'http://%s.lofter.com/dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr' % username
119 |     data = _create_query_data(blogid, _get_timestamp(
120 |         None, time_pattern), str(query_number))
121 |     headers = {
122 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
123 |         'Host': username + '.lofter.com',
124 |         'Referer': 'http://%s.lofter.com/view' % username,
125 |         'Accept-Encoding': 'gzip, deflate'
126 |     }
127 | 
128 |     num_blogs = 0
129 |     num_imgs = 0
130 |     index_img = 0
131 |     print('------------------------------- start line ------------------------------')
132 |     while True:
133 |         html = _get_html(url, data, headers).text
134 |         # get urls of blogs: s3.permalink="44fbca_19a6b1b"
135 |         new_blogs = blog_url_pattern.findall(html)
136 |         num_new_blogs = len(new_blogs)
137 |         num_blogs += num_new_blogs
138 | 
139 |         if num_new_blogs != 0:
140 |             print('NewBlogs:%d\tTotalBolgs:%d' % (num_new_blogs, num_blogs))
141 |             # get imgurls from new_blogs
142 |             imgurls = []
143 |             for blog in new_blogs:
144 |                 imgurls.extend(_get_imgurls(username, blog, headers))
145 |             num_imgs += len(imgurls)
146 | 
147 |             # download imgs
148 |             for imgurl in imgurls:
149 |                 index_img += 1
150 |                 paths = '%s/%d.%s' % (path, index_img,
151 |                                       re.search(r'(jpg|png|gif)', imgurl).group(0))
152 |                 print('{}\t{}'.format(index_img, paths))
153 |                 _capture_images(imgurl, paths)
154 | 
155 |         if num_new_blogs != query_number:
156 |             print(
157 |                 '------------------------------- stop line -------------------------------')
158 |             print('capture complete!')
159 |             print('captured blogs:%d images:%d' % (num_blogs, num_imgs))
160 |             print('download path:' + path)
161 |             print(
162 |                 '-------------------------------------------------------------------------')
163 |             break
164 | 
165 |         data['c0-param2'] = 'number:' + _get_timestamp(html, time_pattern)
166 |         print('The next TimeStamp is : %s\n' % data['c0-param2'].split(':')[1])
167 |         # wait a few second
168 |         time.sleep(random.randint(5, 10))
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     main(sys.argv)
173 | 


--------------------------------------------------------------------------------
/qqzone/qqzone_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | """一个用于下载QQ空间相册内所有照片的爬虫"""
  4 | 
  5 | from selenium import webdriver
  6 | from selenium.webdriver.common.keys import Keys
  7 | from selenium.common.exceptions import WebDriverException
  8 | 
  9 | import os
 10 | import re
 11 | import sys
 12 | import time
 13 | import logging
 14 | import requests
 15 | from json import loads
 16 | 
 17 | 
 18 | class qqzone(object):
 19 |     """QQ空间相册爬虫"""
 20 | 
 21 |     def __init__(self, user):
 22 |         self.username = user['username']
 23 |         self.password = user['password']
 24 | 
 25 |     @staticmethod
 26 |     def get_path(album_name):
 27 |         home_path = os.path.expanduser('~')
 28 |         path = os.path.join(home_path, 'Pictures/python/qqzone', album_name)
 29 |         if not os.path.isdir(path):
 30 |             os.makedirs(path)
 31 |         return path
 32 | 
 33 |     def _login_and_get_args(self):
 34 |         """登录QQ，获取Cookies和g_tk"""
 35 |         opt = webdriver.ChromeOptions()
 36 |         opt.set_headless()
 37 | 
 38 |         driver = webdriver.Chrome(chrome_options=opt)
 39 |         driver.get('https://i.qq.com/')
 40 | 
 41 |         logging.info('User {} login...'.format(self.username))
 42 |         driver.switch_to.frame('login_frame')
 43 |         driver.find_element_by_id('switcher_plogin').click()
 44 |         driver.find_element_by_id('u').clear()
 45 |         driver.find_element_by_id('u').send_keys(self.username)
 46 |         driver.find_element_by_id('p').clear()
 47 |         driver.find_element_by_id('p').send_keys(self.password)
 48 |         driver.find_element_by_id('login_button').click()
 49 | 
 50 |         time.sleep(1)
 51 |         driver.get('https://user.qzone.qq.com/{}'.format(self.username))
 52 | 
 53 |         try:
 54 |             logging.info('Getting g_tk...')
 55 |             self.g_tk = driver.execute_script(
 56 |                 'return QZONE.FP.getACSRFToken()')
 57 |             logging.debug('g_tk: {}'.format(self.g_tk))
 58 |         except WebDriverException:
 59 |             logging.error(
 60 |                 'Getting g_tk failed, please check your QQ number and password')
 61 |             driver.close()
 62 |             driver.quit()
 63 |             sys.exit(1)
 64 | 
 65 |         logging.info('Getting Cookies...')
 66 |         self.cookies = driver.get_cookies()
 67 | 
 68 |         driver.close()
 69 |         driver.quit()
 70 | 
 71 |     def _init_session(self):
 72 |         self.session = requests.Session()
 73 |         for cookie in self.cookies:
 74 |             self.session.cookies.set(cookie['name'], cookie['value'])
 75 |         self.session.headers = {
 76 |             'Referer': 'https://qzs.qq.com/qzone/photo/v7/page/photo.html?init=photo.v7/module/albumList/index&navBar=1',
 77 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
 78 |         }
 79 | 
 80 |     def _get_query_for_request(self, topicId=None, pageStart=0, pageNum=100):
 81 |         """获取请求相册信息或照片信息所需的参数
 82 | 
 83 |         Args:
 84 |             topicId: 每个相册对应的唯一标识符
 85 |             pageStart: 请求某个相册的照片列表信息所需的起始页码
 86 |             pageNum: 单次请求某个相册的照片数量
 87 | 
 88 |         Returns:
 89 |             一个组合好所有请求参数的字符串
 90 |         """
 91 |         query = {
 92 |             'g_tk': self.g_tk,
 93 |             'hostUin': self.username,
 94 |             'uin': self.username,
 95 |             'appid': 4,
 96 |             'inCharset': 'utf-8',
 97 |             'outCharset': 'utf-8',
 98 |             'source': 'qzone',
 99 |             'plat': 'qzone',
100 |             'format': 'jsonp'
101 |         }
102 |         if topicId:
103 |             query['topicId'] = topicId
104 |             query['pageStart'] = pageStart
105 |             query['pageNum'] = pageNum
106 |         return '&'.join('{}={}'.format(key, val) for key, val in query.items())
107 | 
108 |     def _load_callback_data(self, resp):
109 |         """以json格式解析返回的jsonp数据"""
110 |         try:
111 |             resp.encoding = 'utf-8'
112 |             data = loads(re.search(r'.*?\(({.*}).*?\).*', resp.text, re.S)[1])
113 |             return data
114 |         except ValueError:
115 |             logging.error('Invalid input')
116 | 
117 |     def _get_ablum_list(self):
118 |         """获取相册的列表信息"""
119 |         album_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3?' + \
120 |             self._get_query_for_request()
121 | 
122 |         logging.info('Getting ablum list id...')
123 |         resp = self.session.get(album_url)
124 |         data = self._load_callback_data(resp)
125 | 
126 |         album_list = {}
127 |         for item in data['data']['albumListModeSort']:
128 |             album_list[item['name']] = item['id']
129 | 
130 |         return album_list
131 | 
132 |     def _get_photo(self, album_name, album_id):
133 |         """获取单个相册的照片列表信息，并下载该相册所有照片"""
134 |         photo_list_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?' + \
135 |             self._get_query_for_request(topicId=album_id)
136 | 
137 |         logging.info('Getting photo list for album {}...'.format(album_name))
138 |         resp = self.session.get(photo_list_url)
139 |         data = self._load_callback_data(resp)
140 |         if data['data']['totalInPage'] == 0:
141 |             return None
142 | 
143 |         file_dir = self.get_path(album_name)
144 |         for item in data['data']['photoList']:
145 |             path = '{}/{}.jpg'.format(file_dir, item['name'])
146 |             logging.info('Downloading {}-{}'.format(album_name, item['name']))
147 |             self._download_image(item['url'], path)
148 | 
149 |     def _download_image(self, url, path):
150 |         """下载单张照片"""
151 |         try:
152 |             resp = self.session.get(url, timeout=15)
153 |             if resp.status_code == 200:
154 |                 open(path, 'wb').write(resp.content)
155 |         except requests.exceptions.Timeout:
156 |             logging.warning('get {} timeout'.format(url))
157 |         except requests.exceptions.ConnectionError as e:
158 |             logging.error(e.__str__)
159 |         finally:
160 |             pass
161 | 
162 |     def start(self):
163 |         """爬虫的入口函数"""
164 |         self._login_and_get_args()
165 |         self._init_session()
166 |         album_list = self._get_ablum_list()
167 |         for name, id in album_list.items():
168 |             self._get_photo(name, id)
169 | 
170 | 
171 | def get_user():
172 |     """从终端获取用户输入的QQ号及密码"""
173 |     username = input('please input QQ number: ').strip()
174 |     if not re.match(r'^[1-9][0-9]{4,9}$', username):
175 |         logging.error('\033[31mQQ number is wrong!\033[0m')
176 |         sys.exit(1)
177 | 
178 |     import getpass
179 |     password = getpass.getpass('password: ')
180 | 
181 |     return {
182 |         'username': username,
183 |         'password': password
184 |     }
185 | 
186 | 
187 | def main():
188 |     FORMAT = '%(asctime)s [%(levelname)s] %(message)s'
189 |     logging.basicConfig(format=FORMAT, level=logging.INFO)
190 | 
191 |     # 默认QQ账户信息
192 |     user = {
193 |         'username': '123456789',
194 |         'password': '*********'
195 |     }
196 | 
197 |     # 加 -d 参数可以使用上面的默认账户，默认信息请自行修改
198 |     if not (len(sys.argv) > 1 and sys.argv[1] == '-d'):
199 |         user = get_user()
200 | 
201 |     qz = qqzone(user)
202 |     qz.start()
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     main()
207 | 


--------------------------------------------------------------------------------