├── huaban ├── huaban │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── UserBoardsSpider.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ └── settings.py ├── run.py └── scrapy.cfg ├── vmgirls ├── vmgirls │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── vmgirl.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── middlewares.py ├── run.py └── scrapy.cfg ├── requirements.txt ├── .gitignore ├── capturer ├── main.py ├── LICENSE ├── telegram └── telegram_spider.py ├── README.md ├── fabiaoqing └── fabiaoqing_spider.py ├── netbian └── netbian_spider.py ├── sina └── sina_spider.py ├── toutiao └── toutiao_spider.py ├── lofter └── lofter_spider.py └── qqzone └── qqzone_spider.py /huaban/huaban/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /huaban/run.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute('scrapy crawl UserBoardSpider'.split()) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml==4.6.5 2 | selenium==3.141.0 3 | Scrapy==2.5.1 4 | requests==2.20.0 5 | beautifulsoup4==4.9.1 6 | Pillow==8.3.2 7 | -------------------------------------------------------------------------------- /vmgirls/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from scrapy import cmdline 5 | 6 | cmdline.execute('scrapy crawl vmgirl'.split()) 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | *.swp 3 | *.html 4 | *.old 5 | *.log 6 | download/ 7 | build/ 8 | dist/ 9 | *.spec 10 | __pycache__/ 11 | .DS_Store 12 | */.DS_Store 13 | *.session 14 | -------------------------------------------------------------------------------- /huaban/huaban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /capturer: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # author: 5km(smslit) 3 | # date: 2018.06.27 4 | # description: 5 | # a shell script to run main.py 6 | # this scrip can solve the problem 7 | # that python3 may be installed in different path 8 | 9 | python3 ./main.py 10 | -------------------------------------------------------------------------------- /huaban/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = huaban.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = huaban 12 | -------------------------------------------------------------------------------- /vmgirls/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = vmgirls.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = vmgirls 12 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item 9 | from scrapy.item import Field 10 | 11 | 12 | class VmgirlsItem(Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | theme_urls = Field() 16 | theme_titles = Field() 17 | pass 18 | 19 | 20 | class VmgirlsImagesItem(Item): 21 | image_urls = Field() 22 | title = Field() 23 | pass 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # author: litreily 3 | # date: 2018.03.06 4 | # description: capture pictures from webs 5 | 6 | from importlib import import_module 7 | 8 | if __name__ == '__main__': 9 | webs = { 10 | '1': "sina", 11 | '2': "lofter", 12 | '3': "toutiao", 13 | '4': "qqzone", 14 | '5': 'telegram', 15 | '6': 'netbian' 16 | } 17 | 18 | tips = 'please select web you want to caputer(1-{0}, default=1)\n'.format(len(webs)) + \ 19 | ''.join(["\t{0} - {1}\n".format(i, webs.get(i)) for i in webs]) + \ 20 | 'You want to captuer from: ' 21 | 22 | select = input(tips) 23 | if select not in webs: 24 | select = '1' 25 | 26 | module = "{0}.{0}_spider".format(webs.get(select)) 27 | spider = import_module(module) 28 | spider.main() 29 | -------------------------------------------------------------------------------- /huaban/huaban/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item 9 | from scrapy.item import Field 10 | 11 | 12 | class HuabanItem(Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | pass 16 | 17 | 18 | class BoardItem(Item): 19 | # define the fields for your item here like: 20 | # name = Field() 21 | title = Field() 22 | board_id = Field() 23 | category_id = Field() 24 | pin_count = Field() 25 | follow_count = Field() 26 | like_count = Field() 27 | pass 28 | 29 | 30 | class PinItem(Item): 31 | pin_id = Field() 32 | board_id = Field() 33 | board_title = Field() 34 | file_id = Field() 35 | file_key = Field() 36 | source = Field() 37 | tags = Field() 38 | pass 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 litreily@outlook.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/spiders/vmgirl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from vmgirls.items import VmgirlsItem 5 | from vmgirls.items import VmgirlsImagesItem 6 | 7 | from scrapy.http import Request 8 | from scrapy.utils.project import get_project_settings 9 | 10 | import os 11 | 12 | 13 | class VmgirlSpider(scrapy.Spider): 14 | name = 'vmgirl' 15 | allowed_domains = ['vmgirls.com'] 16 | start_urls = ['https://www.vmgirls.com/sitemap.shtml/'] 17 | 18 | def __init__(self): 19 | settings = get_project_settings() 20 | self.user_data_dir = settings.get('USER_DATA_DIR') 21 | 22 | def parse(self, response): 23 | '''Parse sitemap''' 24 | urls = response.xpath('//*[@id="content"][1]/ul/li/a/@href').extract() 25 | titles = response.xpath( 26 | '//*[@id="content"][1]/ul/li/a/text()').extract() 27 | 28 | item = VmgirlsItem() 29 | item['theme_urls'] = urls 30 | item['theme_titles'] = titles 31 | yield item 32 | 33 | for url, title in zip(urls, titles): 34 | save_path = os.path.join(self.user_data_dir, title) 35 | if not os.path.isdir(save_path): 36 | os.makedirs(save_path) 37 | 38 | yield Request(url, meta={'title': title}, callback=self.parse_page) 39 | 40 | def parse_page(self, response): 41 | '''Parse each page of girls''' 42 | urls = response.xpath( 43 | '//*[@class="post-content"]//img/@data-src').extract() 44 | item = VmgirlsImagesItem() 45 | item['image_urls'] = urls 46 | item['title'] = response.meta['title'] 47 | yield item 48 | -------------------------------------------------------------------------------- /telegram/telegram_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | '''Capture pictures from telegram channel 4 | Below parameters need replace to yours 5 | - api_id 6 | - api_hash 7 | - proxy 8 | Notice: 9 | First time you may need enter phone number and get login code 10 | ''' 11 | 12 | import os 13 | import sys 14 | import socks 15 | 16 | from telethon import TelegramClient, sync 17 | from telethon.tl.types import InputMessagesFilterPhotos 18 | from telethon.helpers import TotalList 19 | 20 | 21 | def get_path(channel): 22 | home_path = os.path.expanduser('~') 23 | path = os.path.join(home_path, 'Pictures/python/telegram', channel) 24 | if not os.path.isdir(path): 25 | os.makedirs(path) 26 | return path 27 | 28 | 29 | def open_client(): 30 | # get api_id and api_hash from https://my.telegram.org/apps 31 | api_id = None 32 | api_hash = None 33 | 34 | if not api_id or not api_hash: 35 | print('Please set api_id and api_hash, you can get it from https://my.telegram.org/apps') 36 | sys.exit(1) 37 | 38 | # socks5 proxy, can set to be 'None' if no need 39 | proxy = (socks.SOCKS5, "localhost", 1080) 40 | return TelegramClient('tg_session', api_id=api_id, 41 | api_hash=api_hash, proxy=proxy).start() 42 | 43 | 44 | def get_photos(client, channel): 45 | tg_link = "https://t.me/" + channel 46 | 47 | # get photos 48 | print('Getting photos from ' + tg_link) 49 | return client.get_messages(tg_link, None, filter=InputMessagesFilterPhotos) 50 | 51 | 52 | def main(): 53 | tg_client = open_client() 54 | tg_channel = input('Please input telegram channel name: ') 55 | 56 | photos = get_photos(tg_client, tg_channel) 57 | total = photos.total 58 | 59 | save_path = get_path(tg_channel) 60 | 61 | print('Start downloading photos...') 62 | index = 0 63 | for photo in photos: 64 | filename = os.path.join(save_path, str(photo.id) + '.jpg') 65 | index = index + 1 66 | print("downloading {}/{} : {}".format(index, total, filename)) 67 | tg_client.download_media(photo, filename) 68 | 69 | tg_client.disconnect() 70 | print("Done.") 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What's Capturer 2 | 3 | A capture tool used to capture pictures from web like Sina, LOFTER, huaban and so on. 4 | 5 | > **If you have any suggestions or awesome websites of pictures want to capture, please let me know!!!** 6 | 7 | ## Support Websites 8 | 9 | - [Sina](https://weibo.com/) 10 | - [Lofter](http://www.lofter.com/) 11 | - [Toutiao](https://www.toutiao.com) 12 | - ~~[QQZone](https://qzone.qq.com/)~~: Need verify Captcha 13 | - [Huaban](https://huaban.com/) 14 | - ~~[Vmgirls](https://www.vmgirls.com/)~~: Website upgraded 15 | - [Fabiaoqing](https://www.fabiaoqing.com/) 16 | - `telegram` 17 | - [NetBian](http://pic.netbian.com/) 18 | 19 | ## How to use 20 | 21 | - install `python3` and libs 22 | - update your [Parameters](#parameters) of each kind of web 23 | - run `./capturer` or run `main.py` or `***_spider.py` to capture images from 24 | - `sina` 25 | - `lofter` 26 | - `toutiao` 27 | - `qqzone` 28 | - `telegram` 29 | - `netbian` 30 | - run `huaban/run.py` to capture images from `huaban` 31 | - run `vmgirls/run.py` to capture images from `vmgirls` 32 | - run `fabiaoqing/fabiaoqing_spider.py key1 [key2] [key3] ...` 33 | 34 | ## Notices 35 | 36 | Almost all of the file path based on `~/Pictures/python`, `~` means home dir. 37 | 38 | ## Parameters 39 | 40 | ### huaban 41 | 42 | - `USERNAME`: username of huaban which you want to capture 43 | - `ROOT_DIR`: directories where to store the images 44 | 45 | ### Sina 46 | 47 | - `uid`: user-id(10 numbers) of sina weibo that you want to capture 48 | - `cookies`: your cookies after login the sina weibo 49 | - `path`: directory to save the pictures 50 | 51 | ### Lofter 52 | 53 | - `username`: username of lofter that you want to capture 54 | - `path`: directory to save the pictures, see the function `_get_path` in `lofter_spider.py` 55 | - `query_number`: number of blogs in each query packet, default value is 40 56 | 57 | ### Telegram 58 | 59 | - `api_id`: you can get from 60 | - `api_hash`: you can get from 61 | - `socks proxy`: set proxy ip and port, default is `localhost:1080` 62 | 63 | ## Blogs 64 | 65 | You can find all the relate blogs in . 66 | 67 | - Lofter - [爬取网易LOFTER图片](https://www.litreily.top/2018/03/17/lofter/) 68 | - Sina - [爬取新浪微博用户图片](https://www.litreily.top/2018/04/10/sina/) 69 | - qqzone - [爬取QQ空间相册](https://www.litreily.top/2019/03/03/qqzone/) 70 | - Vmgirls - [Scrapy爬取vmgirls](https://www.litreily.top/2019/08/09/vmgirls/) 71 | - Netbian - [爬取彼岸图网美图](https://www.litreily.top/2020/08/09/netbian/) 72 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exporters import JsonLinesItemExporter 9 | from scrapy.pipelines.images import ImagesPipeline 10 | from scrapy.exceptions import DropItem 11 | from scrapy.http import Request 12 | 13 | from vmgirls.items import VmgirlsItem 14 | from vmgirls.items import VmgirlsImagesItem 15 | 16 | import os 17 | 18 | 19 | class VmgirlsPipeline(object): 20 | '''Pipeline for every url of one theme, save theme info to json file''' 21 | 22 | def __init__(self, user_data_dir): 23 | '''Open file to save the exported Items''' 24 | self.user_data_dir = user_data_dir 25 | 26 | if not os.path.isdir(self.user_data_dir): 27 | os.makedirs(self.user_data_dir) 28 | 29 | @classmethod 30 | def from_crawler(cls, crawler): 31 | '''Get user dir from global settings.py''' 32 | settings = crawler.settings 33 | return cls(settings.get('USER_DATA_DIR')) 34 | 35 | def process_item(self, item, spider): 36 | '''Save item info to loacl file''' 37 | if isinstance(item, VmgirlsItem): 38 | self.girls_info = open( 39 | os.path.join(self.user_data_dir, 'vmgirls.json'), 'w+b') 40 | self.girls_exporter = JsonLinesItemExporter( 41 | self.girls_info, encoding='utf-8', indent=4) 42 | 43 | self.girls_exporter.start_exporting() 44 | 45 | for url, title in zip(item['theme_urls'], item['theme_titles']): 46 | single_item = {'theme_url': url, 'title': title} 47 | self.girls_exporter.export_item(single_item) 48 | 49 | self.girls_exporter.finish_exporting() 50 | self.girls_info.close() 51 | return item 52 | 53 | 54 | class VmgirlsImagesPipeline(ImagesPipeline): 55 | '''Get images from one theme''' 56 | 57 | def get_media_requests(self, item, info): 58 | if isinstance(item, VmgirlsImagesItem): 59 | for image_url in item['image_urls']: 60 | yield Request(image_url, meta={'item': item}) 61 | 62 | def file_path(self, request, response=None, info=None): 63 | '''Set image dir to IMAGES_STORE/title/base_url''' 64 | url = request.url 65 | item = request.meta['item'] 66 | path = os.path.join(item['title'], url.split('/')[-1]) 67 | return path 68 | 69 | def item_completed(self, results, item, info): 70 | if isinstance(item, VmgirlsImagesItem): 71 | image_paths = [x['path'] for ok, x in results if ok] 72 | 73 | if not image_paths: 74 | raise DropItem("Item contains no images") 75 | return item 76 | -------------------------------------------------------------------------------- /fabiaoqing/fabiaoqing_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # author: litreily 4 | # date: 2019.09.15 5 | '''按关键词爬取发表情网fabiaoqing.com的表情包''' 6 | 7 | import requests 8 | import os 9 | import sys 10 | from lxml import html 11 | 12 | 13 | base_url = 'https://fabiaoqing.com/search/search/keyword/' 14 | headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' 16 | } 17 | 18 | 19 | def get_path(keyword): 20 | '''生成指定关键词对应的表情包存储路径''' 21 | home_path = os.path.expanduser('~') 22 | path = os.path.join(home_path, 'Pictures/python/表情包/' + keyword) 23 | if not os.path.isdir(path): 24 | os.makedirs(path) 25 | 26 | return os.path.realpath(path) 27 | 28 | 29 | def get_imgs(keyword): 30 | '''爬取某一个关键词相关的所有表情包 31 | 32 | Args: 33 | keyword: 表情包关键词 34 | ''' 35 | page_index = 0 36 | img_cnts = 0 37 | save_dir = get_path(keyword) 38 | while True: 39 | page_index = page_index + 1 40 | # https://fabiaoqing.com/search/search/keyword/抱抱/type/bq/page/1.html 41 | url = '{}{}/type/bq/page/{}.html'.format(base_url, keyword, page_index) 42 | response = requests.get(url, headers=headers).content 43 | page = html.fromstring(response) 44 | imgs = page.xpath( 45 | '//div[@class="searchbqppdiv tagbqppdiv"]//img/@data-original') 46 | 47 | print('爬取 "{}" 相关表情包第 {} 页:'.format(keyword, page_index)) 48 | img_cnts = download_imgs(imgs, img_cnts, save_dir) 49 | 50 | if page_index == 20 or len(imgs) == 0: 51 | break 52 | 53 | return img_cnts, save_dir 54 | 55 | 56 | def download_imgs(img_urls, starti, save_dir): 57 | '''下载单个页面内所有图片 58 | 59 | Args: 60 | img_urls: 关键词相关表情包某一分页的所有图片链接 61 | starti: 当前页面首个图片命名id 62 | save_dir: 图片存储路径 63 | ''' 64 | fid = starti 65 | for img in img_urls: 66 | print('\t' + img) 67 | fid = fid + 1 68 | file_name = '{}.{}'.format(fid, os.path.basename(img).split('.')[-1]) 69 | save_path = os.path.join(save_dir, file_name) 70 | 71 | try: 72 | with open(save_path, 'wb') as f: 73 | f.write(requests.get(img, headers=headers, timeout=20).content) 74 | except requests.exceptions.ConnectionError as ce: 75 | print(ce.strerror()) 76 | except requests.exceptions.MissingSchema: 77 | print(img + ' missing schema') 78 | except requests.exceptions.ReadTimeout: 79 | print('get {} timeout, skip this item.'.format(img)) 80 | finally: 81 | pass 82 | 83 | return fid 84 | 85 | 86 | def usage(): 87 | print('Usage:\n\t' + os.path.basename(sys.argv[0]) + 88 | ' [key1] [key2] [key3] ...\n') 89 | 90 | 91 | def main(): 92 | if len(sys.argv) < 2: 93 | usage() 94 | sys.exit(0) 95 | 96 | print('============================================') 97 | for keyword in sys.argv[1:]: 98 | print('开始爬取关键词为 "{}" 的表情包:'.format(keyword)) 99 | count, save_dir = get_imgs(keyword) 100 | print('共爬取 "{}" 表情包 {} 个'.format(keyword, count)) 101 | print('文件存储于"{}"'.format(save_dir)) 102 | print('\n爬取完成!') 103 | print('============================================') 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for vmgirls project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | import os 13 | BOT_NAME = 'vmgirls' 14 | 15 | SPIDER_MODULES = ['vmgirls.spiders'] 16 | NEWSPIDER_MODULE = 'vmgirls.spiders' 17 | 18 | USER_DIR = os.path.expanduser('~') 19 | USER_DATA_DIR = os.path.join(USER_DIR, 'Pictures/python/vmgirls') 20 | IMAGES_STORE = USER_DATA_DIR 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'vmgirls (+http://www.yourdomain.com)' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = False 27 | 28 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 | #CONCURRENT_REQUESTS = 32 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | #DOWNLOAD_DELAY = 3 35 | # The download delay setting will honor only one of: 36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 37 | #CONCURRENT_REQUESTS_PER_IP = 16 38 | 39 | # Disable cookies (enabled by default) 40 | #COOKIES_ENABLED = False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | #TELNETCONSOLE_ENABLED = False 44 | 45 | # Override the default request headers: 46 | # DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | # } 50 | 51 | # Enable or disable spider middlewares 52 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 53 | # SPIDER_MIDDLEWARES = { 54 | # 'vmgirls.middlewares.VmgirlsSpiderMiddleware': 543, 55 | # } 56 | 57 | # Enable or disable downloader middlewares 58 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 59 | # DOWNLOADER_MIDDLEWARES = { 60 | # 'vmgirls.middlewares.VmgirlsDownloaderMiddleware': 543, 61 | # } 62 | 63 | # Enable or disable extensions 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 65 | # EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | # } 68 | 69 | # Configure item pipelines 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | 'vmgirls.pipelines.VmgirlsPipeline': 300, 73 | 'vmgirls.pipelines.VmgirlsImagesPipeline': 400 74 | } 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | -------------------------------------------------------------------------------- /huaban/huaban/spiders/UserBoardsSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider 3 | from scrapy.http import Request 4 | from scrapy.http import Headers 5 | from scrapy.utils.project import get_project_settings 6 | 7 | import re 8 | 9 | import json 10 | 11 | from huaban.items import BoardItem 12 | from huaban.items import PinItem 13 | 14 | 15 | class UserboardsspiderSpider(Spider): 16 | name = 'UserBoardSpider' 17 | allowed_domains = ['huaban.com'] 18 | 19 | def __init__(self): 20 | settings = get_project_settings() 21 | self.username = settings.get('USERNAME') 22 | self.hostname = 'http://huaban.com' 23 | self.start_urls = ['{0}/{1}/'.format(self.hostname, self.username)] 24 | 25 | def parse(self, response): 26 | '''Get boards info from home page of user''' 27 | # Enable below comments need disable huabanDownloaderMiddleware 28 | # data = response.xpath('body/script[1]').extract_first() 29 | # user_page = re.search('app\.page\["user"\]\s*=\s*({.*});', data)[1] 30 | # info = json.loads(user_page, encoding='utf-8') 31 | # boards = info.get('boards') 32 | 33 | info = json.loads(response.text, encoding='utf-8') 34 | boards = info['user'].get('boards') 35 | 36 | if not boards: 37 | return 38 | 39 | # Get BoardItem and capture all boards 40 | for board in boards: 41 | item = BoardItem() 42 | item['title'] = board['title'] 43 | item['board_id'] = board['board_id'] 44 | item['category_id'] = board['category_id'] 45 | item['pin_count'] = board['pin_count'] 46 | item['follow_count'] = board['follow_count'] 47 | item['like_count'] = board['like_count'] 48 | yield item 49 | 50 | board_url = '{0}/boards/{1}'.format(self.hostname, 51 | board['board_id']) 52 | yield Request(board_url, meta={'board_title': board['title']}, callback=self.parse_pins) 53 | 54 | # Get more boards info 55 | # Request parameters: 56 | # max: the last board_id get from boards 57 | # limit: default 10, it's the limit number of boards, can be modified 58 | board_req = '{0}/{1}/?jg0gcj0&max={2}&limit={3}&wfl=1'.format(self.hostname, 59 | self.username, boards[-1]['board_id'], 10) 60 | yield Request(board_req, callback=self.parse) 61 | 62 | def parse_pins(self, response): 63 | board_data = json.loads(response.text, encoding='utf-8') 64 | pins = board_data['board'].get('pins') 65 | board_title = response.meta['board_title'] 66 | 67 | if not pins: 68 | return 69 | 70 | for pin in pins: 71 | item = PinItem() 72 | item['pin_id'] = pin['pin_id'] 73 | item['board_id'] = pin['board_id'] 74 | item['board_title'] = board_title 75 | item['file_id'] = pin['file_id'] 76 | item['file_key'] = pin['file']['key'] 77 | item['source'] = pin['source'] 78 | item['tags'] = pin['tags'] 79 | yield item 80 | 81 | # Get more pins info 82 | # Request parameters: 83 | # max: the last pin_id get from pins 84 | # limit: default 20, it's the limit number of pins, can be modified 85 | pin_req = '{0}/boards/{1}/?jg6nr2rm&max={2}&limit={3}&wfl=1'.format( 86 | self.hostname, pins[-1]['board_id'], pins[-1]['pin_id'], 20) 87 | yield Request(pin_req, meta={'board_title': board_title}, callback=self.parse_pins) 88 | -------------------------------------------------------------------------------- /netbian/netbian_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import requests 6 | from lxml import html 7 | from multiprocessing import Pool, cpu_count 8 | 9 | 10 | class NetbianSpider(object): 11 | def __init__(self): 12 | self.index = 'http://pic.netbian.com' 13 | self.headers = { 14 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' 15 | } 16 | 17 | def get_path(self, name): 18 | home_path = os.path.expanduser('~') 19 | path = os.path.join(home_path, 'Pictures/python/netbian/' + name) 20 | if not os.path.isdir(path): 21 | os.makedirs(path) 22 | 23 | return os.path.realpath(path) 24 | 25 | def get_categories(self): 26 | '''get categories of website''' 27 | res = requests.get(self.index, headers=self.headers) 28 | doc = html.fromstring(res.content) 29 | categories = doc.xpath('//div[contains(@class, "classify")]/a') 30 | 31 | for category in categories: 32 | name = category.xpath('text()')[0] 33 | url = category.xpath('@href')[0] 34 | yield name, url 35 | 36 | def spider_by_category(self, category, url): 37 | '''Process function which use to capture images base on category''' 38 | path_category = self.get_path(category) 39 | detail_pages, page_cnt = self.parse_thumb_page(url, first_page=True) 40 | 41 | img_cnt = 0 42 | page_num = 1 43 | while True: 44 | for page in detail_pages: 45 | img_cnt += 1 46 | 47 | print('[{} page-{} img-{}] Parsing page {}'.format( 48 | category, page_num, img_cnt, page)) 49 | img_url = self.parse_detail_page(page) 50 | self.download_image(img_url, path_category) 51 | 52 | page_num += 1 53 | if page_num > page_cnt: 54 | break 55 | detail_pages = self.parse_thumb_page( 56 | '{}index_{}.html'.format(url, page_num)) 57 | 58 | def parse_thumb_page(self, url, first_page=False): 59 | '''parse thumbnail page and get all the detail pages url''' 60 | res = requests.get(self.index + url, headers=self.headers) 61 | doc = html.fromstring(res.content) 62 | detail_pages = doc.xpath('//div[@class="slist"]//a/@href') 63 | 64 | if first_page: 65 | page_cnt = doc.xpath( 66 | '//span[@class="slh"]/following-sibling::a[1]/text()')[0] 67 | return detail_pages, int(page_cnt) 68 | else: 69 | return detail_pages 70 | 71 | def parse_detail_page(self, url): 72 | '''parse detail page and get source image url''' 73 | res = requests.get(self.index + url, headers=self.headers) 74 | doc = html.fromstring(res.content) 75 | img_url = doc.xpath('//*[@id="img"]/img/@src')[0] 76 | 77 | return img_url 78 | 79 | def download_image(self, url, path): 80 | img_name = url.split('/')[-1] 81 | save_path = os.path.join(path, img_name) 82 | 83 | res = requests.get(self.index + url, headers=self.headers, timeout=20) 84 | if res.status_code == 200: 85 | with open(save_path, 'wb') as f: 86 | f.write(res.content) 87 | 88 | 89 | def main(): 90 | spider = NetbianSpider() 91 | categories = spider.get_categories() 92 | 93 | p = Pool(cpu_count()) 94 | for name, url in categories: 95 | p.apply_async(spider.spider_by_category, args=(name, url)) 96 | 97 | p.close() 98 | p.join() 99 | print('All Done!') 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /vmgirls/vmgirls/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class VmgirlsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class VmgirlsDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /huaban/huaban/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class HuabanSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class HuabanDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | request.headers['Accept'] = 'application/json' 76 | request.headers['X-Request'] = 'JSON' 77 | request.headers['X-Requested-With'] = 'XMLHttpRequest' 78 | 79 | # Must either: 80 | # - return None: continue processing this request 81 | # - or return a Response object 82 | # - or return a Request object 83 | # - or raise IgnoreRequest: process_exception() methods of 84 | # installed downloader middleware will be called 85 | return None 86 | 87 | def process_response(self, request, response, spider): 88 | # Called with the response returned from the downloader. 89 | 90 | # Must either; 91 | # - return a Response object 92 | # - return a Request object 93 | # - or raise IgnoreRequest 94 | return response 95 | 96 | def process_exception(self, request, exception, spider): 97 | # Called when a download handler or a process_request() 98 | # (from other downloader middleware) raises an exception. 99 | 100 | # Must either: 101 | # - return None: continue processing this exception 102 | # - return a Response object: stops process_exception() chain 103 | # - return a Request object: stops process_exception() chain 104 | pass 105 | 106 | def spider_opened(self, spider): 107 | spider.logger.info('Spider opened: %s' % spider.name) 108 | -------------------------------------------------------------------------------- /huaban/huaban/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exporters import JsonItemExporter 9 | from scrapy.exporters import JsonLinesItemExporter 10 | 11 | from scrapy.pipelines.images import ImagesPipeline 12 | from scrapy.http import Request 13 | from scrapy.exceptions import DropItem 14 | 15 | from huaban.items import BoardItem 16 | from huaban.items import PinItem 17 | 18 | import os 19 | from os.path import join, basename 20 | 21 | from urllib.parse import urlparse 22 | 23 | from PIL import Image 24 | 25 | try: 26 | from cStringIO import StringIO as BytesIO 27 | except ImportError: 28 | from io import BytesIO 29 | 30 | 31 | class HuabanPipeline(object): 32 | def __init__(self, user_data_dir): 33 | '''Open file to save the exported Items''' 34 | self.user_data_dir = user_data_dir 35 | 36 | if not os.path.isdir(self.user_data_dir): 37 | os.makedirs(self.user_data_dir) 38 | 39 | # save info of BoardItem 40 | self.board_info = open(self.user_data_dir + 'boards.json', 'w+b') 41 | self.board_exporter = JsonItemExporter( 42 | self.board_info, encoding='utf-8', indent=4) 43 | 44 | # save info of PinItem 45 | self.pin_info = open(self.user_data_dir + 'pins.json', 'w+b') 46 | self.pin_exporter = JsonLinesItemExporter( 47 | self.pin_info, encoding='utf-8', indent=4) 48 | 49 | @classmethod 50 | def from_crawler(cls, crawler): 51 | '''get some global settings from settings.py''' 52 | settings = crawler.settings 53 | return cls(settings.get('USER_DATA_DIR')) 54 | 55 | def open_spider(self, spider): 56 | '''Start exporting BoardItem''' 57 | self.board_exporter.start_exporting() 58 | self.pin_exporter.start_exporting() 59 | 60 | def process_item(self, item, spider): 61 | if isinstance(item, BoardItem): 62 | self.board_exporter.export_item(item) 63 | elif isinstance(item, PinItem): 64 | self.pin_exporter.export_item(item) 65 | 66 | return item 67 | 68 | def close_spider(self, spider): 69 | '''finish exporting and close files''' 70 | self.board_exporter.finish_exporting() 71 | self.pin_exporter.finish_exporting() 72 | self.board_info.close() 73 | self.pin_info.close() 74 | 75 | 76 | class HuabanImagesPipeline(ImagesPipeline): 77 | '''Implement image downloader by inherit class ImagesPipeline''' 78 | 79 | def get_media_requests(self, item, info): 80 | if isinstance(item, PinItem): 81 | image_url = 'http://img.hb.aicdn.com/' + item['file_key'] 82 | yield Request(image_url, meta={'item': item}) 83 | 84 | def file_path(self, request, response=None, info=None): 85 | url_path = urlparse(request.url).path 86 | item = request.meta['item'] 87 | board_title = item['board_title'] 88 | # file path: IMAGE_STORE/images/[BOARD_TITLE]/[URL_PATH].jpg 89 | return join('images', board_title.replace(':', '-'), basename(url_path)) 90 | 91 | def check_gif(self, image): 92 | if image.format == 'GIF': 93 | return True 94 | else: 95 | return image.info.get('version') in ['GIF89a', 'GIF87a'] 96 | 97 | def get_images(self, response, request, info): 98 | path = self.file_path(request, response=response, info=info) 99 | orig_image = Image.open(BytesIO(response.body)) 100 | 101 | if self.check_gif(orig_image): 102 | path += '.gif' 103 | abs_path = self.store._get_filesystem_path(path) 104 | self.store._mkdir(os.path.dirname(abs_path), info) 105 | 106 | # save gif image from reponse 107 | with open(abs_path, 'wb') as f: 108 | f.write(response.body) 109 | return None 110 | else: 111 | path += '.jpg' 112 | image, buf = self.convert_image(orig_image) 113 | 114 | yield path, image, buf 115 | 116 | def item_completed(self, results, item, info): 117 | image_paths = [x['path'] for ok, x in results if ok] 118 | if not image_paths: 119 | raise DropItem("Item contains no images") 120 | return item 121 | -------------------------------------------------------------------------------- /huaban/huaban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for huaban project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | import time 13 | import os 14 | 15 | BOT_NAME = 'huaban' 16 | 17 | SPIDER_MODULES = ['huaban.spiders'] 18 | NEWSPIDER_MODULE = 'huaban.spiders' 19 | 20 | # Custom settings 21 | # USERNAME = 'meirijingxuan' 22 | # USERNAME = 'dsk1985' 23 | USERNAME = 'litreily' 24 | ROOT_DIR = '{0}/Pictures/python/huaban/'.format(os.path.expanduser('~')) 25 | USER_DIR = ROOT_DIR + USERNAME 26 | USER_DATA_DIR = USER_DIR + '/json/' 27 | 28 | # Log 29 | SAVE_LOG = False 30 | if SAVE_LOG: 31 | LOG_DIR = ROOT_DIR + '.log' 32 | LOG_ENCODING = 'utf-8' 33 | 34 | if not os.path.isdir(LOG_DIR): 35 | os.makedirs(LOG_DIR) 36 | 37 | localtime = time.strftime('%Y%m%d_%H%M%S', time.localtime()) 38 | LOG_FILE = '{0}/{1}_{2}.log'.format(LOG_DIR, USERNAME, localtime) 39 | 40 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 41 | USER_AGENT = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 42 | ' (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36') 43 | 44 | # Obey robots.txt rules 45 | ROBOTSTXT_OBEY = True 46 | 47 | # Configure a item exporter to save items 48 | # FEED_FORMAT = 'json' 49 | # FEED_URI = 'file:///D:/litreily/Pictures/python/huaban/boards.json' 50 | # FEED_EXPORTERS_BASE = { 51 | # 'json': 'scrapy.exporters.JsonItemExporter', 52 | # 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 53 | # } 54 | 55 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 56 | #CONCURRENT_REQUESTS = 32 57 | 58 | # Configure a delay for requests for the same website (default: 0) 59 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 60 | # See also autothrottle settings and docs 61 | #DOWNLOAD_DELAY = 3 62 | # The download delay setting will honor only one of: 63 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 64 | #CONCURRENT_REQUESTS_PER_IP = 16 65 | 66 | # Disable cookies (enabled by default) 67 | #COOKIES_ENABLED = False 68 | 69 | # Disable Telnet Console (enabled by default) 70 | #TELNETCONSOLE_ENABLED = False 71 | 72 | # Override the default request headers: 73 | # DEFAULT_REQUEST_HEADERS = { 74 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 75 | # 'Accept-Language': 'en', 76 | # } 77 | 78 | # Enable or disable spider middlewares 79 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 80 | # SPIDER_MIDDLEWARES = { 81 | # 'huaban.middlewares.HuabanSpiderMiddleware': 543, 82 | # } 83 | 84 | # Enable or disable downloader middlewares 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 86 | DOWNLOADER_MIDDLEWARES = { 87 | 'huaban.middlewares.HuabanDownloaderMiddleware': 543, 88 | } 89 | 90 | # Enable or disable extensions 91 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 92 | # EXTENSIONS = { 93 | # 'scrapy.extensions.telnet.TelnetConsole': None, 94 | # } 95 | 96 | # Configure item pipelines 97 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 98 | ITEM_PIPELINES = { 99 | 'huaban.pipelines.HuabanPipeline': 300, 100 | 'huaban.pipelines.HuabanImagesPipeline': 400, 101 | } 102 | IMAGES_STORE = USER_DIR 103 | 104 | # Enable and configure the AutoThrottle extension (disabled by default) 105 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 106 | #AUTOTHROTTLE_ENABLED = True 107 | # The initial download delay 108 | #AUTOTHROTTLE_START_DELAY = 5 109 | # The maximum download delay to be set in case of high latencies 110 | #AUTOTHROTTLE_MAX_DELAY = 60 111 | # The average number of requests Scrapy should be sending in parallel to 112 | # each remote server 113 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 114 | # Enable showing throttling stats for every response received: 115 | #AUTOTHROTTLE_DEBUG = False 116 | 117 | # Enable and configure HTTP caching (disabled by default) 118 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 119 | #HTTPCACHE_ENABLED = True 120 | #HTTPCACHE_EXPIRATION_SECS = 0 121 | #HTTPCACHE_DIR = 'httpcache' 122 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 123 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 124 | -------------------------------------------------------------------------------- /sina/sina_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # author: litreily 4 | # date: 2018.02.05 5 | """Capture pictures from sina-weibo with user_id.""" 6 | 7 | import re 8 | import os 9 | 10 | import socket 11 | import urllib.request 12 | 13 | from bs4 import BeautifulSoup 14 | 15 | 16 | def _get_path(uid): 17 | home_path = os.path.expanduser('~') 18 | path = os.path.join(home_path, 'Pictures/python/sina', uid) 19 | if not os.path.isdir(path): 20 | os.makedirs(path) 21 | return path 22 | 23 | 24 | def _get_html(url, headers): 25 | try: 26 | req = urllib.request.Request(url, headers=headers) 27 | page = urllib.request.urlopen(req) 28 | html = page.read().decode('UTF-8') 29 | except Exception as e: 30 | print("get %s failed" % url) 31 | return None 32 | return html 33 | 34 | 35 | def _capture_images(uid, headers, path): 36 | filter_mode = 1 # 0-all 1-original 2-pictures 37 | num_pages = 1 38 | num_blogs = 0 39 | num_imgs = 0 40 | 41 | # regular expression of imgList and img 42 | imglist_reg = r'href="(https://weibo.cn/mblog/picAll/.{9}\?rl=2)"' 43 | imglist_pattern = re.compile(imglist_reg) 44 | img_reg = r'src="(http://w.{2}\.sinaimg.cn/(.{6,8})/.{32,33}.(jpg|gif))"' 45 | img_pattern = re.compile(img_reg) 46 | 47 | print('start capture picture of uid:' + uid) 48 | while True: 49 | url = 'https://weibo.cn/%s/profile?filter=%s&page=%d' % ( 50 | uid, filter_mode, num_pages) 51 | 52 | # 1. get html of each page url 53 | html = _get_html(url, headers) 54 | if html == None: 55 | print('\nPlease check your user id or cookies in sina_spider.py!\n') 56 | os.removedirs(path) 57 | break 58 | 59 | # 2. parse the html and find all the imgList Url of each page 60 | soup = BeautifulSoup(html, "lxml") 61 | #
62 | blogs = soup.body.find_all( 63 | attrs={'id': re.compile(r'^M_')}, recursive=False) 64 | num_blogs += len(blogs) 65 | 66 | if num_pages == 1: 67 | # get number of pages 68 | max_pages = soup.find('input', attrs={'name': 'mp'}).attrs['value'] 69 | 70 | imgurls = [] 71 | for blog in blogs: 72 | blog = str(blog) 73 | imglist_url = imglist_pattern.findall(blog) 74 | if not imglist_url: 75 | # 2.1 get img-url from blog that have only one pic 76 | imgurls += img_pattern.findall(blog) 77 | else: 78 | # 2.2 get img-urls from blog that have group pics 79 | html = _get_html(imglist_url[0], headers) 80 | imgurls += img_pattern.findall(html) 81 | 82 | if num_pages > int(max_pages): 83 | print('capture complete!') 84 | print('captured pages:%d, blogs:%d, imgs:%d' % 85 | (num_pages - 1, num_blogs, num_imgs)) 86 | print('directory:' + path) 87 | break 88 | 89 | # 3. download all the imgs from each imgList 90 | print('PAGE {}/{} with {} images'.format(num_pages, max_pages, len(imgurls))) 91 | for img in imgurls: 92 | imgurl = img[0].replace(img[1], 'large') 93 | num_imgs += 1 94 | count = 1 95 | try: 96 | urllib.request.urlretrieve( 97 | imgurl, '{}/{}.{}'.format(path, num_imgs, img[2])) 98 | except socket.timeout: 99 | while count <= 3: 100 | try: 101 | urllib.request.urlretrieve( 102 | imgurl, '{}/{}.{}'.format(path, num_imgs, img[2])) 103 | break 104 | except socket.timeout: 105 | count += 1 106 | finally: 107 | # display the raw url of images 108 | print('\t%d\t%s' % (num_imgs, imgurl)) 109 | if count > 3: 110 | print('\t%d\t%s failed' % (num_imgs, imgurl)) 111 | pass 112 | num_pages += 1 113 | print('') 114 | 115 | 116 | def main(): 117 | uid = input('please input user id (e.g. 1969308311) :') 118 | path = _get_path(uid) 119 | socket.setdefaulttimeout(20) 120 | 121 | # cookie is form the above url->network->request headers 122 | cookies = '' 123 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 124 | 'Cookie': cookies} 125 | 126 | # capture imgs from sina 127 | _capture_images(uid, headers, path) 128 | 129 | 130 | if __name__ == '__main__': 131 | main() 132 | -------------------------------------------------------------------------------- /toutiao/toutiao_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Description: Get pisture collections from www.toutiao.com 4 | Tip: This script comes from https://github.com/smslit/spider-collection/tree/master/ttpic 5 | Link: https://www.smslit.top/2018/06/21/spider-practice-pic-dog/ 6 | """ 7 | __author__ = '5km(smslit)' 8 | __date__ = '20180627' 9 | 10 | import os 11 | import requests 12 | from hashlib import md5 13 | from functools import partial 14 | from urllib.parse import urlencode 15 | from multiprocessing.pool import Pool 16 | 17 | PAGE_NUM = 5 18 | 19 | HEADERS = { 20 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15', 21 | 'X-Requested-With': 'XMLHttpRequest' 22 | } 23 | 24 | 25 | def get_page_json(offset, keyword): 26 | '''生成ajax请求,发出get请求,并获取响应结果,以字典的形式返回json数据 27 | :param offset: 页面请求偏移值 28 | :type offset: 能被20整除的整数 29 | :param keyword: 图片搜索的关键词 30 | :type keyword: unicode字符串 31 | :return: 响应数据 32 | :rtype: dict 33 | ''' 34 | params = { 35 | 'aid': 24, 36 | 'app_name': 'web_search', 37 | 'offset': offset, 38 | 'format': 'json', 39 | 'keyword': keyword, 40 | 'autoload': 'true', 41 | 'count': '20', 42 | 'cur_tab': '3', 43 | 'from': 'gallery' 44 | } 45 | 46 | url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params) 47 | try: 48 | response = requests.get(url, headers=HEADERS) 49 | if response.status_code == 200: 50 | return response.json() 51 | except requests.ConnectionError as e: 52 | print('Error', e) 53 | 54 | 55 | def parse_images_url(json): 56 | '''解析json字典,获得对应条目标题和图片链接 57 | :param json: 页面请求响应结果对应的字典数据 58 | :type json: dict 59 | ''' 60 | data = json['data'] 61 | if data: 62 | for item in data: 63 | try: 64 | title = item['title'] 65 | images = item['image_list'] 66 | except KeyError: 67 | continue 68 | if images: 69 | for image in images: 70 | yield { 71 | 'image': image['url'].replace('list', 'origin').replace('190x124/', ''), 72 | 'title': title 73 | } 74 | 75 | 76 | def save_image_from(image_info, to_dir=''): 77 | '''根据链接获取图片,保存到标题命名的目录中,图片以md5码命名 78 | :param image_info: 包含标题和链接信息的字典数据 79 | :type image_info: dict 80 | :param to_dir: 要保存到的目录,默认值是空字符串,可指定目录,格式如: '狗狗' 81 | :type to_dir: unicode 字符串 82 | ''' 83 | if image_info: 84 | print(image_info) 85 | image_dir = to_dir + '/' + image_info['title'] 86 | if not os.path.exists(image_dir): 87 | os.makedirs(image_dir) 88 | try: 89 | response = requests.get(image_info['image']) 90 | if response.status_code == 200: 91 | image_path = '{0}/{1}.{2}'.format(image_dir, 92 | md5(response.content).hexdigest(), 'jpg') 93 | print(image_path) 94 | if not os.path.exists(image_path): 95 | with open(image_path, 'wb') as f: 96 | f.write(response.content) 97 | print('图片下载完成!') 98 | else: 99 | print('图片已下载 -> ', image_path) 100 | except requests.ConnectionError as e: 101 | print('图片下载失败!') 102 | except: 103 | print('出现异常!') 104 | 105 | 106 | def get_images_of(offset, keyword): 107 | '''主函数,用于多进程调度 108 | :param offset: 页面链接offset参数的值 109 | :type offset: 能被20整除的整数 110 | :param keyword: 图片搜索关键词 111 | :type keyword: unicode字符串 112 | ''' 113 | print('获取第', offset + 1, '~', offset + 20, '个条目...') 114 | json = get_page_json(offset, keyword) 115 | for image in parse_images_url(json): 116 | path = os.path.join(os.path.expanduser( 117 | '~'), 'Pictures/python/toutiao', keyword) 118 | save_image_from(image, path) 119 | 120 | 121 | def main(): 122 | print('\nWelcome here to get pictures from www.toutiao.com!') 123 | keyword = input('Please input your search keywords > ') 124 | count = None 125 | while count == None: 126 | number_str = input( 127 | 'Please input count of picture collection that you want(Divisible by 20 ) > ') 128 | try: 129 | count = int(number_str) 130 | except ValueError: 131 | print('Please input a valid number!') 132 | 133 | if count > 0: 134 | print('Getting %s pictures...' % keyword) 135 | page_num = count // 20 + (0 if count % 20 == 0 else 1) 136 | offset_list = [x * 20 for x in range(0, page_num)] 137 | pool = Pool() 138 | partial_getter = partial(get_images_of, keyword=keyword) 139 | pool.map(partial_getter, offset_list) 140 | pool.close() 141 | pool.join() 142 | else: 143 | print('Get Cancel!') 144 | 145 | 146 | if __name__ == '__main__': 147 | main() 148 | -------------------------------------------------------------------------------- /lofter/lofter_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # author: litreily 4 | # date: 2018.03.07 5 | """Capture pictures from lofter with username.""" 6 | 7 | import re 8 | import os 9 | import sys 10 | 11 | import requests 12 | 13 | import time 14 | import random 15 | 16 | 17 | def _get_path(uid): 18 | home_path = os.path.expanduser('~') 19 | path = os.path.join(home_path, 'Pictures/python/lofter', uid) 20 | if not os.path.isdir(path): 21 | os.makedirs(path) 22 | return path 23 | 24 | 25 | def _get_html(url, data, headers): 26 | try: 27 | html = requests.post(url, data, headers=headers) 28 | except Exception as e: 29 | print("get %s failed\n%s" % (url, str(e))) 30 | return None 31 | finally: 32 | pass 33 | return html 34 | 35 | 36 | def _get_blogid(username): 37 | try: 38 | html = requests.get('http://%s.lofter.com' % username) 39 | id_reg = r'src="//www.lofter.com/control\?blogId=(.*)"' 40 | blogid = re.search(id_reg, html.text).group(1) 41 | print('The blogid of %s is: %s' % (username, blogid)) 42 | return blogid 43 | except Exception as e: 44 | print('get blogid from http://%s.lofter.com failed' % username) 45 | print('please check your username.') 46 | exit(1) 47 | 48 | 49 | def _get_timestamp(html, time_pattern): 50 | if not html: 51 | timestamp = round(time.time() * 1000) # first timestamp(ms) 52 | else: 53 | timestamp = time_pattern.search(html).group(1) 54 | return str(timestamp) 55 | 56 | 57 | def _get_imgurls(username, blog, headers): 58 | blog_url = 'http://%s.lofter.com/post/%s' % (username, blog) 59 | blog_html = requests.get(blog_url, headers=headers).text 60 | imgurls = re.findall(r'bigimgsrc="(.*?)"', blog_html) 61 | print('Blog\t%s\twith %d\tpictures' % (blog_url, len(imgurls))) 62 | return imgurls 63 | 64 | 65 | def _capture_images(imgurl, path): 66 | headers = { 67 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} 68 | for i in range(1, 3): 69 | try: 70 | image_request = requests.get(imgurl, headers=headers, timeout=20) 71 | if image_request.status_code == 200: 72 | open(path, 'wb').write(image_request.content) 73 | break 74 | except requests.exceptions.ConnectionError as e: 75 | print('\tGet %s failed\n\terror:%s' % (imgurl, e)) 76 | if i == 1: 77 | imgurl = re.sub(r'^http://img.*?\.', 'http://img.', imgurl) 78 | print('\tRetry ' + imgurl) 79 | else: 80 | print('\tRetry fail') 81 | except Exception as e: 82 | print(e) 83 | finally: 84 | pass 85 | 86 | 87 | def _create_query_data(blogid, timestamp, query_number): 88 | data = {'callCount': '1', 89 | 'scriptSessionId': '${scriptSessionId}187', 90 | 'httpSessionId': '', 91 | 'c0-scriptName': 'ArchiveBean', 92 | 'c0-methodName': 'getArchivePostByTime', 93 | 'c0-id': '0', 94 | 'c0-param0': 'boolean:false', 95 | 'c0-param1': 'number:' + blogid, 96 | 'c0-param2': 'number:' + timestamp, 97 | 'c0-param3': 'number:' + query_number, 98 | 'c0-param4': 'boolean:false', 99 | 'batchId': '123456'} 100 | return data 101 | 102 | 103 | def main(argv): 104 | # prepare paramters 105 | if len(argv) < 2: 106 | print(os.path.basename(argv[0]) + ' username') 107 | exit(1) 108 | username = argv[1] 109 | blogid = _get_blogid(username) 110 | query_number = 40 111 | time_pattern = re.compile(r's%d\.time=(.*);s.*type' % (query_number-1)) 112 | blog_url_pattern = re.compile(r's[\d]*\.permalink="([\w_]*)"') 113 | 114 | # creat path to save imgs 115 | path = _get_path(username) 116 | 117 | # parameters of post packet 118 | url = 'http://%s.lofter.com/dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr' % username 119 | data = _create_query_data(blogid, _get_timestamp( 120 | None, time_pattern), str(query_number)) 121 | headers = { 122 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', 123 | 'Host': username + '.lofter.com', 124 | 'Referer': 'http://%s.lofter.com/view' % username, 125 | 'Accept-Encoding': 'gzip, deflate' 126 | } 127 | 128 | num_blogs = 0 129 | num_imgs = 0 130 | index_img = 0 131 | print('------------------------------- start line ------------------------------') 132 | while True: 133 | html = _get_html(url, data, headers).text 134 | # get urls of blogs: s3.permalink="44fbca_19a6b1b" 135 | new_blogs = blog_url_pattern.findall(html) 136 | num_new_blogs = len(new_blogs) 137 | num_blogs += num_new_blogs 138 | 139 | if num_new_blogs != 0: 140 | print('NewBlogs:%d\tTotalBolgs:%d' % (num_new_blogs, num_blogs)) 141 | # get imgurls from new_blogs 142 | imgurls = [] 143 | for blog in new_blogs: 144 | imgurls.extend(_get_imgurls(username, blog, headers)) 145 | num_imgs += len(imgurls) 146 | 147 | # download imgs 148 | for imgurl in imgurls: 149 | index_img += 1 150 | paths = '%s/%d.%s' % (path, index_img, 151 | re.search(r'(jpg|png|gif)', imgurl).group(0)) 152 | print('{}\t{}'.format(index_img, paths)) 153 | _capture_images(imgurl, paths) 154 | 155 | if num_new_blogs != query_number: 156 | print( 157 | '------------------------------- stop line -------------------------------') 158 | print('capture complete!') 159 | print('captured blogs:%d images:%d' % (num_blogs, num_imgs)) 160 | print('download path:' + path) 161 | print( 162 | '-------------------------------------------------------------------------') 163 | break 164 | 165 | data['c0-param2'] = 'number:' + _get_timestamp(html, time_pattern) 166 | print('The next TimeStamp is : %s\n' % data['c0-param2'].split(':')[1]) 167 | # wait a few second 168 | time.sleep(random.randint(5, 10)) 169 | 170 | 171 | if __name__ == '__main__': 172 | main(sys.argv) 173 | -------------------------------------------------------------------------------- /qqzone/qqzone_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | """一个用于下载QQ空间相册内所有照片的爬虫""" 4 | 5 | from selenium import webdriver 6 | from selenium.webdriver.common.keys import Keys 7 | from selenium.common.exceptions import WebDriverException 8 | 9 | import os 10 | import re 11 | import sys 12 | import time 13 | import logging 14 | import requests 15 | from json import loads 16 | 17 | 18 | class qqzone(object): 19 | """QQ空间相册爬虫""" 20 | 21 | def __init__(self, user): 22 | self.username = user['username'] 23 | self.password = user['password'] 24 | 25 | @staticmethod 26 | def get_path(album_name): 27 | home_path = os.path.expanduser('~') 28 | path = os.path.join(home_path, 'Pictures/python/qqzone', album_name) 29 | if not os.path.isdir(path): 30 | os.makedirs(path) 31 | return path 32 | 33 | def _login_and_get_args(self): 34 | """登录QQ,获取Cookies和g_tk""" 35 | opt = webdriver.ChromeOptions() 36 | opt.set_headless() 37 | 38 | driver = webdriver.Chrome(chrome_options=opt) 39 | driver.get('https://i.qq.com/') 40 | 41 | logging.info('User {} login...'.format(self.username)) 42 | driver.switch_to.frame('login_frame') 43 | driver.find_element_by_id('switcher_plogin').click() 44 | driver.find_element_by_id('u').clear() 45 | driver.find_element_by_id('u').send_keys(self.username) 46 | driver.find_element_by_id('p').clear() 47 | driver.find_element_by_id('p').send_keys(self.password) 48 | driver.find_element_by_id('login_button').click() 49 | 50 | time.sleep(1) 51 | driver.get('https://user.qzone.qq.com/{}'.format(self.username)) 52 | 53 | try: 54 | logging.info('Getting g_tk...') 55 | self.g_tk = driver.execute_script( 56 | 'return QZONE.FP.getACSRFToken()') 57 | logging.debug('g_tk: {}'.format(self.g_tk)) 58 | except WebDriverException: 59 | logging.error( 60 | 'Getting g_tk failed, please check your QQ number and password') 61 | driver.close() 62 | driver.quit() 63 | sys.exit(1) 64 | 65 | logging.info('Getting Cookies...') 66 | self.cookies = driver.get_cookies() 67 | 68 | driver.close() 69 | driver.quit() 70 | 71 | def _init_session(self): 72 | self.session = requests.Session() 73 | for cookie in self.cookies: 74 | self.session.cookies.set(cookie['name'], cookie['value']) 75 | self.session.headers = { 76 | 'Referer': 'https://qzs.qq.com/qzone/photo/v7/page/photo.html?init=photo.v7/module/albumList/index&navBar=1', 77 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' 78 | } 79 | 80 | def _get_query_for_request(self, topicId=None, pageStart=0, pageNum=100): 81 | """获取请求相册信息或照片信息所需的参数 82 | 83 | Args: 84 | topicId: 每个相册对应的唯一标识符 85 | pageStart: 请求某个相册的照片列表信息所需的起始页码 86 | pageNum: 单次请求某个相册的照片数量 87 | 88 | Returns: 89 | 一个组合好所有请求参数的字符串 90 | """ 91 | query = { 92 | 'g_tk': self.g_tk, 93 | 'hostUin': self.username, 94 | 'uin': self.username, 95 | 'appid': 4, 96 | 'inCharset': 'utf-8', 97 | 'outCharset': 'utf-8', 98 | 'source': 'qzone', 99 | 'plat': 'qzone', 100 | 'format': 'jsonp' 101 | } 102 | if topicId: 103 | query['topicId'] = topicId 104 | query['pageStart'] = pageStart 105 | query['pageNum'] = pageNum 106 | return '&'.join('{}={}'.format(key, val) for key, val in query.items()) 107 | 108 | def _load_callback_data(self, resp): 109 | """以json格式解析返回的jsonp数据""" 110 | try: 111 | resp.encoding = 'utf-8' 112 | data = loads(re.search(r'.*?\(({.*}).*?\).*', resp.text, re.S)[1]) 113 | return data 114 | except ValueError: 115 | logging.error('Invalid input') 116 | 117 | def _get_ablum_list(self): 118 | """获取相册的列表信息""" 119 | album_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3?' + \ 120 | self._get_query_for_request() 121 | 122 | logging.info('Getting ablum list id...') 123 | resp = self.session.get(album_url) 124 | data = self._load_callback_data(resp) 125 | 126 | album_list = {} 127 | for item in data['data']['albumListModeSort']: 128 | album_list[item['name']] = item['id'] 129 | 130 | return album_list 131 | 132 | def _get_photo(self, album_name, album_id): 133 | """获取单个相册的照片列表信息,并下载该相册所有照片""" 134 | photo_list_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?' + \ 135 | self._get_query_for_request(topicId=album_id) 136 | 137 | logging.info('Getting photo list for album {}...'.format(album_name)) 138 | resp = self.session.get(photo_list_url) 139 | data = self._load_callback_data(resp) 140 | if data['data']['totalInPage'] == 0: 141 | return None 142 | 143 | file_dir = self.get_path(album_name) 144 | for item in data['data']['photoList']: 145 | path = '{}/{}.jpg'.format(file_dir, item['name']) 146 | logging.info('Downloading {}-{}'.format(album_name, item['name'])) 147 | self._download_image(item['url'], path) 148 | 149 | def _download_image(self, url, path): 150 | """下载单张照片""" 151 | try: 152 | resp = self.session.get(url, timeout=15) 153 | if resp.status_code == 200: 154 | open(path, 'wb').write(resp.content) 155 | except requests.exceptions.Timeout: 156 | logging.warning('get {} timeout'.format(url)) 157 | except requests.exceptions.ConnectionError as e: 158 | logging.error(e.__str__) 159 | finally: 160 | pass 161 | 162 | def start(self): 163 | """爬虫的入口函数""" 164 | self._login_and_get_args() 165 | self._init_session() 166 | album_list = self._get_ablum_list() 167 | for name, id in album_list.items(): 168 | self._get_photo(name, id) 169 | 170 | 171 | def get_user(): 172 | """从终端获取用户输入的QQ号及密码""" 173 | username = input('please input QQ number: ').strip() 174 | if not re.match(r'^[1-9][0-9]{4,9}$', username): 175 | logging.error('\033[31mQQ number is wrong!\033[0m') 176 | sys.exit(1) 177 | 178 | import getpass 179 | password = getpass.getpass('password: ') 180 | 181 | return { 182 | 'username': username, 183 | 'password': password 184 | } 185 | 186 | 187 | def main(): 188 | FORMAT = '%(asctime)s [%(levelname)s] %(message)s' 189 | logging.basicConfig(format=FORMAT, level=logging.INFO) 190 | 191 | # 默认QQ账户信息 192 | user = { 193 | 'username': '123456789', 194 | 'password': '*********' 195 | } 196 | 197 | # 加 -d 参数可以使用上面的默认账户,默认信息请自行修改 198 | if not (len(sys.argv) > 1 and sys.argv[1] == '-d'): 199 | user = get_user() 200 | 201 | qz = qqzone(user) 202 | qz.start() 203 | 204 | 205 | if __name__ == '__main__': 206 | main() 207 | --------------------------------------------------------------------------------