├── .gitignore ├── LICENSE.md ├── README.md ├── pixivhack ├── __init__.py ├── __main__.py ├── cls_crawl.py └── pixivhack.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /*.json 3 | pixivimages/ 4 | dist/ 5 | pixivhack.egg-info/ 6 | build/ 7 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2015 GitHub Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Pixiv Hack 2 | 3 | [![PyPI version](https://badge.fury.io/py/pixivhack.svg)](https://badge.fury.io/py/pixivhack) 4 | [![Code Health](https://landscape.io/github/Chion82/PixivHack/master/landscape.svg?style=flat)](https://landscape.io/github/Chion82/PixivHack/master) 5 | ![python](https://img.shields.io/badge/python-2.7-green.svg) 6 | ![license](https://img.shields.io/badge/license-MIT-brightgreen.svg) 7 | 8 | ##Introduction 9 | Pixiv Hack is a tool to automatically crawl illustrations filtered by ratings on www.pixiv.net . 10 | 11 | ##Installation 12 | 13 | ``` 14 | $ sudo pip install pixivhack 15 | ``` 16 | 17 | ##Usage 18 | 1. Browse www.pixiv.net and sign in with your account. Copy the value of cookies:PHPSESSID using the browser debugger (F12) 19 | 2. You can now close the browser and start Pixiv Hack by running: 20 | ``` 21 | $ pixivhack 22 | ``` 23 | 3. Follow the prompt and enter the PHPSESSID you just copied, the keyword to search with, the minimum ratings of illustrations to filter with, the maximum number of illustrations to download and whether to download manga. 24 | 4. Sit back and relax! The script will do the rest. 25 | 5. After all work is done, you can check out ```author_info.json``` to view the ratings and the illustration IDs of each Pixiv author that is crawled. 26 | 6. All downloadable illustrations are saved in the ```pixivimages``` directory. 27 | 28 | ##Crawl Illustrations by author IDs 29 | 1. Create a ```.json``` file containing a list of Pixiv member IDs of authors. Sample: 30 | authors.json 31 | ``` 32 | ["2463004", "19351", "2157729"] 33 | ``` 34 | You can also use ```author_info.json``` which is automatically generated by this script using keyword-search mode described above. 35 | 2. Simply run 36 | ``` 37 | $ pixivhack -a 38 | ``` 39 | 3. Follow the prompt and enter PHPSESSID and other required parameters. 40 | 4. Illustraions are saved in the ```image``` directory sorted by author IDs. 41 | 42 | ##Dependencies 43 | * requests 44 | 45 | Install using: 46 | ``` 47 | $ sudo pip install requests 48 | ``` 49 | 50 | ##License 51 | See the ```LICENSE.md``` file for license rights and limitations (MIT). 52 | -------------------------------------------------------------------------------- /pixivhack/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chion82/PixivHack/f86c8046bc2bee9d7343fcdb1c340f9f9a9e1f57/pixivhack/__init__.py -------------------------------------------------------------------------------- /pixivhack/__main__.py: -------------------------------------------------------------------------------- 1 | from .pixivhack import main 2 | 3 | if __name__=='__main__': 4 | main() -------------------------------------------------------------------------------- /pixivhack/cls_crawl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | #Author: Chion82 4 | 5 | import requests 6 | import urllib 7 | import re 8 | import sys, os 9 | import HTMLParser 10 | import json 11 | from urlparse import urlparse, parse_qs 12 | 13 | reload(sys) 14 | sys.setdefaultencoding('utf8') 15 | 16 | class PixivHackLib(object): 17 | 18 | def __init__(self): 19 | self.__session_id = '' 20 | self.__session = requests.Session() 21 | self.__session.headers.update({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.125 Safari/537.36'}) 22 | self.__keyword = 'kancolle' 23 | self.__min_ratings = 0 24 | self.__max_pics = 10 25 | self.__pic_downloaded_count = 0 26 | self.__download_manga = True 27 | self.__download_big_images = True 28 | self.__author_ratings = [] 29 | if not os.path.exists('pixivimages'): 30 | os.makedirs('pixivimages') 31 | 32 | @property 33 | def session_id(self): 34 | return self.__session_id 35 | 36 | @session_id.setter 37 | def session_id(self, id_str): 38 | self.__session_id = id_str 39 | 40 | def config(self, keyword, min_ratings, max_pics, download_manga, download_big_images): 41 | self.__keyword = keyword 42 | self.__min_ratings = min_ratings 43 | self.__max_pics = max_pics 44 | self.__download_manga = download_manga 45 | self.__download_big_images = download_big_images 46 | 47 | def crawl(self): 48 | self.__pic_downloaded_count = 0 49 | self.__author_ratings = [] 50 | page = 1 51 | while self.__pic_downloaded_count < self.__max_pics : 52 | try: 53 | search_result = self.__get_search_result(page, None) 54 | if (len(search_result)==0 or page>1000): 55 | print('No more result found. ') 56 | break 57 | for link in search_result: 58 | if (self.__pic_downloaded_count >= self.__max_pics): 59 | break 60 | self.__enter_illustration_page(link, 'pixivimages') 61 | page = page + 1 62 | print('************************Moving to next page************************') 63 | except Exception: 64 | print('Crawl error. Skipping page...') 65 | page = page + 1 66 | continue 67 | print('All Done! Saving author info...') 68 | self.__save_author_ratings() 69 | 70 | def crawl_by_author(self, author_list, max_pics_per_author): 71 | for author_id in author_list: 72 | print('***********************Crawling by author*************************') 73 | print('author Pixiv ID : ' + author_id) 74 | self.__pic_downloaded_count = 0 75 | page = 1 76 | if not os.path.exists('pixivimages/' + author_id): 77 | os.makedirs('pixivimages/' + author_id) 78 | while self.__pic_downloaded_count < max_pics_per_author: 79 | try: 80 | search_result = self.__get_search_result(page, author_id) 81 | if (len(search_result) == 0): 82 | print('No more result found.') 83 | break 84 | for link in search_result: 85 | if (self.__pic_downloaded_count >= max_pics_per_author): 86 | break 87 | self.__enter_illustration_page(link, 'pixivimages/' + author_id) 88 | page = page + 1 89 | print('************************Moving to next page***************************') 90 | except Exception: 91 | print('Crawl error. Skipping page...') 92 | page = page + 1 93 | continue 94 | print('***********************Moving to next author**************************') 95 | print('All Done!') 96 | 97 | def __get_search_result(self, page, author_id): 98 | try: 99 | if (author_id == None): 100 | search_result = self.__session.get('http://www.pixiv.net/search.php?word=' + urllib.quote(self.__keyword) + '&p=' + str(page), cookies={'PHPSESSID': self.__session_id}) 101 | else: 102 | search_result = self.__session.get('http://www.pixiv.net/member_illust.php?id=' + author_id + '&type=all&p=' + str(page), cookies={'PHPSESSID': self.__session_id}) 103 | except Exception: 104 | print('Connection failure. Retrying...') 105 | return self.__get_search_result(page, author_id) 106 | 107 | result_list = re.findall(r'', search_result.text) 108 | return ['http://www.pixiv.net'+self.__html_decode(link) for link in result_list if (not '"' in link)] 109 | 110 | def __enter_illustration_page(self, url, directory): 111 | print('********************Entering illustration page*********************') 112 | print('Entering ' + url) 113 | 114 | try: 115 | page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}) 116 | except Exception: 117 | print('Connection failure. Retrying...') 118 | self.__enter_illustration_page(url, directory) 119 | return 120 | 121 | re_result_ratings = re.findall(r'
(.*?)
', page_result.text) 122 | ratings = re_result_ratings[0] 123 | pixiv_id = parse_qs(urlparse(url).query)['illust_id'][0] 124 | re_result_author_id = re.findall(r'
', page_result.text) 125 | pixiv_author_id = re_result_author_id[0] 126 | print('pixiv_id=' + pixiv_id) 127 | print('ratings='+ratings) 128 | print('author_id='+pixiv_author_id) 129 | if (int(ratings) < self.__min_ratings): 130 | print('Ratings < ' + str(self.__min_ratings) + ' , Skipping...') 131 | return 132 | self.__increment_author_ratings(pixiv_author_id, int(ratings), pixiv_id) 133 | re_manga_result = re.findall(r' 0): 137 | if (self.__download_manga == False): 138 | print('Illustration is manga. Skipping...') 139 | return 140 | print('Illustration is manga. Entering manga page.') 141 | self.__enter_manga_page('http://www.pixiv.net/' + self.__html_decode(re_manga_result[0]), pixiv_id, url, directory) 142 | self.__pic_downloaded_count = self.__pic_downloaded_count + 1 143 | elif (len(re_image_result) > 0): 144 | print('Illustration is image. Downloading image...') 145 | self.__pic_downloaded_count = self.__pic_downloaded_count + 1 146 | self.__download_image(self.__html_decode(re_image_result[0]), url, directory) 147 | print('Download completed.') 148 | elif (len(re_big_image_result) > 0): 149 | if (self.__download_big_images == False): 150 | print('Illustration is big-image. Skipping...') 151 | return 152 | print('Illustration mode is big-image. Entering big-image page.') 153 | self.__enter_big_image_page('http://www.pixiv.net/' + self.__html_decode(re_big_image_result[0]), url, directory) 154 | self.__pic_downloaded_count = self.__pic_downloaded_count + 1 155 | else: 156 | print('Illustration mode not supported. Skipping...') 157 | 158 | def __enter_big_image_page(self, url, referer, directory): 159 | print('********************Entering big-image page************************') 160 | print('Entering ' + url) 161 | try: 162 | page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}, headers={'Referer':referer}) 163 | except Exception: 164 | print('Connection failure. Retrying...') 165 | self.__enter_big_image_page(url, referer, directory) 166 | return 167 | 168 | re_big_image_url = re.findall(r' 4 | 5 | from .cls_crawl import PixivHackLib 6 | import argparse 7 | import json 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-a', '--authorlist', help='Crawl illustrations by author IDs. A JSON file containg the list of Pixiv member IDs is required.') 12 | args = parser.parse_args() 13 | 14 | print('Enter PHPSESSID:') 15 | PHPSESSID = raw_input() 16 | print('Enter minimum ratings:') 17 | min_ratings = raw_input() 18 | print('Download manga? (y/N)') 19 | download_manga_str = raw_input() 20 | if (download_manga_str == 'Y' or download_manga_str == 'y'): 21 | print('Will download manga.') 22 | download_manga = True 23 | else: 24 | print('Will not download manga.') 25 | download_manga = False 26 | print('Download big-images? (y/N)') 27 | download_big_images_str = raw_input() 28 | if (download_big_images_str == 'Y' or download_big_images_str == 'y'): 29 | print('Will download big-images.') 30 | download_big_images = True 31 | else: 32 | print('Will not download big-images.') 33 | download_big_images = False 34 | lib = PixivHackLib() 35 | lib.session_id = PHPSESSID 36 | 37 | if (args.authorlist): 38 | print('Will crawl using author ID list.') 39 | print('JSON file : ' + args.authorlist) 40 | f = open(args.authorlist, 'r') 41 | author_list = json.loads(f.read()) 42 | f.close() 43 | author_list = [str(x['author_id']) if type(x)==dict else str(x) for x in author_list] 44 | print('Enter maximum number of illustrations per author:') 45 | max_pics_per_author = raw_input() 46 | lib.config('', int(min_ratings), 0, download_manga, download_big_images) 47 | lib.crawl_by_author(author_list, int(max_pics_per_author)) 48 | else: 49 | print('Will crawl using keyword.') 50 | print('Enter keyword:') 51 | key_word = raw_input() 52 | print('Enter maximum number of illustrations to download:') 53 | max_pics = raw_input() 54 | lib.config(key_word, int(min_ratings), int(max_pics), download_manga, download_big_images) 55 | lib.crawl() 56 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | setup( 3 | name = 'pixivhack', 4 | packages = ['pixivhack'], 5 | entry_points={ 6 | 'console_scripts': [ 7 | 'pixivhack = pixivhack.pixivhack:main', 8 | ], 9 | }, 10 | install_requires=['requests'], 11 | version = '0.1.5', 12 | description = 'Pixiv Hack is a tool to automatically crawl illustrations filtered by ratings on www.pixiv.net', 13 | author = 'Chion82', 14 | license='MIT', 15 | author_email = 'sdspeedonion@gmail.com', 16 | url = 'https://github.com/Chion82/PixivHack', 17 | keywords = ['pixiv', 'pixivhack', 'crawler', 'crawl'], 18 | classifiers = [ 19 | 'Development Status :: 3 - Alpha', 20 | 'Programming Language :: Python :: 2', 21 | 'Programming Language :: Python :: 2.6', 22 | 'Programming Language :: Python :: 2.7', 23 | ] 24 | ) 25 | --------------------------------------------------------------------------------