├── .gitignore
├── LICENSE.md
├── README.md
├── pixivhack
    ├── __init__.py
    ├── __main__.py
    ├── cls_crawl.py
    └── pixivhack.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /*.json
3 | pixivimages/
4 | dist/
5 | pixivhack.egg-info/
6 | build/
7 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2015 GitHub Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Pixiv Hack
 2 | 
 3 | [![PyPI version](https://badge.fury.io/py/pixivhack.svg)](https://badge.fury.io/py/pixivhack)
 4 | [![Code Health](https://landscape.io/github/Chion82/PixivHack/master/landscape.svg?style=flat)](https://landscape.io/github/Chion82/PixivHack/master)
 5 | ![python](https://img.shields.io/badge/python-2.7-green.svg)
 6 | ![license](https://img.shields.io/badge/license-MIT-brightgreen.svg)
 7 | 
 8 | ##Introduction  
 9 | Pixiv Hack is a tool to automatically crawl illustrations filtered by ratings on www.pixiv.net .
10 | 
11 | ##Installation
12 | 
13 | ```
14 | $ sudo pip install pixivhack
15 | ```
16 | 
17 | ##Usage  
18 | 1. Browse www.pixiv.net and sign in with your account. Copy the value of cookies:PHPSESSID using the browser debugger (F12)  
19 | 2. You can now close the browser and start Pixiv Hack by running:  
20 | 	```
21 | 	$ pixivhack
22 | 	```
23 | 3. Follow the prompt and enter the PHPSESSID you just copied, the keyword to search with, the minimum ratings of illustrations to filter with, the maximum number of illustrations to download and whether to download manga.  
24 | 4. Sit back and relax! The script will do the rest.  
25 | 5. After all work is done, you can check out ```author_info.json``` to view the ratings and the illustration IDs of each Pixiv author that is crawled.  
26 | 6. All downloadable illustrations are saved in the ```pixivimages``` directory.
27 | 
28 | ##Crawl Illustrations by author IDs  
29 | 1. Create a ```.json``` file containing a list of Pixiv member IDs of authors. Sample:  
30 | 	authors.json  
31 | 	```
32 | 	["2463004", "19351", "2157729"]
33 | 	```  
34 | 	You can also use ```author_info.json``` which is automatically generated by this script using keyword-search mode described above.  
35 | 2. Simply run  
36 | 	```  
37 | 	$ pixivhack -a <JSON_file>
38 | 	```
39 | 3. Follow the prompt and enter PHPSESSID and other required parameters.  
40 | 4. Illustraions are saved in the ```image``` directory sorted by author IDs.
41 | 
42 | ##Dependencies  
43 | * requests
44 | 
45 | Install using:  
46 | ```
47 | $ sudo pip install requests
48 | ```
49 | 
50 | ##License
51 | See the ```LICENSE.md``` file for license rights and limitations (MIT).
52 | 


--------------------------------------------------------------------------------
/pixivhack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chion82/PixivHack/f86c8046bc2bee9d7343fcdb1c340f9f9a9e1f57/pixivhack/__init__.py


--------------------------------------------------------------------------------
/pixivhack/__main__.py:
--------------------------------------------------------------------------------
1 | from .pixivhack import main
2 | 
3 | if __name__=='__main__':
4 | 	main()


--------------------------------------------------------------------------------
/pixivhack/cls_crawl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | #Author: Chion82<sdspeedonion@gmail.com>
  4 | 
  5 | import requests
  6 | import urllib
  7 | import re
  8 | import sys, os
  9 | import HTMLParser
 10 | import json
 11 | from urlparse import urlparse, parse_qs
 12 | 
 13 | reload(sys)
 14 | sys.setdefaultencoding('utf8')
 15 | 
 16 | class PixivHackLib(object):
 17 | 	
 18 | 	def __init__(self):
 19 | 		self.__session_id = ''
 20 | 		self.__session = requests.Session()
 21 | 		self.__session.headers.update({'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.125 Safari/537.36'})
 22 | 		self.__keyword = 'kancolle'
 23 | 		self.__min_ratings = 0
 24 | 		self.__max_pics = 10
 25 | 		self.__pic_downloaded_count = 0
 26 | 		self.__download_manga = True
 27 | 		self.__download_big_images = True
 28 | 		self.__author_ratings = []
 29 | 		if not os.path.exists('pixivimages'):
 30 | 			os.makedirs('pixivimages')
 31 | 
 32 | 	@property
 33 | 	def session_id(self):
 34 | 		return self.__session_id
 35 | 
 36 | 	@session_id.setter
 37 | 	def session_id(self, id_str):
 38 | 		self.__session_id = id_str
 39 | 
 40 | 	def config(self, keyword, min_ratings, max_pics, download_manga, download_big_images):
 41 | 		self.__keyword = keyword
 42 | 		self.__min_ratings = min_ratings
 43 | 		self.__max_pics = max_pics
 44 | 		self.__download_manga = download_manga
 45 | 		self.__download_big_images = download_big_images
 46 | 
 47 | 	def crawl(self):
 48 | 		self.__pic_downloaded_count = 0
 49 | 		self.__author_ratings = []
 50 | 		page = 1
 51 | 		while self.__pic_downloaded_count < self.__max_pics :
 52 | 			try:
 53 | 				search_result = self.__get_search_result(page, None)
 54 | 				if (len(search_result)==0 or page>1000):
 55 | 					print('No more result found. ')
 56 | 					break
 57 | 				for link in search_result:
 58 | 					if (self.__pic_downloaded_count >= self.__max_pics):
 59 | 						break
 60 | 					self.__enter_illustration_page(link, 'pixivimages')
 61 | 				page = page + 1
 62 | 				print('************************Moving to next page************************')
 63 | 			except Exception:
 64 | 				print('Crawl error. Skipping page...')
 65 | 				page = page + 1
 66 | 				continue
 67 | 		print('All Done! Saving author info...')
 68 | 		self.__save_author_ratings()
 69 | 
 70 | 	def crawl_by_author(self, author_list, max_pics_per_author):
 71 | 		for author_id in author_list:
 72 | 			print('***********************Crawling by author*************************')
 73 | 			print('author Pixiv ID : ' + author_id)
 74 | 			self.__pic_downloaded_count = 0
 75 | 			page = 1
 76 | 			if not os.path.exists('pixivimages/' + author_id):
 77 | 				os.makedirs('pixivimages/' + author_id)
 78 | 			while self.__pic_downloaded_count < max_pics_per_author:
 79 | 				try:
 80 | 					search_result = self.__get_search_result(page, author_id)
 81 | 					if (len(search_result) == 0):
 82 | 						print('No more result found.')
 83 | 						break
 84 | 					for link in search_result:
 85 | 						if (self.__pic_downloaded_count >= max_pics_per_author):
 86 | 							break
 87 | 						self.__enter_illustration_page(link, 'pixivimages/' + author_id)
 88 | 					page = page + 1
 89 | 					print('************************Moving to next page***************************')
 90 | 				except Exception:
 91 | 					print('Crawl error. Skipping page...')
 92 | 					page = page + 1
 93 | 					continue
 94 | 			print('***********************Moving to next author**************************')
 95 | 		print('All Done!')
 96 | 
 97 | 	def __get_search_result(self, page, author_id):
 98 | 		try:
 99 | 			if (author_id == None):
100 | 				search_result = self.__session.get('http://www.pixiv.net/search.php?word=' + urllib.quote(self.__keyword) + '&p=' + str(page), cookies={'PHPSESSID': self.__session_id})
101 | 			else:
102 | 				search_result = self.__session.get('http://www.pixiv.net/member_illust.php?id=' + author_id + '&type=all&p=' + str(page), cookies={'PHPSESSID': self.__session_id})
103 | 		except Exception:
104 | 			print('Connection failure. Retrying...')
105 | 			return self.__get_search_result(page, author_id)
106 | 			
107 | 		result_list = re.findall(r'<a href="(/member_illust\.php\?mode=.*?&amp;illust_id=.*?)">', search_result.text)
108 | 		return ['http://www.pixiv.net'+self.__html_decode(link) for link in result_list if (not '"' in link)]
109 | 
110 | 	def __enter_illustration_page(self, url, directory):
111 | 		print('********************Entering illustration page*********************')
112 | 		print('Entering ' + url)
113 | 
114 | 		try:
115 | 			page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id})
116 | 		except Exception:
117 | 			print('Connection failure. Retrying...')
118 | 			self.__enter_illustration_page(url, directory)
119 | 			return
120 | 		
121 | 		re_result_ratings = re.findall(r'<dd class="rated-count">(.*?)</dd>', page_result.text)
122 | 		ratings = re_result_ratings[0]
123 | 		pixiv_id = parse_qs(urlparse(url).query)['illust_id'][0]
124 | 		re_result_author_id = re.findall(r'<a href="/member\.php\?id=(.*?)" class="user-link">', page_result.text)
125 | 		pixiv_author_id = re_result_author_id[0]
126 | 		print('pixiv_id=' + pixiv_id)
127 | 		print('ratings='+ratings)
128 | 		print('author_id='+pixiv_author_id)
129 | 		if (int(ratings) < self.__min_ratings):
130 | 			print('Ratings < ' + str(self.__min_ratings) + ' , Skipping...')
131 | 			return
132 | 		self.__increment_author_ratings(pixiv_author_id, int(ratings), pixiv_id)
133 | 		re_manga_result = re.findall(r'<a href="(member_illust\.php\?mode=manga&amp;illust_id=.*?)"', page_result.text)
134 | 		re_image_result = re.findall(r'data-src="(.*?)" class="original-image"', page_result.text)
135 | 		re_big_image_result = re.findall(r'<a href="(member_illust\.php\?mode=big&amp;illust_id=.*?)"', page_result.text)
136 | 		if (len(re_manga_result) > 0):
137 | 			if (self.__download_manga == False):
138 | 				print('Illustration is manga. Skipping...')
139 | 				return
140 | 			print('Illustration is manga. Entering manga page.')
141 | 			self.__enter_manga_page('http://www.pixiv.net/' + self.__html_decode(re_manga_result[0]), pixiv_id, url, directory)
142 | 			self.__pic_downloaded_count = self.__pic_downloaded_count + 1
143 | 		elif (len(re_image_result) > 0):
144 | 			print('Illustration is image. Downloading image...')
145 | 			self.__pic_downloaded_count = self.__pic_downloaded_count + 1
146 | 			self.__download_image(self.__html_decode(re_image_result[0]), url, directory)
147 | 			print('Download completed.')
148 | 		elif (len(re_big_image_result) > 0):
149 | 			if (self.__download_big_images == False):
150 | 				print('Illustration is big-image. Skipping...')
151 | 				return
152 | 			print('Illustration mode is big-image. Entering big-image page.')
153 | 			self.__enter_big_image_page('http://www.pixiv.net/' + self.__html_decode(re_big_image_result[0]), url, directory)
154 | 			self.__pic_downloaded_count = self.__pic_downloaded_count + 1
155 | 		else:
156 | 			print('Illustration mode not supported. Skipping...')
157 | 
158 | 	def __enter_big_image_page(self, url, referer, directory):
159 | 		print('********************Entering big-image page************************')
160 | 		print('Entering ' + url)
161 | 		try:
162 | 			page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}, headers={'Referer':referer})
163 | 		except Exception:
164 | 			print('Connection failure. Retrying...')
165 | 			self.__enter_big_image_page(url, referer, directory)
166 | 			return
167 | 
168 | 		re_big_image_url = re.findall(r'<img src="(.*?)"', page_result.text)
169 | 		print('Downloading big-image.')
170 | 		self.__download_image(self.__html_decode(re_big_image_url[0]), url, directory)
171 | 		print('Download completed.')
172 | 
173 | 	def __enter_manga_page(self, url, pixiv_id, referer,directory):
174 | 		print('********************Entering manga page**************************')
175 | 		print('Entering ' + url)
176 | 		if not os.path.exists(directory + '/' + pixiv_id):
177 | 			os.makedirs(directory + '/' + pixiv_id)
178 | 
179 | 		try:
180 | 			page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}, headers={'Referer':referer})
181 | 		except Exception:
182 | 			print('Connection failure. Retrying...')
183 | 			self.__enter_manga_page(url, pixiv_id, referer,directory)
184 | 			return
185 | 
186 | 		re_manga_page_result = re.findall(r'<a href="(/member_illust\.php\?mode=manga_big.*?)"', page_result.text)
187 | 		for link in re_manga_page_result:
188 | 			self.__enter_manga_big_page('http://www.pixiv.net' + self.__html_decode(link), url, directory + '/' + pixiv_id)
189 | 
190 | 	def __enter_manga_big_page(self, url, referer, directory):
191 | 		print('********************Entering manga-big page***************************')
192 | 		print('Entering ' + url)
193 | 
194 | 		try:
195 | 			page_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}, headers={'Referer':referer})
196 | 		except Exception:
197 | 			print('Connection failure. Retrying...')
198 | 			self.__enter_manga_big_page(url, referer, directory)
199 | 			return
200 | 		
201 | 		re_image_result = re.findall(r'<img src="(.*?)"', page_result.text)
202 | 		print('Downloading manga-big image...')
203 | 		self.__download_image(self.__html_decode(re_image_result[0]), url, directory)
204 | 		print('Download completed.')
205 | 
206 | 	def __increment_author_ratings(self, author_id, increment, pixiv_id):
207 | 		for author in self.__author_ratings:
208 | 			if (author['author_id'] == author_id):
209 | 				if (pixiv_id in author['illust_id']):
210 | 					return
211 | 				author['total_ratings'] = author['total_ratings'] + increment
212 | 				author['illust_id'].append(pixiv_id)
213 | 				return
214 | 		self.__author_ratings.append({'author_id':author_id, 'total_ratings':increment, 'illust_id':[pixiv_id]})
215 | 
216 | 	def __save_author_ratings(self):
217 | 		self.__author_ratings = sorted(self.__author_ratings, key=lambda author:author['total_ratings'], reverse=True)
218 | 		f = open('author_info.json','w+')
219 | 		f.write(json.dumps(self.__author_ratings))
220 | 		f.close()
221 | 
222 | 	def __html_decode(self, string):
223 | 		h = HTMLParser.HTMLParser()
224 | 		return h.unescape(string)
225 | 
226 | 	def __download_image(self, url, referer, directory):
227 | 		try:
228 | 			download_result = self.__session.get(url, cookies={'PHPSESSID': self.__session_id}, headers={'Referer':referer})
229 | 		except Exception:
230 | 			print('Connection failure. Retrying...')
231 | 			self.__download_image(url, referer, directory)
232 | 			return
233 | 
234 | 		if (download_result.status_code != 200):
235 | 			print('Download Error')
236 | 			print(download_result.text)
237 | 			return
238 | 		url_parsed_array = url.split('/')
239 | 		file_name = url_parsed_array[len(url_parsed_array)-1]
240 | 		with open(directory + '/' + file_name, 'wb+') as f:
241 | 			for chunk in download_result.iter_content():
242 | 				f.write(chunk)
243 | 			f.close()
244 | 


--------------------------------------------------------------------------------
/pixivhack/pixivhack.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | #Author: Chion82<sdspeedonion@gmail.com>
 4 | 
 5 | from .cls_crawl import PixivHackLib
 6 | import argparse
 7 | import json
 8 | 
 9 | def main():
10 | 	parser = argparse.ArgumentParser()
11 | 	parser.add_argument('-a', '--authorlist', help='Crawl illustrations by author IDs. A JSON file containg the list of Pixiv member IDs is required.')
12 | 	args = parser.parse_args()
13 | 
14 | 	print('Enter PHPSESSID:')
15 | 	PHPSESSID = raw_input()
16 | 	print('Enter minimum ratings:')
17 | 	min_ratings = raw_input()
18 | 	print('Download manga? (y/N)')
19 | 	download_manga_str = raw_input()
20 | 	if (download_manga_str == 'Y' or download_manga_str == 'y'):
21 | 		print('Will download manga.')
22 | 		download_manga = True
23 | 	else:
24 | 		print('Will not download manga.')
25 | 		download_manga = False
26 | 	print('Download big-images? (y/N)')
27 | 	download_big_images_str = raw_input()
28 | 	if (download_big_images_str == 'Y' or download_big_images_str == 'y'):
29 | 		print('Will download big-images.')
30 | 		download_big_images = True
31 | 	else:
32 | 		print('Will not download big-images.')
33 | 		download_big_images = False
34 | 	lib = PixivHackLib()
35 | 	lib.session_id = PHPSESSID
36 | 	
37 | 	if (args.authorlist):
38 | 		print('Will crawl using author ID list.')
39 | 		print('JSON file : ' + args.authorlist)
40 | 		f = open(args.authorlist, 'r')
41 | 		author_list = json.loads(f.read())
42 | 		f.close()
43 | 		author_list = [str(x['author_id']) if type(x)==dict else str(x) for x in author_list]
44 | 		print('Enter maximum number of illustrations per author:')
45 | 		max_pics_per_author = raw_input()
46 | 		lib.config('', int(min_ratings), 0, download_manga, download_big_images)
47 | 		lib.crawl_by_author(author_list, int(max_pics_per_author))
48 | 	else:
49 | 		print('Will crawl using keyword.')
50 | 		print('Enter keyword:')
51 | 		key_word = raw_input()
52 | 		print('Enter maximum number of illustrations to download:')
53 | 		max_pics = raw_input()
54 | 		lib.config(key_word, int(min_ratings), int(max_pics), download_manga, download_big_images)
55 | 		lib.crawl()
56 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | setup(
 3 |   name = 'pixivhack',
 4 |   packages = ['pixivhack'],
 5 |   entry_points={
 6 |       'console_scripts': [
 7 |           'pixivhack = pixivhack.pixivhack:main',
 8 |       ],
 9 |   },
10 |   install_requires=['requests'],
11 |   version = '0.1.5',
12 |   description = 'Pixiv Hack is a tool to automatically crawl illustrations filtered by ratings on www.pixiv.net',
13 |   author = 'Chion82',
14 |   license='MIT',
15 |   author_email = 'sdspeedonion@gmail.com',
16 |   url = 'https://github.com/Chion82/PixivHack',
17 |   keywords = ['pixiv', 'pixivhack', 'crawler', 'crawl'],
18 |   classifiers = [
19 |     'Development Status :: 3 - Alpha',
20 |     'Programming Language :: Python :: 2',
21 |     'Programming Language :: Python :: 2.6',
22 |     'Programming Language :: Python :: 2.7',
23 |   ]
24 | )
25 | 


--------------------------------------------------------------------------------