├── .gitignore
├── LICENSE
├── README.md
└── catch.py


/.gitignore:
--------------------------------------------------------------------------------
1 | Images
2 | log.txt
3 | tags
4 | config
5 | \#config\#
6 | all_tags.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Kanagi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | MoeDownloader
 2 | ======
 3 | 基于python的福利图嗅探器，目前可以嗅探草榴、煎蛋和二次萌エロ画像ブログ这三个网站的图片，如果需要加入其他网站也比较容易。
 4 | 
 5 | 基本用法:
 6 | ======
 7 | "`
 8 | python catch.py [topic]
 9 | `"
10 | 
11 | 其中，［topic］可以是caoliu、moeimg、jandan三个选项之一
12 | 
13 | 更多的用法请输入
14 | "`
15 | python catch.py -h
16 | `"
17 | 来查看
18 | 


--------------------------------------------------------------------------------
/catch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # LICENSE:  see LICENSE file
  5 | #
  6 | # bbs mode:
  7 | # You must rewrite Download,GetCurrentDir,CheckThreadsValid,
  8 | # GetThreadUrl and GetTitle function.
  9 | # single-page mode:
 10 | # You must rewrite Download function.
 11 | 
 12 | import sys
 13 | import logging
 14 | import os
 15 | import os.path
 16 | import requests
 17 | import requesocks
 18 | import re
 19 | import ConfigParser
 20 | import argparse
 21 | import imghdr
 22 | from HTMLParser import HTMLParser
 23 | 
 24 | def success(val): return val,None
 25 | def error(why): return None,why
 26 | def get_val(m_val): return m_val[0]
 27 | def get_error(m_val): return m_val[1]
 28 | 
 29 | #global variables
 30 | init_with_config_file = True
 31 | has_log_file = True
 32 | 
 33 | if os.name != 'nt':
 34 |     WindowsError = OSError
 35 | 
 36 | class Downloader(object):
 37 |     """docstring for ClassName"""
 38 |     def __init__(self):
 39 |         super(Downloader, self).__init__()
 40 | 
 41 |         self.type = 'none'
 42 |         self._isUrlFormat = re.compile(r'https?://([\w-]+\.)+[\w-]+(/[\w\- ./?%&=]*)?');
 43 |         self._path = get_val(self.DealDir("Images"))
 44 |         self.currentDir = ""
 45 |         self.cf = ConfigParser.ConfigParser()
 46 |         self.pageNum = 1
 47 |         self.pageTo = 1
 48 |         self.isMono = False
 49 |         self.keepOriginTitle = True
 50 |         self.numToDownload = -1
 51 |         self.loggingFile = 'log.txt'
 52 |         self.retryTimes = 5
 53 |         self.encode = None
 54 |         self.useProxy = False
 55 |         self.httpProxy = '127.0.0.1:1080'
 56 |         self.httpsProxy = '127.0.0.1:1080'
 57 |         self.imageCount = 0
 58 |         self.verbose = False
 59 |         self.silent = False
 60 |         self.targetThread = "" # single thread
 61 |         self.targetThreadRegex = ""
 62 | 
 63 |         #moeimg specific
 64 |         self.moeimgdomain = 'moeimg.net'
 65 |         self.moeimgTags = False
 66 |         self.moeimgSortWithTags = False
 67 |         self.currentTag = 'default'
 68 | 
 69 |         #caoliu specific
 70 |         self.caoliudomain = 't66y.com'
 71 | 
 72 |         #jandan specific
 73 |         self.jandandomain = 'jandan.net'
 74 |         self.jandanPageToDownload = 1
 75 | 
 76 |         global init_with_config_file
 77 |         global has_log_file
 78 |         if init_with_config_file:
 79 |             if not os.path.exists('config'):
 80 |                 self.InternalPrint('No config file. Creating a default one.', False)
 81 |                 self.SetDefaultConfig();
 82 |             self.LoadConfig()
 83 |         #init logging file
 84 |         if has_log_file:
 85 |             logging.basicConfig(filename = os.path.join(os.getcwd(), self.loggingFile), level = logging.WARN, filemode = 'a+', format = '%(asctime)s - %(levelname)s: %(message)s')
 86 | 
 87 |     def InternalPrint(self, msg, is_verbose):
 88 |         if not self.silent:
 89 |             if is_verbose:
 90 |                 if(self.verbose):
 91 |                     print(msg)
 92 |             else:
 93 |                 print(msg)
 94 | 
 95 |     def LoadConfig(self):
 96 |         self.cf.read("config")
 97 |         self.pageNum = self.cf.getint('web','page_from')
 98 |         self.pageTo = self.cf.getint('web','page_to')
 99 |         self.isMono = self.cf.getboolean('file','mono')
100 |         self.numToDownload = self.cf.getint('web','num_to_download')
101 |         self.loggingFile = self.cf.get('basic','log_file')
102 |         self.retryTimes = self.cf.getint('web','retry_times')
103 |         self.caoliudomain = self.cf.get('caoliu','domain')
104 |         self.moeimgdomain = self.cf.get('moeimg','domain')
105 |         self.keepOriginTitle = self.cf.getboolean('file','keep_origin_title')
106 |         self.jandandomain = self.cf.get('jandan','domain')
107 |         self.jandanPageToDownload = self.cf.getint('jandan','pages_to_download')
108 |         self.moeimgTags = self.cf.getboolean('moeimg','tags')
109 |         self.moeimgSortWithTags = self.cf.getboolean('moeimg','sort_with_tags')
110 |         self.useProxy = self.cf.getboolean('basic','use_proxy')
111 |         self.httpProxy = self.cf.get('basic','http_proxy')
112 |         self.httpsProxy = self.cf.get('basic','https_proxy')
113 | 
114 | 
115 |     def SetDefaultConfig(self):
116 |         self.cf.add_section('basic')
117 |         self.cf.set('basic','log_file','log.txt')
118 |         self.cf.set('basic','use_proxy','false')
119 |         self.cf.set('basic','http_proxy','127.0.0.1:1080')
120 |         self.cf.set('basic','https_proxy','127.0.0.1:1080')
121 |         self.cf.add_section('web')
122 |         self.cf.set('web','page_from','1')
123 |         self.cf.set('web','page_to','1')
124 |         self.cf.set('web','num_to_download','-1')
125 |         self.cf.set('web','retry_times','5')
126 |         self.cf.add_section('caoliu')
127 |         self.cf.set('caoliu','domain','t66y.com')
128 |         self.cf.add_section('moeimg')
129 |         self.cf.set('moeimg','domain','moeimg.net')
130 |         self.cf.set('moeimg','tags','false')
131 |         self.cf.set('moeimg','sort_with_tags','false')
132 |         self.cf.add_section('jandan')
133 |         self.cf.set('jandan','domain','jandan.net')
134 |         self.cf.set('jandan','pages_to_download','1')
135 |         self.cf.add_section('file')
136 |         self.cf.set('file','mono','false')
137 |         self.cf.set('file','keep_origin_title','true')
138 |         with open('config', 'wb') as configfile:
139 |             self.cf.write(configfile)
140 | 
141 |     def StripIllegalChar(self, path):
142 |         return path.strip('>').strip('<').strip('*').strip('|').strip('?').strip(':').strip('"').strip('/')
143 | 
144 |     def DealDir(self, path):
145 |         solved = False
146 |         while True:
147 |             try:
148 |                 if not os.path.exists(path):
149 |                     os.mkdir(path)
150 |                 return success(path)
151 |             except WindowsError:
152 |                 #windows specific
153 |                 global has_log_file
154 |                 if has_log_file:
155 |                     logging.error('Windows error with path %s' % path)
156 |                 if not solved:
157 |                     path = self.StripIllegalChar(path)
158 |                     solved = True
159 |                 else:
160 |                     return error('Invalid path name %s' % path)
161 | 
162 |     def FetchHtml(self, url):
163 |         retry = 0
164 |         proxies = {
165 |             'http':self.httpProxy,
166 |             'https':self.httpsProxy,
167 |         }
168 |         headers = {
169 |             'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
170 |         }
171 |         while True:
172 |             try:
173 |                 self.InternalPrint("Fetching HTML: %s" % url, True)
174 |                 session = requesocks.session()
175 |                 session.headers = headers;
176 |                 if self.useProxy:
177 |                     self.InternalPrint("Using proxy: http %s, https %s" % (self.httpProxy, self.httpsProxy), True)
178 |                     session.proxies = proxies
179 |                 else:
180 |                     self.InternalPrint("No proxy.", True)
181 |                 response = session.get(url)
182 |                 if response.status_code != 200:
183 |                     self.InternalPrint(response.text, True)
184 |                     return error("Failed to fetch html. CODE:%i" % response.status_code)
185 |                 elif (response.text) == 0:
186 |                     return error("Empty html.")
187 |                 else:
188 |                     if self.encode != None:
189 |                         response.encoding = self.encode
190 |                     return success(response.text)
191 |             #except requests.ConnectionError:
192 |             except requesocks.exceptions.ConnectionError:
193 |                 if retry<self.retryTimes:
194 |                     retry+=1
195 |                     self.InternalPrint('Can\'t retrive html. retry %i' % retry, False)
196 |                     continue
197 |                 global has_log_file
198 |                 if has_log_file:
199 |                     logging.error('Can not connect to %s' % url)
200 |                 return error("The server is not responding.")
201 | 
202 |     def DoFetch(self, domain):
203 |         self.InternalPrint("Fetching main html...", True)
204 |         res = self.FetchHtml(domain)
205 |         self.InternalPrint("Main html fetched.", True)
206 |         if get_error(res):
207 |             return res
208 |         html = get_val(res)
209 |         self.FetchPageHtml(html);
210 |         return success(0)
211 | 
212 |     def FetchPageHtml(self, htmlSource):
213 |         prog = re.compile(self.ThreadsRegex, re.IGNORECASE)
214 |         matchesThreads = prog.findall(htmlSource)
215 |         num = 0
216 |         for href in matchesThreads:
217 |             if self.CheckThreadsValid(href) is True:
218 |                 #print href
219 |                 threadurl = self.GetThreadUrl(href)
220 |                 self.InternalPrint('Thread '+str(num + 1)+':'+threadurl, False)
221 |                 if self.keepOriginTitle:
222 |                     self.currentDir = self.GetTitle(href)
223 |                 else:
224 |                     self.currentDir = self.GetCurrentDir(href)
225 | 
226 |                 #TODO: gb2312 bug
227 |                 try:
228 |                     self.InternalPrint(self.currentDir.encode(sys.getfilesystemencoding())+'/', False)
229 |                 except UnicodeEncodeError:
230 |                     global has_log_file
231 |                     if has_log_file:
232 |                         logging.warning('Unicode encode error at %s' % threadurl)
233 |                     self.currentDir = self.GetCurrentDir(href)
234 |                     self.InternalPrint(self.currentDir+'/', False)
235 | 
236 |                 res = self.FetchThreadHtml(threadurl)
237 |                 if(get_error(res)):
238 |                     self.InternalPrint(get_error(res), False)
239 |                 num+=1
240 |                 if self.numToDownload>0 and num>=self.numToDownload:
241 |                     break
242 | 
243 |     def DoFetchSingleThread(self, url):
244 |         self.InternalPrint('Thread:'+url, False)
245 | 
246 |         self.InternalPrint("Fetching thread html...", True)
247 |         res = self.FetchHtml(url)
248 |         if get_error(res):
249 |             return res
250 |         self.InternalPrint("Thread html fetched.", True)
251 | 
252 |         html = get_val(res)
253 |         #get current directory
254 |         if self.keepOriginTitle:
255 |             # get thread title
256 |             #self.currentDir = self.GetTitle(href)
257 |             prog = re.compile(self.targetThreadRegex, re.IGNORECASE)
258 |             matches = prog.findall(html)
259 |             self.currentDir = matches[0]
260 |         else:
261 |             self.currentDir = url.split('/')[-1].split('.')[-2]
262 |         #TODO: gb2312 bug
263 |         try:
264 |             self.InternalPrint(self.currentDir.encode(sys.getfilesystemencoding())+'/', False)
265 |         except UnicodeEncodeError:
266 |             global has_log_file
267 |             if has_log_file:
268 |                 logging.warning('Unicode encode error at %s' % url)
269 |             self.currentDir = 'tmp'
270 |             self.InternalPrint(self.currentDir+'/', False)
271 | 
272 |         html = get_val(res)
273 |         self.currentTag = self.GetThreadTagName(html)
274 |         self.FetchImgLinksFromThread(html);
275 |         return success(0)
276 | 
277 |     # need to rewrite
278 |     def GetThreadUrl(self, href):pass
279 |     def GetTitle(self, href):pass
280 |     def CheckThreadsValid(self, href):pass
281 |     def GetCurrentDir(self, href):pass
282 |     def GetThreadTagName(self, html):return 'default'
283 |     def Download(self):
284 |         self.init()
285 | 
286 |     def PreHandleImgLink(self, href):
287 |         return href
288 | 
289 |     def PreHandleTagName(self, local_file):
290 |         return local_file
291 | 
292 |     def FetchThreadHtml(self, threadurl):
293 |         self.InternalPrint("Fetching thread html...", True)
294 |         res = self.FetchHtml(threadurl)
295 |         self.InternalPrint("Thread html fetched.", True)
296 |         if get_error(res):
297 |             return res
298 |         html = get_val(res)
299 |         self.currentTag = self.GetThreadTagName(html)
300 |         self.FetchImgLinksFromThread(html);
301 |         return success(html)
302 | 
303 |     def FetchImgLinksFromThread(self, htmlSource):
304 |         prog = re.compile(self.ImgRegex, re.IGNORECASE)
305 |         matchesImgSrc = prog.findall(htmlSource)
306 |         global has_log_file
307 |         if not self.isMono:
308 |             self.imageCount = 0
309 |         for href in matchesImgSrc:
310 |             self.InternalPrint(href, True)
311 |             href = self.PreHandleImgLink(href)
312 |             if not self.CheckIsUrlFormat(href):
313 |             #warning: requests library does not support non-http(s) url
314 |                 self.InternalPrint('Invalid url format %s' % href, False)
315 |                 if has_log_file:
316 |                     logging.error('Invalid url format %s' % href)
317 |                 continue;
318 |             res = self.download_file(href)
319 |             if get_error(res):
320 |                 self.InternalPrint(get_error(res).encode(sys.getfilesystemencoding()), False)
321 |             self.imageCount += 1
322 | 
323 |     def CheckIsUrlFormat(self, value):
324 |         return self._isUrlFormat.match(value) is not None
325 | 
326 |     def GetImageType(self, img_path):
327 |         type = imghdr.what(img_path)
328 |         if type != None:
329 |             return type
330 |         else:
331 |             return "jpg"
332 | 
333 |     def ImageExists(self, path, img_name):
334 |         files = os.listdir(path)
335 |         for f in files:
336 |             if img_name == os.path.splitext(f)[0]:
337 |                 return True
338 |         return False
339 | 
340 |     def download_file(self, url):
341 |         dir = self.type
342 |         local_directory = ""
343 |         if self.isMono:
344 |             local_directory = "Images/"+ dir + '/'
345 |             self.DealDir(local_directory)
346 |             local_directory = self.PreHandleTagName(local_directory)
347 |         else:
348 |             local_directory = "Images/" + dir + '/'
349 |             self.DealDir(local_directory)
350 |             local_directory = self.PreHandleTagName(local_directory)
351 |             # deal windows directory error
352 |             res = self.DealDir(local_directory + self.currentDir + '/')
353 |             if get_error(res):
354 |                 #self.InternalPrint(get_error(res), False)
355 |                 self.DealDir(local_directory + 'tmp/')
356 |                 local_directory += 'tmp/'
357 |             else:
358 |                 local_directory += self.currentDir + '/'
359 | 
360 |         #local_filename = local_filename + self.StripIllegalChar(url.split('/')[-1])#has bug in windows
361 |         image_path = local_directory + str(self.imageCount)# so use image count instead
362 |         if self.ImageExists(local_directory, str(self.imageCount)):
363 |             if not self.isMono:
364 |                 return error('\t skip '+image_path)
365 |             else:
366 |                 while(self.ImageExists(local_directory, str(self.imageCount))):
367 |                     self.imageCount+=1
368 |                 image_path = local_directory + str(self.imageCount)
369 | 
370 |         self.InternalPrint('\t=>'+image_path.encode(sys.getfilesystemencoding()), False)
371 |         # NOTE the stream=True parameter
372 |         retry = 0
373 |         proxies = {
374 |             'http':self.httpProxy,
375 |             'https':self.httpsProxy,
376 |         }
377 |         global has_log_file
378 |         while True:
379 |             try:
380 |                 session = requesocks.session()
381 |                 if self.useProxy:
382 |                     self.InternalPrint("Using proxy: http %s, https %s" % (self.httpProxy, self.httpsProxy), True)
383 |                     session.proxies = proxies
384 |                     #r = requests.get(url, stream=True, proxies=proxies)
385 |                 #else:
386 |                     #r = requests.get(url, stream=True)
387 |                 r = session.get(url)
388 |                 break
389 |             #except requests.ConnectionError:
390 |             except requesocks.exceptions.ConnectionError:
391 |                 if retry<self.retryTimes:
392 |                     retry+=1
393 |                     self.InternalPrint('\tCan\'t retrive image. retry %i' % retry, False)
394 |                     continue
395 |                 if has_log_file:
396 |                     logging.error('Can not connect to %s' % url)
397 |                 return error('The server is not responding.')
398 |         try:
399 |             with open(image_path, 'wb') as f:
400 |                 for chunk in r.iter_content(chunk_size=1024):
401 |                     if chunk: # filter out keep-alive new chunks
402 |                         f.write(chunk)
403 |                         f.flush()
404 |             #rename image file by its type
405 |             os.rename(image_path, local_directory+str(self.imageCount)+"."+self.GetImageType(image_path))
406 |         except IOError:
407 |             if has_log_file:
408 |                 logging.error('Can not save file %s' % url)
409 |             self.InternalPrint('Can\'t save image %s' % url, False)
410 | 
411 |         return success(image_path)
412 | 
413 | class MoeimgDownloader(Downloader):
414 |     def __init__(self):
415 |         super(MoeimgDownloader, self).__init__()
416 | 
417 |         self.type = 'moeimg'
418 |         self.encode = 'utf-8'
419 |         self.tag_file = 'tags'
420 |         self.ImgRegex = r'<img\s*src=["\']?([^\'" >]+?)[ \'"]\s*(?:alt="\d*")?\s*class="thumbnail_image"'
421 |         #self.ThreadsRegex = r'<h[23]\s*class="entry-header"\s*>\s*<a\s*href=["\']?([^\'">]+?)[\'"]\s*title=["\']?([^\'"]+?)[\'"]'
422 |         self.ThreadsRegex = r'<h2 class="title">\s*<a href="(http://moeimg.net/\d*.html)"\s*title="[^"]+?">\s*([^<]+?)\s*</a>\s*</h2>'
423 |         self.targetThreadRegex = r'<div\s*class="post">\s*<h1\s*class="title">\s*([^<]+?)\s*</h1>'
424 | 
425 |     def Download(self):
426 |         if self.moeimgTags:
427 |             res = self.LoadTags()
428 |             if get_error(res):
429 |                 self.InternalPrint(get_error(res), False)
430 |                 return
431 |             tags = get_val(res)
432 |         else:
433 |             tags = ['default']
434 |         self.InternalPrint("===============   start   ===============", False)
435 |         i = self.pageNum
436 |         domain = ''
437 |         for tag in tags:
438 |             self.currentTag = tag
439 |             if self.targetThread == "":
440 |                 for i in range(self.pageNum, self.pageTo+1):
441 |                     if not self.moeimgTags:
442 |                         self.InternalPrint("===============   loading page {0}   ===============".format(i), False)
443 |                         if i == 1:
444 |                             domain = "http://"+self.moeimgdomain
445 |                         else:
446 |                             domain = "http://"+self.moeimgdomain+"/page/{0}".format(i)
447 |                     else:
448 |                         self.InternalPrint("===============   loading tag: %s page %i  ===============" % (tag.decode('utf-8').encode(sys.getfilesystemencoding()),i), False)
449 |                         if i == 1:
450 |                             domain = "http://"+self.moeimgdomain+"/tag/%s" % (tag)
451 |                         else:
452 |                             domain = "http://"+self.moeimgdomain+"/tag/%s/page/%i" % (tag,i)
453 |                     res = self.DoFetch(domain)
454 |                     if get_error(res):
455 |                         self.InternalPrint(get_error(res), False)
456 |             else:
457 |                 self.InternalPrint("===============   loading target thread {0}   ===============".format(self.targetThread), False)
458 |                 res = self.DoFetchSingleThread(self.targetThread)
459 |                 if get_error(res):
460 |                     self.InternalPrint(get_error(res), False)
461 |         self.InternalPrint("===============   end   ===============", False)
462 | 
463 |     def FetchAllTags(self):
464 |         res = self.FetchHtml('http://'+self.moeimgdomain+'/taglist')
465 |         if get_error(res):
466 |             return res
467 |         html = get_val(res)
468 |         tagRegex = r'<a\s*href=[\'"]([^\'"]+?)[\'"]\s*class=[\'"][^\'"]*[\'"]\s*title=[\'"][^\'"]*[\'"]\s*style=[\'"][^\'"]*[\'"]>([^<]+?)</a>'
469 |         prog = re.compile(tagRegex, re.IGNORECASE)
470 |         matches = prog.findall(html)
471 |         tags = []
472 |         for m in matches:
473 |             if re.search('tag', m[0]):
474 |                 if not m[1] in tags:
475 |                     tags.append(m[1])
476 |         self.InternalPrint('Fetched %s tags.' % len(tags), True)
477 |         return success(tags)
478 | 
479 |     def LoadTags(self):
480 |         if os.path.exists(self.tag_file):
481 |             tagsfile = open(self.tag_file, 'r')
482 |         else:
483 |             return error('No tags file.')
484 | 
485 |         tags = []
486 |         for tag in tagsfile:
487 |             tags.append(tag.strip('\n').strip(';').decode('utf-8').replace(' ', '-').lower())
488 |         self.InternalPrint('Loaded %s tags.' % len(tags), True)
489 |         return success(tags)
490 | 
491 |     def GetCurrentDir(self, href):
492 |         dir = href[0].split('/')[-1]
493 |         dir = dir.split('.')[-2]
494 |         return dir
495 | 
496 |     def GetThreadTagName(self, html):
497 |         #tagRegex = r'<li\s*class="path">\s*<a\s*href=["\']?([^\'" >]+?)[ \'"]\s*>([^<]*)</a></li>'
498 |         tagRegex = r'<li\s*class="tag"><i\s*class="fa fa-tags"></i><a\s*href=["\']?([^\'" >]+?)[ \'"]\s*rel="tag">([^<]*)</a>'
499 |         prog = re.compile(tagRegex, re.IGNORECASE)
500 |         matches = prog.findall(html)
501 |         for m in matches:
502 |             if re.search('http://moeimg.net/tag/',m[0]):
503 |                 return m[1]
504 |         return 'default'
505 | 
506 |     def PreHandleTagName(self, local_file):
507 |         if self.moeimgSortWithTags:
508 |             if self.moeimgTags:
509 |                 local_file += self.currentTag.encode(sys.getfilesystemencoding()) + '/'
510 |             else:
511 |                 local_file += self.currentTag + '/'
512 |             self.DealDir(local_file)
513 |         return local_file
514 | 
515 |     def CheckThreadsValid(self, href):
516 |         return True
517 | 
518 |     def GetThreadUrl(self, href):
519 |         return href[0]
520 | 
521 |     def GetTitle(self, href):
522 |         return href[1]
523 | 
524 | class CaoliuDownloader(Downloader):
525 |     def __init__(self):
526 |         super(CaoliuDownloader, self).__init__()
527 | 
528 |         self.type = 'caoliu'
529 |         self.encode = 'gbk'
530 |         self.ImgRegex = r'<input\s*src\s*=\s*["\']?([^\'" >]+?)[ \'"]\s*type=\'image\''
531 |         self.ThreadsRegex = r'<h3><a\s*href\s*=\s*["\']?([^\'">]+?)[ \'"][^>]*?>(?:<font color=green>)?([^<]*)(?:</font>)?</a></h3>'
532 |         self.targetThreadRegex = r'<tr><td\s*class="h"> --> <b>[^<]+?</b>\s*([^<]+?)\s*</td>'
533 | 
534 |     def Download(self):
535 |         self.InternalPrint("===============   start   ===============", False)
536 |         if self.targetThread == "":
537 |             for i in range(self.pageNum, self.pageTo+1):
538 |                 self.InternalPrint("===============   loading page {0}   ===============".format(i), False)
539 |                 domain = "http://"+self.caoliudomain+"/thread0806.php?fid=16&search=&page={0}".format(i)
540 |                 res = self.DoFetch(domain)
541 |                 if get_error(res):
542 |                     self.InternalPrint(get_error(res), False)
543 |         else:
544 |             self.InternalPrint("===============   loading target thread {0}   ===============".format(self.targetThread), False)
545 |             res = self.DoFetchSingleThread(self.targetThread)
546 |             if get_error(res):
547 |                 self.InternalPrint(get_error(res), False)
548 |         self.InternalPrint("===============   end   ===============", False)
549 | 
550 |     def GetCurrentDir(self, href):
551 |         dir = href[0].split('/')[-3] + href[0].split('/')[-2] + href[0].split('/')[-1]
552 |         dir = dir.split('.')[-2]
553 |         return dir
554 | 
555 |     def CheckThreadsValid(self, href):
556 |         return href[0][0:8] == "htm_data"
557 | 
558 |     def GetThreadUrl(self, href):
559 |         return 'http://'+self.caoliudomain+'/' + href[0]
560 | 
561 |     def GetTitle(self, href):
562 |         return href[1]
563 | 
564 | class MLStripper(HTMLParser):
565 |     def __init__(self):
566 |         self.reset()
567 |         self.fed = []
568 |     def handle_data(self, d):
569 |         self.fed.append(d)
570 |     def get_data(self):
571 |         return ''.join(self.fed)
572 | 
573 | class JanDanDownloader(Downloader):
574 |     def __init__(self):
575 |         super(JanDanDownloader, self).__init__()
576 | 
577 |         self.isMono = True
578 | 
579 |         self.type = 'jandan'
580 |         self.encode = 'utf-8'
581 |         self.ImgRegex = r'<p>\s*<a\s*href=["\']?([^\'" >]+?)[ \'"]\s*target="_blank"\s*class="view_img_link"\s*>'
582 | 
583 |     def Download(self):
584 |         #get max
585 |         res = self.FetchHtml("http://"+self.jandandomain+"/ooxx")
586 |         if get_error(res):
587 |             self.InternalPrint(get_error(res), False)
588 |             return res
589 |         html = get_val(res)
590 |         newest = self.get_max(html)
591 | 
592 |         self.InternalPrint("===============   start   ===============", False)
593 |         for i in range(newest-self.jandanPageToDownload+1, newest+1):
594 |             self.InternalPrint("===============   loading page {0}   ===============".format(i), False)
595 |             domain = "http://"+self.jandandomain+"/ooxx/page-{0}#comments".format(i)
596 |             res = self.FetchThreadHtml(domain)
597 |             if get_error(res):
598 |                 self.InternalPrint(get_error(res), False)
599 |         self.InternalPrint("===============   end   ===============", False)
600 | 
601 |     def strip_tags(self, html):
602 |         s = MLStripper()
603 |         s.feed(html)
604 |         return s.get_data()
605 | 
606 |     def get_max(self, html_code):
607 |         m = re.search('.+cp-pagenavi.+', html_code)
608 |         m = re.search('\d+', self.strip_tags(m.group(0)).strip())
609 |         return int(m.group(0))
610 | 
611 |     def download_file(self, url):
612 |         dir = self.type
613 |         local_directory = "Images/"+ dir + '/'
614 |         self.DealDir(local_directory)
615 |         image_path = local_directory + url.split('/')[-1]
616 |         if os.path.exists(image_path):
617 |             return error('\t skip '+image_path)
618 |         self.InternalPrint('\t=>'+image_path.encode(sys.getfilesystemencoding()), False)
619 |         # NOTE the stream=True parameter
620 |         retry = 0
621 |         proxies = {
622 |             'http':self.httpProxy,
623 |             'https':self.httpsProxy,
624 |         }
625 |         global has_log_file
626 |         while True:
627 |             try:
628 |                 if self.useProxy:
629 |                     r = requests.get(url, stream=True, proxies=proxies)
630 |                 else:
631 |                     r = requests.get(url, stream=True)
632 |                 break
633 |             except requests.ConnectionError:
634 |                 if retry<self.retryTimes:
635 |                     retry+=1
636 |                     self.InternalPrint('\tCan\'t retrive image. retry %i' % retry, False)
637 |                     continue
638 |                 if has_log_file:
639 |                     logging.error('Can not connect to %s' % url)
640 |                 return error('The server is not responding.')
641 |         try:
642 |             with open(image_path, 'wb') as f:
643 |                 for chunk in r.iter_content(chunk_size=1024):
644 |                     if chunk: # filter out keep-alive new chunks
645 |                         f.write(chunk)
646 |                         f.flush()
647 |         except IOError:
648 |             if has_log_file:
649 |                 logging.error('Can not save file %s' % url)
650 |             self.InternalPrint('Can\'t save image %s' % url, False)
651 |         return success(image_path)
652 | 
653 | def process_pages(d, num):
654 |     if num > 0:
655 |         d.pageTo = d.pageNum + num - 1
656 | 
657 | def parse_general_args(obj, args):
658 |     if args.no_log:
659 |         obj.hasLog = False
660 |     if args.threads:
661 |         obj.numToDownload = args.threads
662 |     if args.single:
663 |         obj.targetThread = args.single[0]
664 |     if args.proxy:
665 |         obj.useProxy = True
666 |         obj.httpProxy = args.proxy[0]
667 |         obj.httpsProxy = args.proxy[0]
668 |     if args.direct:
669 |         obj.useProxy = False
670 |     if args.retry:
671 |         obj.retryTimes = args.retry
672 |     if args.mono:
673 |         obj.isMono = True
674 |     if args.verbose:
675 |         obj.verbose = True
676 |     if args.quiet:
677 |         obj.silent = True
678 | 
679 | def caoliu(args):
680 |     cl = CaoliuDownloader()
681 |     if args.pages:
682 |         process_pages(cl, args.pages)
683 |     if args.domain:
684 |         cl.caoliudomain = args.domain
685 |     parse_general_args(cl, args)
686 |     cl.InternalPrint("Processing caoliu...", False)
687 |     cl.Download()
688 | 
689 | def moeimg(args):
690 |     moe = MoeimgDownloader()
691 |     if args.pages:
692 |         process_pages(moe, args.pages)
693 |     if args.domain:
694 |         moe.moeimgdomain = args.domain
695 |     if args.sort_with_tags:
696 |         moe.moeimgSortWithTags = True
697 |     parse_general_args(moe, args)
698 |     moe.InternalPrint("Processing moeimg...", False)
699 |     if args.fetch_all_tags:
700 |         res = moe.FetchAllTags()
701 |         if get_error(res):
702 |             print(get_error(res))
703 |             return
704 |         tags = get_val(res)
705 |         with open('all_tags.txt', 'w') as all_tags_file:
706 |             for t in tags:
707 |                 all_tags_file.write(t + '\n')
708 |             print('Fetched all tags.')
709 |     elif args.with_tags:
710 |         if args.tag_file:
711 |             moe.tag_file = args.tag_file
712 |         moe.moeimgTags = True
713 |         moe.Download()
714 |     else:
715 |         moe.Download()
716 | 
717 | def jandan(args):
718 |     j = JanDanDownloader()
719 |     if args.pages:
720 |         j.jandanPageToDownload = args.pages
721 |     if args.domain:
722 |         j.jandandomain = args.domain
723 |     parse_general_args(j, args)
724 |     j.InternalPrint("Processing jandan...", False)
725 |     j.Download()
726 | 
727 | #def all():pass
728 | 
729 | def main():
730 |     global init_with_config_file
731 |     global has_log_file
732 |     ap = argparse.ArgumentParser(description='This tool can download ooxx image from some websites. :P',
733 |                                  epilog=" Please report bugs to https://github.com/KanagiMiss/MoeDownloader/issues")
734 |     sp = ap.add_subparsers(title='subcommands',
735 |                            description='available subcommands',
736 |                            help='')
737 | 
738 |     p_caoliu = sp.add_parser("caoliu", help="download caoliu images")
739 |     p_caoliu.set_defaults(func=caoliu)
740 |     p_moeimg = sp.add_parser("moeimg", help="download moeimg images")
741 |     p_moeimg.set_defaults(func=moeimg)
742 |     p_jandan = sp.add_parser("jandan", help="download jandan images")
743 |     p_jandan.set_defaults(func=jandan)
744 | #   p_all = sp.add_parser("all", help="download all images")
745 | 
746 |     g1 = ap.add_mutually_exclusive_group()
747 |     g2 = ap.add_mutually_exclusive_group()
748 |     ap.add_argument("-p", "--pages", type=int,
749 |                     help="number of pages to download")
750 | 
751 |     #general options
752 |     ap.add_argument("-i", "--ignore_config", action="store_true", help="ignore config file and load with default options")
753 |     ap.add_argument("-n", "--no_log", action="store_true", help="run without log")
754 |     ap.add_argument("-r", "--retry", type=int, help="retry times if failed")
755 |     ap.add_argument("-m", "--mono", action="store_true", help="set if mono file")
756 |     ap.add_argument("-t", "--threads", type=int, help="number of threads to download")
757 |     ap.add_argument("-S", "--single", nargs=1, help="download single thread")
758 |     g1.add_argument("-q", "--quiet", action="store_true", help="run quietly and briefly")
759 |     g1.add_argument("-v", "--verbose", action="store_true", help="run verbosely")
760 |     g2.add_argument("-d", "--direct", action="store_true", help="connect directly(without proxy)")
761 |     g2.add_argument("--proxy", nargs=1, help='set http and https proxy')
762 |     ap.add_argument('--version', action='version', version='%(prog)s 1.0')
763 | 
764 |     #moeimg options
765 |     p_moeimg.add_argument("-T", "--fetch_all_tags", action="store_true", help="fetch all tags from site")
766 |     p_moeimg.add_argument("-t", "--with_tags", action="store_true", help="download with tags")
767 |     p_moeimg.add_argument("-s", "--sort_with_tags", action="store_true", help="sort files with tags")
768 |     p_moeimg.add_argument("--domain", nargs=1, help="set domain")
769 |     p_moeimg.add_argument("-f", "--tag_file", type=argparse.FileType('r'), help="set specific tag file")
770 | 
771 |     #caoliu options
772 |     p_caoliu.add_argument("--domain", nargs=1, help="set domain")
773 | 
774 |     #jandan options
775 |     p_jandan.add_argument("--domain", nargs=1, help="set domain")
776 | 
777 |     args = ap.parse_args()
778 | 
779 |     # run with default config (ignore config file)
780 |     if args.ignore_config:
781 |         init_with_config_file = False
782 | 
783 |     # run without log file
784 |     if args.no_log:
785 |         has_log_file = False
786 | 
787 |     args.func(args)
788 | 
789 | if __name__ == '__main__':
790 |     reload(sys)
791 |     sys.setdefaultencoding(sys.getfilesystemencoding())
792 |     main()
793 | 


--------------------------------------------------------------------------------