├── LICENSE.txt
├── README.md
├── creeper.py
├── creeper.pyc
└── weibo-photo-downloader.py


/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright (c) 2016 pingze
3 | 
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 | 
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 | 
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # weibo-photo-downloader
 2 |   新浪微博相片下载器。自动排序，多线程，断点续传。
 3 | 
 4 | ## 功能
 5 |   下载指定新浪微博用户的相册的所有图片到本地。
 6 | 
 7 | ## 使用方法
 8 | 1. 打开需要下载的用户主页，在网址或页面文档中找到用户唯一id。支持3种ID，比如黄晓明的ID可以为huangxiaoming（个性域名）、1730077315（用户标识oid）、1006051730077315（相册标识page_id）。
 9 | 2. 利用chrome开发者工具或其他抓包工具，或者运行语句```document.cookie```来获取cookie值。
10 | 3. 编辑weibo-photo-downloader.py文件，在初始参数中，修改uid和cookie的值为第一步获取的用户id和第二步获取的cookie。
11 | 4. 运行weibo-photo-downloader.py文件。
12 |  
13 | ## 特点
14 | 1. 因为新浪微博用户登录需要验证码，现使用cookie代替登录环节。
15 | 2. 用户相册的所有图片都会被下载，按发布时间排序。
16 | 3. 动态图（GIF）也会被下载，视频则会被忽略。
17 | 4. 采用多线程提高下载效率，具体线程数可控制。
18 | 5. 可以在控制台观察到下载进度。
19 | 


--------------------------------------------------------------------------------
/creeper.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | '''
  5 | Creeper:
  6 |     爬虫包。
  7 |     功能: 获取网页文档和图片等资源，多线程，支持断点续传，支持数据库。
  8 | '''
  9 | '''
 10 |     类:
 11 |         Creeper:
 12 |             用于get、post并获取响应内容等。
 13 |         Creeper_imgs:
 14 |             对多个连接的内容进行下载。多线程/断点续传。
 15 |         Creeper_thread:
 16 |             多线程类。
 17 | 
 18 | '''
 19 | '''
 20 |     1.增加记录功能。
 21 |     2.修改requests的cookies参数
 22 |     3.尝试找到requests下载gif图片的方法
 23 |     4.增加小视频下载功能
 24 |     5.智能线程数控制
 25 | '''
 26 | 
 27 | 
 28 | import os
 29 | import requests
 30 | import urllib
 31 | import re
 32 | from StringIO import StringIO
 33 | import pickle
 34 | import traceback
 35 | import time
 36 | from PIL import Image
 37 | import threading
 38 | from Queue import Queue
 39 | 
 40 | 
 41 | 
 42 | # **************** Creeper *********************
 43 | 
 44 | class Creeper(object):
 45 |     def __init__(self,url,headers='',params='',data='',proxies='',encoding=''):
 46 |         self.url = url
 47 |         self.headers = headers
 48 |         self.params = params
 49 |         self.data = data
 50 |         self.proxies = proxies
 51 |         self.response = None
 52 |         self.html = None
 53 |         self.max_try_times = 10
 54 |         self.wait_time = 5
 55 |         self.sleep_time = 0
 56 |         self.encoding = encoding
 57 |         self.lock = None
 58 | 
 59 |     def get(self,url='',headers='',params='',proxies='',stream=False):
 60 |         'get方法'
 61 |         if url != '':
 62 |             self.set_url(url)
 63 |         if headers != '':
 64 |             self.set_headers(headers)
 65 |         if params != '':
 66 |             self.set_params(params)
 67 |         if proxies != '':
 68 |             self.set_proxies(proxies)
 69 |         response = self.__response('get',self.url,self.headers,self.params,self.proxies,stream)
 70 |         if response != False:
 71 |             self.response = response
 72 |             return response
 73 |         else:
 74 |             if self.lock != None:
 75 |                 self.lock.acquire()
 76 |                 print('[Creeper] Failed to get response. ')
 77 |                 self.lock.release()
 78 |             return False
 79 | 
 80 |     def post(self,url='',headers='',params='',data='',proxies=''):
 81 |         if url != '':
 82 |             self.set_url(url)
 83 |         if headers != '':
 84 |             self.set_headers(headers)
 85 |         if data != '':
 86 |             self.set_data(data)
 87 |         if params != '':
 88 |             self.set_params(params)
 89 |         if proxies != '':
 90 |             self.set_proxies(proxies)
 91 |         response = self.__response('post',self.url,self.headers,self.params,self.proxies)
 92 |         if response != False:
 93 |             self.response = response
 94 |             return response
 95 |         else:
 96 |             if self.lock != None:
 97 |                 self.lock.acquire()
 98 |                 print('[Creeper] Failed to post. ')
 99 |                 self.lock.release()
100 |             return False
101 | 
102 |     def __response(self,work,url,headers='',params='',data='',proxies='',stream=False):
103 |         '获取响应'
104 |         max_try_times = self.max_try_times # 最大尝试次数
105 |         wait_time = self.wait_time # 最大单次尝试时间
106 |         sleep_time = self.sleep_time # 尝试失败延时
107 |         for times in range(1,max_try_times+1):
108 |             try:
109 |                 if work == 'get':
110 |                     # response = requests.get(url, timeout = wait_time, headers=headers, params=params, proxies=proxies, stream=stream)
111 |                     response = requests.get(url, timeout = wait_time, headers=headers, params=params, stream=stream)
112 |                 elif work == 'post':
113 |                     # response = requests.post(url, timeout = wait_time, headers=headers, params=params,data=data, proxies=proxies)
114 |                     response = requests.post(url, timeout = wait_time, headers=headers, params=params,data=data)
115 |                 break
116 |             except:
117 |                 #traceback.print_exc()
118 |                 if times < max_try_times:
119 |                     time.sleep(sleep_time)
120 |                     continue
121 |                 else:
122 |                     traceback.print_exc()
123 |                     return False
124 |         if self.encoding == '':
125 |             try:
126 |                 encoding = re_find('charset=(.+?)>',response.text,report=False)
127 |                 encoding = string_removeall(encoding,['"',"'",' ','=','/'])
128 |                 self.encoding = encoding
129 |             except:
130 |                 traceback.print_exc()
131 |                 self.encoding = 'utf-8'
132 |             if self.encoding.lower() not in ['utf-8','utf8','gbk','gb2312']:
133 |                 self.encoding = 'utf-8'
134 |                 if self.lock != None:
135 |                     self.lock.acquire()
136 |                     print('[Creeper] Can not find charset, set encoding:utf-8')
137 |                     self.lock.release()
138 |         self.response = response
139 |         self.response.encoding = self.encoding
140 |         return response
141 | 
142 |     def set_url(self,url):
143 |         if isinstance(url,str) or isinstance(url,unicode):
144 |             self.url = url
145 |             self.__flush()
146 |         else:
147 |             print('[Creeper] Url should be a string')
148 | 
149 |     def set_headers(self,headers):
150 |         if isinstance(headers,dict):
151 |             self.headers = headers
152 |             self.__flush()
153 |         else:
154 |             print('[Creeper] Headers should be a dict')
155 | 
156 |     def set_params(self,params):
157 |         if isinstance(params,dict):
158 |             self.params = params
159 |             self.__flush()
160 |         else:
161 |             print('[Creeper] Params should be a dict')
162 | 
163 |     def set_data(self,data):
164 |         if isinstance(data,dict):
165 |             self.data = data
166 |             self.__flush()
167 |         else:
168 |             print('[Creeper] Data should be a dict')
169 | 
170 |     def set_proxies(self,proxies):
171 |         if isinstance(proxies,dict):
172 |             self.proxies = proxies
173 |             self.__flush()
174 |         else:
175 |             print('[Creeper] Proxies should be a dict')
176 | 
177 |     def __flush(self):
178 |         self.response = None
179 |         self.html = None   
180 | 
181 |     def find(self,pattern,text=''):
182 |         if text == '':
183 |             text = self.html
184 |         if isinstance(text,str)==False and isinstance(text,unicode)==False:
185 |             print('[Creeper] Invalid Text')
186 |             return ''
187 |         regex = re.search(pattern, text, re.S)
188 |         num = pattern.count('(')
189 |         try:
190 |             if num == 1:
191 |                 get = regex.group(1)
192 |             else:
193 |                 get = regex.groups()
194 |             return get.strip()
195 |         except:
196 |             print('[Creeper] No Found')
197 | 
198 |             return ''
199 | 
200 |     def findall(self,pattern,text=''):
201 |         if text == '':
202 |             text = self.html
203 |         if len(text) < 1:
204 |             print('[Creeper] Invalid Text')
205 |             return ''
206 |         get_list = re.findall(pattern, text, re.S)
207 |         find_list = []
208 |         for get in get_list:
209 |             find_list.append(get.strip())
210 |         if len(find_list) > 0:
211 |             return find_list
212 |         else:
213 |             print('[Creeper] No Found')
214 | 
215 |             return ''          
216 | 
217 |     def get_html(self):
218 |         if self.response == None:
219 |             self.get()
220 |             self.html = self.response.text
221 |         elif self.html == None:
222 |             self.html = self.response.text
223 |         return self.html
224 | 
225 | # **************** Creepers *********************
226 | 
227 | # class Creepers(Creeper_imgs):
228 | #     '专门用于多个链接内容的获取'
229 | #     def __init__(self,url_list,headers='',params='',data=''):
230 | #         super(Creepers,self).__init__(self,url_list[0],headers,params,data)
231 | #         self.set_url_list(url_list)
232 | #         self.lock = None
233 | #         self.thread_list = []
234 | #         self.thread_num = 5
235 | 
236 | #     def set_url_list(self,url_list):
237 | #         if isinstance(url_list,list):
238 | #             flag = True
239 | #             for url in url_list:
240 | #                 if not isinstance(url,str):
241 | #                     flag = False
242 | #             if flag == True:
243 | #                 self.url_list = url_list
244 | 
245 | #     def thread_num(num):
246 | #         if isinstance(num,int):
247 | #             if 1 <= num <= 20:
248 | #                 self.thread_num = num
249 | #             else:
250 | #                 print('[Creeper] Please make threads number between 1 and 20 ')
251 | #         else:
252 | #             print('[Creeper] threads number shoud be a integer ')
253 | 
254 | #     def __get(self,tname,lock,url='',headers='',params=''):
255 | #         '单个线程的get方法'
256 | #         if url != '':
257 | #             self.set_url(url)
258 | #         if headers != '':
259 | #             self.set_headers(headers)
260 | #         if params != '':
261 | #             self.set_params(params)
262 | #         response = self.__response('get',self.url,self.headers,self.params)
263 | #         if response != False:
264 | #             self.response = response
265 | #             return response
266 | #         else:
267 | #             print('[Creeper] Failed to get response. ')
268 | #             exit()
269 | 
270 | #     def getall(self,url_list='',headers='',params=''):
271 | #         if url_list != '':
272 | #             self.set_url_list(url_list)
273 | #         if headers != '':
274 | #             self.set_headers(headers)
275 | #         if params != '':
276 | #             self.set_params(params)
277 | #         self.url_queue = list2queue(self.url_list)
278 | #         self.lock = threading.Lock()
279 | #         self.thread_list = []
280 | #         for i in range(self.thread_num):
281 | #             thread = MultiThread(self.__get,'t'+str(i),lock,(self.url_queue,savepath,headers))
282 | #             thread_list.append(thread)
283 | #             thread.start()
284 | #         for thread in thread_list:
285 | #             thread.join()
286 | 
287 | 
288 | 
289 | 
290 | 
291 | # **************** Creepers *********************
292 | 
293 | class Creeper_imgs(Creeper):
294 |     '专门用于多个链接图片的下载'
295 |     def __init__(self,url_list,dirpath='',headers='',params=''):
296 |         super(Creeper_imgs,self).__init__(self,'',headers,params)
297 |         self.url = None
298 |         self.url_list = None
299 |         self.set_url_list(url_list)
300 |         self.dirpath = None
301 |         if dirpath != '':
302 |             self.set_dirpath(dirpath)
303 |         self.done_list = []
304 |         self.failed_dict = {}
305 |         self.task_total = len(self.url_list)
306 |         self.lock = None
307 |         self.thread_list = []
308 |         self.thread_num = 10
309 |         self.task_done = 0
310 |         self.task_last = 0
311 |         self.filename_list = None
312 | 
313 |     def set_url_list(self,url_list):
314 |         if isinstance(url_list,list):
315 |             flag = True
316 |             for url in url_list:
317 |                 if not isinstance(url,str) and not isinstance(url, unicode):
318 |                     flag = False
319 |             if flag == True:
320 |                 self.url_list = url_list
321 |                 return True
322 |             else:
323 |                 print('[Creeper] Wrong url_list Input')
324 |                 self.url_list = None
325 |                 return False
326 | 
327 |     def set_dirpath(self,dirpath):
328 |         if not isinstance(dirpath,str) and not isinstance(dirpath,unicode):
329 |             print('[Creeper] Wrong dirpath Input: Not a String')
330 |             return False
331 |         else:
332 |             flag = cmkdir(dirpath)
333 |             if flag == True:
334 |                 self.dirpath = dirpath
335 |                 return True
336 |             else:
337 |                 print('[Creeper] Wrong dirpath Input: Can Not Create Path')
338 |                 return False
339 | 
340 |     def set_filename(self,filename_list):
341 |         if len(filename_list) == len(self.url_list):
342 |             self.filename_list = filename_list
343 | 
344 |     def save(self,dirpath='',url_list='',headers='',params=''):
345 |         flag = True
346 |         if url_list != '':
347 |             flag = self.set_url_list(url_list)
348 |         if headers != '':
349 |             self.set_headers(headers)
350 |         if params != '':
351 |             self.set_params(params)
352 |         if dirpath != '':
353 |             flag = self.set_dirpath(dirpath)
354 |         elif self.dirpath == None:
355 |             print('[Creeper] No dirpath exsits')
356 |             flag = False
357 |         if flag == True:
358 |             self.__load_log()
359 |             # deal with url_list and task_last
360 |             url_list_checked = []
361 |             for imgurl in self.url_list:
362 |                 if not imgurl in self.done_list:
363 |                     url_list_checked.append(imgurl)
364 |             self.url_list = url_list_checked
365 |             # download th failed ones first
366 |             self.failed_list = self.failed_dict.values()
367 |             for url_list in (self.failed_list, self.url_list):
368 |                 list_length = len(url_list)
369 |                 self.task_total = list_length
370 |                 url_queue = list2queue(url_list)
371 |                 self.lock = threading.Lock()
372 |                 self.thread_list = []
373 |                 for i in range(self.thread_num):
374 |                     if url_list is self.url_list:
375 |                         thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params))
376 |                     else:
377 |                         # thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params),flag_failed=True)
378 |                         thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params))
379 |                     self.thread_list.append(thread)
380 |                     thread.start()
381 |                 for thread in self.thread_list:
382 |                     thread.join()
383 |                 print('[Creeper][%s] All Thread Exit !' % (time.asctime()[11:19]))
384 |             print('[Creeper][%s] All Task Done !' % (time.asctime()[11:19]))
385 | 
386 |             
387 | 
388 |     def __save_thread(self,tname,lock,args,flag_failed=False):
389 |         (url_queue,dirpath,headers,params) = args
390 |         self.lock.acquire()
391 |         print('[Creeper][%s][%s] Thread Start !' % (time.asctime()[11:19], tname))
392 |         self.lock.release()
393 |         while True:
394 |             if url_queue.empty() == True:
395 |                 break
396 |             imgurl = url_queue.get()
397 |             self.lock.acquire()
398 |             print('[Creeper][%s][%s] Getting %s ... !' % (time.asctime()[11:19], tname, imgurl))
399 |             self.lock.release()
400 |             self.task_last = self.task_last + 1
401 |             self.url = imgurl
402 |             imgext = imgurl[imgurl.rfind('.'):]
403 |             if flag_failed == True:
404 |                 imgname = self.failed_list[imgurl]
405 |             else:
406 |                 if self.filename_list == None:
407 |                     imgname = format_num(self.task_last-1,len(str(self.task_total))) + imgext
408 |                 else:
409 |                     imgname = self.filename_list[self.url_list.index(imgurl)]
410 |             flag = self.__save_img(imgurl, dirpath, imgname, headers, params)
411 |             self.task_done = self.task_done + 1
412 |             if not imgurl in self.failed_list:
413 |                 self.failed_dict[imgurl] = imgname
414 |             if flag == True:
415 |                 self.lock.acquire()
416 |                 print('[Creeper][%s][%s] Successfully saved image %s as %s (%d/%d) !' % (time.asctime()[11:19], tname, imgurl, imgname, self.task_done, self.task_total))
417 |                 self.lock.release()
418 |                 self.done_list.append(imgurl)
419 |                 if imgurl in self.failed_dict:
420 |                     self.failed_dict.pop(imgurl)
421 |                 self.__write_log()
422 |             if flag == False:
423 |                 self.lock.acquire()
424 |                 print('[Creeper][%s][%s] Failed saved image %s as %s (%d/%d) !' % (time.asctime()[11:19], tname, imgurl, imgname, self.task_done, self.task_total)) 
425 |                 self.lock.release()
426 | 
427 |                 self.__write_log()
428 |         self.lock.acquire()
429 |         print('[Creeper][%s][%s] Thread Exit !' % (time.asctime()[11:19], tname))
430 |         self.lock.release()
431 | 
432 |     def __save_img(self,imgurl,dirpath,imgname,headers='',params=''):
433 |         '稳定高效的下载图片方法（多次尝试失败后跳过）'
434 |         max_try_times = 10 # 最大尝试次数
435 |         sleep_time = 0 # 尝试失败延时
436 |         self.max_try_times = 10
437 |         self.wait_time = 5
438 |         self.sleep_time = 0
439 |         imgpath = dirpath + os.sep + imgname
440 |         # flag_gif = False
441 |         # if imgname[-4:] == '.gif':
442 |         #     flag_gif = True
443 |         for times in range(1,max_try_times+1):
444 |             # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
445 |             try:
446 |                 # if flag_gif == False:
447 |                 #     print('notgif')
448 |                 response = self.get(imgurl,headers=headers,params=params,stream=True)
449 |                 if response.status_code != requests.codes.ok:
450 |                     print('[Creeper] 404 Client Error')
451 |                     imgurl = imgurl[:-4] + '.png'
452 |                     response = self.get(imgurl,headers=headers,params=params,stream=True)
453 |                 img = Image.open(StringIO(response.content))
454 |                 img.save(imgpath)
455 |                 img.close()
456 | 
457 |                 with open(imgpath, 'wb') as f:
458 |                     for chunk in response.iter_content(chunk_size=1024):
459 |                         if chunk: # filter out keep-alive new chunks  
460 |                             f.write(chunk)
461 |                             f.flush()
462 |                     f.close() 
463 | 
464 |                 # else:
465 |                 #     print('isgif')
466 |                 #     urllib.urlretrieve(imgurl,imgpath) #暂时不能反反盗链
467 |                 # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
468 |                 return True
469 |             except:
470 |                 if times < max_try_times:
471 |                     # print('[%s][WARN][IMG] The %s time try failed!' % (time.asctime()[11:19], times))
472 |                     time.sleep(sleep_time)
473 |                     continue
474 |                 else:
475 |                     traceback.print_exc()
476 |                     break
477 |         return False
478 | 
479 |     def __write_log(self):
480 |         log_dict = {
481 |             'done_list':self.done_list,
482 |             'failed_dict':self.failed_dict,
483 |             'task_last':self.task_last-1
484 |         }
485 |         logpath = self.dirpath + os.sep + 'log.pkl'
486 |         try:
487 |             out_file = open(logpath, 'wb')
488 |             pickle.dump(log_dict,out_file)
489 |             return True
490 |         except:
491 |             traceback.print_exc()
492 |             return None
493 |         finally:
494 |             out_file.close() 
495 | 
496 |     def __load_log(self):
497 |         logpath = self.dirpath + os.sep + 'log.pkl'
498 |         if os.path.exists(logpath) and os.path.isfile(logpath):
499 |             print('[Creeper] Found Log File, Reading ...')
500 |             try:
501 |                 in_file = open(logpath, 'rb')
502 |             except:
503 |                 traceback.print_exc()
504 |                 return None
505 |             dic = pickle.load(in_file)
506 |             self.done_list = dic['done_list']
507 |             self.failed_dict = dic['failed_dict']
508 |             self.task_last = dic['task_last']
509 |         else:
510 |             print('[Creeper] Not Found Log File, Will Create a New One')
511 |             self.done_list = []
512 |             self.failed_dict = {}
513 |             self.task_last = 0
514 |             cmkdir(self.dirpath)
515 | 
516 | 
517 | # **************** Creeper_thread *********************
518 | 
519 | class Creeper_thread(threading.Thread):
520 |     '可传递参数、指定运行函数的线程类'
521 |     def __init__(self,target,tname,lock,args):
522 |         super(Creeper_thread,self).__init__()
523 |         self.setDaemon(True)
524 |         self.target = target
525 |         self.tname = tname
526 |         self.lock = lock
527 |         self.args = args
528 | 
529 |     def run(self):
530 |         self.target(self.tname,self.lock,self.args)
531 | 
532 | # **************** Single Functions *********************
533 | 
534 | # def string_removeall(string,ele):
535 | #     if isinstance(ele,str) == True or isinstance(ele,unicode) == True:
536 | #         while True:
537 | #             if ele in string:
538 | #                 string = string[:string.index(ele)] + string[string.index(ele)+len(ele):]
539 | #             else:
540 | #                 break
541 | #         return string
542 | #     elif isinstance(ele,list) or isinstance(ele,tuple):
543 | #         for eele in ele:
544 | #             while True:
545 | #                 if eele in string:
546 | #                     string = string[:string.index(eele)] + string[string.index(eele)+len(eele):]
547 | #                 else:
548 | #                     break
549 | #         return string
550 | 
551 | def string_removeall(string,ele):
552 |     if isinstance(ele,str) == True or isinstance(ele,unicode) == True:
553 |         return string.replace(ele,'')
554 |     elif isinstance(ele,list) or isinstance(ele,tuple):
555 |         for eele in ele:
556 |             string = string.replace(eele,'')
557 |         return string
558 | 
559 | 
560 | def list_remove(alist,ele):
561 |     try:
562 |         alist.remove(ele)
563 |         return True
564 |     except:
565 |         return False
566 | 
567 | def format_num(num,length):
568 |     string = str(num)
569 |     for i in range(length-len(string)):
570 |         string = '0' + string
571 |     return string
572 | 
573 | def cmkdir(path):
574 |         try:
575 |             if(os.path.exists(path)==False or os.path.isdir(path)==False):
576 |                 os.mkdir(path)
577 |         except:
578 |             traceback.print_exc()
579 |             return False
580 |         return True
581 | 
582 | def re_search(pattern, text):
583 |     if len(text) < 1:
584 |         print('[Creeper] Invalid Text')
585 |         return ''
586 |     regex = re.search(pattern, text, re.S)
587 |     try:
588 |         get = regex.group(1).strip()
589 |         return get
590 |     except:
591 |         print('[Creeper] No Search')
592 |         return ''
593 | 
594 | def list2queue(alist):
595 |     queue = Queue()
596 |     for ele in alist:
597 |         queue.put(ele)
598 |     return queue
599 | 
600 | re_digits = re.compile(r'(\d+)')  
601 |   
602 | def emb_numbers(s):  
603 |     pieces=re_digits.split(s)  
604 |     pieces[1::2]=map(int,pieces[1::2])      
605 |     return pieces  
606 | 
607 | def lsort(alist):
608 |     return sorted(alist, key=emb_numbers)  
609 | 
610 | def save(target,dirpath='',url_list='',headers='',params=''):
611 |     '多线程下载方法'
612 |     flag = True
613 |     if url_list != '':
614 |         flag = self.set_url_list(url_list)
615 |     if headers != '':
616 |         self.set_headers(headers)
617 |     if params != '':
618 |         self.set_params(params)
619 |     if dirpath != '':
620 |         flag = self.set_dirpath(dirpath)
621 |     elif self.dirpath == None:
622 |         print('[Creeper] No dirpath exsits')
623 |         flag = False
624 |     if flag == True:
625 |         self.__load_log()
626 |         # deal with url_list and task_last
627 |         url_list_checked = []
628 |         for imgurl in self.url_list:
629 |             if not imgurl in self.done_list:
630 |                 url_list_checked.append(imgurl)
631 |         self.url_list = url_list_checked
632 |         # download th failed ones first
633 |         self.failed_list = self.failed_dict.values()
634 |         for url_list in (self.failed_list, self.url_list):
635 |             list_length = len(url_list)
636 |             self.task_total = list_length
637 |             if list_length == 0:
638 |                 continue
639 |             elif 0 < list_length <= 20:
640 |                 self.thread_num = 5
641 |             else:
642 |                 self.thread_num = 10
643 |             url_queue = list2queue(url_list)
644 |             self.lock = threading.Lock()
645 |             self.thread_list = []
646 |             for i in range(self.thread_num):
647 |                 if url_list is self.url_list:
648 |                     thread = Creeper_thread(target,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params))
649 |                 else:
650 |                     thread = Creeper_thread(target,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params),flag_failed=True)
651 |                 self.thread_list.append(thread)
652 |                 thread.start()
653 |             for thread in self.thread_list:
654 |                 thread.join()
655 |             print('[Creeper][%s] All Thread Exit !' % (time.asctime()[11:19]))
656 |         print('[Creeper][%s] All Task Done !' % (time.asctime()[11:19]))
657 | 
658 | def re_find(pattern,text,report=True):
659 |     if len(text) < 1:
660 |         print('[Creeper] Invalid Text')
661 |         return ''
662 |     regex = re.search(pattern, text, re.S)
663 |     num = pattern.count('(')
664 |     try:
665 |         if num == 1:
666 |             get = regex.group(1)
667 |         else:
668 |             get = regex.groups()
669 |         return get.strip()
670 |     except:
671 |         if report == True:
672 |             print('[Creeper] No Found')
673 |         return ''
674 | 
675 | def re_findall(pattern,text,report=True):
676 |     if len(text) < 1:
677 |         print('[Creeper] Invalid Text')
678 |         return ''
679 |     get_list = re.findall(pattern, text, re.S)
680 |     find_list = []
681 |     for get in get_list:
682 |         find_list.append(get.strip())
683 |     if len(find_list) > 0:
684 |         return find_list
685 |     else:
686 |         if report == True:
687 |             print('[Creeper] No Found')
688 |         return ''          
689 | 
690 | def list_remove_same(alist):
691 |     newlist = []
692 |     for ele in alist:
693 |         if ele not in newlist:
694 |             newlist.append(ele)
695 |     return newlist
696 | 
697 | def num_format(num,total):
698 |     bitnum = len(str(total))
699 |     if bitnum == 1:
700 |         return '{:0>1}'.format(num)
701 |     elif bitnum == 2:
702 |         return '{:0>2}'.format(num)
703 |     elif bitnum == 3:
704 |         return '{:0>3}'.format(num)
705 |     elif bitnum == 4:
706 |         return '{:0>4}'.format(num)
707 |     elif bitnum == 5:
708 |         return '{:0>5}'.format(num)
709 |     elif bitnum == 6:
710 |         return '{:0>6}'.format(num)
711 |     elif bitnum == 7:
712 |         return '{:0>7}'.format(num)
713 |     elif bitnum == 8:
714 |         return '{:0>8}'.format(num)
715 |     elif bitnum == 9:
716 |         return '{:0>9}'.format(num)
717 | 
718 | # ****************** testing ********************
719 | 
720 | def main():
721 |     print num_format(12,300)
722 | 
723 | if __name__ == '__main__':
724 |     main()
725 | 
726 | 


--------------------------------------------------------------------------------
/creeper.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pingze-github/weibo-photo-downloader/566d78b65e3adfe745f498405b16f4245a46c098/creeper.pyc


--------------------------------------------------------------------------------
/weibo-photo-downloader.py:
--------------------------------------------------------------------------------
  1 | # coding=u8
  2 | 
  3 | "功能"
  4 | '''
  5 | 获取新浪微博用户相册照片到本地
  6 | '''
  7 | 
  8 | "使用方法"
  9 | '''
 10 | 1.填写储存目录
 11 | 2.指定微博用户id
 12 | 3.填写cookie
 13 | 4.运行
 14 | '''
 15 | 
 16 | # ---------------------------------------------------------------|| 初始参数 ||----------------------------------------------------------------------
 17 | dirpath = r'images' #储存目录
 18 | uid = '1005052495158140' #用户id
 19 | cookie = 'SINAGLOBAL=7096614666686.993.1478933549913; SCF=As9EXPnkiArRA4WnZDDqKwxMqDuhByIuF7xOWaFyllSVBrRIucqvH7G019xOOW-DvOwyS980Z-qhDhvjQ5KADjM.; SUHB=0C1CIC7sshc16C; ALF=1511684170; SUB=_2AkMvZk4nf8NhqwJRmP4VxGjkaoR1yw_EieLBAH7sJRMxHRl-yT83qm4ytRBGWF_rq7FhGYThrYAZmIqeJ6jpng..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFToROOH4BacWGBOsPOl71T; _s_tentry=auto.ifeng.com; Apache=1350895953090.9514.1481955434953; ULV=1481955434972:11:2:1:1350895953090.9514.1481955434953:1481022799070; YF-Page-G0=734c07cbfd1a4edf254d8b9173a162eb; UOR=,,www.baidu.com' #cookie
 20 | 
 21 | # 说明：
 22 | #   1.uid可在页面html文档中搜索"oid"，获取其值
 23 | #   2.cookies可在登录新浪微博后观测浏览器记录的cookies信息获得
 24 | 
 25 | import os
 26 | import requests
 27 | import urllib
 28 | import re
 29 | import pickle
 30 | import traceback
 31 | import time
 32 | from PIL import Image
 33 | from StringIO import StringIO
 34 | import creeper
 35 | 
 36 | # global
 37 | total_num = 0
 38 | 
 39 | def list_find(alist,ele):
 40 |     try:
 41 |         return alist.index(ele)
 42 |     except:
 43 |         return -1
 44 | 
 45 | def get_response(url,cookies='',headers='',params='', stream=False, mtt=20, wt=2, st=0.25):
 46 |     '稳定高效的获取响应方法'
 47 |     max_try_times = mtt # 最大尝试次数
 48 |     wait_time = wt # 最大单次尝试时间
 49 |     sleep_time = st # 尝试失败延时
 50 |     #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
 51 |     for times in range(1,max_try_times+1):
 52 |         # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
 53 |         try:
 54 |             response = requests.get(url, timeout=wait_time, cookies=cookies, headers=headers, params=params, stream=stream)
 55 |             # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
 56 |             break
 57 |         except:
 58 |             #traceback.print_exc()
 59 |             if times < max_try_times:
 60 |                 # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
 61 |                 time.sleep(sleep_time)
 62 |                 continue
 63 |             else:
 64 |                 print('[%s][ERROR] The last try failed at last , exit pro ...' % time.asctime()[11:19])
 65 |                 traceback.print_exc()
 66 |                 exit()
 67 |     # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
 68 |     response.encoding = 'u8'
 69 |     return response
 70 | 
 71 | 
 72 | 
 73 | def retrieve(imgurl,imgpath):
 74 |     '稳定高效的下载图片方法（多次尝试失败后跳过）'
 75 |     max_try_times = 5 # 最大尝试次数
 76 |     wait_time = 15 # 最大单次尝试时间
 77 |     sleep_time = 3 # 尝试失败延时
 78 |     import socket
 79 |     socket.setdefaulttimeout(wait_time)
 80 |     #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
 81 |     for times in range(1,max_try_times+1):
 82 |         # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
 83 |         try:
 84 |             urllib.urlretrieve(imgurl,imgpath)
 85 |             # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
 86 |             return True
 87 |         except:
 88 |             if times < max_try_times:
 89 |                 # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
 90 |                 time.sleep(sleep_time)
 91 |                 continue
 92 |             else:
 93 |                 print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19])
 94 |                 break
 95 |     return False
 96 |     # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
 97 | 
 98 | def save_img2(imgurl, imgpath, headers='', params=''):
 99 |     '稳定高效的下载图片方法（多次尝试失败后跳过）'
100 |     max_try_times = 10 # 最大尝试次数
101 |     sleep_time = 0 # 尝试失败延时
102 |     print('[%s][INFO] Start trying to download ...' % time.asctime()[11:19])
103 |     for times in range(1,max_try_times+1):
104 |         # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
105 |         try:
106 |             # __save_img2(imgurl, dirpath, imgname, headers, params)
107 |             response = get_response(imgurl, headers=headers,params=params, stream=False, mtt=10, wt=15, st=2)
108 |             img = Image.open(StringIO(response.content))
109 |             img.save(imgpath)
110 |             img.close()
111 |             # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
112 |             return True
113 |         except:
114 |             traceback.print_exc()
115 |             if times < max_try_times:
116 |                 print('[%s][WARN][IMG] The %s time try failed!' % (time.asctime()[11:19], times))
117 |                 time.sleep(sleep_time)
118 |                 continue
119 |             else:
120 |                 print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19])
121 |                 break
122 |     return False
123 | 
124 | def save_img(imgurl,savepath,imgname):
125 |     '向本地目录储存图像'
126 |     imgext = imgurl[-4:]
127 |     imgname = imgname + imgext 
128 |     flag = retrieve(imgurl,savepath+os.sep+imgname)
129 |     if flag == True:
130 |         return True
131 |     else:
132 |         return False
133 | 
134 | 
135 | def secp(string,pattern1,pattern2=''):
136 |     '替换字符串中所有指定字符串为新字符串(效率低)'
137 |     while True:
138 |         index = string.find(pattern1)
139 |         if index > -1:
140 |             string = string[:index]+pattern2+string[index+len(pattern1):]
141 |         else:
142 |             break
143 |     return string 
144 | 
145 | def url_deal(url):
146 |     'URL处理'
147 |     urld = secp(url,'\\')
148 |     urld = secp(urld,'thumb300','large')
149 |     return urld
150 | 
151 | def get_imgurl(html):
152 |     '解析html，获取图像url列表'
153 |     imgurl_list = []
154 |     extlist = ['jpg','gif','png']
155 |     for ext in extlist:
156 |         pattern = r'class=\\\"photo_pict\\\" src=\\\"(http:\S+thumb300\S+.'+ext+')'
157 |         result = re.findall(pattern,html,re.S)
158 |         if len(result) > 0:
159 |             for url in result:
160 |                 imgurl_list.append(url_deal(url))    
161 |     return imgurl_list   
162 | 
163 | 
164 | 
165 | def save_log(dic, path):
166 |     '以pickle文件格式储存到目标路径'
167 |     try:
168 |         out_file = open(path, 'wb')
169 |         pickle.dump(dic,out_file)
170 |         return path
171 |     except:
172 |         traceback.print_exc()
173 |         return None
174 |     finally:
175 |         out_file.close()      
176 | 
177 | def load_log(path):
178 |     '从指定文件读取pickle文件转成字典'
179 |     try:
180 |         in_file = open(path, 'rb')
181 |         dic = pickle.load(in_file)
182 |         return dic
183 |     except:
184 |         traceback.print_exc()
185 |         return None
186 | 
187 | def re_search(pattern, text):
188 |     regex = re.search(pattern, text, re.S)
189 |     try:
190 |         get = regex.group(1).strip()
191 |         return get
192 |     except:
193 |         return ''
194 | 
195 | def main():
196 |     creeper.cmkdir("./images")
197 |     url1 = 'http://www.weibo.com/u/' + uid
198 |     url2 = 'http://www.weibo.com/p/' + uid
199 |     url3 = 'http://www.weibo.com/' + uid
200 | 
201 |     cookies = dict(cookies_are=cookie) # use cookies alone
202 |     print('[%s][INFO] Pro starting ...' % (time.asctime()[11:19]))
203 |     for url in (url1,url2,url3):  
204 |         print('[%s][INFO] Start analysis at %s ...' % (time.asctime()[11:19], url))
205 |         response = requests.get(url, cookies=cookies)
206 |         html = response.text
207 |         page_id = re_search("page_id']='(\d+)';",html)
208 |         if len(page_id) > 0:
209 |             print('[%s][INFO] Successfully get page_id %s ...' % (time.asctime()[11:19], page_id))
210 |             break 
211 |     url = 'http://www.weibo.com/p/'+str(page_id)+'/photos'
212 |     '访问网址，获取html文档'
213 |     response = get_response(url, cookies=cookies)
214 |     response.encoding = 'u8'
215 |     html = response.text
216 |     '检查html是否有效；若无效，报错并中止'
217 |     if len(re.findall('thumb300',html,re.S)) < 1 and len(re.findall('oid',html,re.S)) < 1 and len(re.findall('的微博',re.S)) < 1:
218 |         print('[%s][ERROR] Invalid cookies or page_id, please check !' % (time.asctime()[11:19]))
219 |         exit()
220 |     '解析文档，获取用户信息和图片路径'
221 |     uname = re.findall(u'content="(.+?)，',html,re.S)[0]
222 |     imgurl_list = get_imgurl(html)
223 |     '动态获取循环'
224 |     while True:
225 |         '获取since_id，进一步获取动态加载的页面'
226 |         result = re.findall('since_id=(\S+)">',html,re.S)
227 |         if len(result)>0:
228 |             since_id = result[0][:-1]
229 |         else:
230 |             break
231 |         #print(since_id)
232 |         payload={
233 |             'since_id': since_id,
234 |             'page_id': page_id,
235 |             'ajax_call': 1
236 |         }
237 |         url = 'http://weibo.com/p/aj/album/loading'
238 |         while True:
239 |             response = get_response(url,params=payload,cookies=cookies)
240 |             if response.url != 'http://weibo.com/sorry?sysbusy':
241 |                 break
242 |         html = response.text
243 |         print('[%s][INFO] Got new page of %s !' % (time.asctime()[11:19], response.url))
244 |         '解析文档，获取html路径'
245 |         imgurl_list = imgurl_list + get_imgurl(html)
246 |     #pprint(imgurl_list)
247 |     savepath = dirpath + os.sep + uname
248 |     print('[%s][INFO] Got savepath %s !' % (time.asctime()[11:19], savepath))
249 |     if(os.path.exists(savepath)==False or os.path.isdir(savepath)==False):
250 |         os.mkdir(savepath)
251 |     imgurl_list.reverse()
252 |     global total_num
253 |     total_num = len(imgurl_list)
254 |     print('[%s][INFO] Got all images, total %d !' % (time.asctime()[11:19], len(imgurl_list)))
255 | 
256 |     cp = creeper.Creeper_imgs(imgurl_list)
257 |     cp.save(savepath)
258 | 
259 | if __name__ == '__main__':
260 |     main()
261 | 
262 | 


--------------------------------------------------------------------------------