├── LICENSE.txt ├── README.md ├── creeper.py ├── creeper.pyc └── weibo-photo-downloader.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 pingze 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weibo-photo-downloader 2 | 新浪微博相片下载器。自动排序,多线程,断点续传。 3 | 4 | ## 功能 5 | 下载指定新浪微博用户的相册的所有图片到本地。 6 | 7 | ## 使用方法 8 | 1. 打开需要下载的用户主页,在网址或页面文档中找到用户唯一id。支持3种ID,比如黄晓明的ID可以为huangxiaoming(个性域名)、1730077315(用户标识oid)、1006051730077315(相册标识page_id)。 9 | 2. 利用chrome开发者工具或其他抓包工具,或者运行语句```document.cookie```来获取cookie值。 10 | 3. 编辑weibo-photo-downloader.py文件,在初始参数中,修改uid和cookie的值为第一步获取的用户id和第二步获取的cookie。 11 | 4. 运行weibo-photo-downloader.py文件。 12 | 13 | ## 特点 14 | 1. 因为新浪微博用户登录需要验证码,现使用cookie代替登录环节。 15 | 2. 用户相册的所有图片都会被下载,按发布时间排序。 16 | 3. 动态图(GIF)也会被下载,视频则会被忽略。 17 | 4. 采用多线程提高下载效率,具体线程数可控制。 18 | 5. 可以在控制台观察到下载进度。 19 | -------------------------------------------------------------------------------- /creeper.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # coding=utf-8 3 | 4 | ''' 5 | Creeper: 6 | 爬虫包。 7 | 功能: 获取网页文档和图片等资源,多线程,支持断点续传,支持数据库。 8 | ''' 9 | ''' 10 | 类: 11 | Creeper: 12 | 用于get、post并获取响应内容等。 13 | Creeper_imgs: 14 | 对多个连接的内容进行下载。多线程/断点续传。 15 | Creeper_thread: 16 | 多线程类。 17 | 18 | ''' 19 | ''' 20 | 1.增加记录功能。 21 | 2.修改requests的cookies参数 22 | 3.尝试找到requests下载gif图片的方法 23 | 4.增加小视频下载功能 24 | 5.智能线程数控制 25 | ''' 26 | 27 | 28 | import os 29 | import requests 30 | import urllib 31 | import re 32 | from StringIO import StringIO 33 | import pickle 34 | import traceback 35 | import time 36 | from PIL import Image 37 | import threading 38 | from Queue import Queue 39 | 40 | 41 | 42 | # **************** Creeper ********************* 43 | 44 | class Creeper(object): 45 | def __init__(self,url,headers='',params='',data='',proxies='',encoding=''): 46 | self.url = url 47 | self.headers = headers 48 | self.params = params 49 | self.data = data 50 | self.proxies = proxies 51 | self.response = None 52 | self.html = None 53 | self.max_try_times = 10 54 | self.wait_time = 5 55 | self.sleep_time = 0 56 | self.encoding = encoding 57 | self.lock = None 58 | 59 | def get(self,url='',headers='',params='',proxies='',stream=False): 60 | 'get方法' 61 | if url != '': 62 | self.set_url(url) 63 | if headers != '': 64 | self.set_headers(headers) 65 | if params != '': 66 | self.set_params(params) 67 | if proxies != '': 68 | self.set_proxies(proxies) 69 | response = self.__response('get',self.url,self.headers,self.params,self.proxies,stream) 70 | if response != False: 71 | self.response = response 72 | return response 73 | else: 74 | if self.lock != None: 75 | self.lock.acquire() 76 | print('[Creeper] Failed to get response. ') 77 | self.lock.release() 78 | return False 79 | 80 | def post(self,url='',headers='',params='',data='',proxies=''): 81 | if url != '': 82 | self.set_url(url) 83 | if headers != '': 84 | self.set_headers(headers) 85 | if data != '': 86 | self.set_data(data) 87 | if params != '': 88 | self.set_params(params) 89 | if proxies != '': 90 | self.set_proxies(proxies) 91 | response = self.__response('post',self.url,self.headers,self.params,self.proxies) 92 | if response != False: 93 | self.response = response 94 | return response 95 | else: 96 | if self.lock != None: 97 | self.lock.acquire() 98 | print('[Creeper] Failed to post. ') 99 | self.lock.release() 100 | return False 101 | 102 | def __response(self,work,url,headers='',params='',data='',proxies='',stream=False): 103 | '获取响应' 104 | max_try_times = self.max_try_times # 最大尝试次数 105 | wait_time = self.wait_time # 最大单次尝试时间 106 | sleep_time = self.sleep_time # 尝试失败延时 107 | for times in range(1,max_try_times+1): 108 | try: 109 | if work == 'get': 110 | # response = requests.get(url, timeout = wait_time, headers=headers, params=params, proxies=proxies, stream=stream) 111 | response = requests.get(url, timeout = wait_time, headers=headers, params=params, stream=stream) 112 | elif work == 'post': 113 | # response = requests.post(url, timeout = wait_time, headers=headers, params=params,data=data, proxies=proxies) 114 | response = requests.post(url, timeout = wait_time, headers=headers, params=params,data=data) 115 | break 116 | except: 117 | #traceback.print_exc() 118 | if times < max_try_times: 119 | time.sleep(sleep_time) 120 | continue 121 | else: 122 | traceback.print_exc() 123 | return False 124 | if self.encoding == '': 125 | try: 126 | encoding = re_find('charset=(.+?)>',response.text,report=False) 127 | encoding = string_removeall(encoding,['"',"'",' ','=','/']) 128 | self.encoding = encoding 129 | except: 130 | traceback.print_exc() 131 | self.encoding = 'utf-8' 132 | if self.encoding.lower() not in ['utf-8','utf8','gbk','gb2312']: 133 | self.encoding = 'utf-8' 134 | if self.lock != None: 135 | self.lock.acquire() 136 | print('[Creeper] Can not find charset, set encoding:utf-8') 137 | self.lock.release() 138 | self.response = response 139 | self.response.encoding = self.encoding 140 | return response 141 | 142 | def set_url(self,url): 143 | if isinstance(url,str) or isinstance(url,unicode): 144 | self.url = url 145 | self.__flush() 146 | else: 147 | print('[Creeper] Url should be a string') 148 | 149 | def set_headers(self,headers): 150 | if isinstance(headers,dict): 151 | self.headers = headers 152 | self.__flush() 153 | else: 154 | print('[Creeper] Headers should be a dict') 155 | 156 | def set_params(self,params): 157 | if isinstance(params,dict): 158 | self.params = params 159 | self.__flush() 160 | else: 161 | print('[Creeper] Params should be a dict') 162 | 163 | def set_data(self,data): 164 | if isinstance(data,dict): 165 | self.data = data 166 | self.__flush() 167 | else: 168 | print('[Creeper] Data should be a dict') 169 | 170 | def set_proxies(self,proxies): 171 | if isinstance(proxies,dict): 172 | self.proxies = proxies 173 | self.__flush() 174 | else: 175 | print('[Creeper] Proxies should be a dict') 176 | 177 | def __flush(self): 178 | self.response = None 179 | self.html = None 180 | 181 | def find(self,pattern,text=''): 182 | if text == '': 183 | text = self.html 184 | if isinstance(text,str)==False and isinstance(text,unicode)==False: 185 | print('[Creeper] Invalid Text') 186 | return '' 187 | regex = re.search(pattern, text, re.S) 188 | num = pattern.count('(') 189 | try: 190 | if num == 1: 191 | get = regex.group(1) 192 | else: 193 | get = regex.groups() 194 | return get.strip() 195 | except: 196 | print('[Creeper] No Found') 197 | 198 | return '' 199 | 200 | def findall(self,pattern,text=''): 201 | if text == '': 202 | text = self.html 203 | if len(text) < 1: 204 | print('[Creeper] Invalid Text') 205 | return '' 206 | get_list = re.findall(pattern, text, re.S) 207 | find_list = [] 208 | for get in get_list: 209 | find_list.append(get.strip()) 210 | if len(find_list) > 0: 211 | return find_list 212 | else: 213 | print('[Creeper] No Found') 214 | 215 | return '' 216 | 217 | def get_html(self): 218 | if self.response == None: 219 | self.get() 220 | self.html = self.response.text 221 | elif self.html == None: 222 | self.html = self.response.text 223 | return self.html 224 | 225 | # **************** Creepers ********************* 226 | 227 | # class Creepers(Creeper_imgs): 228 | # '专门用于多个链接内容的获取' 229 | # def __init__(self,url_list,headers='',params='',data=''): 230 | # super(Creepers,self).__init__(self,url_list[0],headers,params,data) 231 | # self.set_url_list(url_list) 232 | # self.lock = None 233 | # self.thread_list = [] 234 | # self.thread_num = 5 235 | 236 | # def set_url_list(self,url_list): 237 | # if isinstance(url_list,list): 238 | # flag = True 239 | # for url in url_list: 240 | # if not isinstance(url,str): 241 | # flag = False 242 | # if flag == True: 243 | # self.url_list = url_list 244 | 245 | # def thread_num(num): 246 | # if isinstance(num,int): 247 | # if 1 <= num <= 20: 248 | # self.thread_num = num 249 | # else: 250 | # print('[Creeper] Please make threads number between 1 and 20 ') 251 | # else: 252 | # print('[Creeper] threads number shoud be a integer ') 253 | 254 | # def __get(self,tname,lock,url='',headers='',params=''): 255 | # '单个线程的get方法' 256 | # if url != '': 257 | # self.set_url(url) 258 | # if headers != '': 259 | # self.set_headers(headers) 260 | # if params != '': 261 | # self.set_params(params) 262 | # response = self.__response('get',self.url,self.headers,self.params) 263 | # if response != False: 264 | # self.response = response 265 | # return response 266 | # else: 267 | # print('[Creeper] Failed to get response. ') 268 | # exit() 269 | 270 | # def getall(self,url_list='',headers='',params=''): 271 | # if url_list != '': 272 | # self.set_url_list(url_list) 273 | # if headers != '': 274 | # self.set_headers(headers) 275 | # if params != '': 276 | # self.set_params(params) 277 | # self.url_queue = list2queue(self.url_list) 278 | # self.lock = threading.Lock() 279 | # self.thread_list = [] 280 | # for i in range(self.thread_num): 281 | # thread = MultiThread(self.__get,'t'+str(i),lock,(self.url_queue,savepath,headers)) 282 | # thread_list.append(thread) 283 | # thread.start() 284 | # for thread in thread_list: 285 | # thread.join() 286 | 287 | 288 | 289 | 290 | 291 | # **************** Creepers ********************* 292 | 293 | class Creeper_imgs(Creeper): 294 | '专门用于多个链接图片的下载' 295 | def __init__(self,url_list,dirpath='',headers='',params=''): 296 | super(Creeper_imgs,self).__init__(self,'',headers,params) 297 | self.url = None 298 | self.url_list = None 299 | self.set_url_list(url_list) 300 | self.dirpath = None 301 | if dirpath != '': 302 | self.set_dirpath(dirpath) 303 | self.done_list = [] 304 | self.failed_dict = {} 305 | self.task_total = len(self.url_list) 306 | self.lock = None 307 | self.thread_list = [] 308 | self.thread_num = 10 309 | self.task_done = 0 310 | self.task_last = 0 311 | self.filename_list = None 312 | 313 | def set_url_list(self,url_list): 314 | if isinstance(url_list,list): 315 | flag = True 316 | for url in url_list: 317 | if not isinstance(url,str) and not isinstance(url, unicode): 318 | flag = False 319 | if flag == True: 320 | self.url_list = url_list 321 | return True 322 | else: 323 | print('[Creeper] Wrong url_list Input') 324 | self.url_list = None 325 | return False 326 | 327 | def set_dirpath(self,dirpath): 328 | if not isinstance(dirpath,str) and not isinstance(dirpath,unicode): 329 | print('[Creeper] Wrong dirpath Input: Not a String') 330 | return False 331 | else: 332 | flag = cmkdir(dirpath) 333 | if flag == True: 334 | self.dirpath = dirpath 335 | return True 336 | else: 337 | print('[Creeper] Wrong dirpath Input: Can Not Create Path') 338 | return False 339 | 340 | def set_filename(self,filename_list): 341 | if len(filename_list) == len(self.url_list): 342 | self.filename_list = filename_list 343 | 344 | def save(self,dirpath='',url_list='',headers='',params=''): 345 | flag = True 346 | if url_list != '': 347 | flag = self.set_url_list(url_list) 348 | if headers != '': 349 | self.set_headers(headers) 350 | if params != '': 351 | self.set_params(params) 352 | if dirpath != '': 353 | flag = self.set_dirpath(dirpath) 354 | elif self.dirpath == None: 355 | print('[Creeper] No dirpath exsits') 356 | flag = False 357 | if flag == True: 358 | self.__load_log() 359 | # deal with url_list and task_last 360 | url_list_checked = [] 361 | for imgurl in self.url_list: 362 | if not imgurl in self.done_list: 363 | url_list_checked.append(imgurl) 364 | self.url_list = url_list_checked 365 | # download th failed ones first 366 | self.failed_list = self.failed_dict.values() 367 | for url_list in (self.failed_list, self.url_list): 368 | list_length = len(url_list) 369 | self.task_total = list_length 370 | url_queue = list2queue(url_list) 371 | self.lock = threading.Lock() 372 | self.thread_list = [] 373 | for i in range(self.thread_num): 374 | if url_list is self.url_list: 375 | thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params)) 376 | else: 377 | # thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params),flag_failed=True) 378 | thread = Creeper_thread(self.__save_thread,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params)) 379 | self.thread_list.append(thread) 380 | thread.start() 381 | for thread in self.thread_list: 382 | thread.join() 383 | print('[Creeper][%s] All Thread Exit !' % (time.asctime()[11:19])) 384 | print('[Creeper][%s] All Task Done !' % (time.asctime()[11:19])) 385 | 386 | 387 | 388 | def __save_thread(self,tname,lock,args,flag_failed=False): 389 | (url_queue,dirpath,headers,params) = args 390 | self.lock.acquire() 391 | print('[Creeper][%s][%s] Thread Start !' % (time.asctime()[11:19], tname)) 392 | self.lock.release() 393 | while True: 394 | if url_queue.empty() == True: 395 | break 396 | imgurl = url_queue.get() 397 | self.lock.acquire() 398 | print('[Creeper][%s][%s] Getting %s ... !' % (time.asctime()[11:19], tname, imgurl)) 399 | self.lock.release() 400 | self.task_last = self.task_last + 1 401 | self.url = imgurl 402 | imgext = imgurl[imgurl.rfind('.'):] 403 | if flag_failed == True: 404 | imgname = self.failed_list[imgurl] 405 | else: 406 | if self.filename_list == None: 407 | imgname = format_num(self.task_last-1,len(str(self.task_total))) + imgext 408 | else: 409 | imgname = self.filename_list[self.url_list.index(imgurl)] 410 | flag = self.__save_img(imgurl, dirpath, imgname, headers, params) 411 | self.task_done = self.task_done + 1 412 | if not imgurl in self.failed_list: 413 | self.failed_dict[imgurl] = imgname 414 | if flag == True: 415 | self.lock.acquire() 416 | print('[Creeper][%s][%s] Successfully saved image %s as %s (%d/%d) !' % (time.asctime()[11:19], tname, imgurl, imgname, self.task_done, self.task_total)) 417 | self.lock.release() 418 | self.done_list.append(imgurl) 419 | if imgurl in self.failed_dict: 420 | self.failed_dict.pop(imgurl) 421 | self.__write_log() 422 | if flag == False: 423 | self.lock.acquire() 424 | print('[Creeper][%s][%s] Failed saved image %s as %s (%d/%d) !' % (time.asctime()[11:19], tname, imgurl, imgname, self.task_done, self.task_total)) 425 | self.lock.release() 426 | 427 | self.__write_log() 428 | self.lock.acquire() 429 | print('[Creeper][%s][%s] Thread Exit !' % (time.asctime()[11:19], tname)) 430 | self.lock.release() 431 | 432 | def __save_img(self,imgurl,dirpath,imgname,headers='',params=''): 433 | '稳定高效的下载图片方法(多次尝试失败后跳过)' 434 | max_try_times = 10 # 最大尝试次数 435 | sleep_time = 0 # 尝试失败延时 436 | self.max_try_times = 10 437 | self.wait_time = 5 438 | self.sleep_time = 0 439 | imgpath = dirpath + os.sep + imgname 440 | # flag_gif = False 441 | # if imgname[-4:] == '.gif': 442 | # flag_gif = True 443 | for times in range(1,max_try_times+1): 444 | # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times)) 445 | try: 446 | # if flag_gif == False: 447 | # print('notgif') 448 | response = self.get(imgurl,headers=headers,params=params,stream=True) 449 | if response.status_code != requests.codes.ok: 450 | print('[Creeper] 404 Client Error') 451 | imgurl = imgurl[:-4] + '.png' 452 | response = self.get(imgurl,headers=headers,params=params,stream=True) 453 | img = Image.open(StringIO(response.content)) 454 | img.save(imgpath) 455 | img.close() 456 | 457 | with open(imgpath, 'wb') as f: 458 | for chunk in response.iter_content(chunk_size=1024): 459 | if chunk: # filter out keep-alive new chunks 460 | f.write(chunk) 461 | f.flush() 462 | f.close() 463 | 464 | # else: 465 | # print('isgif') 466 | # urllib.urlretrieve(imgurl,imgpath) #暂时不能反反盗链 467 | # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times)) 468 | return True 469 | except: 470 | if times < max_try_times: 471 | # print('[%s][WARN][IMG] The %s time try failed!' % (time.asctime()[11:19], times)) 472 | time.sleep(sleep_time) 473 | continue 474 | else: 475 | traceback.print_exc() 476 | break 477 | return False 478 | 479 | def __write_log(self): 480 | log_dict = { 481 | 'done_list':self.done_list, 482 | 'failed_dict':self.failed_dict, 483 | 'task_last':self.task_last-1 484 | } 485 | logpath = self.dirpath + os.sep + 'log.pkl' 486 | try: 487 | out_file = open(logpath, 'wb') 488 | pickle.dump(log_dict,out_file) 489 | return True 490 | except: 491 | traceback.print_exc() 492 | return None 493 | finally: 494 | out_file.close() 495 | 496 | def __load_log(self): 497 | logpath = self.dirpath + os.sep + 'log.pkl' 498 | if os.path.exists(logpath) and os.path.isfile(logpath): 499 | print('[Creeper] Found Log File, Reading ...') 500 | try: 501 | in_file = open(logpath, 'rb') 502 | except: 503 | traceback.print_exc() 504 | return None 505 | dic = pickle.load(in_file) 506 | self.done_list = dic['done_list'] 507 | self.failed_dict = dic['failed_dict'] 508 | self.task_last = dic['task_last'] 509 | else: 510 | print('[Creeper] Not Found Log File, Will Create a New One') 511 | self.done_list = [] 512 | self.failed_dict = {} 513 | self.task_last = 0 514 | cmkdir(self.dirpath) 515 | 516 | 517 | # **************** Creeper_thread ********************* 518 | 519 | class Creeper_thread(threading.Thread): 520 | '可传递参数、指定运行函数的线程类' 521 | def __init__(self,target,tname,lock,args): 522 | super(Creeper_thread,self).__init__() 523 | self.setDaemon(True) 524 | self.target = target 525 | self.tname = tname 526 | self.lock = lock 527 | self.args = args 528 | 529 | def run(self): 530 | self.target(self.tname,self.lock,self.args) 531 | 532 | # **************** Single Functions ********************* 533 | 534 | # def string_removeall(string,ele): 535 | # if isinstance(ele,str) == True or isinstance(ele,unicode) == True: 536 | # while True: 537 | # if ele in string: 538 | # string = string[:string.index(ele)] + string[string.index(ele)+len(ele):] 539 | # else: 540 | # break 541 | # return string 542 | # elif isinstance(ele,list) or isinstance(ele,tuple): 543 | # for eele in ele: 544 | # while True: 545 | # if eele in string: 546 | # string = string[:string.index(eele)] + string[string.index(eele)+len(eele):] 547 | # else: 548 | # break 549 | # return string 550 | 551 | def string_removeall(string,ele): 552 | if isinstance(ele,str) == True or isinstance(ele,unicode) == True: 553 | return string.replace(ele,'') 554 | elif isinstance(ele,list) or isinstance(ele,tuple): 555 | for eele in ele: 556 | string = string.replace(eele,'') 557 | return string 558 | 559 | 560 | def list_remove(alist,ele): 561 | try: 562 | alist.remove(ele) 563 | return True 564 | except: 565 | return False 566 | 567 | def format_num(num,length): 568 | string = str(num) 569 | for i in range(length-len(string)): 570 | string = '0' + string 571 | return string 572 | 573 | def cmkdir(path): 574 | try: 575 | if(os.path.exists(path)==False or os.path.isdir(path)==False): 576 | os.mkdir(path) 577 | except: 578 | traceback.print_exc() 579 | return False 580 | return True 581 | 582 | def re_search(pattern, text): 583 | if len(text) < 1: 584 | print('[Creeper] Invalid Text') 585 | return '' 586 | regex = re.search(pattern, text, re.S) 587 | try: 588 | get = regex.group(1).strip() 589 | return get 590 | except: 591 | print('[Creeper] No Search') 592 | return '' 593 | 594 | def list2queue(alist): 595 | queue = Queue() 596 | for ele in alist: 597 | queue.put(ele) 598 | return queue 599 | 600 | re_digits = re.compile(r'(\d+)') 601 | 602 | def emb_numbers(s): 603 | pieces=re_digits.split(s) 604 | pieces[1::2]=map(int,pieces[1::2]) 605 | return pieces 606 | 607 | def lsort(alist): 608 | return sorted(alist, key=emb_numbers) 609 | 610 | def save(target,dirpath='',url_list='',headers='',params=''): 611 | '多线程下载方法' 612 | flag = True 613 | if url_list != '': 614 | flag = self.set_url_list(url_list) 615 | if headers != '': 616 | self.set_headers(headers) 617 | if params != '': 618 | self.set_params(params) 619 | if dirpath != '': 620 | flag = self.set_dirpath(dirpath) 621 | elif self.dirpath == None: 622 | print('[Creeper] No dirpath exsits') 623 | flag = False 624 | if flag == True: 625 | self.__load_log() 626 | # deal with url_list and task_last 627 | url_list_checked = [] 628 | for imgurl in self.url_list: 629 | if not imgurl in self.done_list: 630 | url_list_checked.append(imgurl) 631 | self.url_list = url_list_checked 632 | # download th failed ones first 633 | self.failed_list = self.failed_dict.values() 634 | for url_list in (self.failed_list, self.url_list): 635 | list_length = len(url_list) 636 | self.task_total = list_length 637 | if list_length == 0: 638 | continue 639 | elif 0 < list_length <= 20: 640 | self.thread_num = 5 641 | else: 642 | self.thread_num = 10 643 | url_queue = list2queue(url_list) 644 | self.lock = threading.Lock() 645 | self.thread_list = [] 646 | for i in range(self.thread_num): 647 | if url_list is self.url_list: 648 | thread = Creeper_thread(target,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params)) 649 | else: 650 | thread = Creeper_thread(target,'thread'+'{:0>2}'.format(i),self.lock,args=(url_queue,self.dirpath,headers,params),flag_failed=True) 651 | self.thread_list.append(thread) 652 | thread.start() 653 | for thread in self.thread_list: 654 | thread.join() 655 | print('[Creeper][%s] All Thread Exit !' % (time.asctime()[11:19])) 656 | print('[Creeper][%s] All Task Done !' % (time.asctime()[11:19])) 657 | 658 | def re_find(pattern,text,report=True): 659 | if len(text) < 1: 660 | print('[Creeper] Invalid Text') 661 | return '' 662 | regex = re.search(pattern, text, re.S) 663 | num = pattern.count('(') 664 | try: 665 | if num == 1: 666 | get = regex.group(1) 667 | else: 668 | get = regex.groups() 669 | return get.strip() 670 | except: 671 | if report == True: 672 | print('[Creeper] No Found') 673 | return '' 674 | 675 | def re_findall(pattern,text,report=True): 676 | if len(text) < 1: 677 | print('[Creeper] Invalid Text') 678 | return '' 679 | get_list = re.findall(pattern, text, re.S) 680 | find_list = [] 681 | for get in get_list: 682 | find_list.append(get.strip()) 683 | if len(find_list) > 0: 684 | return find_list 685 | else: 686 | if report == True: 687 | print('[Creeper] No Found') 688 | return '' 689 | 690 | def list_remove_same(alist): 691 | newlist = [] 692 | for ele in alist: 693 | if ele not in newlist: 694 | newlist.append(ele) 695 | return newlist 696 | 697 | def num_format(num,total): 698 | bitnum = len(str(total)) 699 | if bitnum == 1: 700 | return '{:0>1}'.format(num) 701 | elif bitnum == 2: 702 | return '{:0>2}'.format(num) 703 | elif bitnum == 3: 704 | return '{:0>3}'.format(num) 705 | elif bitnum == 4: 706 | return '{:0>4}'.format(num) 707 | elif bitnum == 5: 708 | return '{:0>5}'.format(num) 709 | elif bitnum == 6: 710 | return '{:0>6}'.format(num) 711 | elif bitnum == 7: 712 | return '{:0>7}'.format(num) 713 | elif bitnum == 8: 714 | return '{:0>8}'.format(num) 715 | elif bitnum == 9: 716 | return '{:0>9}'.format(num) 717 | 718 | # ****************** testing ******************** 719 | 720 | def main(): 721 | print num_format(12,300) 722 | 723 | if __name__ == '__main__': 724 | main() 725 | 726 | -------------------------------------------------------------------------------- /creeper.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pingze-github/weibo-photo-downloader/566d78b65e3adfe745f498405b16f4245a46c098/creeper.pyc -------------------------------------------------------------------------------- /weibo-photo-downloader.py: -------------------------------------------------------------------------------- 1 | # coding=u8 2 | 3 | "功能" 4 | ''' 5 | 获取新浪微博用户相册照片到本地 6 | ''' 7 | 8 | "使用方法" 9 | ''' 10 | 1.填写储存目录 11 | 2.指定微博用户id 12 | 3.填写cookie 13 | 4.运行 14 | ''' 15 | 16 | # ---------------------------------------------------------------|| 初始参数 ||---------------------------------------------------------------------- 17 | dirpath = r'images' #储存目录 18 | uid = '1005052495158140' #用户id 19 | cookie = 'SINAGLOBAL=7096614666686.993.1478933549913; SCF=As9EXPnkiArRA4WnZDDqKwxMqDuhByIuF7xOWaFyllSVBrRIucqvH7G019xOOW-DvOwyS980Z-qhDhvjQ5KADjM.; SUHB=0C1CIC7sshc16C; ALF=1511684170; SUB=_2AkMvZk4nf8NhqwJRmP4VxGjkaoR1yw_EieLBAH7sJRMxHRl-yT83qm4ytRBGWF_rq7FhGYThrYAZmIqeJ6jpng..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFToROOH4BacWGBOsPOl71T; _s_tentry=auto.ifeng.com; Apache=1350895953090.9514.1481955434953; ULV=1481955434972:11:2:1:1350895953090.9514.1481955434953:1481022799070; YF-Page-G0=734c07cbfd1a4edf254d8b9173a162eb; UOR=,,www.baidu.com' #cookie 20 | 21 | # 说明: 22 | # 1.uid可在页面html文档中搜索"oid",获取其值 23 | # 2.cookies可在登录新浪微博后观测浏览器记录的cookies信息获得 24 | 25 | import os 26 | import requests 27 | import urllib 28 | import re 29 | import pickle 30 | import traceback 31 | import time 32 | from PIL import Image 33 | from StringIO import StringIO 34 | import creeper 35 | 36 | # global 37 | total_num = 0 38 | 39 | def list_find(alist,ele): 40 | try: 41 | return alist.index(ele) 42 | except: 43 | return -1 44 | 45 | def get_response(url,cookies='',headers='',params='', stream=False, mtt=20, wt=2, st=0.25): 46 | '稳定高效的获取响应方法' 47 | max_try_times = mtt # 最大尝试次数 48 | wait_time = wt # 最大单次尝试时间 49 | sleep_time = st # 尝试失败延时 50 | #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19]) 51 | for times in range(1,max_try_times+1): 52 | # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times)) 53 | try: 54 | response = requests.get(url, timeout=wait_time, cookies=cookies, headers=headers, params=params, stream=stream) 55 | # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times)) 56 | break 57 | except: 58 | #traceback.print_exc() 59 | if times < max_try_times: 60 | # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times)) 61 | time.sleep(sleep_time) 62 | continue 63 | else: 64 | print('[%s][ERROR] The last try failed at last , exit pro ...' % time.asctime()[11:19]) 65 | traceback.print_exc() 66 | exit() 67 | # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19]) 68 | response.encoding = 'u8' 69 | return response 70 | 71 | 72 | 73 | def retrieve(imgurl,imgpath): 74 | '稳定高效的下载图片方法(多次尝试失败后跳过)' 75 | max_try_times = 5 # 最大尝试次数 76 | wait_time = 15 # 最大单次尝试时间 77 | sleep_time = 3 # 尝试失败延时 78 | import socket 79 | socket.setdefaulttimeout(wait_time) 80 | #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19]) 81 | for times in range(1,max_try_times+1): 82 | # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times)) 83 | try: 84 | urllib.urlretrieve(imgurl,imgpath) 85 | # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times)) 86 | return True 87 | except: 88 | if times < max_try_times: 89 | # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times)) 90 | time.sleep(sleep_time) 91 | continue 92 | else: 93 | print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19]) 94 | break 95 | return False 96 | # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19]) 97 | 98 | def save_img2(imgurl, imgpath, headers='', params=''): 99 | '稳定高效的下载图片方法(多次尝试失败后跳过)' 100 | max_try_times = 10 # 最大尝试次数 101 | sleep_time = 0 # 尝试失败延时 102 | print('[%s][INFO] Start trying to download ...' % time.asctime()[11:19]) 103 | for times in range(1,max_try_times+1): 104 | # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times)) 105 | try: 106 | # __save_img2(imgurl, dirpath, imgname, headers, params) 107 | response = get_response(imgurl, headers=headers,params=params, stream=False, mtt=10, wt=15, st=2) 108 | img = Image.open(StringIO(response.content)) 109 | img.save(imgpath) 110 | img.close() 111 | # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times)) 112 | return True 113 | except: 114 | traceback.print_exc() 115 | if times < max_try_times: 116 | print('[%s][WARN][IMG] The %s time try failed!' % (time.asctime()[11:19], times)) 117 | time.sleep(sleep_time) 118 | continue 119 | else: 120 | print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19]) 121 | break 122 | return False 123 | 124 | def save_img(imgurl,savepath,imgname): 125 | '向本地目录储存图像' 126 | imgext = imgurl[-4:] 127 | imgname = imgname + imgext 128 | flag = retrieve(imgurl,savepath+os.sep+imgname) 129 | if flag == True: 130 | return True 131 | else: 132 | return False 133 | 134 | 135 | def secp(string,pattern1,pattern2=''): 136 | '替换字符串中所有指定字符串为新字符串(效率低)' 137 | while True: 138 | index = string.find(pattern1) 139 | if index > -1: 140 | string = string[:index]+pattern2+string[index+len(pattern1):] 141 | else: 142 | break 143 | return string 144 | 145 | def url_deal(url): 146 | 'URL处理' 147 | urld = secp(url,'\\') 148 | urld = secp(urld,'thumb300','large') 149 | return urld 150 | 151 | def get_imgurl(html): 152 | '解析html,获取图像url列表' 153 | imgurl_list = [] 154 | extlist = ['jpg','gif','png'] 155 | for ext in extlist: 156 | pattern = r'class=\\\"photo_pict\\\" src=\\\"(http:\S+thumb300\S+.'+ext+')' 157 | result = re.findall(pattern,html,re.S) 158 | if len(result) > 0: 159 | for url in result: 160 | imgurl_list.append(url_deal(url)) 161 | return imgurl_list 162 | 163 | 164 | 165 | def save_log(dic, path): 166 | '以pickle文件格式储存到目标路径' 167 | try: 168 | out_file = open(path, 'wb') 169 | pickle.dump(dic,out_file) 170 | return path 171 | except: 172 | traceback.print_exc() 173 | return None 174 | finally: 175 | out_file.close() 176 | 177 | def load_log(path): 178 | '从指定文件读取pickle文件转成字典' 179 | try: 180 | in_file = open(path, 'rb') 181 | dic = pickle.load(in_file) 182 | return dic 183 | except: 184 | traceback.print_exc() 185 | return None 186 | 187 | def re_search(pattern, text): 188 | regex = re.search(pattern, text, re.S) 189 | try: 190 | get = regex.group(1).strip() 191 | return get 192 | except: 193 | return '' 194 | 195 | def main(): 196 | creeper.cmkdir("./images") 197 | url1 = 'http://www.weibo.com/u/' + uid 198 | url2 = 'http://www.weibo.com/p/' + uid 199 | url3 = 'http://www.weibo.com/' + uid 200 | 201 | cookies = dict(cookies_are=cookie) # use cookies alone 202 | print('[%s][INFO] Pro starting ...' % (time.asctime()[11:19])) 203 | for url in (url1,url2,url3): 204 | print('[%s][INFO] Start analysis at %s ...' % (time.asctime()[11:19], url)) 205 | response = requests.get(url, cookies=cookies) 206 | html = response.text 207 | page_id = re_search("page_id']='(\d+)';",html) 208 | if len(page_id) > 0: 209 | print('[%s][INFO] Successfully get page_id %s ...' % (time.asctime()[11:19], page_id)) 210 | break 211 | url = 'http://www.weibo.com/p/'+str(page_id)+'/photos' 212 | '访问网址,获取html文档' 213 | response = get_response(url, cookies=cookies) 214 | response.encoding = 'u8' 215 | html = response.text 216 | '检查html是否有效;若无效,报错并中止' 217 | if len(re.findall('thumb300',html,re.S)) < 1 and len(re.findall('oid',html,re.S)) < 1 and len(re.findall('的微博',re.S)) < 1: 218 | print('[%s][ERROR] Invalid cookies or page_id, please check !' % (time.asctime()[11:19])) 219 | exit() 220 | '解析文档,获取用户信息和图片路径' 221 | uname = re.findall(u'content="(.+?),',html,re.S)[0] 222 | imgurl_list = get_imgurl(html) 223 | '动态获取循环' 224 | while True: 225 | '获取since_id,进一步获取动态加载的页面' 226 | result = re.findall('since_id=(\S+)">',html,re.S) 227 | if len(result)>0: 228 | since_id = result[0][:-1] 229 | else: 230 | break 231 | #print(since_id) 232 | payload={ 233 | 'since_id': since_id, 234 | 'page_id': page_id, 235 | 'ajax_call': 1 236 | } 237 | url = 'http://weibo.com/p/aj/album/loading' 238 | while True: 239 | response = get_response(url,params=payload,cookies=cookies) 240 | if response.url != 'http://weibo.com/sorry?sysbusy': 241 | break 242 | html = response.text 243 | print('[%s][INFO] Got new page of %s !' % (time.asctime()[11:19], response.url)) 244 | '解析文档,获取html路径' 245 | imgurl_list = imgurl_list + get_imgurl(html) 246 | #pprint(imgurl_list) 247 | savepath = dirpath + os.sep + uname 248 | print('[%s][INFO] Got savepath %s !' % (time.asctime()[11:19], savepath)) 249 | if(os.path.exists(savepath)==False or os.path.isdir(savepath)==False): 250 | os.mkdir(savepath) 251 | imgurl_list.reverse() 252 | global total_num 253 | total_num = len(imgurl_list) 254 | print('[%s][INFO] Got all images, total %d !' % (time.asctime()[11:19], len(imgurl_list))) 255 | 256 | cp = creeper.Creeper_imgs(imgurl_list) 257 | cp.save(savepath) 258 | 259 | if __name__ == '__main__': 260 | main() 261 | 262 | --------------------------------------------------------------------------------