├── instruction.png ├── .gitignore ├── README.md └── spider.py /instruction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vansl/JiandanSpider/HEAD/instruction.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/settings.json 2 | .vscode/tasks.json 3 | .vscode/launch.json 4 | .vscode/extensions.json -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JiandanSpider 2 | 煎蛋网妹子图爬虫 解决煎蛋网新反爬机制下图片获取问题 3 | 4 | ## 流程说明: 5 | ![流程说明图](https://github.com/van1997/JiandanSpider/blob/master/instruction.png) 6 | 7 | ## 详细介绍: 8 | CSDN:http://blog.csdn.net/van_brilliant/article/details/78723878 9 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import hashlib 4 | import base64 5 | from bs4 import BeautifulSoup 6 | import requests 7 | import re 8 | import random 9 | import shutil 10 | import os 11 | import time 12 | import queue 13 | import threading 14 | import math 15 | 16 | ''' 17 | url解码 18 | ''' 19 | def parse(imgHash, constant): 20 | return decode_base64(imgHash).decode('utf8') 21 | 22 | ''' 23 | 以下是原来的解码方式,近日(2018/5/25)已被修改不再生效 24 | q = 4 25 | hashlib.md5() 26 | constant = md5(constant) 27 | o = md5(constant[0:16]) 28 | n = md5(constant[16:32]) 29 | l = imgHash[0:q] 30 | c = o + md5(o + l) 31 | imgHash = imgHash[q:] 32 | k = decode_base64(imgHash) 33 | h =list(range(256)) 34 | 35 | b = list(range(256)) 36 | 37 | for g in range(0,256): 38 | b[g] = ord(c[g % len(c)]) 39 | 40 | f=0 41 | for g in range(0,256): 42 | f = (f+h[g]+b[g]) % 256 43 | tmp = h[g] 44 | h[g] = h[f] 45 | h[f] = tmp 46 | 47 | result = "" 48 | p=0 49 | f=0 50 | for g in range(0,len(k)): 51 | p = (p + 1) % 256; 52 | f = (f + h[p]) % 256 53 | tmp = h[p] 54 | h[p] = h[f] 55 | h[f] = tmp 56 | result += chr(k[g] ^ (h[(h[p] + h[f]) % 256])) 57 | 58 | result = result[26:] 59 | return result 60 | ''' 61 | 62 | def md5(src): 63 | m = hashlib.md5() 64 | m.update(src.encode("utf8")) 65 | return m.hexdigest() 66 | 67 | def decode_base64(data): 68 | missing_padding=4-len(data)%4 69 | if missing_padding: 70 | data += '='* missing_padding 71 | return base64.b64decode(data) 72 | 73 | headers={ 74 | 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' 75 | } 76 | 77 | ''' 78 | 页面抓取类 79 | ''' 80 | class Spider(threading.Thread): 81 | 82 | def __init__(self,pages,proxies,url_manager): 83 | threading.Thread.__init__(self) 84 | self.pages=pages 85 | self.proxies=proxies 86 | self.url_manager=url_manager 87 | 88 | def get_Page(self,page,proxies,url_manager): 89 | bs_page=BeautifulSoup(page,"lxml") 90 | 91 | ''' 92 | 获取js文件地址从而得到constant常量 93 | ''' 94 | try: 95 | model=re.findall(r'.*<\/script>.*',page) 96 | jsfile_url="http://"+model[len(model)-1] #页面上可能有两个地址,取最后一个匹配的地址 97 | except Exception as e: 98 | print(e) 99 | jsfile=requests.get(jsfile_url,headers=headers,proxies=proxies,timeout=3).text 100 | 101 | constant=re.search(r'.*remove\(\);var\sc=\w+\(e,\"(\w+)\".*',jsfile).group(1) 102 | ''' 103 | 向parse函数传入constant常量和img-hash得到图片地址 104 | ''' 105 | for item in bs_page.select('.img-hash'): 106 | img_url='http:'+parse(item.text,constant) 107 | url_manager.addNewUrl(img_url) 108 | 109 | def run(self): 110 | for page in self.pages: 111 | self.get_Page(page,self.proxies,self.url_manager) 112 | 113 | ''' 114 | 程序入口 115 | ''' 116 | def main(amount): 117 | url_manager=UrlManager() 118 | proxies={'http':''} #尚未添加ip代理功能,程序已能正常运行 119 | 120 | current_url='http://jandan.net/ooxx' #当前页面url 121 | ''' 122 | 多线程抓取页面地址 123 | ''' 124 | pages=[] #所有待抓取页面 125 | try: 126 | for i in range(amount): 127 | current_page=requests.get(current_url,headers=headers).text #当前页面源码 128 | pages.append(current_page) 129 | current_url='http:'+re.search(r'.*Older\sComments\"\shref=\"(.*?)\"\sclass.*',current_page).group(1)#提取下个页面url 130 | except Exception as e: 131 | pass 132 | 133 | page_threads = [] 134 | t_amount=10 if len(pages)>10 else len(pages) #页面抓取线程数 135 | for i in range(t_amount): 136 | t = Spider(pages[math.ceil(int((len(pages))/t_amount)*i):math.ceil(int((len(pages))/t_amount)*(i+1))],proxies,url_manager) 137 | page_threads.append(t) 138 | for t in page_threads: 139 | t.start() 140 | for t in page_threads: 141 | t.join() 142 | 143 | img_threads = [] 144 | for i in range(10): #固定10个线程用于下载图片 145 | t = Download(url_manager) 146 | img_threads.append(t) 147 | for t in img_threads: 148 | t.start() 149 | for t in img_threads: 150 | t.join() 151 | 152 | L = threading.Lock() 153 | 154 | ''' 155 | 图片下载类 156 | ''' 157 | class Download(threading.Thread): 158 | def __init__(self, url_manager): 159 | threading.Thread.__init__(self) 160 | self.url_manager=url_manager 161 | self.pic_headers = headers 162 | self.pic_headers['Host'] = 'wx3.sinaimg.cn' 163 | 164 | def download_Img(self,url): 165 | isGif=re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)',url) 166 | if isGif: 167 | url=isGif.group(1)+'large'+isGif.group(3) 168 | 169 | extensionName=re.match(r'.*(\.\w+)',url).group(1) #图片扩展名 170 | 171 | L.acquire() 172 | if not os.path.exists('img'): 173 | os.mkdir('img') 174 | with open ('img/'+str(len(os.listdir('./img')))+extensionName, 'wb') as f: 175 | # headers['Host']='wx3.sinaimg.cn' 176 | f.write(requests.get(url,headers=self.pic_headers).content) 177 | f.close() 178 | L.release() 179 | 180 | def run(self): 181 | while not self.url_manager.isEmpty(): 182 | imgUrl=self.url_manager.getNewUrl() 183 | self.download_Img(imgUrl) 184 | self.url_manager.addOldUrl(imgUrl) 185 | 186 | ''' 187 | url仓库,提供url更新以及记录功能 188 | ''' 189 | class UrlManager: 190 | def __init__(self): 191 | self.url_used=[] 192 | self.url_target=queue.Queue() 193 | if os.path.exists('url.txt'): 194 | with open('url.txt','r') as f: 195 | for eachline in f.readlines(): 196 | self.url_used.append(eachline.strip()) 197 | else: 198 | open("url.txt",'w') 199 | 200 | def getNewUrl(self): 201 | return self.url_target.get() 202 | 203 | def isEmpty(self): 204 | return self.url_target.empty() 205 | 206 | def addNewUrl(self,newUrl): 207 | if newUrl in self.url_used: 208 | pass 209 | else: 210 | self.url_target.put(newUrl) 211 | 212 | def addOldUrl(self,oldUrl): 213 | self.url_used.append(oldUrl) 214 | with open('url.txt','a') as f: 215 | f.write(oldUrl+'\n') 216 | 217 | if __name__=='__main__': 218 | 219 | amount=input('请输入抓取页数后按回车开始(小于100),从首页开始计数):') 220 | main(int(amount)) #抓取首页开始的前amount页的图片 221 | --------------------------------------------------------------------------------