├── README.md ├── aio_mm131.py ├── images ├── biehuang1.jpeg ├── mm131_number.png ├── nengyong.jpg └── www.jpg ├── lib ├── req.py └── utils.py ├── main.py ├── requirements.txt └── thread_mm131.py /README.md: -------------------------------------------------------------------------------- 1 | ## MM131妹子图片批量下载爬虫py脚本 2 | 3 | 爬取网站:[MM131](http://mm131.com) 4 | 5 | 爬了**2000**套妹子图集 将近**10万**张,共**8.5个G** (图为我的腾讯云cos存储 6 | 7 | ![](images/mm131_number.png) 8 | 9 | ![](images/biehuang1.jpeg) 10 | 11 | 最开始的版本其实是先解析页面再提取url链接逐个请求, 12 | 后来发现了图片的url规律: 13 | url变量只有末尾的: id/num 14 | 15 | 然后发现对req header请求头伪装一下UA用户代理和链接所在文档位置Referer 就可以直接就可以对图片进行请求,这就很舒服~ 16 | 17 | 再配合上多进程+协程的一个库[aiomultiprocess](https://github.com/jreese/aiomultiprocess)进行异步请求,concurrent包的futures线程池进行并发爬取,爬取速度效率大幅提升。 18 | 19 | ## Usage: 20 | 1.安装依赖(Python3): 21 | > pip install -r requirements.txt 22 | 23 | 2. 24 | 运行脚本,爬虫有两个版本
25 | ~~windows建议 运行多线程版本: **thread_mm131.py**~~
26 | ~~linux/os x 运行 多进程+协程版本: **aio_mm131.py** 或前者皆可~~ 27 | 28 | - <=2019.3.23=> 29 | - 更新依赖支持python3.7 30 | - <=2018 12.1=>
31 | - 自动获取网站最新更新 32 | - 终断下载后再次下载会继续上次的进度 33 | - 自动选择不同系统合适的下载方法 34 | 35 | 只需 36 | > python main.py 37 | 38 | 39 | 来不及解释了,快上车!! 40 | 41 | 42 | ![](images/www.jpg) 43 | 44 | 45 | ### 有问题可以提issue,欢迎老司机们 star,fork ~ -------------------------------------------------------------------------------- /aio_mm131.py: -------------------------------------------------------------------------------- 1 | from lib.utils import set_header 2 | from aiomultiprocess import Pool 3 | import aiohttp,asyncio,time,os 4 | 5 | class Aio_mm(object): 6 | 7 | def __init__(self): 8 | self.mm_folder='mm131/' 9 | self.each_limit=60 10 | 11 | async def async_get(self,url): 12 | i=url[24:29] 13 | j=url[30:-4] 14 | 15 | if not os.path.exists(self.mm_folder+i): 16 | os.makedirs(self.mm_folder+i) 17 | 18 | async with aiohttp.ClientSession() as session: 19 | print('Waiting for', url) 20 | response = await session.get(url,headers=set_header(url)) 21 | pic = await response.read() 22 | if response.status==404: 23 | return '404 not found!' 24 | print('Get res from', url, 'Result:', response.status,'ok!') 25 | 26 | with open(self.mm_folder+'%s/%s.jpg'% (i,j),'wb') as pp: 27 | pp.write(pic) 28 | 29 | 30 | async def makeurl(self,sta,end,limit): 31 | urls=['http://img1.mm131.me/pic/'+str(i)+'/'+str(j)+'.jpg' for i in range(sta,end+1) for j in range(1,limit)] 32 | return await Pool().map(self.async_get,urls) 33 | 34 | def go_start(self,begin,end): 35 | task = asyncio.ensure_future(self.makeurl(begin,end,self.each_limit)) 36 | loop = asyncio.get_event_loop() 37 | loop.run_until_complete(task) 38 | 39 | if __name__ == '__main__': 40 | sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' ')) 41 | app=Aio_mm() 42 | start_time = time.time() 43 | app.go_start(sta,end) 44 | end_time = time.time() 45 | print('爬取任务已完成,消耗时间:', end_time - start_time) -------------------------------------------------------------------------------- /images/biehuang1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/biehuang1.jpeg -------------------------------------------------------------------------------- /images/mm131_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/mm131_number.png -------------------------------------------------------------------------------- /images/nengyong.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/nengyong.jpg -------------------------------------------------------------------------------- /images/www.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/www.jpg -------------------------------------------------------------------------------- /lib/req.py: -------------------------------------------------------------------------------- 1 | import requests,os 2 | from .utils import set_header 3 | 4 | class Request(object): 5 | 6 | def __init__(self): 7 | self.folder='mm131/' 8 | 9 | def save(self,pic,i,j): 10 | if not os.path.exists(self.folder+i): 11 | os.makedirs(self.folder+i) 12 | with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp: 13 | pp.write(pic) 14 | return 'saved!' 15 | 16 | def get(self,url,i,j): 17 | response = requests.get(url,headers=set_header(url)) 18 | pic=response.content 19 | if response.status_code==404: 20 | return '404 not found!' 21 | elif response.status_code==200: 22 | return self.save(pic,i,j) 23 | 24 | def requrl(self,ij): 25 | i=ij[:4] 26 | j=ij[4:] 27 | url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg' 28 | print('正在请求-->', url) 29 | result = self.get(url,i,j) 30 | print('获取到结果:-->', url, '-->', result) 31 | 32 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | import os,requests 2 | from lxml import etree 3 | 4 | def set_header(referer): 5 | headers = { 6 | 'Pragma': 'no-cache', 7 | 'Accept-Encoding': 'gzip, deflate', 8 | 'Accept-Language': 'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7', 9 | 'Cache-Control': 'no-cache', 10 | 'Connection': 'keep-alive', 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 12 | 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 13 | 'Referer': '{}'.format(set_referer(referer)), 14 | } 15 | return headers 16 | 17 | def set_referer(src): 18 | ref=src[25:-4].split('/') 19 | if src[-5:-4]==1: 20 | return 'http://www.mm131.com/xinggan/'+ref[0]+'.html' 21 | return 'http://www.mm131.com/xinggan/'+ref[0]+'_'+ref[1]+'.html' 22 | 23 | def getmmdir(folder,default=2400): 24 | if os.path.exists(folder) and len(os.listdir(folder))>0: 25 | return int(max(os.listdir(folder))) 26 | else: 27 | return default 28 | 29 | def getnew(default=4600): 30 | try: 31 | url='http://www.mm131.com/xinggan/' 32 | content=etree.HTML(requests.get(url).text) 33 | href=content.xpath('//dl[@class="list-left public-box"]//dd[1]/a/@href')[0] 34 | newid=href[-9:-5] 35 | return int(newid) 36 | except Exception as e: 37 | return default -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time,os 2 | from aio_mm131 import Aio_mm 3 | from thread_mm131 import Thread_mm 4 | from lib.utils import getmmdir,getnew 5 | 6 | def start(begin,end): 7 | start_time=time.time() 8 | if os.name=='nt': 9 | app=Thread_mm() 10 | app.go_start(begin,end) 11 | else: 12 | app=Aio_mm() 13 | app.go_start(begin,end) 14 | end_time=time.time() 15 | print('爬取任务已完成,消耗时间:', end_time - start_time) 16 | 17 | 18 | if __name__ == '__main__': 19 | # config={ 20 | # 'mm_folder':'mm131/', 21 | # 'each_limit':60 22 | # } 23 | folder='mm131/' 24 | finished,newid=getmmdir(folder),getnew() 25 | start(finished,newid) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.4.4 2 | aiomultiprocess==0.5.0 3 | asyncio==3.4.3 4 | requests==2.20.0 5 | lxml==4.2.5 -------------------------------------------------------------------------------- /thread_mm131.py: -------------------------------------------------------------------------------- 1 | from concurrent import futures 2 | import requests,os,time 3 | from lib.req import Request 4 | 5 | class Thread_mm(object): 6 | 7 | def __init__(self): 8 | self.folder='mm131/' 9 | self.each_limit=60 10 | 11 | # def get(self,url,i,j): 12 | # response = requests.get(url,headers=set_header(url)) 13 | # pic=response.content 14 | # if response.status_code==404: 15 | # return '404 not found!' 16 | # if not os.path.exists(self.folder+i): 17 | # os.makedirs(self.folder+i) 18 | # with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp: 19 | # pp.write(pic) 20 | # return 'okay~!' 21 | 22 | # def requrl(self,ij): 23 | # i=ij[:4] 24 | # j=ij[4:] 25 | # url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg' 26 | # print('Waiting for', url) 27 | # result = self.get(url,i,j) 28 | # print('Get res from', url, 'Result:', result) 29 | 30 | def go_start(self,begin,end,wokers=100,**kw): 31 | self.req_obj=Request() 32 | with futures.ThreadPoolExecutor(wokers) as e: 33 | e.map(self.req_obj.requrl,[ str(i)+str(j) for i in range(begin,end+1) for j in range(1,self.each_limit)]) 34 | 35 | if __name__ == '__main__': 36 | app=Thread_mm() 37 | sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' ')) 38 | start_time = time.time() 39 | app.go_start(sta,end) 40 | end_time = time.time() 41 | print('爬取任务已完成,消耗时间:', end_time - start_time) --------------------------------------------------------------------------------