├── README.md
├── aio_mm131.py
├── images
    ├── biehuang1.jpeg
    ├── mm131_number.png
    ├── nengyong.jpg
    └── www.jpg
├── lib
    ├── req.py
    └── utils.py
├── main.py
├── requirements.txt
└── thread_mm131.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## MM131妹子图片批量下载爬虫py脚本
 2 | 
 3 | 爬取网站:[MM131](http://mm131.com)
 4 | 
 5 | 爬了**2000**套妹子图集　将近**10万**张，共**8.5个G**  （图为我的腾讯云cos存储
 6 | 
 7 | ![](images/mm131_number.png)
 8 | 
 9 | ![](images/biehuang1.jpeg)
10 | 
11 | 最开始的版本其实是先解析页面再提取url链接逐个请求,
12 | 后来发现了图片的url规律：
13 | url变量只有末尾的: id/num
14 | 
15 | 然后发现对req header请求头伪装一下UA用户代理和链接所在文档位置Referer 就可以直接就可以对图片进行请求,这就很舒服~
16 | 
17 | 再配合上多进程+协程的一个库[aiomultiprocess](https://github.com/jreese/aiomultiprocess)进行异步请求,concurrent包的futures线程池进行并发爬取,爬取速度效率大幅提升。
18 | 
19 | ## Usage:
20 | 1.安装依赖(Python3):
21 | > pip install -r requirements.txt
22 | 
23 | 2.
24 | 运行脚本,爬虫有两个版本<br>
25 | ~~windows建议 运行多线程版本: **thread_mm131.py**~~<br>
26 | ~~linux/os x 运行 多进程+协程版本: **aio_mm131.py** 或前者皆可~~
27 | 
28 | - <=2019.3.23=>
29 | -  更新依赖支持python3.7
30 | - <=2018 12.1=><br>
31 | - 自动获取网站最新更新
32 | - 终断下载后再次下载会继续上次的进度
33 | - 自动选择不同系统合适的下载方法
34 | 
35 | 只需
36 | >  python main.py    
37 | 
38 | 
39 | 来不及解释了，快上车！！
40 | 
41 | 
42 | ![](images/www.jpg)
43 | 
44 | 
45 | ### 有问题可以提issue,欢迎老司机们 star,fork ~


--------------------------------------------------------------------------------
/aio_mm131.py:
--------------------------------------------------------------------------------
 1 | from lib.utils import set_header
 2 | from aiomultiprocess import Pool
 3 | import aiohttp,asyncio,time,os
 4 | 
 5 | class Aio_mm(object):
 6 | 
 7 |     def __init__(self):
 8 |         self.mm_folder='mm131/'
 9 |         self.each_limit=60
10 | 
11 |     async def async_get(self,url):
12 |         i=url[24:29]
13 |         j=url[30:-4]
14 | 
15 |         if not os.path.exists(self.mm_folder+i):
16 |             os.makedirs(self.mm_folder+i)
17 |      
18 |         async with aiohttp.ClientSession() as session:
19 |             print('Waiting for', url)
20 |             response = await session.get(url,headers=set_header(url))
21 |             pic = await response.read()
22 |         if response.status==404:
23 |             return '404 not found!'
24 |         print('Get res from', url, 'Result:', response.status,'ok!')
25 |         
26 |         with open(self.mm_folder+'%s/%s.jpg'% (i,j),'wb') as pp:
27 |             pp.write(pic)
28 |         
29 | 
30 |     async def makeurl(self,sta,end,limit): 
31 |         urls=['http://img1.mm131.me/pic/'+str(i)+'/'+str(j)+'.jpg' for i in range(sta,end+1) for j in range(1,limit)]
32 |         return await Pool().map(self.async_get,urls)
33 | 
34 |     def go_start(self,begin,end):
35 |         task = asyncio.ensure_future(self.makeurl(begin,end,self.each_limit)) 
36 |         loop = asyncio.get_event_loop()
37 |         loop.run_until_complete(task)
38 | 
39 | if __name__ == '__main__':
40 |     sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' '))
41 |     app=Aio_mm()
42 |     start_time = time.time()
43 |     app.go_start(sta,end)
44 |     end_time = time.time()
45 |     print('爬取任务已完成,消耗时间:', end_time - start_time)


--------------------------------------------------------------------------------
/images/biehuang1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/biehuang1.jpeg


--------------------------------------------------------------------------------
/images/mm131_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/mm131_number.png


--------------------------------------------------------------------------------
/images/nengyong.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/nengyong.jpg


--------------------------------------------------------------------------------
/images/www.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/www.jpg


--------------------------------------------------------------------------------
/lib/req.py:
--------------------------------------------------------------------------------
 1 | import requests,os
 2 | from .utils import set_header
 3 | 
 4 | class Request(object):
 5 | 
 6 |     def __init__(self):
 7 |         self.folder='mm131/'
 8 |         
 9 |     def save(self,pic,i,j):
10 |         if not os.path.exists(self.folder+i):
11 |             os.makedirs(self.folder+i)
12 |         with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp:
13 |             pp.write(pic)
14 |         return 'saved!'
15 | 
16 |     def get(self,url,i,j):
17 |         response = requests.get(url,headers=set_header(url))
18 |         pic=response.content
19 |         if response.status_code==404:
20 |             return '404 not found!'
21 |         elif response.status_code==200:
22 |             return self.save(pic,i,j)
23 | 
24 |     def requrl(self,ij):
25 |         i=ij[:4]
26 |         j=ij[4:]
27 |         url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg'
28 |         print('正在请求-->', url)
29 |         result = self.get(url,i,j)
30 |         print('获取到结果:-->', url, '-->', result)
31 | 
32 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | import os,requests
 2 | from lxml import etree
 3 | 
 4 | def set_header(referer):
 5 |     headers = {
 6 |         'Pragma': 'no-cache',
 7 |         'Accept-Encoding': 'gzip, deflate',
 8 |         'Accept-Language': 'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',
 9 |         'Cache-Control': 'no-cache',
10 |         'Connection': 'keep-alive',
11 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
12 |         'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
13 |         'Referer': '{}'.format(set_referer(referer)),
14 |     }
15 |     return headers
16 | 
17 | def set_referer(src):
18 |     ref=src[25:-4].split('/')
19 |     if src[-5:-4]==1:
20 |         return 'http://www.mm131.com/xinggan/'+ref[0]+'.html'
21 |     return 'http://www.mm131.com/xinggan/'+ref[0]+'_'+ref[1]+'.html'
22 | 
23 | def getmmdir(folder,default=2400):
24 | 	if os.path.exists(folder) and len(os.listdir(folder))>0:
25 | 		return int(max(os.listdir(folder)))
26 | 	else:
27 | 		return default
28 | 
29 | def getnew(default=4600):
30 | 	try:
31 | 		url='http://www.mm131.com/xinggan/'
32 | 		content=etree.HTML(requests.get(url).text)
33 | 		href=content.xpath('//dl[@class="list-left public-box"]//dd[1]/a/@href')[0]
34 | 		newid=href[-9:-5]
35 | 		return int(newid)
36 | 	except Exception as e:
37 | 		return default


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import time,os
 2 | from aio_mm131 import Aio_mm
 3 | from thread_mm131 import Thread_mm
 4 | from lib.utils import getmmdir,getnew
 5 | 
 6 | def start(begin,end):
 7 | 	start_time=time.time()
 8 | 	if os.name=='nt':
 9 | 		app=Thread_mm()
10 | 		app.go_start(begin,end)
11 | 	else:
12 | 		app=Aio_mm()
13 | 		app.go_start(begin,end)
14 | 	end_time=time.time()
15 | 	print('爬取任务已完成,消耗时间:', end_time - start_time)
16 | 
17 | 
18 | if __name__ == '__main__':
19 | 	# config={
20 | 	# 	'mm_folder':'mm131/',
21 | 	# 	'each_limit':60
22 | 	# }
23 | 	folder='mm131/'
24 | 	finished,newid=getmmdir(folder),getnew()
25 | 	start(finished,newid)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.4.4
2 | aiomultiprocess==0.5.0
3 | asyncio==3.4.3
4 | requests==2.20.0
5 | lxml==4.2.5


--------------------------------------------------------------------------------
/thread_mm131.py:
--------------------------------------------------------------------------------
 1 | from concurrent import futures
 2 | import requests,os,time
 3 | from lib.req import Request
 4 | 
 5 | class Thread_mm(object):
 6 |     
 7 |     def __init__(self):
 8 |         self.folder='mm131/'
 9 |         self.each_limit=60
10 | 
11 |     # def get(self,url,i,j):
12 |     #     response = requests.get(url,headers=set_header(url))
13 |     #     pic=response.content
14 |     #     if response.status_code==404:
15 |     #         return '404 not found!'
16 |     #     if not os.path.exists(self.folder+i):
17 |     #         os.makedirs(self.folder+i)
18 |     #     with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp:
19 |     #         pp.write(pic)
20 |     #     return 'okay~!'
21 |      
22 |     # def requrl(self,ij):
23 |     #     i=ij[:4]
24 |     #     j=ij[4:]    
25 |     #     url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg'
26 |     #     print('Waiting for', url)
27 |     #     result = self.get(url,i,j)
28 |     #     print('Get res from', url, 'Result:', result)
29 | 
30 |     def go_start(self,begin,end,wokers=100,**kw):
31 |         self.req_obj=Request()
32 |         with futures.ThreadPoolExecutor(wokers) as e:
33 |             e.map(self.req_obj.requrl,[ str(i)+str(j) for i in range(begin,end+1) for j in range(1,self.each_limit)])
34 | 
35 | if __name__ == '__main__':
36 |     app=Thread_mm()
37 |     sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' '))
38 |     start_time = time.time()
39 |     app.go_start(sta,end)
40 |     end_time = time.time()
41 |     print('爬取任务已完成,消耗时间:', end_time - start_time)


--------------------------------------------------------------------------------