├── README.md
├── aio_mm131.py
├── images
├── biehuang1.jpeg
├── mm131_number.png
├── nengyong.jpg
└── www.jpg
├── lib
├── req.py
└── utils.py
├── main.py
├── requirements.txt
└── thread_mm131.py
/README.md:
--------------------------------------------------------------------------------
1 | ## MM131妹子图片批量下载爬虫py脚本
2 |
3 | 爬取网站:[MM131](http://mm131.com)
4 |
5 | 爬了**2000**套妹子图集 将近**10万**张,共**8.5个G** (图为我的腾讯云cos存储
6 |
7 | 
8 |
9 | 
10 |
11 | 最开始的版本其实是先解析页面再提取url链接逐个请求,
12 | 后来发现了图片的url规律:
13 | url变量只有末尾的: id/num
14 |
15 | 然后发现对req header请求头伪装一下UA用户代理和链接所在文档位置Referer 就可以直接就可以对图片进行请求,这就很舒服~
16 |
17 | 再配合上多进程+协程的一个库[aiomultiprocess](https://github.com/jreese/aiomultiprocess)进行异步请求,concurrent包的futures线程池进行并发爬取,爬取速度效率大幅提升。
18 |
19 | ## Usage:
20 | 1.安装依赖(Python3):
21 | > pip install -r requirements.txt
22 |
23 | 2.
24 | 运行脚本,爬虫有两个版本
25 | ~~windows建议 运行多线程版本: **thread_mm131.py**~~
26 | ~~linux/os x 运行 多进程+协程版本: **aio_mm131.py** 或前者皆可~~
27 |
28 | - <=2019.3.23=>
29 | - 更新依赖支持python3.7
30 | - <=2018 12.1=>
31 | - 自动获取网站最新更新
32 | - 终断下载后再次下载会继续上次的进度
33 | - 自动选择不同系统合适的下载方法
34 |
35 | 只需
36 | > python main.py
37 |
38 |
39 | 来不及解释了,快上车!!
40 |
41 |
42 | 
43 |
44 |
45 | ### 有问题可以提issue,欢迎老司机们 star,fork ~
--------------------------------------------------------------------------------
/aio_mm131.py:
--------------------------------------------------------------------------------
1 | from lib.utils import set_header
2 | from aiomultiprocess import Pool
3 | import aiohttp,asyncio,time,os
4 |
5 | class Aio_mm(object):
6 |
7 | def __init__(self):
8 | self.mm_folder='mm131/'
9 | self.each_limit=60
10 |
11 | async def async_get(self,url):
12 | i=url[24:29]
13 | j=url[30:-4]
14 |
15 | if not os.path.exists(self.mm_folder+i):
16 | os.makedirs(self.mm_folder+i)
17 |
18 | async with aiohttp.ClientSession() as session:
19 | print('Waiting for', url)
20 | response = await session.get(url,headers=set_header(url))
21 | pic = await response.read()
22 | if response.status==404:
23 | return '404 not found!'
24 | print('Get res from', url, 'Result:', response.status,'ok!')
25 |
26 | with open(self.mm_folder+'%s/%s.jpg'% (i,j),'wb') as pp:
27 | pp.write(pic)
28 |
29 |
30 | async def makeurl(self,sta,end,limit):
31 | urls=['http://img1.mm131.me/pic/'+str(i)+'/'+str(j)+'.jpg' for i in range(sta,end+1) for j in range(1,limit)]
32 | return await Pool().map(self.async_get,urls)
33 |
34 | def go_start(self,begin,end):
35 | task = asyncio.ensure_future(self.makeurl(begin,end,self.each_limit))
36 | loop = asyncio.get_event_loop()
37 | loop.run_until_complete(task)
38 |
39 | if __name__ == '__main__':
40 | sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' '))
41 | app=Aio_mm()
42 | start_time = time.time()
43 | app.go_start(sta,end)
44 | end_time = time.time()
45 | print('爬取任务已完成,消耗时间:', end_time - start_time)
--------------------------------------------------------------------------------
/images/biehuang1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/biehuang1.jpeg
--------------------------------------------------------------------------------
/images/mm131_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/mm131_number.png
--------------------------------------------------------------------------------
/images/nengyong.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/nengyong.jpg
--------------------------------------------------------------------------------
/images/www.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qwertyuiop6/mm131/da81e039ae4cc5d9fdcee6d46b90e77a1cfa417e/images/www.jpg
--------------------------------------------------------------------------------
/lib/req.py:
--------------------------------------------------------------------------------
1 | import requests,os
2 | from .utils import set_header
3 |
4 | class Request(object):
5 |
6 | def __init__(self):
7 | self.folder='mm131/'
8 |
9 | def save(self,pic,i,j):
10 | if not os.path.exists(self.folder+i):
11 | os.makedirs(self.folder+i)
12 | with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp:
13 | pp.write(pic)
14 | return 'saved!'
15 |
16 | def get(self,url,i,j):
17 | response = requests.get(url,headers=set_header(url))
18 | pic=response.content
19 | if response.status_code==404:
20 | return '404 not found!'
21 | elif response.status_code==200:
22 | return self.save(pic,i,j)
23 |
24 | def requrl(self,ij):
25 | i=ij[:4]
26 | j=ij[4:]
27 | url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg'
28 | print('正在请求-->', url)
29 | result = self.get(url,i,j)
30 | print('获取到结果:-->', url, '-->', result)
31 |
32 |
--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
1 | import os,requests
2 | from lxml import etree
3 |
4 | def set_header(referer):
5 | headers = {
6 | 'Pragma': 'no-cache',
7 | 'Accept-Encoding': 'gzip, deflate',
8 | 'Accept-Language': 'zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7',
9 | 'Cache-Control': 'no-cache',
10 | 'Connection': 'keep-alive',
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
12 | 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
13 | 'Referer': '{}'.format(set_referer(referer)),
14 | }
15 | return headers
16 |
17 | def set_referer(src):
18 | ref=src[25:-4].split('/')
19 | if src[-5:-4]==1:
20 | return 'http://www.mm131.com/xinggan/'+ref[0]+'.html'
21 | return 'http://www.mm131.com/xinggan/'+ref[0]+'_'+ref[1]+'.html'
22 |
23 | def getmmdir(folder,default=2400):
24 | if os.path.exists(folder) and len(os.listdir(folder))>0:
25 | return int(max(os.listdir(folder)))
26 | else:
27 | return default
28 |
29 | def getnew(default=4600):
30 | try:
31 | url='http://www.mm131.com/xinggan/'
32 | content=etree.HTML(requests.get(url).text)
33 | href=content.xpath('//dl[@class="list-left public-box"]//dd[1]/a/@href')[0]
34 | newid=href[-9:-5]
35 | return int(newid)
36 | except Exception as e:
37 | return default
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import time,os
2 | from aio_mm131 import Aio_mm
3 | from thread_mm131 import Thread_mm
4 | from lib.utils import getmmdir,getnew
5 |
6 | def start(begin,end):
7 | start_time=time.time()
8 | if os.name=='nt':
9 | app=Thread_mm()
10 | app.go_start(begin,end)
11 | else:
12 | app=Aio_mm()
13 | app.go_start(begin,end)
14 | end_time=time.time()
15 | print('爬取任务已完成,消耗时间:', end_time - start_time)
16 |
17 |
18 | if __name__ == '__main__':
19 | # config={
20 | # 'mm_folder':'mm131/',
21 | # 'each_limit':60
22 | # }
23 | folder='mm131/'
24 | finished,newid=getmmdir(folder),getnew()
25 | start(finished,newid)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.4.4
2 | aiomultiprocess==0.5.0
3 | asyncio==3.4.3
4 | requests==2.20.0
5 | lxml==4.2.5
--------------------------------------------------------------------------------
/thread_mm131.py:
--------------------------------------------------------------------------------
1 | from concurrent import futures
2 | import requests,os,time
3 | from lib.req import Request
4 |
5 | class Thread_mm(object):
6 |
7 | def __init__(self):
8 | self.folder='mm131/'
9 | self.each_limit=60
10 |
11 | # def get(self,url,i,j):
12 | # response = requests.get(url,headers=set_header(url))
13 | # pic=response.content
14 | # if response.status_code==404:
15 | # return '404 not found!'
16 | # if not os.path.exists(self.folder+i):
17 | # os.makedirs(self.folder+i)
18 | # with open(self.folder+'%s/%s.jpg'% (i,j),'wb') as pp:
19 | # pp.write(pic)
20 | # return 'okay~!'
21 |
22 | # def requrl(self,ij):
23 | # i=ij[:4]
24 | # j=ij[4:]
25 | # url = 'http://img1.mm131.me/pic/'+i+'/'+j+'.jpg'
26 | # print('Waiting for', url)
27 | # result = self.get(url,i,j)
28 | # print('Get res from', url, 'Result:', result)
29 |
30 | def go_start(self,begin,end,wokers=100,**kw):
31 | self.req_obj=Request()
32 | with futures.ThreadPoolExecutor(wokers) as e:
33 | e.map(self.req_obj.requrl,[ str(i)+str(j) for i in range(begin,end+1) for j in range(1,self.each_limit)])
34 |
35 | if __name__ == '__main__':
36 | app=Thread_mm()
37 | sta,end=map(int,input('输入mm起始编号和结束编号 以空格隔开:').split(' '))
38 | start_time = time.time()
39 | app.go_start(sta,end)
40 | end_time = time.time()
41 | print('爬取任务已完成,消耗时间:', end_time - start_time)
--------------------------------------------------------------------------------