├── .gitignore
├── LICENSE
├── README.md
├── mz.py
├── requirements.txt
└── screenshots
    ├── 1.png
    └── 2.png


/.gitignore:
--------------------------------------------------------------------------------
1 | /Mzitu/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mzitu_Spider
 2 | 
 3 | [![license](https://img.shields.io/github/license/ZYSzys/Mzitu_Spider.svg)](https://github.com/ZYSzys/Mzitu_Spider/blob/master/LICENSE)
 4 | 
 5 | 对http://www.mzitu.com 进行爬取，下载首页美女图片
 6 | 
 7 | ## 依赖环境
 8 | python2.7, 3.6
 9 | ### python库
10 | http请求：requests  
11 | 图片提取：bs4  
12 | 存储相关: os  
13 | 
14 | 
15 | ## 下载安装
16 | 在终端输入如下命令：
17 | ```bash
18 | git clone https://github.com/ZYSzys/Mzitu_Spider.git
19 | ```
20 | 
21 | ## 使用方法
22 | 在当前目录下输入：
23 | ```bash
24 | cd Mzitu_Spider
25 | pip install -r requirements.txt
26 | python mz.py
27 | ```
28 | 运行爬虫，如图所示  
29 | ![](/screenshots/1.png)  
30 | 
31 | 稍等几分钟后，当前目录下生成Mzitu文件夹，首页每套图以存储在其中  
32 | ![](/screenshots/2.png)  
33 | 
34 | 


--------------------------------------------------------------------------------
/mz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding: utf-8 -*-
 3 | 
 4 | __author__ = 'ZYSzys'
 5 | 
 6 | import requests
 7 | from bs4 import BeautifulSoup
 8 | import os
 9 | 
10 | class Mz:
11 |     def __init__(self):
12 |         self.url = 'http://www.mzitu.com'
13 |         self.headers = {
14 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 
15 |         'Referer': 'http://www.mzitu.com/'
16 |         }
17 |         self.req = requests.session()
18 |         self.all_a = []
19 |         self.all_a_title = []
20 |         self.all_a_max = []
21 |         os.makedirs(os.path.join(os.getcwd(), 'Mzitu'))
22 |         os.chdir(os.path.join(os.getcwd(), 'Mzitu'))
23 |         self.initpwd = os.getcwd()
24 | 
25 |     def Domainhtml(self):
26 |         html = self.req.get(self.url, headers=self.headers)
27 |         lis = BeautifulSoup(html.text, 'lxml').find('div', class_='postlist').find_all('li')
28 |         for a in lis:
29 |             imgurl = a.find('a')['href']
30 |             self.all_a.append(imgurl)
31 | 
32 |     def Getmaxpage(self):
33 |         for a in self.all_a:
34 |             imghtml = self.req.get(a, headers=self.headers)
35 |             title = BeautifulSoup(imghtml.text, 'lxml').find('h2', class_='main-title').string
36 |             #print title
37 |             last = BeautifulSoup(imghtml.text, 'lxml').find('div', class_='pagenavi').find_all('span')
38 |             last = int(last[-2].string)
39 |             self.all_a_title.append(title)
40 |             self.all_a_max.append(last)
41 | 
42 |     def Downloadimg(self):
43 |         cnt = 0
44 |         print('total: %s' % len(self.all_a))
45 |         for a in self.all_a:
46 |             print('Downloading %s now...' % (cnt+1))
47 |             os.makedirs(os.path.join(os.getcwd(), self.all_a_title[cnt]))
48 |             os.chdir(os.path.join(os.getcwd(), self.all_a_title[cnt]))
49 |             for i in range(1, self.all_a_max[cnt]+1):
50 |                 nurl = a+'/'+str(i)
51 |                 imghtml = self.req.get(nurl, headers=self.headers)
52 |                 aaa = BeautifulSoup(imghtml.text, 'lxml').find('div', class_='main-image').find('img')['src']
53 |                 img = self.req.get(aaa, headers=self.headers)
54 |                 f = open(str(i)+'.jpg', 'ab')
55 |                 f.write(img.content)
56 |                 f.close()
57 |             cnt += 1
58 |             os.chdir(self.initpwd)
59 | 
60 |         print('Dowmload completed!')
61 | 
62 | if __name__ == '__main__':
63 |     test = Mz()
64 |     test.Domainhtml()
65 |     test.Getmaxpage()
66 |     test.Downloadimg()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | lxml
4 | html5lib
5 | 


--------------------------------------------------------------------------------
/screenshots/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZYSzys/Mzitu_Spider/2ed0f6142f0cfcfb0b727dbd934662508b071110/screenshots/1.png


--------------------------------------------------------------------------------
/screenshots/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZYSzys/Mzitu_Spider/2ed0f6142f0cfcfb0b727dbd934662508b071110/screenshots/2.png


--------------------------------------------------------------------------------