├── 2.png
├── 3.png
├── 4.png
├── 5.png
├── 6.png
├── 7.png
├── 1240.png
├── .gitattributes
├── .gitignore
├── mm.py
└── README.md


/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/2.png


--------------------------------------------------------------------------------
/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/3.png


--------------------------------------------------------------------------------
/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/4.png


--------------------------------------------------------------------------------
/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/5.png


--------------------------------------------------------------------------------
/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/6.png


--------------------------------------------------------------------------------
/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/7.png


--------------------------------------------------------------------------------
/1240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panacena/mmPictures/HEAD/1240.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask instance folder
 57 | instance/
 58 | 
 59 | # Scrapy stuff:
 60 | .scrapy
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | # IPython Notebook
 69 | .ipynb_checkpoints
 70 | 
 71 | # pyenv
 72 | .python-version
 73 | 
 74 | # celery beat schedule file
 75 | celerybeat-schedule
 76 | 
 77 | # dotenv
 78 | .env
 79 | 
 80 | # virtualenv
 81 | venv/
 82 | ENV/
 83 | 
 84 | # Spyder project settings
 85 | .spyderproject
 86 | 
 87 | # Rope project settings
 88 | .ropeproject
 89 | 
 90 | # =========================
 91 | # Operating System Files
 92 | # =========================
 93 | 
 94 | # OSX
 95 | # =========================
 96 | 
 97 | .DS_Store
 98 | .AppleDouble
 99 | .LSOverride
100 | 
101 | # Thumbnails
102 | ._*
103 | 
104 | # Files that might appear in the root of a volume
105 | .DocumentRevisions-V100
106 | .fseventsd
107 | .Spotlight-V100
108 | .TemporaryItems
109 | .Trashes
110 | .VolumeIcon.icns
111 | 
112 | # Directories potentially created on remote AFP share
113 | .AppleDB
114 | .AppleDesktop
115 | Network Trash Folder
116 | Temporary Items
117 | .apdisk
118 | 
119 | # Windows
120 | # =========================
121 | 
122 | # Windows image file caches
123 | Thumbs.db
124 | ehthumbs.db
125 | 
126 | # Folder config file
127 | Desktop.ini
128 | 
129 | # Recycle Bin used on file shares
130 | $RECYCLE.BIN/
131 | 
132 | # Windows Installer files
133 | *.cab
134 | *.msi
135 | *.msm
136 | *.msp
137 | 
138 | # Windows shortcuts
139 | *.lnk
140 | 


--------------------------------------------------------------------------------
/mm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*-coding:utf-8-*-
  3 | import  urllib2
  4 | from lxml import etree
  5 | from os import system
  6 | """
  7 | 第一步: 从 http://www.zngirls.com/rank/sum/ 开始抓取MM点击头像的链接(注意是分页的)
  8 | #第二部  http://www.zngirls.com/girl/21751/ 抓取每一个写真集合的链接(注意是分页的)
  9 | #第三部 http://www.zngirls.com/g/19671/1.html 在写真图片的具体页面抓取图片(注意是分页的)
 10 | """
 11 | pciturelist=[]
 12 | 
 13 | 
 14 | header = {
 15 |             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
 16 |             , "Connection": "keep-alive"
 17 |          }
 18 | 
 19 | 
 20 | """
 21 | 从起始页面 http://www.zngirls.com/rank/sum/ 开始获取排名的页数和每一页的url
 22 | 
 23 | """
 24 | def  mmRankSum():
 25 |     req = urllib2.Request("http://www.zngirls.com/rank/sum/", headers=header)
 26 |     html = urllib2.urlopen(req)
 27 |     htmldata = html.read()
 28 |     htmlpath = etree.HTML(htmldata)
 29 | 
 30 |     #首先获取页码数,然后用循环的方式挨个解析每一个页面
 31 |     pages = htmlpath.xpath('//div[@class="pagesYY"]/div/a/@href')
 32 | 
 33 | 
 34 |     for i in range( len(pages) -2 ):
 35 | 
 36 |         pagesitem="http://www.zngirls.com/rank/sum/"+ pages[i]
 37 |         mmRankitem(pagesitem)
 38 | 
 39 | """
 40 | 参数 url : 分页中每一页的具体url地址
 41 | 通过穿过来的参数，使用  lxml和xpath 解析 html，获取每一个MM写真专辑页面的url
 42 | 
 43 | """
 44 | def mmRankitem(url):
 45 |     req = urllib2.Request(url, headers=header)
 46 |     html = urllib2.urlopen(req)
 47 |     htmldata = html.read()
 48 |     htmlpath = etree.HTML(htmldata)
 49 | 
 50 |     pages = htmlpath.xpath('//div[@class="rankli_imgdiv"]/a/@href')
 51 |     for i in range(len(pages)):
 52 |        print  "http://www.zngirls.com/" + pages[i]+"album/"
 53 |        getAlbums("http://www.zngirls.com/" + pages[i]+"/album/")
 54 |        #print "http://www.zngirls.com/" + pages[i]
 55 | 
 56 | 
 57 | """
 58 | 参数 url : 每一个MM专辑的页面地址
 59 | 通过穿过来的参数，获取每一个MM写真专辑图片集合的地址
 60 | 
 61 | """
 62 | def getAlbums(girlUrl):
 63 |     req = urllib2.Request(girlUrl, headers=header)
 64 |     html = urllib2.urlopen(req)
 65 |     htmldata = html.read()
 66 |     htmlpath = etree.HTML(htmldata)
 67 | 
 68 |     pages = htmlpath.xpath('//div[@class="igalleryli_div"]/a/@href')
 69 |     for i in range(len(pages)):
 70 | 
 71 |         getPagePicturess("http://www.zngirls.com/" + pages[i])
 72 | 
 73 | 
 74 | """
 75 | 参数 url : 每一个MM写真专辑图片集合的地址
 76 | 通过穿过来的参数，首先先获取图片集合的页数，然后每一页解析写真图片的真实地址
 77 | 
 78 | """
 79 | def getPagePicturess(albumsurl):
 80 |     req = urllib2.Request(albumsurl, headers=header)
 81 |     html = urllib2.urlopen(req)
 82 |     htmldata = html.read()
 83 |     htmlpath = etree.HTML(htmldata)
 84 |     pages = htmlpath.xpath('//div[@id="pages"]/a/@href')
 85 |     for i in range(len(pages)-2):
 86 |         savePictures("http://www.zngirls.com" + pages[i])
 87 | 
 88 | """
 89 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测)
 90 | 通过穿过来的参数，直接解析页面，获取写真图片的地址，然后下载保存到本地。
 91 | 
 92 | """
 93 | def savePictures(itemPagesurl):
 94 |     header = {
 95 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
 96 |         , "Connection": "keep-alive"
 97 |         , "Referer": "image / webp, image / *, * / *;q = 0.8"
 98 |         ,"Accept":"image/webp,image/*,*/*;q=0.8"
 99 |     }
100 |     try:
101 |         req = urllib2.Request(itemPagesurl, headers=header)
102 |         html = urllib2.urlopen(req)
103 |         htmldata = html.read()
104 |         htmlpath = etree.HTML(htmldata)
105 |         print itemPagesurl
106 |         pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src')
107 |         names = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@alt')
108 |     except Exception:
109 |         pass
110 |     for i in range(len(pages) ):
111 |         print pages[i]
112 |         pciturelist.append(pages[i])
113 | 
114 |         try:
115 | 
116 |             headers = {
117 |                 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
118 |                 , "Connection": "keep-alive"
119 |                 , "Referer": pages[i]
120 |             }
121 |             req = urllib2.Request(pages[i], headers=headers)
122 | 
123 |             urlhtml = urllib2.urlopen(req)
124 | 
125 |             respHtml = urlhtml.read()
126 | 
127 |             binfile = open('%s.jpg' % ( names[i] ) , "wb")
128 |             binfile.write(respHtml);
129 |             binfile.close();
130 |         except Exception :
131 |             pass
132 | 
133 | 
134 | mmRankSum()
135 | """
136 | fl=open('list.txt', 'w')
137 | for i in pciturelist:
138 |     fl.write(i)
139 |     fl.write("\n")
140 | fl.close()
141 | print '关机ing'
142 | """
143 | print 'finish'
144 | system('shutdown -s')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ###这篇文章干嘛的？
  2 | 本屌在上网时偶然看到一个图片网站，网站的尺度是这样的： 
  3 | 
  4 | 
  5 | 
  6 | ![图片站首页](https://raw.githubusercontent.com/panacena/mmPictures/master/1240.png)
  7 | 
  8 | 里面的美女**露骨而不露点**，简直是宅男福利。一时兴起，决定将网站上的图片down下来研究研究。正好最近在研究python，所以决定用这个抓取图片存到本地，将图片url存到 mongodb以防止以后用。
  9 | 
 10 | 源码项目github地址 ： https://github.com/panacena/mmPictures/
 11 | 本人博客地址 ：  http://www.jianshu.com/u/61f41588151d
 12 | ***
 13 | ####爬虫初识
 14 | 网络爬虫（又被称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动的抓取万维网信息的程序或者脚本。
 15 | 要学习Python爬虫，我们要学习的共有以下几点：
 16 | * Python基础知识
 17 | * Python中urllib和urllib2库的用法
 18 | * Python正则表达式
 19 | * Python爬虫框架Scrapy
 20 | * Python爬虫更高级的功能
 21 | 当然，我们今天暂时不需要使用框架进行爬取，只用urllib2等库进行操作。其它的一些关于爬虫的一些基本信息，可以在这位大神的博客中学习到。[学习爬虫点我](http://cuiqingcai.com/1052.html)。
 22 | ***
 23 | ####开始爬取
 24 | #####确定URL开始抓取
 25 | 1. 我们以 http://www.zngirls.com/rank/sum/ 为起始页面进行爬取，打开网页后右键查看源代码。
 26 | 打开这个url后，如图1。我们需要关心的是红色链接的内容。
 27 | ![图1](https://raw.githubusercontent.com/panacena/mmPictures/master/2.png)
 28 | 
 29 | 2. 这个如何查找呢?如果你用的是360浏览器，在MM图片那右击，选择“审查元素”。之后就可以查看到点击MM头像后跳转的url地址和MM头像的url地址  。图2：
 30 | ![图2](https://raw.githubusercontent.com/panacena/mmPictures/master/3.png)
 31 | 
 32 | 3 . 现在还是没有看到写真的图片，我们点击MM的头像，进入到了下图的页面  http://www.zngirls.com/girl/21751/ 可以看到如图3这个页面也没有写真的具体图片，只是写真的封面集合。不急，我们继续点击封面。
 33 | ![图3](https://raw.githubusercontent.com/panacena/mmPictures/master/4.png)
 34 | 4 .  点击封面后，http://www.zngirls.com/g/19671/1.html  进入的页面就可以看到写真的具体图片了。这时我们就可以爬取图片地址了。当然，这个也是分页的，所以也需要获取一共多少页以及每一页的url。
 35 | 
 36 | ![Paste_Image.png](https://raw.githubusercontent.com/panacena/mmPictures/master/5.png)
 37 | 
 38 | #####开始码代码吧
 39 | 从上面的步骤我们整理一下思路，大概分为以下三部：
 40 | * 第一步 从 http://www.zngirls.com/rank/sum/ 开始抓取MM点击头像的链接(注意是分页的)
 41 | * 第二部 从 http://www.zngirls.com/girl/21751/ 抓取每一个写真集合的链接(注意是分页的)
 42 | * 第三部 从 http://www.zngirls.com/g/19671/1.html 在写真图片的具体页面抓取图片(注意是分页的)
 43 | 
 44 | 
 45 | **1. 从起始页面 http://www.zngirls.com/rank/sum/ 开始首选先获取分页的页数以及每一页的url，方便下一步获取点击MM获取专辑url地址。接着解析每一页的html，获取每一页中点击MM头像后跳转的专辑集合页面。**
 46 | 
 47 | ```
 48 | """
 49 | 从起始页面 http://www.zngirls.com/rank/sum/ 开始获取排名的页数和每一页的url
 50 | 
 51 | """
 52 | def  mmRankSum():
 53 |     req = urllib2.Request("http://www.zngirls.com/rank/sum/", headers=header)
 54 |     html = urllib2.urlopen(req)
 55 |     htmldata = html.read()
 56 |     htmlpath = etree.HTML(htmldata)
 57 | 
 58 |     #首先获取页码数,然后用循环的方式挨个解析每一个页面
 59 |     pages = htmlpath.xpath('//div[@class="pagesYY"]/div/a/@href')
 60 | 
 61 | 
 62 |     for i in range( len(pages) -2 ):
 63 | 
 64 |         pagesitem="http://www.zngirls.com/rank/sum/"+ pages[i]
 65 |         mmRankitem(pagesitem)
 66 | 
 67 | """
 68 | 参数 url : 分页中每一页的具体url地址
 69 | 通过穿过来的参数，使用  lxml和xpath 解析 html，获取每一个MM写真专辑页面的url
 70 | 
 71 | """
 72 | def mmRankitem(url):
 73 |     req = urllib2.Request(url, headers=header)
 74 |     html = urllib2.urlopen(req)
 75 |     htmldata = html.read()
 76 |     htmlpath = etree.HTML(htmldata)
 77 | 
 78 |     pages = htmlpath.xpath('//div[@class="rankli_imgdiv"]/a/@href')
 79 |     for i in range(len(pages)):
 80 | 
 81 |         print "http://www.zngirls.com/" + pages[i]
 82 | ```
 83 | ***
 84 | **2. 从mmRankitem方法中获取到的url中解析每一个MM写真专辑图片的具体地址，也就是写真图片列表的页面。**
 85 | 
 86 | ```
 87 | """
 88 | 参数 albumsurl: 每一个MM专辑的页面地址
 89 | 通过穿过来的参数，获取每一个MM写真专辑图片集合的地址
 90 | 
 91 | """
 92 | def getAlbums(girlUrl):
 93 |     req = urllib2.Request(girlUrl, headers=header)
 94 |     html = urllib2.urlopen(req)
 95 |     htmldata = html.read()
 96 |     htmlpath = etree.HTML(htmldata)
 97 | 
 98 |     pages = htmlpath.xpath('//div[@class="igalleryli_div"]/a/@href')
 99 |     for i in range(len(pages)):
100 |          print  "http://www.zngirls.com/" + pages[i]+"album/"
101 |          getAlbums("http://www.zngirls.com/" + pages[i]+"/album/")
102 | ```
103 | 
104 | **3.  从每一页中获取图片的url，已经每一张图片的名称，方便下一步进行下载。**
105 | 
106 | 
107 | ```
108 | """
109 | 参数 url : 每一个MM写真专辑图片集合的地址
110 | 通过穿过来的参数，首先先获取图片集合的页数，然后每一页解析写真图片的真实地址
111 | 
112 | """
113 | def getPagePicturess(albumsurl):
114 |     req = urllib2.Request(albumsurl, headers=header)
115 |     html = urllib2.urlopen(req)
116 |     htmldata = html.read()
117 |     htmlpath = etree.HTML(htmldata)
118 |     pages = htmlpath.xpath('//div[@id="pages"]/a/@href')
119 |     for i in range(len(pages)-2):
120 |         savePictures("http://www.zngirls.com" + pages[i])
121 | 
122 | """
123 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测)
124 | 通过穿过来的参数，直接解析页面，获取写真图片的地址，然后下载保存到本地。
125 | 
126 | """
127 | def savePictures(itemPagesurl):
128 |     header = {
129 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
130 |         , "Connection": "keep-alive"
131 |         , "Referer": "image / webp, image / *, * / *;q = 0.8"
132 |         ,"Accept":"image/webp,image/*,*/*;q=0.8"
133 |     }
134 |     req = urllib2.Request(itemPagesurl, headers=header)
135 |     html = urllib2.urlopen(req)
136 |     htmldata = html.read()
137 |     htmlpath = etree.HTML(htmldata)
138 |     print itemPagesurl
139 |     pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src')
140 |     for i in range(len(pages) ):
141 |         print pages[i]
142 |         pciturelist.append(pages[i])
143 | ```
144 | 
145 | 
146 | **4  .获取每张图片的url，可每张照片的名称，然后下载到本地。**
147 | 
148 | ```
149 | """
150 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测)
151 | 通过穿过来的参数，直接解析页面，获取写真图片的地址，然后下载保存到本地。
152 | 
153 | """
154 | def savePictures(itemPagesurl):
155 |     header = {
156 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
157 |         , "Connection": "keep-alive"
158 |         , "Referer": "image / webp, image / *, * / *;q = 0.8"
159 |         ,"Accept":"image/webp,image/*,*/*;q=0.8"
160 |     }
161 |     req = urllib2.Request(itemPagesurl, headers=header)
162 |     html = urllib2.urlopen(req)
163 |     htmldata = html.read()
164 |     htmlpath = etree.HTML(htmldata)
165 |     print itemPagesurl
166 |     pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src')
167 | 
168 |     names = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@alt')
169 |     for i in range(len(pages) ):
170 |         print pages[i]
171 |         pciturelist.append(pages[i])
172 | 
173 |         headers = {
174 |             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
175 |             , "Connection": "keep-alive"
176 |             , "Referer": pages[i]
177 |         }
178 |         req = urllib2.Request(pages[i], headers=headers)
179 | 
180 |         urlhtml = urllib2.urlopen(req)
181 | 
182 |         respHtml = urlhtml.read()
183 | 
184 |         binfile = open('%s.jpg' % ( names[i] ) , "wb")
185 |         binfile.write(respHtml);
186 |         binfile.close();
187 | 
188 | ```
189 | 
190 | ***
191 | **5. 执行完毕后(时间可能会比较长)，就可以在文件夹里面看到一张一张的图片都下载完毕。再这里注意的是防止出现反爬虫，可以用设置header或者代理等方式。**
192 | ![图片下载到了文件夹中](https://raw.githubusercontent.com/panacena/mmPictures/master/6.png)
193 | 
194 | 没有设置一些header等参数时，有时会出现如下这种情况，这种情况应该是网站有反爬虫的机制。
195 | 
196 | ![反爬虫](https://raw.githubusercontent.com/panacena/mmPictures/master/7.png)
197 | 
198 | 
199 | ***
200 | 源码项目github地址 ： https://github.com/panacena/mmPictures/
201 | 这是本人学习python后写的一个小例子。代码写的很烂。以后会学习Scrapy等框架然后在重新重构一下代码。希望可以给个star~~~
202 | 


--------------------------------------------------------------------------------