├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 1240.png ├── .gitattributes ├── .gitignore ├── mm.py └── README.md /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/2.png -------------------------------------------------------------------------------- /3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/3.png -------------------------------------------------------------------------------- /4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/4.png -------------------------------------------------------------------------------- /5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/5.png -------------------------------------------------------------------------------- /6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/6.png -------------------------------------------------------------------------------- /7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/7.png -------------------------------------------------------------------------------- /1240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panacena/mmPictures/HEAD/1240.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | venv/ 82 | ENV/ 83 | 84 | # Spyder project settings 85 | .spyderproject 86 | 87 | # Rope project settings 88 | .ropeproject 89 | 90 | # ========================= 91 | # Operating System Files 92 | # ========================= 93 | 94 | # OSX 95 | # ========================= 96 | 97 | .DS_Store 98 | .AppleDouble 99 | .LSOverride 100 | 101 | # Thumbnails 102 | ._* 103 | 104 | # Files that might appear in the root of a volume 105 | .DocumentRevisions-V100 106 | .fseventsd 107 | .Spotlight-V100 108 | .TemporaryItems 109 | .Trashes 110 | .VolumeIcon.icns 111 | 112 | # Directories potentially created on remote AFP share 113 | .AppleDB 114 | .AppleDesktop 115 | Network Trash Folder 116 | Temporary Items 117 | .apdisk 118 | 119 | # Windows 120 | # ========================= 121 | 122 | # Windows image file caches 123 | Thumbs.db 124 | ehthumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | # Recycle Bin used on file shares 130 | $RECYCLE.BIN/ 131 | 132 | # Windows Installer files 133 | *.cab 134 | *.msi 135 | *.msm 136 | *.msp 137 | 138 | # Windows shortcuts 139 | *.lnk 140 | -------------------------------------------------------------------------------- /mm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*-coding:utf-8-*- 3 | import urllib2 4 | from lxml import etree 5 | from os import system 6 | """ 7 | 第一步: 从 http://www.zngirls.com/rank/sum/ 开始抓取MM点击头像的链接(注意是分页的) 8 | #第二部 http://www.zngirls.com/girl/21751/ 抓取每一个写真集合的链接(注意是分页的) 9 | #第三部 http://www.zngirls.com/g/19671/1.html 在写真图片的具体页面抓取图片(注意是分页的) 10 | """ 11 | pciturelist=[] 12 | 13 | 14 | header = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 16 | , "Connection": "keep-alive" 17 | } 18 | 19 | 20 | """ 21 | 从起始页面 http://www.zngirls.com/rank/sum/ 开始获取排名的页数和每一页的url 22 | 23 | """ 24 | def mmRankSum(): 25 | req = urllib2.Request("http://www.zngirls.com/rank/sum/", headers=header) 26 | html = urllib2.urlopen(req) 27 | htmldata = html.read() 28 | htmlpath = etree.HTML(htmldata) 29 | 30 | #首先获取页码数,然后用循环的方式挨个解析每一个页面 31 | pages = htmlpath.xpath('//div[@class="pagesYY"]/div/a/@href') 32 | 33 | 34 | for i in range( len(pages) -2 ): 35 | 36 | pagesitem="http://www.zngirls.com/rank/sum/"+ pages[i] 37 | mmRankitem(pagesitem) 38 | 39 | """ 40 | 参数 url : 分页中每一页的具体url地址 41 | 通过穿过来的参数,使用 lxml和xpath 解析 html,获取每一个MM写真专辑页面的url 42 | 43 | """ 44 | def mmRankitem(url): 45 | req = urllib2.Request(url, headers=header) 46 | html = urllib2.urlopen(req) 47 | htmldata = html.read() 48 | htmlpath = etree.HTML(htmldata) 49 | 50 | pages = htmlpath.xpath('//div[@class="rankli_imgdiv"]/a/@href') 51 | for i in range(len(pages)): 52 | print "http://www.zngirls.com/" + pages[i]+"album/" 53 | getAlbums("http://www.zngirls.com/" + pages[i]+"/album/") 54 | #print "http://www.zngirls.com/" + pages[i] 55 | 56 | 57 | """ 58 | 参数 url : 每一个MM专辑的页面地址 59 | 通过穿过来的参数,获取每一个MM写真专辑图片集合的地址 60 | 61 | """ 62 | def getAlbums(girlUrl): 63 | req = urllib2.Request(girlUrl, headers=header) 64 | html = urllib2.urlopen(req) 65 | htmldata = html.read() 66 | htmlpath = etree.HTML(htmldata) 67 | 68 | pages = htmlpath.xpath('//div[@class="igalleryli_div"]/a/@href') 69 | for i in range(len(pages)): 70 | 71 | getPagePicturess("http://www.zngirls.com/" + pages[i]) 72 | 73 | 74 | """ 75 | 参数 url : 每一个MM写真专辑图片集合的地址 76 | 通过穿过来的参数,首先先获取图片集合的页数,然后每一页解析写真图片的真实地址 77 | 78 | """ 79 | def getPagePicturess(albumsurl): 80 | req = urllib2.Request(albumsurl, headers=header) 81 | html = urllib2.urlopen(req) 82 | htmldata = html.read() 83 | htmlpath = etree.HTML(htmldata) 84 | pages = htmlpath.xpath('//div[@id="pages"]/a/@href') 85 | for i in range(len(pages)-2): 86 | savePictures("http://www.zngirls.com" + pages[i]) 87 | 88 | """ 89 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测) 90 | 通过穿过来的参数,直接解析页面,获取写真图片的地址,然后下载保存到本地。 91 | 92 | """ 93 | def savePictures(itemPagesurl): 94 | header = { 95 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 96 | , "Connection": "keep-alive" 97 | , "Referer": "image / webp, image / *, * / *;q = 0.8" 98 | ,"Accept":"image/webp,image/*,*/*;q=0.8" 99 | } 100 | try: 101 | req = urllib2.Request(itemPagesurl, headers=header) 102 | html = urllib2.urlopen(req) 103 | htmldata = html.read() 104 | htmlpath = etree.HTML(htmldata) 105 | print itemPagesurl 106 | pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src') 107 | names = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@alt') 108 | except Exception: 109 | pass 110 | for i in range(len(pages) ): 111 | print pages[i] 112 | pciturelist.append(pages[i]) 113 | 114 | try: 115 | 116 | headers = { 117 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 118 | , "Connection": "keep-alive" 119 | , "Referer": pages[i] 120 | } 121 | req = urllib2.Request(pages[i], headers=headers) 122 | 123 | urlhtml = urllib2.urlopen(req) 124 | 125 | respHtml = urlhtml.read() 126 | 127 | binfile = open('%s.jpg' % ( names[i] ) , "wb") 128 | binfile.write(respHtml); 129 | binfile.close(); 130 | except Exception : 131 | pass 132 | 133 | 134 | mmRankSum() 135 | """ 136 | fl=open('list.txt', 'w') 137 | for i in pciturelist: 138 | fl.write(i) 139 | fl.write("\n") 140 | fl.close() 141 | print '关机ing' 142 | """ 143 | print 'finish' 144 | system('shutdown -s') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ###这篇文章干嘛的? 2 | 本屌在上网时偶然看到一个图片网站,网站的尺度是这样的: 3 | 4 | 5 | 6 | ![图片站首页](https://raw.githubusercontent.com/panacena/mmPictures/master/1240.png) 7 | 8 | 里面的美女**露骨而不露点**,简直是宅男福利。一时兴起,决定将网站上的图片down下来研究研究。正好最近在研究python,所以决定用这个抓取图片存到本地,将图片url存到 mongodb以防止以后用。 9 | 10 | 源码项目github地址 : https://github.com/panacena/mmPictures/ 11 | 本人博客地址 : http://www.jianshu.com/u/61f41588151d 12 | *** 13 | ####爬虫初识 14 | 网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本。 15 | 要学习Python爬虫,我们要学习的共有以下几点: 16 | * Python基础知识 17 | * Python中urllib和urllib2库的用法 18 | * Python正则表达式 19 | * Python爬虫框架Scrapy 20 | * Python爬虫更高级的功能 21 | 当然,我们今天暂时不需要使用框架进行爬取,只用urllib2等库进行操作。其它的一些关于爬虫的一些基本信息,可以在这位大神的博客中学习到。[学习爬虫点我](http://cuiqingcai.com/1052.html)。 22 | *** 23 | ####开始爬取 24 | #####确定URL开始抓取 25 | 1. 我们以 http://www.zngirls.com/rank/sum/ 为起始页面进行爬取,打开网页后右键查看源代码。 26 | 打开这个url后,如图1。我们需要关心的是红色链接的内容。 27 | ![图1](https://raw.githubusercontent.com/panacena/mmPictures/master/2.png) 28 | 29 | 2. 这个如何查找呢?如果你用的是360浏览器,在MM图片那右击,选择“审查元素”。之后就可以查看到点击MM头像后跳转的url地址和MM头像的url地址 。图2: 30 | ![图2](https://raw.githubusercontent.com/panacena/mmPictures/master/3.png) 31 | 32 | 3 . 现在还是没有看到写真的图片,我们点击MM的头像,进入到了下图的页面 http://www.zngirls.com/girl/21751/ 可以看到如图3这个页面也没有写真的具体图片,只是写真的封面集合。不急,我们继续点击封面。 33 | ![图3](https://raw.githubusercontent.com/panacena/mmPictures/master/4.png) 34 | 4 . 点击封面后,http://www.zngirls.com/g/19671/1.html 进入的页面就可以看到写真的具体图片了。这时我们就可以爬取图片地址了。当然,这个也是分页的,所以也需要获取一共多少页以及每一页的url。 35 | 36 | ![Paste_Image.png](https://raw.githubusercontent.com/panacena/mmPictures/master/5.png) 37 | 38 | #####开始码代码吧 39 | 从上面的步骤我们整理一下思路,大概分为以下三部: 40 | * 第一步 从 http://www.zngirls.com/rank/sum/ 开始抓取MM点击头像的链接(注意是分页的) 41 | * 第二部 从 http://www.zngirls.com/girl/21751/ 抓取每一个写真集合的链接(注意是分页的) 42 | * 第三部 从 http://www.zngirls.com/g/19671/1.html 在写真图片的具体页面抓取图片(注意是分页的) 43 | 44 | 45 | **1. 从起始页面 http://www.zngirls.com/rank/sum/ 开始首选先获取分页的页数以及每一页的url,方便下一步获取点击MM获取专辑url地址。接着解析每一页的html,获取每一页中点击MM头像后跳转的专辑集合页面。** 46 | 47 | ``` 48 | """ 49 | 从起始页面 http://www.zngirls.com/rank/sum/ 开始获取排名的页数和每一页的url 50 | 51 | """ 52 | def mmRankSum(): 53 | req = urllib2.Request("http://www.zngirls.com/rank/sum/", headers=header) 54 | html = urllib2.urlopen(req) 55 | htmldata = html.read() 56 | htmlpath = etree.HTML(htmldata) 57 | 58 | #首先获取页码数,然后用循环的方式挨个解析每一个页面 59 | pages = htmlpath.xpath('//div[@class="pagesYY"]/div/a/@href') 60 | 61 | 62 | for i in range( len(pages) -2 ): 63 | 64 | pagesitem="http://www.zngirls.com/rank/sum/"+ pages[i] 65 | mmRankitem(pagesitem) 66 | 67 | """ 68 | 参数 url : 分页中每一页的具体url地址 69 | 通过穿过来的参数,使用 lxml和xpath 解析 html,获取每一个MM写真专辑页面的url 70 | 71 | """ 72 | def mmRankitem(url): 73 | req = urllib2.Request(url, headers=header) 74 | html = urllib2.urlopen(req) 75 | htmldata = html.read() 76 | htmlpath = etree.HTML(htmldata) 77 | 78 | pages = htmlpath.xpath('//div[@class="rankli_imgdiv"]/a/@href') 79 | for i in range(len(pages)): 80 | 81 | print "http://www.zngirls.com/" + pages[i] 82 | ``` 83 | *** 84 | **2. 从mmRankitem方法中获取到的url中解析每一个MM写真专辑图片的具体地址,也就是写真图片列表的页面。** 85 | 86 | ``` 87 | """ 88 | 参数 albumsurl: 每一个MM专辑的页面地址 89 | 通过穿过来的参数,获取每一个MM写真专辑图片集合的地址 90 | 91 | """ 92 | def getAlbums(girlUrl): 93 | req = urllib2.Request(girlUrl, headers=header) 94 | html = urllib2.urlopen(req) 95 | htmldata = html.read() 96 | htmlpath = etree.HTML(htmldata) 97 | 98 | pages = htmlpath.xpath('//div[@class="igalleryli_div"]/a/@href') 99 | for i in range(len(pages)): 100 | print "http://www.zngirls.com/" + pages[i]+"album/" 101 | getAlbums("http://www.zngirls.com/" + pages[i]+"/album/") 102 | ``` 103 | 104 | **3. 从每一页中获取图片的url,已经每一张图片的名称,方便下一步进行下载。** 105 | 106 | 107 | ``` 108 | """ 109 | 参数 url : 每一个MM写真专辑图片集合的地址 110 | 通过穿过来的参数,首先先获取图片集合的页数,然后每一页解析写真图片的真实地址 111 | 112 | """ 113 | def getPagePicturess(albumsurl): 114 | req = urllib2.Request(albumsurl, headers=header) 115 | html = urllib2.urlopen(req) 116 | htmldata = html.read() 117 | htmlpath = etree.HTML(htmldata) 118 | pages = htmlpath.xpath('//div[@id="pages"]/a/@href') 119 | for i in range(len(pages)-2): 120 | savePictures("http://www.zngirls.com" + pages[i]) 121 | 122 | """ 123 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测) 124 | 通过穿过来的参数,直接解析页面,获取写真图片的地址,然后下载保存到本地。 125 | 126 | """ 127 | def savePictures(itemPagesurl): 128 | header = { 129 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 130 | , "Connection": "keep-alive" 131 | , "Referer": "image / webp, image / *, * / *;q = 0.8" 132 | ,"Accept":"image/webp,image/*,*/*;q=0.8" 133 | } 134 | req = urllib2.Request(itemPagesurl, headers=header) 135 | html = urllib2.urlopen(req) 136 | htmldata = html.read() 137 | htmlpath = etree.HTML(htmldata) 138 | print itemPagesurl 139 | pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src') 140 | for i in range(len(pages) ): 141 | print pages[i] 142 | pciturelist.append(pages[i]) 143 | ``` 144 | 145 | 146 | **4 .获取每张图片的url,可每张照片的名称,然后下载到本地。** 147 | 148 | ``` 149 | """ 150 | 参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测) 151 | 通过穿过来的参数,直接解析页面,获取写真图片的地址,然后下载保存到本地。 152 | 153 | """ 154 | def savePictures(itemPagesurl): 155 | header = { 156 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 157 | , "Connection": "keep-alive" 158 | , "Referer": "image / webp, image / *, * / *;q = 0.8" 159 | ,"Accept":"image/webp,image/*,*/*;q=0.8" 160 | } 161 | req = urllib2.Request(itemPagesurl, headers=header) 162 | html = urllib2.urlopen(req) 163 | htmldata = html.read() 164 | htmlpath = etree.HTML(htmldata) 165 | print itemPagesurl 166 | pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src') 167 | 168 | names = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@alt') 169 | for i in range(len(pages) ): 170 | print pages[i] 171 | pciturelist.append(pages[i]) 172 | 173 | headers = { 174 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" 175 | , "Connection": "keep-alive" 176 | , "Referer": pages[i] 177 | } 178 | req = urllib2.Request(pages[i], headers=headers) 179 | 180 | urlhtml = urllib2.urlopen(req) 181 | 182 | respHtml = urlhtml.read() 183 | 184 | binfile = open('%s.jpg' % ( names[i] ) , "wb") 185 | binfile.write(respHtml); 186 | binfile.close(); 187 | 188 | ``` 189 | 190 | *** 191 | **5. 执行完毕后(时间可能会比较长),就可以在文件夹里面看到一张一张的图片都下载完毕。再这里注意的是防止出现反爬虫,可以用设置header或者代理等方式。** 192 | ![图片下载到了文件夹中](https://raw.githubusercontent.com/panacena/mmPictures/master/6.png) 193 | 194 | 没有设置一些header等参数时,有时会出现如下这种情况,这种情况应该是网站有反爬虫的机制。 195 | 196 | ![反爬虫](https://raw.githubusercontent.com/panacena/mmPictures/master/7.png) 197 | 198 | 199 | *** 200 | 源码项目github地址 : https://github.com/panacena/mmPictures/ 201 | 这是本人学习python后写的一个小例子。代码写的很烂。以后会学习Scrapy等框架然后在重新重构一下代码。希望可以给个star~~~ 202 | --------------------------------------------------------------------------------