├── requirements.txt ├── images ├── wallpaper_0.png ├── wallpaper_1.png └── wallpaper_2.jpg ├── LICENSE ├── README.md ├── .gitignore ├── wallpaper.py └── wallpaper_redis.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | redis 3 | -------------------------------------------------------------------------------- /images/wallpaper_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/wallpaper/HEAD/images/wallpaper_0.png -------------------------------------------------------------------------------- /images/wallpaper_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/wallpaper/HEAD/images/wallpaper_1.png -------------------------------------------------------------------------------- /images/wallpaper_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenjiandongx/wallpaper/HEAD/images/wallpaper_2.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) chenjiandongx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zol 手机壁纸爬虫 2 | 3 | 这两天在找手机壁纸,不过发现一张一张来效率太低。话不多说,爬虫在手,天下我有! 4 | 5 | 爬取网站:[http://sj.zol.com.cn/bizhi/](http://sj.zol.com.cn/bizhi/) 6 | 7 | 至于为什么选这个 zol 的手机壁纸网站,没办法,某度搜索出来第一个,树大招风! 8 | 9 | 代码用 Python 写的,第三方类库只用了 requests,老少皆宜。匹配只用了正则表达式。利用多进程提高爬取速度。 10 | 11 | 壁纸大小为 750 * 1334(iphone6 的分辨率),大小可在代码里自行更改,最高的分辨率为 1080 * 1920(iphone6s plus 的分辨率)。 12 | 13 | 总共爬取了 **420** 套,共 **3941** 张壁纸。 14 | 15 | ![wallpaper_0](https://github.com/chenjiandongx/wallpaper/blob/master/images/wallpaper_0.png) 16 | 17 | 具体情况如下图,以套图为单位存放在不同的文件夹 18 | 19 | ![wallpaper_1](https://github.com/chenjiandongx/wallpaper/blob/master/images/wallpaper_1.png) 20 | 21 | 囊括了各种风格的壁纸!! 22 | 23 | 然而,事情到这里还没有结束。这两天看了 redis 的文档,就想利用 redis 来搞个简单的分布式爬虫。前期工作的下载和配置 redis 我就不说了。核心逻辑是先将所有连接存入到本机的 redis 数据库里,本机作为 master。然后其他 slave 连接到 master 分配任务干活。但是,你以为我既没有另外一台电脑也没有钱购买虚拟主机这件事我会告诉你?最后就只用了本机爬... ,anyway,最后也同样把所有的套图都爬下来了。这部分代码在 wallpaper_redis.py 里。 24 | 25 | 壁纸已打包上传到百度云里:链接: [https://pan.baidu.com/s/1boZ6XTP](https://pan.baidu.com/s/1boZ6XTP) 密码: q8k6 26 | 27 | 为了深入贯彻党提倡的二十四字社会主义核心价值观,请允许我先大家安利下面这张壁纸 28 | 29 | ![wallpaper_2](https://github.com/chenjiandongx/wallpaper/blob/master/images/wallpaper_2.jpg) 30 | 31 | **永远热血,永远热泪盈眶!** 32 | 33 | 34 | ### License 35 | 36 | MIT [© chenjiandongx](https://github.com/chenjiandongx) 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | /data_/ 9 | .idea 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | -------------------------------------------------------------------------------- /wallpaper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | 5 | import os 6 | import re 7 | import threading 8 | import time 9 | from multiprocessing import Pool, cpu_count 10 | 11 | import requests 12 | 13 | # 图片存放路径 14 | SAVE_PATH = r"D:\wallpaper" 15 | CRAWL_URL = "http://sj.zol.com.cn{}" 16 | HEADERS = { 17 | "X-Requested-With": "XMLHttpRequest", 18 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " 19 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", 20 | } 21 | 22 | 23 | def mkdir(folder_name): 24 | """ 25 | 新建文件夹并切换到该目录下 26 | 27 | :param folder_name: 文件夹名称 28 | """ 29 | path = os.path.join(SAVE_PATH, folder_name) 30 | # 如果目录已经存在就不用再次爬取了,去重,提高效率。存在返回 False,否则反之 31 | if not os.path.exists(path): 32 | os.makedirs(path) 33 | print(path) 34 | os.chdir(path) 35 | return True 36 | print("Folder has existed!") 37 | return False 38 | 39 | 40 | def save_images(src, name): 41 | """ 42 | 保存图片到本地 43 | 44 | :param src: 图片 src 45 | :param name: 保存图片名 46 | """ 47 | try: 48 | img = requests.get(src, headers=HEADERS) 49 | with open(name + ".jpg", "ab") as f: 50 | f.write(img.content) 51 | print("{}.jpg save Successfully".format(name)) 52 | except: 53 | pass 54 | 55 | 56 | def clear(dir_path): 57 | """ 58 | 删除空文件夹 59 | 60 | :param dir_path: 文件夹路径 61 | """ 62 | if os.path.exists(dir_path): 63 | if os.path.isdir(dir_path): 64 | for d in os.listdir(dir_path): 65 | path = os.path.join(dir_path, d) 66 | if os.path.isdir(path): 67 | # 递归删除空文件夹 68 | clear(path) 69 | if not os.listdir(dir_path): 70 | os.rmdir(dir_path) 71 | print("remove the empty dir: {}".format(dir_path)) 72 | 73 | 74 | def get_urls(): 75 | """ 76 | 获取壁纸套图地址 77 | 78 | # 1080x1920(iphone6s plus) 79 | # 800x1280 80 | # 768x1280 81 | # 750x1334(iphone6) 82 | # 720x1280 83 | # 640x1336(iphone5s) 84 | """ 85 | url = "http://sj.zol.com.cn/bizhi/1080x1920/{}.html" 86 | _urls = set() 87 | for url in [url.format(page) for page in range(1, 51)]: 88 | req = requests.get(url, headers=HEADERS).text 89 | for u in [ 90 | CRAWL_URL.format(u) 91 | for u in re.findall('(.*?)", html)[0] 108 | # 壁纸套图张数 109 | max_cnt = re.findall("((.*?))", html)[0].split("/")[1] 110 | with lock: 111 | if not mkdir(title): 112 | return 113 | # 按点击下一页循环下载 114 | for _ in range(int(max_cnt)): 115 | next_page = re.findall('class ="next" href="(.*?)"', html) 116 | img_url = re.findall( 117 | 'id="750x1334" href="(.*?)">750x1334', html 118 | ) 119 | # 像素未达到 750x1334 的不进行下载 120 | img = img_url[0] if img_url else [] 121 | if img: 122 | req = requests.get( 123 | CRAWL_URL.format(img), headers=HEADERS 124 | ).text 125 | img_src = re.findall('(.*?)", html)[0] 105 | # 壁纸套图张数 106 | max_cnt_pattern = "((.*?))" 107 | max_cnt = re.findall(max_cnt_pattern, html)[0].split("/")[1] 108 | if self.mkdir(title): 109 | # 按点击下一页循环下载 110 | for _ in range(int(max_cnt)): 111 | next_page = re.findall( 112 | 'class ="next" href="(.*?)"', html 113 | ) 114 | img_url = re.findall( 115 | 'id="750x1334" href="(.*?)">750x1334', html 116 | ) 117 | # 像素未达到 750x1334 的不进行下载 118 | img = img_url[0] if img_url else [] 119 | if img: 120 | req = requests.get( 121 | self.crawl_url.format(img), headers=HEADERS 122 | ).text 123 | img_src = re.findall('