├── README.md ├── pic_text.py ├── banfo_pics.py └── pic_text_download.py /README.md: -------------------------------------------------------------------------------- 1 | ## 使用Python爬虫批量保存半佛仙人公众号的骚气表情包,识别图中文字当成名称,方便找图时通过名称快速搜索,提高找图效率。 2 | 3 | #### 0418无重命名版本: 4 | 0418版本,banfo_pic.py,代码文件里有详细的操作步骤说明,更详细的代码解析见我的公众号「Python知识圈」 5 | 按照公众号文章操作操作即可:[我用Python一键保存了半佛老师所有的骚气表情包 6 | ](https://mp.weixin.qq.com/s/fVDwNdVDZo_0q6jAMWCGAA) 7 | 8 | 本项目[B站原创视频版](https://www.bilibili.com/video/BV1Vz41187Rt),目前播放量13万,需要你的三连支持!需要你的三连支持!需要你的三连支持! 9 | 10 | #### 0425优化版本 11 | 1、优化版:保存表情包的同时识别表情包的文字并以文字重命名,方便通过名字查找需要的表情包,更快找到素材,见上方代码:pic_text_download.py,代码文件里有详细的操作步骤说明,更详细的代码解析见我的**公众号「Python知识圈」** 代码解析文章:[批量识别图中文字自动命名,让你1秒找到小姐姐](https://mp.weixin.qq.com/s/ZmcOOX7rXtdSvD8bPe9_Rw) 12 | 13 | 2、加强版:批量识别之前已经保存的图片中文字并重命名,给出图片的文件夹路径即可,见上方代码:pic_text.py,代码文件里有详细的操作步骤说明,更详细的代码解析见我的**公众号「Python知识圈」** 代码解析文章:[批量识别图中文字自动命名,让你1秒找到小姐姐](https://mp.weixin.qq.com/s/ZmcOOX7rXtdSvD8bPe9_Rw) 14 | 15 | 加强版项目[B站原创视频版](https://www.bilibili.com/video/BV1u541147gQ),目前播放量5000,需要你的三连支持!需要你的三连支持!需要你的三连支持! 16 | 17 | 如果有疑问,可以在公众号后台留言,看到的的话,我会回答,如果时间比较久,后台有我微信,直接加我微信也行(微信号:dyw520520)。 18 | 19 | ## 微信公众号 20 | 欢迎关注个人微信公众号 “Python知识圈” (ID:PythonCircle) 21 | 22 | ![Python知识圈公众号二维码](http://blog.pyzhishiquan.com/img/20200427091312.jpg) 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /pic_text.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from os import path 3 | import os 4 | from aip import AipOcr 5 | from PIL import Image 6 | import time 7 | 8 | 9 | """ 10 | 1、本项目是根据图片或者表情包中文字重命名你的图片或者表情包,不支持gif图片。 11 | 12 | 2、替换百度aip接口中的三个参数,换成你自己的 13 | 14 | 3、把pic_path的路径替换成你需要重命名的图片的文件夹路径,路径后不要少了/,否则就不是文件夹了 15 | 16 | 4、运行代码 17 | 18 | 5、 本项目视频首发 B 站(菜鸟程序员的日常) 19 | 文案首发公众号:Python知识圈(id:PythonCircle),欢迎关注,三连! 20 | 视频链接:https://www.bilibili.com/video/BV1Vz41187Rt 21 | 公众号文章链接:https://mp.weixin.qq.com/s/fVDwNdVDZo_0q6jAMWCGAA 22 | """ 23 | 24 | def baiduOCR(picfile): 25 | """利用百度api识别文本,并保存提取的文字 26 | picfile: 图片文件名 27 | """ 28 | 29 | APP_ID = '填你自己注册应用的APP_ID' # 刚才获取的ID,下同 30 | API_KEY = '填你自己注册应用的API_KEY' 31 | SECRECT_KEY = '填你自己注册应用的SECRECT_KEY' 32 | client = AipOcr(APP_ID, API_KEY, SECRECT_KEY) 33 | i = open(picfile, 'rb') 34 | img = i.read() 35 | try: 36 | message = client.basicGeneral(img)['words_result'] # 通用文字识别,每天50000次免费 37 | # message = client.basicAccurate(img) # 通用文字高精度识别,每天 800 次免费 38 | value = [] 39 | for j in message: 40 | value.append(j['words']) 41 | title = ''.join(value) 42 | timestamp = int(time.time()) 43 | if title == "": 44 | title = timestamp 45 | return title 46 | i.close() 47 | return title 48 | except Exception: 49 | pass 50 | # print('此图片类型无法识别') 51 | 52 | 53 | pic_path = r'/Users/brucepk/Pictures/banfo/' # 目标路径,需要改成你自己需要改名称的路径,最后最后的/不要漏了 54 | 55 | """os.listdir(path) 操作效果为 返回指定路径(path)文件夹中所有文件名""" 56 | 57 | filename_list = os.listdir(pic_path) # 扫描目标路径的文件,将文件名存入列表 58 | 59 | a = 0 60 | for i in filename_list: 61 | try: 62 | used_name = pic_path + filename_list[a] 63 | title = baiduOCR(used_name) 64 | timestamp = int(time.time()) 65 | pic_type = i.split('.')[-1] 66 | if pic_type == 'gif': 67 | new_name = pic_path + str(timestamp) + '.' + i.split('.')[-1] 68 | else: 69 | new_name = pic_path + str(title) + '.' + i.split('.')[-1] 70 | os.rename(used_name, new_name) 71 | print('文件重命名成功,新的文件名为 % s' % (new_name)) 72 | except Exception: 73 | pass 74 | a += 1 75 | -------------------------------------------------------------------------------- /banfo_pics.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import json 4 | import csv 5 | import random 6 | from bs4 import BeautifulSoup 7 | import re 8 | import os 9 | from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning 10 | 11 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 12 | requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) 13 | 14 | """ 15 | 本项目首发: 16 | B站原创视频:https://www.bilibili.com/video/BV1Vz41187Rt 17 | 公众号原创文章:https://mp.weixin.qq.com/s/fVDwNdVDZo_0q6jAMWCGAA 18 | 公众号:Python知识圈(id:PythonCircle) 19 | 哔哩哔哩:菜鸟程序员的日常 20 | """ 21 | 22 | 23 | def request_data(): 24 | article_url_list = [] 25 | print('正在下载,请稍等!大约需要30分钟') 26 | for offset in range(0, 310, 10): 27 | # 记得把offset后面的值改成{} 28 | base_url = 'http://mp.weixin.qq.com/mp/profile_ext?offset={}&count=10' 29 | 30 | # 下面的值以自己的为准,部分省略了,从转换工具里复制过来就行 31 | # 下面的值以自己的为准,部分省略了,从转换工具里复制过来就行 32 | # 下面的值以自己的为准,部分省略了,从转换工具里复制过来就行 33 | 34 | cookies = { 35 | 'devicetype': 'xxx', 36 | 'lang': 'xx', 37 | 'pass_ticket': 'xxxx', 38 | 'version': '27000d37', 39 | } 40 | 41 | headers = { 42 | 'Host': 'mp.weixin.qq.com', 43 | 'Accept': '*/*', 44 | 'User-Agent': 'xx', 45 | 'Referer': 'xxx', 46 | 'Accept-Language': 'zh-cn', 47 | 'X-Requested-With': 'XMLHttpRequest', 48 | } 49 | 50 | params = ( 51 | ('action', 'getmsg'), 52 | ('__biz', 'MzI5MTE2NDI2OQ=='), 53 | ('f', ['json', 'json']), 54 | ('is_ok', '1'), 55 | ('scene', '124'), 56 | ('uin', 'MjQ5NjQ5NjEzNg=='), 57 | ('key', 58 | 'xxx'), 59 | ('pass_ticket', 'xxx'), 60 | ('wxtoken', ''), 61 | ('appmsg_token', 'xxx'), 62 | ('x5', '0'), 63 | ) 64 | 65 | # 代理ip,失效的话请自行更换,也可以直接去掉 66 | proxy = {'https': '114.239.144.61:808'} 67 | 68 | try: 69 | response = requests.get( 70 | base_url.format(offset), 71 | headers=headers, 72 | params=params, 73 | cookies=cookies, 74 | proxies=proxy) 75 | if 200 == response.status_code: 76 | all_datas = json.loads(response.text) 77 | if 0 == all_datas['ret'] and all_datas['msg_count'] > 0: 78 | summy_datas = all_datas['general_msg_list'] 79 | datas = json.loads(summy_datas)['list'] 80 | for data in datas: 81 | try: 82 | article_url = data['app_msg_ext_info']['content_url'] 83 | article_url_list.append(article_url) 84 | except Exception as e: 85 | continue 86 | except: 87 | time.sleep(2) 88 | time.sleep(int(format(random.randint(2, 5)))) 89 | return article_url_list 90 | 91 | 92 | def get_urls(url): 93 | try: 94 | html = requests.get(url, timeout=30).text 95 | except requests.exceptions.SSLError: 96 | html = requests.get(url, verify=False, timeout=30).text 97 | except TimeoutError: 98 | print('请求超时') 99 | except Exception: 100 | print('获取失败') 101 | src = re.compile(r'data-src="(.*?)"') 102 | urls = re.findall(src, html) 103 | if urls is not None: 104 | url_list = [] 105 | for url in urls: 106 | url_list.append(url) 107 | return url_list 108 | 109 | 110 | def mkdir(): 111 | isExists = os.path.exists(r'./banfo') 112 | if not isExists: 113 | print('创建目录') 114 | os.makedirs(r'./banfo') # 创建目录 115 | os.chdir(r'./banfo') # 切换到创建的文件夹 116 | return True 117 | else: 118 | print('目录已存在,即将保存!') 119 | return False 120 | 121 | 122 | def download(filename, url): 123 | try: 124 | with open(filename, 'wb+') as f: 125 | try: 126 | f.write(requests.get(url, timeout=30).content) 127 | print('成功下载图片:', filename) 128 | except requests.exceptions.SSLError: 129 | f.write(requests.get(url, verify=False, timeout=30).content) 130 | print('成功下载图片:', filename) 131 | except FileNotFoundError: 132 | print('下载失败,非表情包,直接忽略:', filename) 133 | except TimeoutError: 134 | print('下载超时:', filename) 135 | except Exception: 136 | print('下载失败:', filename) 137 | 138 | """ 139 | 本项目首发: 140 | B站原创视频:https://www.bilibili.com/video/BV1Vz41187Rt 141 | 公众号原创文章:https://mp.weixin.qq.com/s/fVDwNdVDZo_0q6jAMWCGAA 142 | 公众号:Python知识圈(id:PythonCircle) 143 | 哔哩哔哩:菜鸟程序员的日常 144 | """ 145 | 146 | if __name__ == '__main__': 147 | for url in request_data(): 148 | url_list = get_urls(url) 149 | mkdir() 150 | for pic_url in url_list: 151 | filename = r'./banfo/' + pic_url.split('/')[-2] + '.' + pic_url.split('=')[-1] # 图片的路径 152 | download(filename, pic_url) 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /pic_text_download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | import glob 6 | from os import path 7 | from aip import AipOcr 8 | from PIL import Image 9 | import time 10 | import random 11 | import json 12 | import csv 13 | 14 | from requests.packages.urllib3.exceptions import InsecureRequestWarning, InsecurePlatformWarning 15 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 16 | requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) 17 | 18 | """ 19 | 1、本项目是保存公众号文章中的表情包,并识别表情包中文字当成名称命名。 20 | 21 | 2、抓包工具charles或者fiddler复制Header相关信息(失效后需要重新抓取)替换掉我的,params中去掉offset和count 22 | 23 | 3、替换百度aip接口中的三个参数,换成你自己的 24 | 25 | 4、运行代码 26 | 27 | 5、 本项目视频首发 B 站(菜鸟程序员的日常) 28 | 文案首发公众号:Python知识圈(id:PythonCircle),欢迎关注,三连! 29 | 视频链接:https://www.bilibili.com/video/BV1Vz41187Rt 30 | 公众号文章链接:https://mp.weixin.qq.com/s/fVDwNdVDZo_0q6jAMWCGAA 31 | """ 32 | 33 | def request_data(): 34 | article_url_list = [] 35 | print('正在获取所有文章链接,请稍后') 36 | for offset in range(0, 323, 10): 37 | # 记得把offset后面的值改成{} 38 | base_url = 'http://mp.weixin.qq.com/mp/profile_ext?offset={}&count=10' 39 | # 下面的值以自己的为准,部分省略了 40 | 41 | cookies = { 42 | 'wxuin': 'xxx', 43 | 'devicetype': 'xxx-28', 44 | 'version': 'xx', 45 | 'lang': 'zh_CN', 46 | 'rewardsn': '', 47 | 'wxtokenkey': '777', 48 | 'pass_ticket': 'x/x//x+x', 49 | 'wap_sid2': 'xxxxx', 50 | } 51 | 52 | headers = { 53 | 'Host': 'mp.weixin.qq.com', 54 | 'user-agent': 'x/5.0 (Linux; x 9; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 XWEB/1178 MMWEBSDK/180801 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070333) NetType/WIFI Language/zh_CN Process/toolsmp', 55 | 'x-requested-with': 'x', 56 | 'accept': '*/*', 57 | 'referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&x=x==&scen607xxx', 58 | 'accept-language': 'zh-CN,en-US;q=0.9', 59 | } 60 | # 复制过来之后记得把offset和count去掉,offset值是动态的 61 | # 复制过来之后记得把offset和count去掉,offset值是动态的 62 | # 复制过来之后记得把offset和count去掉,offset值是动态的 63 | params = ( 64 | ('action', 'getmsg'), 65 | ('__biz', 'xxxxxx=='), 66 | ('f', ['json', 'json']), 67 | ('is_ok', '1'), 68 | ('scene', '126'), 69 | ('uin', 'xxx'), 70 | ('key', 'xxx'), 71 | ('pass_ticket', 'zILyKILRyRlW0V/xxxx//xxx+5uNjV5AX'), 72 | ('wxtoken', ''), 73 | ('appmsg_token', 'xxxxxx~~'), 74 | ('x5', '0'), 75 | ) 76 | # 代理ip,报pxory错误的话可能失效了,失效的话去西刺网自行更换,也可以直接去掉 77 | proxy = {'https': '114.239.144.61:808'} 78 | 79 | try: 80 | response = requests.get( 81 | base_url.format(offset), 82 | headers=headers, 83 | params=params, 84 | cookies=cookies, 85 | proxies=proxy) 86 | if 200 == response.status_code: 87 | all_datas = json.loads(response.text) 88 | if 0 == all_datas['ret'] and all_datas['msg_count'] > 0: 89 | summy_datas = all_datas['general_msg_list'] 90 | datas = json.loads(summy_datas)['list'] 91 | for data in datas: 92 | try: 93 | article_url = data['app_msg_ext_info']['content_url'] 94 | article_url_list.append(article_url) 95 | except Exception as e: 96 | continue 97 | except Exception as e: 98 | time.sleep(2) 99 | print('获取文章链接失败', e) 100 | time.sleep(int(format(random.randint(2, 5)))) 101 | return article_url_list 102 | 103 | 104 | def baiduOCR(picfile): 105 | """利用百度api识别文本,并保存提取的文字 106 | picfile: 图片文件名 107 | """ 108 | 109 | APP_ID = '填你自己注册应用的APP_ID' # 刚才获取的ID,下同 110 | API_KEY = '填你自己注册应用的API_KEY' 111 | SECRECT_KEY = '填你自己注册应用的SECRECT_KEY' 112 | client = AipOcr(APP_ID, API_KEY, SECRECT_KEY) 113 | i = open(picfile, 'rb') 114 | img = i.read() 115 | try: 116 | message = client.basicGeneral(img)['words_result'] # 通用文字识别,每天50000次免费 117 | # message = client.basicAccurate(img) # 通用文字高精度识别,每天 800 次免费 118 | value = [] 119 | for j in message: 120 | value.append(j['words']) 121 | t = ''.join(value) 122 | title = t.replace('/', '').replace('\\', '').replace(':', '').replace('*', '').replace('?', '').replace('<', '')\ 123 | .replace('>', '').replace('|', '').replace('.', '') 124 | # .strip('/').strip('\\').strip(':').strip('*').strip('?').strip('<').strip('>').strip('|').strip('.') 125 | timestamp = int(time.time()) 126 | if title == "" or title == None: 127 | title = timestamp 128 | return title 129 | i.close() 130 | return title 131 | except Exception as e: 132 | print(e) 133 | # print('此图片类型无法识别') 134 | 135 | 136 | def get_urls(url): 137 | try: 138 | html = requests.get(url, timeout=30).text 139 | except requests.exceptions.SSLError: 140 | html = requests.get(url, verify=False, timeout=30).text 141 | except TimeoutError: 142 | print('请求超时') 143 | except Exception: 144 | print('获取图片链接失败') 145 | src = re.compile(r'data-src="(.*?)"') 146 | urls = re.findall(src, html) 147 | if urls is not None: 148 | url_list = [] 149 | for url in urls: 150 | url_list.append(url) 151 | return url_list 152 | 153 | 154 | def mkdir(base_path): 155 | isExists = os.path.exists(base_path) 156 | if not isExists: 157 | print('创建目录') 158 | os.makedirs(base_path) # 创建目录 159 | os.chdir(base_path) # 切换到创建的文件夹 160 | return True 161 | else: 162 | print('目录已存在,即将保存!') 163 | return False 164 | 165 | 166 | def download(filename, url): 167 | try: 168 | with open(filename, 'wb+') as f: 169 | try: 170 | f.write(requests.get(url, timeout=30).content) 171 | print('成功下载图片:', filename) 172 | except requests.exceptions.SSLError: 173 | f.write(requests.get(url, verify=False, timeout=30).content) 174 | print('成功下载图片:', filename) 175 | except FileNotFoundError: 176 | print('下载失败!!没有找到对应图片目前,请检查路径:', filename) 177 | pass 178 | except TimeoutError: 179 | print('下载超时:', filename) 180 | pass 181 | except Exception: 182 | print('非正常图片,直接忽略:', filename) 183 | pass 184 | 185 | 186 | if __name__ == '__main__': 187 | for url in request_data(): 188 | for url in urls: 189 | url_list = get_urls(url) 190 | base_path = r'./banfo/' 191 | mkdir(base_path) 192 | for pic_url in url_list: 193 | filename = base_path + \ 194 | pic_url.split('/')[-2] + '.' + pic_url.split('=')[-1] # 图片的路径 195 | download(filename, pic_url) 196 | try: 197 | title = baiduOCR(filename) 198 | timestamp = int(time.time()) 199 | pic_type = pic_url.split('=')[-1] 200 | if pic_type == 'gif': 201 | new_name = base_path + str(timestamp) + '.' + pic_type 202 | else: 203 | new_name = base_path + str(title) + '.' + pic_type 204 | os.rename(filename, new_name) 205 | print('文件重命名成功% s' % (new_name)) 206 | except Exception as e: 207 | print('重命名失败!忽略') 208 | --------------------------------------------------------------------------------