├── .gitignore ├── LICENSE ├── README.md ├── dzdp_css_map.py ├── dzdp_font_encryption.py ├── images ├── css_map_result.png ├── font.png └── font_encryption_result.png ├── requirements.txt └── woff2tff.py /.gitignore: -------------------------------------------------------------------------------- 1 | woff_file/* 2 | 3 | *.jpg 4 | 5 | __pycache__ 6 | 7 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 naiveliberty 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 大众点评信息爬取 2 | 3 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。** 4 | 5 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。** 6 | 7 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。** 8 | 9 | | 作者 | 邮箱 | 10 | | ------- | -------------------- | 11 | | liberty | fthemuse@foxmail.com | 12 | 13 | ------ 14 | 15 | ## 1. 项目介绍 16 | 17 | **该项目只对大众点评字体加密做了处理** 18 | 19 | 字体加密分以下两种: 20 | 21 | 1. CSS 字体映射(评论详情页:http://www.dianping.com/shop/G9TSD2JvdLtA7fdm/review_all); 22 | 2. WOFF 字体加密(店铺搜索页面:http://www.dianping.com/shenzhen/ch10/g117); 23 | 24 | 25 | 26 | 项目中使用了 `woff2tff.py` 将 woff 字体文件转为 ttf 字体文件,该文件来自:https://github.com/hanikesn/woff2otf 27 | 28 | ttf 字体文件通过 fontTools 库生成如下图片,之后使用 pytesseract 识别汉字,动态生成码表 29 | 30 | ![](images/font.png) 31 | 32 | 33 | 34 | ## 2. 效果展示 35 | 36 | ### 2.1 CSS 字体映射 37 | 38 | `dzdp_css_map.py` 39 | 40 | ![](images/css_map_result.png) 41 | 42 | 43 | 44 | ### 2.2 WOFF 字体加密 45 | 46 | `dzdp_font_encryption.py` 47 | 48 | ![](images/font_encryption_result.png) 49 | 50 | 51 | 52 | ## 3. 环境依赖 53 | 54 | ### 3.1 python 55 | 56 | ``` 57 | pip3 install -r requirements.txt -i https://pypi.douban.com/simple 58 | ``` 59 | 60 | 61 | 62 | ### 3.2 pytesseract 63 | 64 | - pytesseract 下载地址:https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.0.0-alpha.20200328.exe 65 | - 安装完成后,新增系统变量: `TESSDATA_PREFIX: C:\Program Files\Tesseract-OCR` (安装目录) 66 | - 修改 `当前 python 解释器所在目录\Lib\site-packages\pytesseract\pytesseract.py` 中的 `tesseract_cmd` 变量值为 `tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'` 67 | - 下载简体中文训练集下载地址:https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.00/chi_sim.traineddata,并拷贝到 `C:\Program Files\Tesseract-OCR\tessdata` 目录下 68 | 69 | 70 | 71 | ## 4. 分析过程 72 | 73 | 详见:`https://blog.csdn.net/saberqqq/article/details/105977645` -------------------------------------------------------------------------------- /dzdp_css_map.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import time 4 | from lxml import etree 5 | import random 6 | 7 | 8 | class DaZhongDianPing: 9 | def __init__(self, shop_review_url, user_cookie): 10 | # 商家评论详情页 url 11 | self.url = shop_review_url 12 | # 商家评论详情页源码 13 | self.html = str() 14 | # 页面字体大小 15 | self.font_size = 14 16 | # 页面引用的 css 文件 17 | self.css = str() 18 | # 商家地址使用的 svg 文件 19 | self.address_svg = str() 20 | # 商家电话使用的 svg 文件 21 | self.tell_svg = str() 22 | # 商家评论使用的 svg 文件 23 | self.review_svg = str() 24 | 25 | # 字体码表,key 为 class 名称,value 为对应的汉字 26 | self.address_font_map = dict() 27 | self.tell_font_map = dict() 28 | self.review_font_map = dict() 29 | 30 | # 商家评论的最大页码数 31 | self.max_pages = 0 32 | self.timeout = 10 33 | self.headers = { 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Cache-Control': 'no-cache', 37 | 'Upgrade-Insecure-Requests': '1', 38 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36', 39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 40 | 'Referer': self.url.replace('/review_all', ''), 41 | 'Accept-Language': 'zh-CN,zh;q=0.9', 42 | 'Cookie': user_cookie 43 | } 44 | 45 | def get_svg_html(self): 46 | """ 47 | 获取商家详情页 svg 文件 48 | :return: 49 | """ 50 | # 获取商家评论页内容 51 | index_res = requests.get(self.url, headers=self.headers, timeout=self.timeout) 52 | self.html = index_res.text 53 | if '验证中心' in index_res.text: 54 | print('遇到验证码,程序退出!') 55 | exit() 56 | 57 | # 提取最大页数 58 | tree = etree.HTML(self.html) 59 | self.max_pages = int(tree.xpath('//div[@class="reviews-pages"]/a/text()')[-2]) 60 | 61 | # 正则匹配 css 文件 url 62 | result = re.search('', self.html, re.S) 63 | if result: 64 | css_url = 'http:' + result.group(1) 65 | headers = { 66 | 'Proxy-Connection': 'keep-alive', 67 | 'Pragma': 'no-cache', 68 | 'Cache-Control': 'no-cache', 69 | 'Upgrade-Insecure-Requests': '1', 70 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', 71 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 72 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7' 73 | } 74 | css_res = requests.get(css_url, headers=headers) 75 | print(f'css_url:{css_url}') 76 | self.css = css_res.text 77 | 78 | # 正则匹配商家地址使用的 svg 文件 url 79 | result = re.search('bb\[class.*?background-image: url\((.*?)\);', self.css, re.S) 80 | address_svg_url = 'http:' + result.group(1) 81 | self.address_svg = requests.get(address_svg_url, headers=headers).text 82 | print(f'address_svg_url:{address_svg_url}') 83 | 84 | # 正则匹配商家电话号码使用的 svg 文件 url 85 | result = re.search('cc\[class.*?background-image: url\((.*?)\);', self.css, re.S) 86 | tell_svg_url = 'http:' + result.group(1) 87 | self.tell_svg = requests.get(tell_svg_url, headers=headers).text 88 | print(f'tell_svg_url:{tell_svg_url}') 89 | 90 | # 正则匹配评论使用的 svg 文件 url 91 | result = re.search('svgmtsi\[class.*?background-image: url\((.*?)\);', self.css, re.S) 92 | review_svg_url = 'http:' + result.group(1) 93 | self.review_svg = requests.get(review_svg_url, headers=headers).text 94 | print(f'review_svg_url:{review_svg_url}') 95 | 96 | def get_font_map(self): 97 | # 地址 css 样式 98 | # 电话 css 样式 99 | # 评论 css 样式 100 | # xxx 每天都会发生变化,所以动态匹配对应的前缀 101 | 102 | # 地址 css 前缀 103 | bb_result = re.search('', self.html, re.S) 104 | address_prefix = bb_result.group(1)[:2] 105 | 106 | # 电话 css 前缀 107 | cc_result = re.search('', self.html, re.S) 108 | tell_prefix = cc_result.group(1)[:2] 109 | 110 | # 评论 css 前缀 111 | svgmtsi_result = re.search('', self.html, re.S) 112 | review_prefix = svgmtsi_result.group(1)[:2] 113 | 114 | """ 115 | 116 | :return: 117 | """ 118 | 119 | # 匹配 css 文件中格式为 .(css前缀.*?){background:(.*?)px (.*?)px;} ,获得所有 css 加密字符的 css 样式 120 | address_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % address_prefix, self.css, re.S) 121 | tell_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % tell_prefix, self.css, re.S) 122 | review_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % review_prefix, self.css, re.S) 123 | 124 | """ 125 | 匹配评论 svg 文件中格式为 的字段 126 | 其中 id 的值对应 xlink:href="#(\d+)" 的值 127 | d="M0 (\d+) H600" 的值对应 background中 y轴的偏移量 128 | 129 | 匹配评论 svg 文件中格式为 (.*?) 的字段 130 | (\d+) 对应为 css id 选择器,对应上面 中的 id 131 | (.*?) 对应一串中文字符串, 132 | 还原后的字符 = 中文字符串[css 样式上中 x 的绝对值 / 字体大小] 133 | """ 134 | review_svg_id_y_list = re.findall('', self.review_svg, re.S) 135 | review_svg_id_fonts_dc = dict( 136 | re.findall('(.*?)', self.review_svg, re.S)) 137 | self.review_font_map = self.review_class_to_font(review_class_list, review_svg_id_y_list, 138 | review_svg_id_fonts_dc) 139 | 140 | address_svg_y_words_list = re.findall('(.*?)', self.address_svg, re.S) 141 | self.address_font_map = self.address_class_to_font(address_class_list, address_svg_y_words_list) 142 | 143 | tell_svg_result = re.search('(.*?)', self.tell_svg, re.S) 144 | tell_x_list = tell_svg_result.group(1).split(' ') 145 | tell_words_str = tell_svg_result.group(2) 146 | tell_svg_x_words_list = list(zip(tell_x_list, list(tell_words_str))) 147 | self.tell_font_map = self.tell_class_to_num(tell_class_list, tell_svg_x_words_list) 148 | 149 | print(self.address_font_map) 150 | print(self.review_font_map) 151 | print(self.tell_font_map) 152 | 153 | def address_class_to_font(self, class_list, y_words_list): 154 | tmp_dc = dict() 155 | for class_name, class_x, class_y in class_list: 156 | for text_y, words in y_words_list: 157 | if int(text_y) >= abs(int(float(class_y))): 158 | index = abs(int(float(class_x))) // self.font_size 159 | tmp_dc[class_name] = words[index] 160 | break 161 | return tmp_dc 162 | 163 | def review_class_to_font(self, class_list, id_y_list, words_dc): 164 | tmp_dc = dict() 165 | for class_name, class_x, class_y in class_list: 166 | for class_id, y in id_y_list: 167 | if int(y) >= abs(int(float(class_y))): 168 | word_index = abs(int(float(class_x))) // self.font_size 169 | tmp_dc[class_name] = words_dc[class_id][int(word_index)] 170 | break 171 | return tmp_dc 172 | 173 | def tell_class_to_num(self, class_list, x_word_list): 174 | tmp_dc = dict() 175 | for class_name, class_x, class_y in class_list: 176 | for x, word in x_word_list: 177 | if int(x) >= abs(int(float(class_x))): 178 | tmp_dc[class_name] = word 179 | break 180 | return tmp_dc 181 | 182 | def get_shop_info(self): 183 | # 将 self.html 商铺地址加密的 class 样式替换成对应的中文字符 184 | address_class_set = re.findall('', self.html, re.S) 185 | for class_name in address_class_set: 186 | self.html = re.sub(''.format(class_name), self.address_font_map[class_name], self.html) 187 | 188 | # 将 self.html 电话号码加密的 class 样式替换成对应的数字 189 | tell_class_set = re.findall('', self.html, re.S) 190 | for class_name in tell_class_set: 191 | self.html = re.sub(''.format(class_name), self.tell_font_map[class_name], self.html) 192 | 193 | tree = etree.HTML(self.html) 194 | shop_address = tree.xpath('//div[@class="address-info"]/text()')[0].replace(' ', '').replace('\n', 195 | '').replace( 196 | ' ', '') 197 | shop_tell = tree.xpath('//div[@class="phone-info"]/text()')[0].replace(' ', '').replace('\n', '').replace( 198 | ' ', '') 199 | print(f'地址:{shop_address}\n电话:{shop_tell}') 200 | 201 | def get_info(self): 202 | # 将 self.html 评论区域加密的 class 样式替换成对应的中文字符 203 | review_class_set = re.findall('', self.html, re.S) 204 | for class_name in review_class_set: 205 | self.html = re.sub(''.format(class_name), self.review_font_map[class_name], 206 | self.html) 207 | 208 | tree = etree.HTML(self.html) 209 | for i in tree.xpath('//div[@class="main-review"]'): 210 | user_name = i.xpath('./div[@class="dper-info"]/a/text()')[0].strip() 211 | star = int( 212 | re.search('sml-rank-stars sml-str(\d+) star', i.xpath('./div[@class="review-rank"]/span[1]/@class')[0], 213 | re.S).group(1)) / 10 214 | evaluation_list = [i.strip() for i in 215 | i.xpath('./div[@class="review-rank"]/span[@class="score"]/span/text()')] 216 | if len(evaluation_list) > 3: 217 | consumption_per_person = evaluation_list[-1].replace('人均:', '') 218 | evaluation_list = evaluation_list[:3] 219 | else: 220 | consumption_per_person = '无' 221 | review = i.xpath('string(./div[@class="review-words Hide"])').replace('收起评价', '').strip().replace(' ', 222 | '').replace( 223 | '⃣', '.').replace('\n', '') 224 | images_list = i.xpath('./div[@class="review-pictures"]/ul/li[@class="item"]/a/img/@data-big') 225 | review_time = i.xpath('./div[@class="misc-info clearfix"]/span[@class="time"]/text()')[0].strip() 226 | print('-------------------------------------') 227 | print(f'用户:{user_name}') 228 | print(f'星评:{star}') 229 | print(f'多维分数:{evaluation_list}') 230 | print(f'人均:{consumption_per_person}') 231 | print(f'评论:{review}') 232 | print(f'图片:{images_list}') 233 | print(f'评论时间:{review_time}') 234 | 235 | def run(self): 236 | self.get_svg_html() 237 | self.get_font_map() 238 | self.get_shop_info() 239 | self.get_info() 240 | 241 | 242 | if __name__ == '__main__': 243 | url = 'http://www.dianping.com/shop/G9TSD2JvdLtA7fdm/review_all' 244 | user_cookie = '' 245 | dz = DaZhongDianPing(url, user_cookie) 246 | dz.run() 247 | -------------------------------------------------------------------------------- /dzdp_font_encryption.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re 4 | from fontTools.ttLib import TTFont 5 | from PIL import Image, ImageDraw, ImageFont 6 | import pytesseract 7 | import numpy 8 | import os 9 | from woff2tff import woff_to_ttf 10 | 11 | 12 | class DaZhongDianPing(): 13 | def __init__(self): 14 | self.url = "http://www.dianping.com/shenzhen/ch10/g117" 15 | # 页面 html 16 | self.html = None 17 | # 页面引用的 css 文件 18 | self.css = None 19 | self.woff_dc = dict() 20 | self.address_font_map = dict() 21 | self.shop_num_font_map = dict() 22 | self.tag_name_font_map = dict() 23 | self.referer = self.url.replace('/review_all', '') 24 | self.timeout = 10 25 | self.headers = { 26 | 'Connection': 'keep-alive', 27 | 'Pragma': 'no-cache', 28 | 'Cache-Control': 'no-cache', 29 | 'Upgrade-Insecure-Requests': '1', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36', 31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9', 33 | 'Cookie': '' 34 | } 35 | 36 | def get_woffs(self): 37 | html_res = requests.get(self.url, headers=self.headers) 38 | self.html = html_res.text 39 | result = re.search('', self.html, re.S) 40 | 41 | if result: 42 | css_url = 'http://s3plus' + result.group(1) 43 | headers = { 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36' 45 | } 46 | css_res = requests.get(css_url, headers=headers) 47 | print(css_url) 48 | self.css = css_res.text 49 | 50 | result = re.findall('@font-face\{font-family: "(.*?)";.*?,url\("(.*?)"\);\}', self.css) 51 | print(result) 52 | 53 | self.woff_dc = dict(result) 54 | if not os.path.exists('woff_file'): 55 | os.mkdir('woff_file') 56 | 57 | for woff_url in result: 58 | url = 'http:' + woff_url[1] 59 | res = requests.get(url, headers=headers) 60 | filename = woff_url[1].split('/')[-1] 61 | filepath = f'./woff_file/{filename}' 62 | with open(filepath, 'wb') as f: 63 | f.write(res.content) 64 | self.woff_dc[woff_url[0]] = filepath 65 | print(self.woff_dc) 66 | 67 | def get_woff_2_ttf(self): 68 | tmp_dc = self.woff_dc 69 | for key in tmp_dc: 70 | woff_path = tmp_dc[key] 71 | ttf_filepath = woff_path.replace('.woff', '.ttf') 72 | woff_to_ttf([woff_path, ttf_filepath]) 73 | self.woff_dc[key] = ttf_filepath 74 | print(self.woff_dc) 75 | 76 | def fontConvert(self, fontPath): 77 | fonts = TTFont(fontPath) 78 | codeList = fonts.getGlyphOrder()[2:] 79 | im = Image.new("RGB", (1800, 1000), (255, 255, 255)) 80 | dr = ImageDraw.Draw(im) 81 | font = ImageFont.truetype(font=os.path.abspath(fontPath), size=40) 82 | count = 18 83 | arrayList = numpy.array_split(codeList, count) 84 | for t in range(count): 85 | newList = [i.replace("uni", "\\u") for i in arrayList[t]] 86 | text = "".join(newList) 87 | text = text.encode('utf-8').decode('unicode_escape') 88 | dr.text((0, 50 * t), text, font=font, fill="#000000") 89 | images_name = 'font.jpg' 90 | im.save(images_name) 91 | im = Image.open(images_name) 92 | result = pytesseract.image_to_string(im, lang="chi_sim") 93 | result = result.replace(" ", "").replace("\n", "") 94 | codeList = [i.replace("uni", "&#x") + ";" for i in codeList] 95 | return dict(zip(codeList, list(result))) 96 | 97 | def get_font_map(self): 98 | for key in self.woff_dc: 99 | if 'shopNum' in key: 100 | self.shop_num_font_map = self.fontConvert(self.woff_dc[key]) 101 | elif 'address' in key: 102 | self.address_font_map = self.fontConvert(self.woff_dc[key]) 103 | elif 'tagName' in key: 104 | self.tag_name_font_map = self.fontConvert(self.woff_dc[key]) 105 | 106 | def get_shop_info(self): 107 | shopNum_res = re.findall('(.*?)', self.html, re.S) 108 | for i in shopNum_res: 109 | self.html = re.sub('{}'.format(i), self.shop_num_font_map[i], self.html) 110 | 111 | address_res = re.findall('(.*?)', self.html, re.S) 112 | for i in address_res: 113 | self.html = re.sub('{}'.format(i), self.address_font_map[i], self.html) 114 | 115 | tagName = re.findall('(.*?)', self.html, re.S) 116 | for i in tagName: 117 | self.html = re.sub('{}'.format(i), self.tag_name_font_map[i], self.html) 118 | 119 | tree = etree.HTML(self.html) 120 | shop_title_list = tree.xpath('//div[@class="tit"]/a/h4/text()') 121 | shop_star_score = tree.xpath('//div[@class="comment"]/div/div[2]/text()') 122 | shop_review_nums = tree.xpath('//div[@class="comment"]/a[1]/b/text()') 123 | shop_mean_price = tree.xpath('//div[@class="comment"]/a[2]/b/text()') 124 | shop_tag = tree.xpath('//div[@class="tag-addr"]/a[1]/span/text()') 125 | shop_address_tag = tree.xpath('//div[@class="tag-addr"]/a[2]/span/text()') 126 | shop_adress_des = tree.xpath('//div[@class="tag-addr"]/span/text()') 127 | shop_taste_score = tree.xpath('//span[@class="comment-list"]/span[1]/b/text()') 128 | shop_environment_score = tree.xpath('//span[@class="comment-list"]/span[2]/b/text()') 129 | shop_server_score = tree.xpath('//span[@class="comment-list"]/span[3]/b/text()') 130 | shop_recommend_dishes = tree.xpath('//div[@class="recommend"]/a/text()') 131 | 132 | print(shop_title_list) 133 | print(shop_star_score) 134 | print(shop_review_nums) 135 | print(shop_mean_price) 136 | print(shop_tag) 137 | print(shop_address_tag) 138 | print(shop_adress_des) 139 | print(shop_taste_score) 140 | print(shop_environment_score) 141 | print(shop_server_score) 142 | print(shop_recommend_dishes) 143 | 144 | def run(self): 145 | self.get_woffs() 146 | self.get_woff_2_ttf() 147 | self.get_font_map() 148 | self.get_shop_info() 149 | 150 | 151 | if __name__ == '__main__': 152 | dz = DaZhongDianPing() 153 | dz.run() 154 | -------------------------------------------------------------------------------- /images/css_map_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/css_map_result.png -------------------------------------------------------------------------------- /images/font.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/font.png -------------------------------------------------------------------------------- /images/font_encryption_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/font_encryption_result.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml==4.5.0 2 | numpy==1.18.2 3 | Pillow==7.1.1 4 | pytesseract==0.3.4 5 | requests==2.23.0 6 | fonttools==4.9.0 7 | -------------------------------------------------------------------------------- /woff2tff.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | import zlib 4 | 5 | 6 | def convert_streams(infile, outfile): 7 | WOFFHeader = {'signature': struct.unpack(">I", infile.read(4))[0], 8 | 'flavor': struct.unpack(">I", infile.read(4))[0], 9 | 'length': struct.unpack(">I", infile.read(4))[0], 10 | 'numTables': struct.unpack(">H", infile.read(2))[0], 11 | 'reserved': struct.unpack(">H", infile.read(2))[0], 12 | 'totalSfntSize': struct.unpack(">I", infile.read(4))[0], 13 | 'majorVersion': struct.unpack(">H", infile.read(2))[0], 14 | 'minorVersion': struct.unpack(">H", infile.read(2))[0], 15 | 'metaOffset': struct.unpack(">I", infile.read(4))[0], 16 | 'metaLength': struct.unpack(">I", infile.read(4))[0], 17 | 'metaOrigLength': struct.unpack(">I", infile.read(4))[0], 18 | 'privOffset': struct.unpack(">I", infile.read(4))[0], 19 | 'privLength': struct.unpack(">I", infile.read(4))[0]} 20 | 21 | outfile.write(struct.pack(">I", WOFFHeader['flavor'])) 22 | outfile.write(struct.pack(">H", WOFFHeader['numTables'])) 23 | maximum = list(filter(lambda x: x[1] <= WOFFHeader['numTables'], [(n, 2 ** n) for n in range(64)]))[-1] 24 | searchRange = maximum[1] * 16 25 | outfile.write(struct.pack(">H", searchRange)) 26 | entrySelector = maximum[0] 27 | outfile.write(struct.pack(">H", entrySelector)) 28 | rangeShift = WOFFHeader['numTables'] * 16 - searchRange 29 | outfile.write(struct.pack(">H", rangeShift)) 30 | 31 | offset = outfile.tell() 32 | 33 | TableDirectoryEntries = [] 34 | for i in range(0, WOFFHeader['numTables']): 35 | TableDirectoryEntries.append({'tag': struct.unpack(">I", infile.read(4))[0], 36 | 'offset': struct.unpack(">I", infile.read(4))[0], 37 | 'compLength': struct.unpack(">I", infile.read(4))[0], 38 | 'origLength': struct.unpack(">I", infile.read(4))[0], 39 | 'origChecksum': struct.unpack(">I", infile.read(4))[0]}) 40 | offset += 4 * 4 41 | 42 | for TableDirectoryEntry in TableDirectoryEntries: 43 | outfile.write(struct.pack(">I", TableDirectoryEntry['tag'])) 44 | outfile.write(struct.pack(">I", TableDirectoryEntry['origChecksum'])) 45 | outfile.write(struct.pack(">I", offset)) 46 | outfile.write(struct.pack(">I", TableDirectoryEntry['origLength'])) 47 | TableDirectoryEntry['outOffset'] = offset 48 | offset += TableDirectoryEntry['origLength'] 49 | if (offset % 4) != 0: 50 | offset += 4 - (offset % 4) 51 | 52 | for TableDirectoryEntry in TableDirectoryEntries: 53 | infile.seek(TableDirectoryEntry['offset']) 54 | compressedData = infile.read(TableDirectoryEntry['compLength']) 55 | if TableDirectoryEntry['compLength'] != TableDirectoryEntry['origLength']: 56 | uncompressedData = zlib.decompress(compressedData) 57 | else: 58 | uncompressedData = compressedData 59 | outfile.seek(TableDirectoryEntry['outOffset']) 60 | outfile.write(uncompressedData) 61 | offset = TableDirectoryEntry['outOffset'] + TableDirectoryEntry['origLength'] 62 | padding = 0 63 | if (offset % 4) != 0: 64 | padding = 4 - (offset % 4) 65 | outfile.write(bytearray(padding)) 66 | 67 | 68 | def convert(infilename, outfilename): 69 | with open(infilename, mode='rb') as infile: 70 | with open(outfilename, mode='wb') as outfile: 71 | convert_streams(infile, outfile) 72 | 73 | 74 | def main(argv): 75 | if len(argv) == 1 or len(argv) > 3: 76 | print('I convert *.woff files to *.otf files. (one at a time :)\n' 77 | 'Usage: woff2otf.py web_font.woff [converted_filename.otf]\n' 78 | 'If the target file name is ommited, it will be guessed. Have fun!\n') 79 | return 80 | 81 | source_file_name = argv[1] 82 | if len(argv) == 3: 83 | target_file_name = argv[2] 84 | else: 85 | target_file_name = source_file_name.split('.')[0] + '.otf' 86 | 87 | convert(source_file_name, target_file_name) 88 | return 0 89 | 90 | 91 | def woff_to_ttf(argv): 92 | source_file_name = argv[0] 93 | target_file_name = argv[1] 94 | convert(source_file_name, target_file_name) 95 | return 0 96 | 97 | 98 | if __name__ == '__main__': 99 | sys.exit(main(sys.argv)) 100 | --------------------------------------------------------------------------------