├── .gitignore
├── LICENSE
├── README.md
├── dzdp_css_map.py
├── dzdp_font_encryption.py
├── images
├── css_map_result.png
├── font.png
└── font_encryption_result.png
├── requirements.txt
└── woff2tff.py
/.gitignore:
--------------------------------------------------------------------------------
1 | woff_file/*
2 |
3 | *.jpg
4 |
5 | __pycache__
6 |
7 | .idea
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 naiveliberty
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 大众点评信息爬取
2 |
3 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。**
4 |
5 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。**
6 |
7 | **声明:项目内容不得用于商业用途,仅做学习交流,如果侵犯了您的利益和权益,请邮箱联系我,我将删除该项目。**
8 |
9 | | 作者 | 邮箱 |
10 | | ------- | -------------------- |
11 | | liberty | fthemuse@foxmail.com |
12 |
13 | ------
14 |
15 | ## 1. 项目介绍
16 |
17 | **该项目只对大众点评字体加密做了处理**
18 |
19 | 字体加密分以下两种:
20 |
21 | 1. CSS 字体映射(评论详情页:http://www.dianping.com/shop/G9TSD2JvdLtA7fdm/review_all);
22 | 2. WOFF 字体加密(店铺搜索页面:http://www.dianping.com/shenzhen/ch10/g117);
23 |
24 |
25 |
26 | 项目中使用了 `woff2tff.py` 将 woff 字体文件转为 ttf 字体文件,该文件来自:https://github.com/hanikesn/woff2otf
27 |
28 | ttf 字体文件通过 fontTools 库生成如下图片,之后使用 pytesseract 识别汉字,动态生成码表
29 |
30 | 
31 |
32 |
33 |
34 | ## 2. 效果展示
35 |
36 | ### 2.1 CSS 字体映射
37 |
38 | `dzdp_css_map.py`
39 |
40 | 
41 |
42 |
43 |
44 | ### 2.2 WOFF 字体加密
45 |
46 | `dzdp_font_encryption.py`
47 |
48 | 
49 |
50 |
51 |
52 | ## 3. 环境依赖
53 |
54 | ### 3.1 python
55 |
56 | ```
57 | pip3 install -r requirements.txt -i https://pypi.douban.com/simple
58 | ```
59 |
60 |
61 |
62 | ### 3.2 pytesseract
63 |
64 | - pytesseract 下载地址:https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.0.0-alpha.20200328.exe
65 | - 安装完成后,新增系统变量: `TESSDATA_PREFIX: C:\Program Files\Tesseract-OCR` (安装目录)
66 | - 修改 `当前 python 解释器所在目录\Lib\site-packages\pytesseract\pytesseract.py` 中的 `tesseract_cmd` 变量值为 `tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'`
67 | - 下载简体中文训练集下载地址:https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.00/chi_sim.traineddata,并拷贝到 `C:\Program Files\Tesseract-OCR\tessdata` 目录下
68 |
69 |
70 |
71 | ## 4. 分析过程
72 |
73 | 详见:`https://blog.csdn.net/saberqqq/article/details/105977645`
--------------------------------------------------------------------------------
/dzdp_css_map.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | import time
4 | from lxml import etree
5 | import random
6 |
7 |
8 | class DaZhongDianPing:
9 | def __init__(self, shop_review_url, user_cookie):
10 | # 商家评论详情页 url
11 | self.url = shop_review_url
12 | # 商家评论详情页源码
13 | self.html = str()
14 | # 页面字体大小
15 | self.font_size = 14
16 | # 页面引用的 css 文件
17 | self.css = str()
18 | # 商家地址使用的 svg 文件
19 | self.address_svg = str()
20 | # 商家电话使用的 svg 文件
21 | self.tell_svg = str()
22 | # 商家评论使用的 svg 文件
23 | self.review_svg = str()
24 |
25 | # 字体码表,key 为 class 名称,value 为对应的汉字
26 | self.address_font_map = dict()
27 | self.tell_font_map = dict()
28 | self.review_font_map = dict()
29 |
30 | # 商家评论的最大页码数
31 | self.max_pages = 0
32 | self.timeout = 10
33 | self.headers = {
34 | 'Connection': 'keep-alive',
35 | 'Pragma': 'no-cache',
36 | 'Cache-Control': 'no-cache',
37 | 'Upgrade-Insecure-Requests': '1',
38 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
39 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
40 | 'Referer': self.url.replace('/review_all', ''),
41 | 'Accept-Language': 'zh-CN,zh;q=0.9',
42 | 'Cookie': user_cookie
43 | }
44 |
45 | def get_svg_html(self):
46 | """
47 | 获取商家详情页 svg 文件
48 | :return:
49 | """
50 | # 获取商家评论页内容
51 | index_res = requests.get(self.url, headers=self.headers, timeout=self.timeout)
52 | self.html = index_res.text
53 | if '验证中心' in index_res.text:
54 | print('遇到验证码,程序退出!')
55 | exit()
56 |
57 | # 提取最大页数
58 | tree = etree.HTML(self.html)
59 | self.max_pages = int(tree.xpath('//div[@class="reviews-pages"]/a/text()')[-2])
60 |
61 | # 正则匹配 css 文件 url
62 | result = re.search('', self.html, re.S)
63 | if result:
64 | css_url = 'http:' + result.group(1)
65 | headers = {
66 | 'Proxy-Connection': 'keep-alive',
67 | 'Pragma': 'no-cache',
68 | 'Cache-Control': 'no-cache',
69 | 'Upgrade-Insecure-Requests': '1',
70 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
71 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
72 | 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
73 | }
74 | css_res = requests.get(css_url, headers=headers)
75 | print(f'css_url:{css_url}')
76 | self.css = css_res.text
77 |
78 | # 正则匹配商家地址使用的 svg 文件 url
79 | result = re.search('bb\[class.*?background-image: url\((.*?)\);', self.css, re.S)
80 | address_svg_url = 'http:' + result.group(1)
81 | self.address_svg = requests.get(address_svg_url, headers=headers).text
82 | print(f'address_svg_url:{address_svg_url}')
83 |
84 | # 正则匹配商家电话号码使用的 svg 文件 url
85 | result = re.search('cc\[class.*?background-image: url\((.*?)\);', self.css, re.S)
86 | tell_svg_url = 'http:' + result.group(1)
87 | self.tell_svg = requests.get(tell_svg_url, headers=headers).text
88 | print(f'tell_svg_url:{tell_svg_url}')
89 |
90 | # 正则匹配评论使用的 svg 文件 url
91 | result = re.search('svgmtsi\[class.*?background-image: url\((.*?)\);', self.css, re.S)
92 | review_svg_url = 'http:' + result.group(1)
93 | self.review_svg = requests.get(review_svg_url, headers=headers).text
94 | print(f'review_svg_url:{review_svg_url}')
95 |
96 | def get_font_map(self):
97 | # 地址 css 样式
98 | # 电话 css 样式
99 | # 评论 css 样式
100 | # xxx 每天都会发生变化,所以动态匹配对应的前缀
101 |
102 | # 地址 css 前缀
103 | bb_result = re.search('', self.html, re.S)
104 | address_prefix = bb_result.group(1)[:2]
105 |
106 | # 电话 css 前缀
107 | cc_result = re.search('', self.html, re.S)
108 | tell_prefix = cc_result.group(1)[:2]
109 |
110 | # 评论 css 前缀
111 | svgmtsi_result = re.search('', self.html, re.S)
112 | review_prefix = svgmtsi_result.group(1)[:2]
113 |
114 | """
115 |
116 | :return:
117 | """
118 |
119 | # 匹配 css 文件中格式为 .(css前缀.*?){background:(.*?)px (.*?)px;} ,获得所有 css 加密字符的 css 样式
120 | address_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % address_prefix, self.css, re.S)
121 | tell_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % tell_prefix, self.css, re.S)
122 | review_class_list = re.findall('\.(%s.*?){background:(.*?)px (.*?)px;}' % review_prefix, self.css, re.S)
123 |
124 | """
125 | 匹配评论 svg 文件中格式为 的字段
126 | 其中 id 的值对应 xlink:href="#(\d+)" 的值
127 | d="M0 (\d+) H600" 的值对应 background中 y轴的偏移量
128 |
129 | 匹配评论 svg 文件中格式为 (.*?) 的字段
130 | (\d+) 对应为 css id 选择器,对应上面 中的 id
131 | (.*?) 对应一串中文字符串,
132 | 还原后的字符 = 中文字符串[css 样式上中 x 的绝对值 / 字体大小]
133 | """
134 | review_svg_id_y_list = re.findall('', self.review_svg, re.S)
135 | review_svg_id_fonts_dc = dict(
136 | re.findall('(.*?)', self.review_svg, re.S))
137 | self.review_font_map = self.review_class_to_font(review_class_list, review_svg_id_y_list,
138 | review_svg_id_fonts_dc)
139 |
140 | address_svg_y_words_list = re.findall('(.*?)', self.address_svg, re.S)
141 | self.address_font_map = self.address_class_to_font(address_class_list, address_svg_y_words_list)
142 |
143 | tell_svg_result = re.search('(.*?)', self.tell_svg, re.S)
144 | tell_x_list = tell_svg_result.group(1).split(' ')
145 | tell_words_str = tell_svg_result.group(2)
146 | tell_svg_x_words_list = list(zip(tell_x_list, list(tell_words_str)))
147 | self.tell_font_map = self.tell_class_to_num(tell_class_list, tell_svg_x_words_list)
148 |
149 | print(self.address_font_map)
150 | print(self.review_font_map)
151 | print(self.tell_font_map)
152 |
153 | def address_class_to_font(self, class_list, y_words_list):
154 | tmp_dc = dict()
155 | for class_name, class_x, class_y in class_list:
156 | for text_y, words in y_words_list:
157 | if int(text_y) >= abs(int(float(class_y))):
158 | index = abs(int(float(class_x))) // self.font_size
159 | tmp_dc[class_name] = words[index]
160 | break
161 | return tmp_dc
162 |
163 | def review_class_to_font(self, class_list, id_y_list, words_dc):
164 | tmp_dc = dict()
165 | for class_name, class_x, class_y in class_list:
166 | for class_id, y in id_y_list:
167 | if int(y) >= abs(int(float(class_y))):
168 | word_index = abs(int(float(class_x))) // self.font_size
169 | tmp_dc[class_name] = words_dc[class_id][int(word_index)]
170 | break
171 | return tmp_dc
172 |
173 | def tell_class_to_num(self, class_list, x_word_list):
174 | tmp_dc = dict()
175 | for class_name, class_x, class_y in class_list:
176 | for x, word in x_word_list:
177 | if int(x) >= abs(int(float(class_x))):
178 | tmp_dc[class_name] = word
179 | break
180 | return tmp_dc
181 |
182 | def get_shop_info(self):
183 | # 将 self.html 商铺地址加密的 class 样式替换成对应的中文字符
184 | address_class_set = re.findall('', self.html, re.S)
185 | for class_name in address_class_set:
186 | self.html = re.sub(''.format(class_name), self.address_font_map[class_name], self.html)
187 |
188 | # 将 self.html 电话号码加密的 class 样式替换成对应的数字
189 | tell_class_set = re.findall('', self.html, re.S)
190 | for class_name in tell_class_set:
191 | self.html = re.sub(''.format(class_name), self.tell_font_map[class_name], self.html)
192 |
193 | tree = etree.HTML(self.html)
194 | shop_address = tree.xpath('//div[@class="address-info"]/text()')[0].replace(' ', '').replace('\n',
195 | '').replace(
196 | ' ', '')
197 | shop_tell = tree.xpath('//div[@class="phone-info"]/text()')[0].replace(' ', '').replace('\n', '').replace(
198 | ' ', '')
199 | print(f'地址:{shop_address}\n电话:{shop_tell}')
200 |
201 | def get_info(self):
202 | # 将 self.html 评论区域加密的 class 样式替换成对应的中文字符
203 | review_class_set = re.findall('', self.html, re.S)
204 | for class_name in review_class_set:
205 | self.html = re.sub(''.format(class_name), self.review_font_map[class_name],
206 | self.html)
207 |
208 | tree = etree.HTML(self.html)
209 | for i in tree.xpath('//div[@class="main-review"]'):
210 | user_name = i.xpath('./div[@class="dper-info"]/a/text()')[0].strip()
211 | star = int(
212 | re.search('sml-rank-stars sml-str(\d+) star', i.xpath('./div[@class="review-rank"]/span[1]/@class')[0],
213 | re.S).group(1)) / 10
214 | evaluation_list = [i.strip() for i in
215 | i.xpath('./div[@class="review-rank"]/span[@class="score"]/span/text()')]
216 | if len(evaluation_list) > 3:
217 | consumption_per_person = evaluation_list[-1].replace('人均:', '')
218 | evaluation_list = evaluation_list[:3]
219 | else:
220 | consumption_per_person = '无'
221 | review = i.xpath('string(./div[@class="review-words Hide"])').replace('收起评价', '').strip().replace(' ',
222 | '').replace(
223 | '⃣', '.').replace('\n', '')
224 | images_list = i.xpath('./div[@class="review-pictures"]/ul/li[@class="item"]/a/img/@data-big')
225 | review_time = i.xpath('./div[@class="misc-info clearfix"]/span[@class="time"]/text()')[0].strip()
226 | print('-------------------------------------')
227 | print(f'用户:{user_name}')
228 | print(f'星评:{star}')
229 | print(f'多维分数:{evaluation_list}')
230 | print(f'人均:{consumption_per_person}')
231 | print(f'评论:{review}')
232 | print(f'图片:{images_list}')
233 | print(f'评论时间:{review_time}')
234 |
235 | def run(self):
236 | self.get_svg_html()
237 | self.get_font_map()
238 | self.get_shop_info()
239 | self.get_info()
240 |
241 |
242 | if __name__ == '__main__':
243 | url = 'http://www.dianping.com/shop/G9TSD2JvdLtA7fdm/review_all'
244 | user_cookie = ''
245 | dz = DaZhongDianPing(url, user_cookie)
246 | dz.run()
247 |
--------------------------------------------------------------------------------
/dzdp_font_encryption.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | import re
4 | from fontTools.ttLib import TTFont
5 | from PIL import Image, ImageDraw, ImageFont
6 | import pytesseract
7 | import numpy
8 | import os
9 | from woff2tff import woff_to_ttf
10 |
11 |
12 | class DaZhongDianPing():
13 | def __init__(self):
14 | self.url = "http://www.dianping.com/shenzhen/ch10/g117"
15 | # 页面 html
16 | self.html = None
17 | # 页面引用的 css 文件
18 | self.css = None
19 | self.woff_dc = dict()
20 | self.address_font_map = dict()
21 | self.shop_num_font_map = dict()
22 | self.tag_name_font_map = dict()
23 | self.referer = self.url.replace('/review_all', '')
24 | self.timeout = 10
25 | self.headers = {
26 | 'Connection': 'keep-alive',
27 | 'Pragma': 'no-cache',
28 | 'Cache-Control': 'no-cache',
29 | 'Upgrade-Insecure-Requests': '1',
30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
32 | 'Accept-Language': 'zh-CN,zh;q=0.9',
33 | 'Cookie': ''
34 | }
35 |
36 | def get_woffs(self):
37 | html_res = requests.get(self.url, headers=self.headers)
38 | self.html = html_res.text
39 | result = re.search('', self.html, re.S)
40 |
41 | if result:
42 | css_url = 'http://s3plus' + result.group(1)
43 | headers = {
44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
45 | }
46 | css_res = requests.get(css_url, headers=headers)
47 | print(css_url)
48 | self.css = css_res.text
49 |
50 | result = re.findall('@font-face\{font-family: "(.*?)";.*?,url\("(.*?)"\);\}', self.css)
51 | print(result)
52 |
53 | self.woff_dc = dict(result)
54 | if not os.path.exists('woff_file'):
55 | os.mkdir('woff_file')
56 |
57 | for woff_url in result:
58 | url = 'http:' + woff_url[1]
59 | res = requests.get(url, headers=headers)
60 | filename = woff_url[1].split('/')[-1]
61 | filepath = f'./woff_file/{filename}'
62 | with open(filepath, 'wb') as f:
63 | f.write(res.content)
64 | self.woff_dc[woff_url[0]] = filepath
65 | print(self.woff_dc)
66 |
67 | def get_woff_2_ttf(self):
68 | tmp_dc = self.woff_dc
69 | for key in tmp_dc:
70 | woff_path = tmp_dc[key]
71 | ttf_filepath = woff_path.replace('.woff', '.ttf')
72 | woff_to_ttf([woff_path, ttf_filepath])
73 | self.woff_dc[key] = ttf_filepath
74 | print(self.woff_dc)
75 |
76 | def fontConvert(self, fontPath):
77 | fonts = TTFont(fontPath)
78 | codeList = fonts.getGlyphOrder()[2:]
79 | im = Image.new("RGB", (1800, 1000), (255, 255, 255))
80 | dr = ImageDraw.Draw(im)
81 | font = ImageFont.truetype(font=os.path.abspath(fontPath), size=40)
82 | count = 18
83 | arrayList = numpy.array_split(codeList, count)
84 | for t in range(count):
85 | newList = [i.replace("uni", "\\u") for i in arrayList[t]]
86 | text = "".join(newList)
87 | text = text.encode('utf-8').decode('unicode_escape')
88 | dr.text((0, 50 * t), text, font=font, fill="#000000")
89 | images_name = 'font.jpg'
90 | im.save(images_name)
91 | im = Image.open(images_name)
92 | result = pytesseract.image_to_string(im, lang="chi_sim")
93 | result = result.replace(" ", "").replace("\n", "")
94 | codeList = [i.replace("uni", "") + ";" for i in codeList]
95 | return dict(zip(codeList, list(result)))
96 |
97 | def get_font_map(self):
98 | for key in self.woff_dc:
99 | if 'shopNum' in key:
100 | self.shop_num_font_map = self.fontConvert(self.woff_dc[key])
101 | elif 'address' in key:
102 | self.address_font_map = self.fontConvert(self.woff_dc[key])
103 | elif 'tagName' in key:
104 | self.tag_name_font_map = self.fontConvert(self.woff_dc[key])
105 |
106 | def get_shop_info(self):
107 | shopNum_res = re.findall('(.*?)', self.html, re.S)
108 | for i in shopNum_res:
109 | self.html = re.sub('{}'.format(i), self.shop_num_font_map[i], self.html)
110 |
111 | address_res = re.findall('(.*?)', self.html, re.S)
112 | for i in address_res:
113 | self.html = re.sub('{}'.format(i), self.address_font_map[i], self.html)
114 |
115 | tagName = re.findall('(.*?)', self.html, re.S)
116 | for i in tagName:
117 | self.html = re.sub('{}'.format(i), self.tag_name_font_map[i], self.html)
118 |
119 | tree = etree.HTML(self.html)
120 | shop_title_list = tree.xpath('//div[@class="tit"]/a/h4/text()')
121 | shop_star_score = tree.xpath('//div[@class="comment"]/div/div[2]/text()')
122 | shop_review_nums = tree.xpath('//div[@class="comment"]/a[1]/b/text()')
123 | shop_mean_price = tree.xpath('//div[@class="comment"]/a[2]/b/text()')
124 | shop_tag = tree.xpath('//div[@class="tag-addr"]/a[1]/span/text()')
125 | shop_address_tag = tree.xpath('//div[@class="tag-addr"]/a[2]/span/text()')
126 | shop_adress_des = tree.xpath('//div[@class="tag-addr"]/span/text()')
127 | shop_taste_score = tree.xpath('//span[@class="comment-list"]/span[1]/b/text()')
128 | shop_environment_score = tree.xpath('//span[@class="comment-list"]/span[2]/b/text()')
129 | shop_server_score = tree.xpath('//span[@class="comment-list"]/span[3]/b/text()')
130 | shop_recommend_dishes = tree.xpath('//div[@class="recommend"]/a/text()')
131 |
132 | print(shop_title_list)
133 | print(shop_star_score)
134 | print(shop_review_nums)
135 | print(shop_mean_price)
136 | print(shop_tag)
137 | print(shop_address_tag)
138 | print(shop_adress_des)
139 | print(shop_taste_score)
140 | print(shop_environment_score)
141 | print(shop_server_score)
142 | print(shop_recommend_dishes)
143 |
144 | def run(self):
145 | self.get_woffs()
146 | self.get_woff_2_ttf()
147 | self.get_font_map()
148 | self.get_shop_info()
149 |
150 |
151 | if __name__ == '__main__':
152 | dz = DaZhongDianPing()
153 | dz.run()
154 |
--------------------------------------------------------------------------------
/images/css_map_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/css_map_result.png
--------------------------------------------------------------------------------
/images/font.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/font.png
--------------------------------------------------------------------------------
/images/font_encryption_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naiveliberty/DaZhongDianPing/63062c672271000faffdac82c3366ca87f681318/images/font_encryption_result.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml==4.5.0
2 | numpy==1.18.2
3 | Pillow==7.1.1
4 | pytesseract==0.3.4
5 | requests==2.23.0
6 | fonttools==4.9.0
7 |
--------------------------------------------------------------------------------
/woff2tff.py:
--------------------------------------------------------------------------------
1 | import struct
2 | import sys
3 | import zlib
4 |
5 |
6 | def convert_streams(infile, outfile):
7 | WOFFHeader = {'signature': struct.unpack(">I", infile.read(4))[0],
8 | 'flavor': struct.unpack(">I", infile.read(4))[0],
9 | 'length': struct.unpack(">I", infile.read(4))[0],
10 | 'numTables': struct.unpack(">H", infile.read(2))[0],
11 | 'reserved': struct.unpack(">H", infile.read(2))[0],
12 | 'totalSfntSize': struct.unpack(">I", infile.read(4))[0],
13 | 'majorVersion': struct.unpack(">H", infile.read(2))[0],
14 | 'minorVersion': struct.unpack(">H", infile.read(2))[0],
15 | 'metaOffset': struct.unpack(">I", infile.read(4))[0],
16 | 'metaLength': struct.unpack(">I", infile.read(4))[0],
17 | 'metaOrigLength': struct.unpack(">I", infile.read(4))[0],
18 | 'privOffset': struct.unpack(">I", infile.read(4))[0],
19 | 'privLength': struct.unpack(">I", infile.read(4))[0]}
20 |
21 | outfile.write(struct.pack(">I", WOFFHeader['flavor']))
22 | outfile.write(struct.pack(">H", WOFFHeader['numTables']))
23 | maximum = list(filter(lambda x: x[1] <= WOFFHeader['numTables'], [(n, 2 ** n) for n in range(64)]))[-1]
24 | searchRange = maximum[1] * 16
25 | outfile.write(struct.pack(">H", searchRange))
26 | entrySelector = maximum[0]
27 | outfile.write(struct.pack(">H", entrySelector))
28 | rangeShift = WOFFHeader['numTables'] * 16 - searchRange
29 | outfile.write(struct.pack(">H", rangeShift))
30 |
31 | offset = outfile.tell()
32 |
33 | TableDirectoryEntries = []
34 | for i in range(0, WOFFHeader['numTables']):
35 | TableDirectoryEntries.append({'tag': struct.unpack(">I", infile.read(4))[0],
36 | 'offset': struct.unpack(">I", infile.read(4))[0],
37 | 'compLength': struct.unpack(">I", infile.read(4))[0],
38 | 'origLength': struct.unpack(">I", infile.read(4))[0],
39 | 'origChecksum': struct.unpack(">I", infile.read(4))[0]})
40 | offset += 4 * 4
41 |
42 | for TableDirectoryEntry in TableDirectoryEntries:
43 | outfile.write(struct.pack(">I", TableDirectoryEntry['tag']))
44 | outfile.write(struct.pack(">I", TableDirectoryEntry['origChecksum']))
45 | outfile.write(struct.pack(">I", offset))
46 | outfile.write(struct.pack(">I", TableDirectoryEntry['origLength']))
47 | TableDirectoryEntry['outOffset'] = offset
48 | offset += TableDirectoryEntry['origLength']
49 | if (offset % 4) != 0:
50 | offset += 4 - (offset % 4)
51 |
52 | for TableDirectoryEntry in TableDirectoryEntries:
53 | infile.seek(TableDirectoryEntry['offset'])
54 | compressedData = infile.read(TableDirectoryEntry['compLength'])
55 | if TableDirectoryEntry['compLength'] != TableDirectoryEntry['origLength']:
56 | uncompressedData = zlib.decompress(compressedData)
57 | else:
58 | uncompressedData = compressedData
59 | outfile.seek(TableDirectoryEntry['outOffset'])
60 | outfile.write(uncompressedData)
61 | offset = TableDirectoryEntry['outOffset'] + TableDirectoryEntry['origLength']
62 | padding = 0
63 | if (offset % 4) != 0:
64 | padding = 4 - (offset % 4)
65 | outfile.write(bytearray(padding))
66 |
67 |
68 | def convert(infilename, outfilename):
69 | with open(infilename, mode='rb') as infile:
70 | with open(outfilename, mode='wb') as outfile:
71 | convert_streams(infile, outfile)
72 |
73 |
74 | def main(argv):
75 | if len(argv) == 1 or len(argv) > 3:
76 | print('I convert *.woff files to *.otf files. (one at a time :)\n'
77 | 'Usage: woff2otf.py web_font.woff [converted_filename.otf]\n'
78 | 'If the target file name is ommited, it will be guessed. Have fun!\n')
79 | return
80 |
81 | source_file_name = argv[1]
82 | if len(argv) == 3:
83 | target_file_name = argv[2]
84 | else:
85 | target_file_name = source_file_name.split('.')[0] + '.otf'
86 |
87 | convert(source_file_name, target_file_name)
88 | return 0
89 |
90 |
91 | def woff_to_ttf(argv):
92 | source_file_name = argv[0]
93 | target_file_name = argv[1]
94 | convert(source_file_name, target_file_name)
95 | return 0
96 |
97 |
98 | if __name__ == '__main__':
99 | sys.exit(main(sys.argv))
100 |
--------------------------------------------------------------------------------