├── README.md └── maoyan ├── __pycache__ ├── font.cpython-37.pyc └── knn_font.cpython-37.pyc ├── font.py ├── fonts ├── 1.woff ├── 2.woff ├── 3.woff ├── 4.woff ├── 5.woff └── test.woff ├── knn_font.py ├── knn_test.py └── test.py /README.md: -------------------------------------------------------------------------------- 1 | # knn-font 2 | 使用sklearn库调用knn算法实现猫眼字体识别 3 | -------------------------------------------------------------------------------- /maoyan/__pycache__/font.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/__pycache__/font.cpython-37.pyc -------------------------------------------------------------------------------- /maoyan/__pycache__/knn_font.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/__pycache__/knn_font.cpython-37.pyc -------------------------------------------------------------------------------- /maoyan/font.py: -------------------------------------------------------------------------------- 1 | # coding='utf-8' 2 | import requests 3 | import re 4 | from fontTools.ttLib import TTFont 5 | 6 | 7 | def get_font_content(): 8 | url = 'https://maoyan.com/board/1' 9 | headers = { 10 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36' 11 | } 12 | response = requests.get(url, headers=headers) 13 | font_url = 'http:' + re.findall(r"url\('(.*?\.woff)'\)", response.text)[0] 14 | return requests.get(font_url).content 15 | 16 | 17 | def save_font(): 18 | for i in range(5): 19 | font_content = get_font_content() 20 | with open(f'./fonts/{i+1}.woff', 'wb') as f: 21 | f.write(font_content) 22 | 23 | 24 | def get_coor_info(font, cli): 25 | glyf_order = font.getGlyphOrder()[2:] 26 | info = list() 27 | for i, g in enumerate(glyf_order): 28 | coors = font['glyf'][g].coordinates 29 | coors = [_ for c in coors for _ in c] 30 | coors.insert(0, cli[i]) 31 | info.append(coors) 32 | return info 33 | 34 | 35 | def get_font_data(): 36 | font_1 = TTFont('./fonts/1.woff') 37 | cli_1 = [6, 7, 4, 9, 1, 2, 5, 0, 3, 8] 38 | coor_info_1 = get_coor_info(font_1, cli_1) 39 | 40 | font_2 = TTFont('./fonts/2.woff') 41 | cli_2 = [1, 3, 2, 7, 6, 8, 9, 0, 4, 5] 42 | coor_info_2 = get_coor_info(font_2, cli_2) 43 | 44 | font_3 = TTFont('./fonts/3.woff') 45 | cli_3 = [5, 8, 3, 0, 6, 7, 9, 1, 2, 4] 46 | coor_info_3 = get_coor_info(font_3, cli_3) 47 | 48 | font_4 = TTFont('./fonts/4.woff') 49 | cli_4 = [9, 3, 4, 8, 7, 5, 2, 1, 6, 0] 50 | coor_info_4 = get_coor_info(font_4, cli_4) 51 | 52 | font_5 = TTFont('./fonts/5.woff') 53 | cli_5 = [1, 5, 8, 0, 7, 9, 6, 3, 2, 4] 54 | coor_info_5 = get_coor_info(font_5, cli_5) 55 | 56 | infos = coor_info_1 + coor_info_2 + coor_info_3 + coor_info_4 + coor_info_5 57 | return infos 58 | 59 | 60 | if __name__ == '__main__': 61 | print(get_font_data()) 62 | -------------------------------------------------------------------------------- /maoyan/fonts/1.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/1.woff -------------------------------------------------------------------------------- /maoyan/fonts/2.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/2.woff -------------------------------------------------------------------------------- /maoyan/fonts/3.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/3.woff -------------------------------------------------------------------------------- /maoyan/fonts/4.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/4.woff -------------------------------------------------------------------------------- /maoyan/fonts/5.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/5.woff -------------------------------------------------------------------------------- /maoyan/fonts/test.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xfs-coder/knn-font/403b8dffcb8ef084efbfc5bc851575a9632db489/maoyan/fonts/test.woff -------------------------------------------------------------------------------- /maoyan/knn_font.py: -------------------------------------------------------------------------------- 1 | # coding='utf-8' 2 | import numpy as np 3 | import pandas as pd 4 | from font import get_font_data 5 | from sklearn.impute import SimpleImputer 6 | from sklearn.neighbors import KNeighborsClassifier 7 | 8 | 9 | class Classify: 10 | def __init__(self): 11 | self.len = None 12 | self.knn = self.get_knn() 13 | 14 | def process_data(self, data): 15 | imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 16 | return pd.DataFrame(imputer.fit_transform(pd.DataFrame(data))) 17 | 18 | def get_knn(self): 19 | data = self.process_data(get_font_data()) 20 | 21 | x_train = data.drop([0], axis=1) 22 | y_train = data[0] 23 | 24 | knn = KNeighborsClassifier(n_neighbors=1) 25 | knn.fit(x_train, y_train) 26 | 27 | self.len = x_train.shape[1] 28 | 29 | return knn 30 | 31 | def knn_predict(self, data): 32 | df = pd.DataFrame(data) 33 | data = pd.concat([df, pd.DataFrame(np.zeros( 34 | (df.shape[0], self.len - df.shape[1])), columns=range(df.shape[1], self.len))]) 35 | data = self.process_data(data) 36 | 37 | y_predict = self.knn.predict(data) 38 | return y_predict 39 | 40 | 41 | if __name__ == '__main__': 42 | obj = Classify() 43 | obj.get_knn() 44 | obj.knn_predict() 45 | -------------------------------------------------------------------------------- /maoyan/knn_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from font import get_font_data 4 | from sklearn.impute import SimpleImputer 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | 10 | def main(): 11 | # 处理缺失值 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 13 | data = pd.DataFrame(imputer.fit_transform(pd.DataFrame(get_font_data()))) 14 | 15 | # 取出特征值\目标值 16 | x = data.drop([0], axis=1) 17 | y = data[0] 18 | 19 | # 分割数据集 20 | # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) 21 | x_train = x.head(30) 22 | y_train = y.head(30) 23 | x_test = x.tail(10) 24 | y_test = y.tail(10) 25 | 26 | # 标准化 27 | # std = StandardScaler() 28 | # x_train = std.fit_transform(x_train) 29 | # x_test = std.transform(x_test) 30 | 31 | # 进行算法流程 32 | knn = KNeighborsClassifier(n_neighbors=1) 33 | # 开始训练 34 | knn.fit(x_train, y_train) 35 | # 预测结果 36 | y_predict = knn.predict(x_test) 37 | print(y) 38 | # 得出准确率 39 | print(knn.score(x_test, y_test)) 40 | 41 | 42 | if __name__ == '__main__': 43 | main() -------------------------------------------------------------------------------- /maoyan/test.py: -------------------------------------------------------------------------------- 1 | # coding='utf-8' 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | from io import BytesIO 7 | from fontTools.ttLib import TTFont 8 | from knn_font import Classify 9 | 10 | 11 | classify = Classify() 12 | 13 | def get_map(text): 14 | font_url = 'http:' + re.findall(r"url\('(.*?\.woff)'\)", text)[0] 15 | content = requests.get(font_url).content 16 | with open('./fonts/test.woff', 'wb') as f: 17 | f.write(content) 18 | font = TTFont(BytesIO(content)) 19 | glyf_order = font.getGlyphOrder()[2:] 20 | 21 | info = list() 22 | for g in glyf_order: 23 | coors = font['glyf'][g].coordinates 24 | coors = [_ for c in coors for _ in c] 25 | info.append(coors) 26 | map_li = map(lambda x: str(int(x)), classify.knn_predict(info)) 27 | uni_li = map(lambda x: x.lower().replace('uni', '&#x') + ';', glyf_order) 28 | return dict(zip(uni_li, map_li)) 29 | 30 | 31 | def get_board(): 32 | url = 'https://maoyan.com/board/1' 33 | headers = { 34 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36' 35 | } 36 | text = requests.get(url, headers=headers).text 37 | 38 | map_dict = get_map(text) 39 | for uni in map_dict.keys(): 40 | text = text.replace(uni, map_dict[uni]) 41 | 42 | html = etree.HTML(text) 43 | dd_li = html.xpath('//dl[@class="board-wrapper"]/dd') 44 | for dd in dd_li: 45 | p_li = dd.xpath( 46 | './div[@class="board-item-main"]//div[@class="movie-item-info"]/p') 47 | title = p_li[0].xpath('./a/@title')[0] 48 | star = p_li[1].xpath('./text()')[0] 49 | releasetime = p_li[2].xpath('./text()')[0] 50 | 51 | p_li = dd.xpath( 52 | './div[@class="board-item-main"]//div[@class="movie-item-number boxoffice"]/p') 53 | realtime_stont = ''.join( 54 | list(map(lambda x: x.strip(), p_li[0].xpath('.//text()')))) 55 | total_stont = ''.join( 56 | list(map(lambda x: x.strip(), p_li[1].xpath('.//text()')))) 57 | print(title) 58 | print(star) 59 | print(releasetime) 60 | print(realtime_stont) 61 | print(total_stont) 62 | print('-' * 50) 63 | 64 | 65 | if __name__ == '__main__': 66 | get_board() 67 | --------------------------------------------------------------------------------