├── .gitignore ├── README.md ├── construct_poets_network.py ├── data ├── early_tang_poets.txt ├── high_tang_poets.txt ├── late_tang_poets.txt ├── middle_tang_poets.txt ├── qts_zhs.txt └── qts_zht.txt ├── html ├── early_tang_poets_net.html ├── echarts-all-3.js ├── full_tang_poets_net.html ├── high_tang_poets_net.html ├── html_head.txt ├── html_tail.txt ├── late_tang_poets_net.html └── middle_tang_poets_net.html ├── utils.py ├── visualize_poets_network.py └── word_level_analyzer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # CBDB太大了,不加入repo 2 | data/cbdb_sqlite.db 3 | # 忽略中间运算结果 4 | save/* 5 | # 忽略html网页文件 6 | html/* 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 全唐诗分析程序 2 | 这个程序最初的诞生是为了写微信公众号的两篇文章,那两篇文章的也大致讲解了程序的原理和流程。 3 | 因此,在使用程序之前,强烈建议您先读这两篇文章: 4 | - [当我们在读唐诗时,我们在读什么?](https://mp.weixin.qq.com/s?__biz=MzI0NTUxMjgyOA==&mid=2247483724&idx=1&sn=9fe912aaaa2757eec2634a95931e1c6a&chksm=e94c2e5fde3ba749e4e364644d6b68d004b295a6864606c79f710b4b0e7e5d07ac3e89481012&mpshare=1&scene=1&srcid=0314cTnPXrmiKE1tR18sIV5m&pass_ticket=LmF1XSUkX6AZUuMnsPEO3vBZgEqfwt9frF%2F%2FATtYfAWYcIhzbawA0%2FclwgYNC1u%2F#rd) 5 | - [计算机告诉你,唐朝诗人之间的关系到底是什么样的?](https://mp.weixin.qq.com/s?__biz=MzI0NTUxMjgyOA==&mid=2247483750&idx=1&sn=dd883b547a3fc4343a3dcce1abea3719&chksm=e94c2e75de3ba7631ffd7abff8a89ea56fda63b2f3d3bb81fd845ef5fd3e9207b41230900288&mpshare=1&scene=1&srcid=0314HdoeYueFNse6H7j18qfx&pass_ticket=P5NYT1vI3xq6gboRVFuq64N9z2Yp0ADF4pMH3nRnXAhGuoM7eROG8O2lhVg%2BIvoR#rd) 6 | 7 | 相应的,程序也主要有两个方面的功能: 8 | - 分析词频和词向量,对应第一篇文章 9 | - 构建诗人之间的引用关系,对应第二篇文章 10 | 11 | master分枝仅支持python3。python2分枝(感谢网友[carryme9527](https://github.com/carryme9527/poetry_analyzer)的工作,这个分枝主要是他的功劳)则支持python2。 12 | 程序主要有两个目录: 13 | - data目录,用于存储全唐诗和CBDB数据库 14 | - html目录,存储最终的社交网络关系网页 15 | 16 | 程序在运算过程中会dump一些中间运算结果,并存储在save目录(如果不存在会自动创建)中。 17 | 18 | 由于CBDB数据库很大,有400+M。github不允许上传这么大的文件,请大家自行去[CBDB官网](http://projects.iq.harvard.edu/chinesecbdb/%E4%B8%8B%E8%BC%89cbdb%E5%96%AE%E6%A9%9F%E7%89%88)下载单机版数据库,并且以cbdb_sqlite.db为文件名存储在data目录下。 19 | # 依赖库 20 | 程序依赖了两个python库 21 | ``` shell 22 | pip3 install thulac 23 | pip3 install gensim 24 | ``` 25 | 其中thulac用于分词,gensim用于word2vec. 26 | 这两个库只用于第一篇文章的分析。如果您只关心如何构建诗人关系网络,那么不需要安装这个两个库。 27 | 28 | # 基本用法 29 | 对于**普通用户**来说: 30 | 直接用浏览器打开html目录下的网页文件,就可以在浏览器中观察网络结构了,并且可以随意拖动和放大,很有意思。 31 | 32 | 对于**程序员**来说: 33 | - 运行`python3 word_level_analyzer.py`来复现第一篇文章的结果 34 | - 运行`python3 construct_poets_network.py`来构建社交网络,并将运行结果存储在save目录。 35 | - 运行`python3 visualize_poets_network.py`来构建出显示社交网络的网页,并将结果存储在html目录。 36 | # 路线图 37 | 我后续还会对古典文献进行一些分析,并将更新过的代码及时的push到这个库中。欢迎大家关注我的微信公众号:mrqianjinsi 38 | -------------------------------------------------------------------------------- /construct_poets_network.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import argparse 3 | import os 4 | from collections import Counter, defaultdict 5 | 6 | from utils import read_qts, get_alter_names_from_CBDB 7 | 8 | # TODO 补充著名诗人列表 9 | # 這些詩人在CBDB的重名难以轻易排除,手動查找其在BIOG_MAIN表中的ID 10 | # 注意CBDB使用的是繁體中文 11 | manual_defuzzy_authors_id = { 12 | '李林甫': 32534, '王建': 92047, 13 | '李賀': 93012, '張繼': 93495, 14 | '張旭': 93409, '李紳': 92982} 15 | # 手動刪除某些作者 16 | mannual_deleted_authors = set(['無作', '清江']) 17 | # 手動刪除作者的某些別稱,这些别称在唐诗中是常用字 18 | mannual_deleted_alter_names = {'李林甫': set(['李十']), 19 | '李益': set(['李十']), 20 | '李世民': set(['李二']), 21 | '李嘉祐': set(['李二']), 22 | '馬湘': set(['自然']), 23 | '高駢': set(['千里']), 24 | '孟浩然': set(['浩然']), 25 | '李白': set(['太白']), 26 | '黃巢': set(['皇帝']), 27 | '眉娘': set(['逍遙'])} 28 | # 補充CBDB中缺少的部分作者別稱 29 | mannual_added_alter_names = { 30 | '李建': set(['李十一']), 31 | '劉禹錫': set(['劉二十八']) 32 | } 33 | 34 | def get_alter_names(qts_file, cbdb_file, save_dir): 35 | alter_names_file = os.path.join(save_dir, "alternames.pkl") 36 | 37 | if os.path.exists(alter_names_file): 38 | print("find dumped alternames file, loading directly.") 39 | with open(alter_names_file, 'rb') as f: 40 | qts_list, authors_filtered_by_CBDB, alter_names_dict = pickle.load(f) 41 | else: 42 | print("processing QuanTangShi...") 43 | # 读取全唐诗,并存储诗歌内容和作者 44 | qts_list, authors_set = read_qts(qts_file) 45 | # 删除部分作者 46 | authors_set -= mannual_deleted_authors 47 | 48 | alter_names_dict, authors_filtered_by_CBDB = get_alter_names_from_CBDB(cbdb_file, authors_set, 49 | manual_defuzzy_authors_id) 50 | # 刪除不想要的別稱 51 | for k, v in mannual_deleted_alter_names.items(): 52 | alter_names_dict[k] -= v 53 | # 補充CBDB中缺少的別稱 54 | for k, v in mannual_added_alter_names.items(): 55 | alter_names_dict[k] |= v 56 | 57 | # 存储计算结果 58 | with open(alter_names_file, 'wb') as f: 59 | pickle.dump([qts_list, authors_filtered_by_CBDB, alter_names_dict], f) 60 | 61 | return qts_list, authors_filtered_by_CBDB, alter_names_dict 62 | 63 | 64 | def get_refer_relations(qts_list, authors_filtered_by_CBDB, alter_names_dict, save_dir): 65 | reference_relations_file = os.path.join(save_dir, 'reference_relations.pkl') 66 | 67 | if os.path.exists(reference_relations_file): 68 | print("find dumped reference relations file, skip calculating.") 69 | return 70 | else: 71 | print("calculating reference relations...") 72 | reference_relations_counter = Counter() 73 | reference_relations_text = defaultdict(list) 74 | # 逐个作者搜寻 75 | for name in authors_filtered_by_CBDB: 76 | # 逐首诗搜寻 77 | for author, title, text in qts_list: 78 | # 如果不在CBDB过滤过的set中,直接跳过 79 | if author not in authors_filtered_by_CBDB: 80 | continue 81 | 82 | poem = title + ' ' + text 83 | # 查找本名,标题加正文中只要出现一次名字就可以 84 | if poem.find(name) != -1: 85 | reference_relations_counter[(author, name)] += 1 86 | reference_relations_text[(author, name)].append(title) 87 | continue 88 | # 查找别名 89 | alt_names = alter_names_dict[name] 90 | for alt_name in alt_names: 91 | if poem.find(alt_name) != -1: 92 | reference_relations_counter[(author, name)] += 1 93 | reference_relations_text[(author, name)].append(title) 94 | break 95 | # 存储计算结果 96 | with open(reference_relations_file, 'wb') as f: 97 | pickle.dump([reference_relations_counter, reference_relations_text], f) 98 | 99 | 100 | def main(): 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('--qts_path', type=str, default='data/qts_zht.txt', 103 | help='file path of Quan Tangshi') 104 | parser.add_argument('--cbdb_path', type=str, default='data/cbdb_sqlite.db', 105 | help='file path of CBDB') 106 | parser.add_argument('--save_dir', type=str, default='save', 107 | help='directory to pickle intermediate data') 108 | args = parser.parse_args() 109 | 110 | # 检查存储目录是否存在 111 | if not os.path.isdir(args.save_dir): 112 | os.makedirs(args.save_dir) 113 | 114 | qts_list, authors_filtered_by_CBDB, alter_names_dict = get_alter_names(args.qts_path, args.cbdb_path, args.save_dir) 115 | get_refer_relations(qts_list, authors_filtered_by_CBDB, alter_names_dict, args.save_dir) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | 121 | -------------------------------------------------------------------------------- /data/early_tang_poets.txt: -------------------------------------------------------------------------------- 1 | 王績 2 | 王勃 3 | 王梵志 4 | 宋之問 5 | 杜審言 6 | 李百藥 7 | 李嶠 8 | 陳子昂 9 | 駱賓王 10 | 李賢 11 | 魏徵 12 | 上官儀 13 | 李世民 14 | 盧照鄰 15 | 蘇味道 16 | 楊炯 17 | 劉希夷 18 | 寒山 19 | 崔液 20 | 韋承慶 21 | 張若虛 22 | 沈佺期 23 | 喬知之 24 | -------------------------------------------------------------------------------- /data/high_tang_poets.txt: -------------------------------------------------------------------------------- 1 | 李治 2 | 綦毋潛 3 | 高適 4 | 崔顥 5 | 戎昱 6 | 張說 7 | 崔國輔 8 | 錢起 9 | 蘇頲 10 | 王昌齡 11 | 王之渙 12 | 皇甫冉 13 | 張巡 14 | 崔峒 15 | 岑參 16 | 丘爲 17 | 杜甫 18 | 李嘉祐 19 | 西鄙人 20 | 劉眘虛 21 | 孟浩然 22 | 祖詠 23 | 儲光羲 24 | 劉長卿 25 | 萬楚 26 | 張九齡 27 | 劉灣 28 | 劉方平 29 | 元結 30 | 張謂 31 | 張旭 32 | 薛稷 33 | 李白 34 | 韓翃 35 | 司空曙 36 | 王灣 37 | 常非月 38 | 張繼 39 | 王維 40 | 李隆基 41 | 常建 42 | 李頎 43 | 柳中庸 44 | 賀知章 45 | 邱爲 46 | 嚴武 47 | 嚴識玄 48 | 王翰 49 | -------------------------------------------------------------------------------- /data/late_tang_poets.txt: -------------------------------------------------------------------------------- 1 | 王駕 2 | 章碣 3 | 司空圖 4 | 張蟲賓 5 | 曹鬆 6 | 李羣玉 7 | 錢珝 8 | 羅隱 9 | 黃滔 10 | 崔珏 11 | 秦韜玉 12 | 陳玉蘭 13 | 許渾 14 | 聶夷中 15 | 於濆 16 | 馬戴 17 | 鄭遨 18 | 盧汝弼 19 | 來鵠 20 | 韋莊 21 | 李商隱 22 | 溫庭筠 23 | 令狐楚 24 | 司馬劄 25 | 貫休 26 | 黃巢 27 | 張喬 28 | 吳融 29 | 鄭穀 30 | 薛逢 31 | 趙嘏 32 | 崔塗 33 | 劉駕 34 | 金昌緒 35 | 雍陶 36 | 齊己 37 | 崔道融 38 | 李洞 39 | 杜荀鶴 40 | 陳陶 41 | 杜牧 42 | 陸龜蒙 43 | 韓偓 44 | 方幹 45 | 皮日休 46 | 曹鄴 47 | 孟賓於 48 | 唐彥謙 49 | -------------------------------------------------------------------------------- /data/middle_tang_poets.txt: -------------------------------------------------------------------------------- 1 | 李賀 2 | 白居易 3 | 姚合 4 | 於鵠 5 | 武元衡 6 | 韋應物 7 | 賈島 8 | 趙微明 9 | 劉皁 10 | 薛濤 11 | 何希堯 12 | 權德輿 13 | 李德裕 14 | 韓氏 15 | 元稹 16 | 郎士元 17 | 張籍 18 | 顧況 19 | 楊凝 20 | 韓愈 21 | 張繼 22 | 張祜 23 | 劉禹錫 24 | 嚴維 25 | 李約 26 | 韓琮 27 | 李益 28 | 施肩吾 29 | 柳宗元 30 | 呂溫 31 | 杜秋娘 32 | 耿湋 33 | 李端 34 | 賈至 35 | 李涉 36 | 朱慶餘 37 | 張潮 38 | 胡令能 39 | 李紳 40 | 鮑溶 41 | 孟郊 42 | 王建 43 | 劉採春 44 | 楊巨源 45 | 李坤 46 | 盧綸 47 | 張仲素 48 | 王涯 49 | 崔護 50 | 劉商 51 | 鄭錫 52 | 戴叔倫 53 | -------------------------------------------------------------------------------- /html/early_tang_poets_net.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /html/full_tang_poets_net.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 335 | 336 | 337 | -------------------------------------------------------------------------------- /html/high_tang_poets_net.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /html/html_head.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /html/late_tang_poets_net.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /html/middle_tang_poets_net.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ECharts 6 | 7 | 8 | 9 |
10 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from collections import defaultdict 3 | 4 | # 读取全唐诗 5 | def read_qts(file_name): 6 | qts_list = [] 7 | authors_set = set() 8 | # 逐行读取诗歌 9 | with open(file_name, 'r', encoding = 'utf-8') as f: 10 | for line in f: 11 | text_segs = line.split() 12 | title = text_segs[1] 13 | author = text_segs[2] 14 | poem = text_segs[-1] 15 | 16 | authors_set.add(author) 17 | 18 | # 去除非汉字字符 19 | valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。'] 20 | validated_poem = ''.join(valid_char_list) 21 | # 按照作者、标题、内容的格式保存 22 | qts_list.append((author, title, validated_poem)) 23 | 24 | return qts_list, authors_set 25 | 26 | 27 | # 从CBDB中获取诗人们的别名 28 | def get_alter_names_from_CBDB(db_file, authors_set, manual_defuzzy_authors_id): 29 | tang_begin_year = 618 # 唐朝建立年份 30 | tang_end_year = 907 # 唐朝灭亡年份 31 | 32 | # 手动排查的诗人集合 33 | mannual_defuzzy_authors = set(manual_defuzzy_authors_id.keys()) 34 | 35 | authors_not_in_CBDB = set() 36 | fuzzy_authors = set() 37 | fuzzy_authors_details = {} 38 | alter_names_dict = defaultdict(set) 39 | 40 | conn = sqlite3.connect(db_file) 41 | cursor = conn.cursor() 42 | for author in authors_set: 43 | # 如果在手动排查集合中,直接使用 44 | if author in mannual_defuzzy_authors: 45 | author_id = manual_defuzzy_authors_id[author] 46 | else: # 否则从CBDB中查询 47 | # import ipdb; ipdb.set_trace() 48 | # 某些诗人的名字在全唐诗中和CBDB中不一致,用模糊搜索更好 49 | # 比如"贯休"在CBDB中的名字为"释贯休" 50 | author_pattern = '%' + author 51 | cursor.execute('SELECT c_personid, c_birthyear, c_deathyear FROM BIOG_MAIN WHERE c_name_chn LIKE?', 52 | (author_pattern,)) 53 | person_info_list = cursor.fetchall() 54 | 55 | # 排除重名现象 56 | # 具体策略请参考我的微信公众号(mrqianjinsi)文章《计算机告诉你,唐朝诗人之间的关系到底是什么样的?》 57 | candidate_author_ids = [] 58 | # import ipdb; ipdb.set_trace() 59 | for person_id, birth_year, death_year in person_info_list: 60 | if birth_year and death_year: # 生卒年俱全 61 | if birth_year < tang_end_year and death_year > tang_begin_year: 62 | # 一旦找到一个生卒年俱全且和唐朝有交集的,就不看其他的了 63 | candidate_author_ids = [person_id] 64 | break 65 | elif birth_year or death_year: # 只有生年或者卒年 66 | year = birth_year if birth_year else death_year 67 | if year > tang_begin_year and year < tang_end_year: 68 | candidate_author_ids.append(person_id) 69 | 70 | # 候选名单为空或者多于一个人的候选名单都不要 71 | if not candidate_author_ids: 72 | authors_not_in_CBDB.add(author) 73 | # print('can\'t find valid items for %s' % author) 74 | continue 75 | elif len(candidate_author_ids) > 1: 76 | fuzzy_authors.add(author) 77 | fuzzy_authors_details[author] = candidate_author_ids 78 | # print('fuzzy authors: %s' % author) 79 | continue 80 | 81 | author_id = candidate_author_ids[0] 82 | 83 | # 根据author_id找出诗人别名 84 | cursor.execute('SELECT c_alt_name_chn FROM ALTNAME_DATA WHERE c_personid=?', 85 | (author_id,)) 86 | alt_name_list = cursor.fetchall() 87 | for alt_name in alt_name_list: 88 | # 不要只有一个字的别称 89 | if len(alt_name[0]) > 1: 90 | alter_names_dict[author].add(alt_name[0]) 91 | 92 | conn.close() 93 | 94 | # 经过CBDB过滤过的诗人,接下来只分析这些人之间的关系 95 | authors_filtered_by_CBDB = authors_set - authors_not_in_CBDB - fuzzy_authors 96 | 97 | return alter_names_dict, authors_filtered_by_CBDB 98 | -------------------------------------------------------------------------------- /visualize_poets_network.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import argparse 3 | import os 4 | import math 5 | 6 | # 如果需要,可以用opencc实现繁体和简体之间的转换 7 | # 需要在电脑上安装opencc 8 | # opencc = 'opencc -i echart_visualize/poets_network_early.html -o echart_visualize/poets_network_early_zhs.html -c zht2zhs.ini' 9 | 10 | 11 | # 直接获取排名前visulize_range的引用关系 12 | def get_concerned_relations_by_range(reference_relations_counter, visulize_range): 13 | # 获取引用关系 14 | relations = reference_relations_counter.most_common(visulize_range) 15 | max_refer_count = relations[0][1] 16 | min_refer_count = relations[-1][1] 17 | 18 | return relations, max_refer_count, min_refer_count 19 | 20 | # 获取指定诗人群体之间的引用关系,适合画出某个群体内部的网络 21 | def get_concerned_relations_by_authors(reference_relations_counter, authors): 22 | # 获取指定作者群体内部的引用关系 23 | relations = [] 24 | max_refer_count = 0 25 | min_refer_count = 10000 26 | for (refered_by, refered), count in reference_relations_counter.items(): 27 | # 不统计自引用的count 28 | if refered_by == refered: 29 | continue 30 | if refered_by in authors and refered in authors: 31 | if count > max_refer_count: 32 | max_refer_count = count 33 | if count < min_refer_count: 34 | min_refer_count = count 35 | 36 | relations.append(((refered_by, refered), count)) 37 | 38 | return relations, max_refer_count, min_refer_count 39 | 40 | # 有些时候如果画出所有关系会显得非常拥挤,用count_to_plot_threshold来控制最小显示出来的关系 41 | # 只有引用数大于等于count_to_plot_threshold的关系才会显示出来 42 | def generate_html_page(relations, max_refer_count, min_refer_count, saved_html_file, count_to_plot_threshold = 1): 43 | html_dir = os.path.dirname(saved_html_file) 44 | html_head_path = os.path.join(html_dir, 'html_head.txt') 45 | html_tail_path = os.path.join(html_dir, 'html_tail.txt') 46 | 47 | min_link_width = 0.5 48 | max_link_width = 3.0 49 | 50 | # 因为引用关系的强弱范围很大,对其开方降低变化范围,画图更直观 51 | max_refer_count = math.sqrt(max_refer_count) 52 | min_refer_count = math.sqrt(min_refer_count) 53 | width_slope = (max_link_width - min_link_width) / (max_refer_count - min_refer_count) 54 | # 格式化links数据 55 | links_text = 'links: [\n' 56 | links_item_format = """{source: '%s', target: '%s', 57 | lineStyle:{normal:{width: %f}}}, 58 | """ 59 | filtered_authors = set() 60 | for (refered_by, refered), count in relations: 61 | # 跳过自引用,不然有可能画出孤立节点 62 | if refered_by == refered: 63 | continue 64 | # 小于门限跳过 65 | if count < count_to_plot_threshold: 66 | continue 67 | 68 | filtered_authors.add(refered_by) 69 | filtered_authors.add(refered) 70 | count = math.sqrt(count) 71 | line_width = min_link_width + width_slope * (count - min_refer_count) 72 | links_text += links_item_format % (refered_by, refered, line_width) 73 | 74 | links_text += '],\n' 75 | 76 | # 格式化node数据 77 | data_text = 'data:[\n' 78 | data_item_format = "{name: '%s'},\n" 79 | for author in filtered_authors: 80 | data_text += data_item_format % author 81 | 82 | data_text += '],\n' 83 | 84 | # 读取html的head和tail部分 85 | with open(html_head_path, 'r', encoding = 'utf-8') as f: 86 | head_text = f.read() 87 | 88 | with open(html_tail_path, 'r', encoding = 'utf-8') as f: 89 | tail_text = f.read() 90 | 91 | # 合并存储为html 92 | with open(saved_html_file, 'w', encoding = 'utf-8') as f: 93 | f.write(head_text + data_text + links_text + tail_text) 94 | 95 | 96 | def main(): 97 | parser = argparse.ArgumentParser() 98 | 99 | parser.add_argument('--relations_path', type=str, default='save/reference_relations.pkl', 100 | help='file to load relations data') 101 | parser.add_argument('--data_dir', type=str, default='data', 102 | help='directory to load authors file') 103 | parser.add_argument('--html_dir', type=str, default='html', 104 | help='directory to save html page') 105 | 106 | args = parser.parse_args() 107 | 108 | with open(args.relations_path, 'rb') as f: 109 | reference_relations_counter, reference_relations_text = pickle.load(f) 110 | 111 | # 生成全唐排名前100的关系图 112 | relations, max_refer_count, min_refer_count = get_concerned_relations_by_range(reference_relations_counter, 100) 113 | saved_html = os.path.join(args.html_dir, 'full_tang_poets_net.html') 114 | generate_html_page(relations, max_refer_count, min_refer_count, saved_html) 115 | 116 | # 生成初唐、盛唐、中唐、晚唐四个时期的诗人关系图 117 | # 诗人名字文件 社交关系图网页 引用数门限 118 | files_name_array = [('early_tang_poets.txt', 'early_tang_poets_net.html', 1), 119 | ('high_tang_poets.txt', 'high_tang_poets_net.html', 2), 120 | ('middle_tang_poets.txt','middle_tang_poets_net.html', 2), 121 | ('late_tang_poets.txt', 'late_tang_poets_net.html', 1)] 122 | 123 | for authors_file_name, html_file_name, threshold in files_name_array: 124 | authors_file_path = os.path.join(args.data_dir, authors_file_name) 125 | with open(authors_file_path, 'r', encoding='utf-8') as f: 126 | text = f.read() 127 | authors = set(text.split()) 128 | 129 | relations, max_refer_count, min_refer_count = get_concerned_relations_by_authors(reference_relations_counter, authors) 130 | 131 | saved_html = os.path.join(args.html_dir, html_file_name) 132 | generate_html_page(relations, max_refer_count, min_refer_count, saved_html, threshold) 133 | 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /word_level_analyzer.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | import thulac 3 | import pickle 4 | import os 5 | import argparse 6 | 7 | import multiprocessing 8 | from gensim.models import Word2Vec 9 | from gensim.models.word2vec import LineSentence 10 | 11 | # 对全唐诗分词 12 | def cut_qts_to_words(qts_file, saved_words_file): 13 | save_dir = os.path.dirname((saved_words_file)) 14 | dumped_file = os.path.join(save_dir, 'qts_words_stat_result.pkl') 15 | 16 | if os.path.exists(dumped_file) and os.path.exists(saved_words_file): 17 | print('find preprocessed data, loading directly...') 18 | with open(dumped_file, 'rb') as f: 19 | char_counter, author_counter, vocab, word_counter, genre_counter = pickle.load(f) 20 | else: 21 | char_counter = Counter() # 字频统计 22 | author_counter = Counter() # 每个作者的写诗篇数 23 | vocab = set() # 词汇库 24 | word_counter = Counter() # 词频统计 25 | genre_counter = defaultdict(Counter) # 针对每个词性的Counter 26 | 27 | fid_save = open(saved_words_file, 'w', encoding = 'utf-8') 28 | lex_analyzer = thulac.thulac() # 分词器 29 | line_cnt = 0 30 | with open(qts_file, 'r', encoding = 'utf-8') as f: 31 | for line in f: 32 | text_segs = line.split() 33 | author = text_segs[2] 34 | author_counter[author] += 1 35 | 36 | poem = text_segs[-1] 37 | # 去除非汉字字符 38 | valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。'] 39 | for char in valid_char_list: 40 | char_counter[char] += 1 41 | 42 | regularized_poem = ''.join(valid_char_list) 43 | word_genre_pairs = lex_analyzer.cut(regularized_poem) 44 | 45 | word_list = [] 46 | for word, genre in word_genre_pairs: 47 | word_list.append(word) 48 | vocab.add(word) 49 | word_counter[word] += 1 50 | genre_counter[genre][word] += 1 51 | 52 | save_line = ' '.join(word_list) 53 | fid_save.write(save_line + '\n') 54 | 55 | if line_cnt % 10 == 0: 56 | print('%d poets processed.' % line_cnt) 57 | line_cnt += 1 58 | 59 | fid_save.close() 60 | # 存储下来 61 | dumped_data = [char_counter, author_counter, vocab, word_counter, genre_counter] 62 | with open(dumped_file, 'wb') as f: 63 | pickle.dump(dumped_data, f) 64 | 65 | return char_counter, author_counter, genre_counter 66 | 67 | # 将分词结果转换为向量 68 | def word2vec(words_file): 69 | save_dir = os.path.dirname((words_file)) 70 | vector_file = os.path.join(save_dir, 'word_vectors.model') 71 | 72 | if os.path.exists(vector_file): 73 | print('find word vector file, loading directly...') 74 | model = Word2Vec.load(vector_file) 75 | else: 76 | print('calculating word vectors...') 77 | model = Word2Vec(LineSentence(words_file), size=400, window=3, min_count=10, 78 | workers=multiprocessing.cpu_count()) 79 | # 将计算结果存储起来,下次就不用重新计算了 80 | model.save(vector_file) 81 | 82 | return model 83 | 84 | def print_stat_results(char_counter, author_counter, genre_counter, vector_model): 85 | def print_counter(counter): 86 | for k, v in counter: 87 | print(k, v) 88 | # 诗人写作数量排名 89 | print('\n诗人写作数量排名') 90 | print_counter(author_counter.most_common(10)) 91 | 92 | # 基于字的分析 93 | print('\n\n基于字的分析') 94 | # 常用字排名 95 | print('\n常用字排名') 96 | print_counter(char_counter.most_common(12)) 97 | # 季节排名 98 | print('\n季节排名') 99 | for c in ['春', '夏', '秋', '冬']: 100 | print(c, char_counter[c]) 101 | # 颜色排名 102 | print('\n颜色排名') 103 | colors = ['红', '白', '青', '蓝', '绿', '紫', '黑', '黄'] 104 | for c in colors: 105 | print(c, char_counter[c]) 106 | # 植物排名 107 | print('\n植物排名') 108 | plants = ['梅', '兰', '竹', '菊', '松', '柳', '枫', '桃', '梨', '杏'] 109 | for p in plants: 110 | print(p, char_counter[p]) 111 | # 动物排名 112 | print('\n动物排名') 113 | age_animals = ['鼠', '牛', '虎', '兔', '龙', '蛇', '马', '羊', '猴', '鸡', '狗', '猪'] 114 | for a in age_animals: 115 | print(a, char_counter[a]) 116 | 117 | # 基于词的分析 118 | print('\n\n基于词的分析') 119 | # 地名排名 120 | print('\n地名词排名') 121 | print_counter(genre_counter['ns'].most_common(10)) 122 | # 时间排名 123 | print('\n时间词排名') 124 | print_counter(genre_counter['t'].most_common(10)) 125 | # 场景排名 126 | print('\n场景词排名') 127 | print_counter(genre_counter['s'].most_common(10)) 128 | 129 | 130 | # 基于词向量的分析 131 | print('\n\n基于词向量的分析') 132 | # print(vector_model['今日']) 133 | def print_similar_words(word): 134 | print('\n与"%s"比较意思比较接近的词' % word) 135 | print_counter(vector_model.most_similar(word)) 136 | 137 | print_similar_words('天子') 138 | print_similar_words('寂寞') 139 | 140 | 141 | def main(): 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('--qts_path', type=str, default='data/qts_zhs.txt', 144 | help='file path of Quan Tangshi') 145 | parser.add_argument('--words_path', type=str, default='save/qts_words_list.txt', 146 | help='file path to save Quan Tangshi words data') 147 | args = parser.parse_args() 148 | 149 | # 检查存储目录是否存在 150 | save_dir = os.path.dirname(args.words_path) 151 | if not os.path.isdir(save_dir): 152 | os.makedirs(save_dir) 153 | 154 | char_counter, author_counter, genre_counter = cut_qts_to_words(args.qts_path, args.words_path) 155 | vector_model = word2vec(args.words_path) 156 | 157 | print_stat_results(char_counter, author_counter, genre_counter, vector_model) 158 | 159 | 160 | if __name__ == '__main__': 161 | main() 162 | --------------------------------------------------------------------------------