├── .gitignore
├── README.md
├── construct_poets_network.py
├── data
├── early_tang_poets.txt
├── high_tang_poets.txt
├── late_tang_poets.txt
├── middle_tang_poets.txt
├── qts_zhs.txt
└── qts_zht.txt
├── html
├── early_tang_poets_net.html
├── echarts-all-3.js
├── full_tang_poets_net.html
├── high_tang_poets_net.html
├── html_head.txt
├── html_tail.txt
├── late_tang_poets_net.html
└── middle_tang_poets_net.html
├── utils.py
├── visualize_poets_network.py
└── word_level_analyzer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # CBDB太大了,不加入repo
2 | data/cbdb_sqlite.db
3 | # 忽略中间运算结果
4 | save/*
5 | # 忽略html网页文件
6 | html/*
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 全唐诗分析程序
2 | 这个程序最初的诞生是为了写微信公众号的两篇文章,那两篇文章的也大致讲解了程序的原理和流程。
3 | 因此,在使用程序之前,强烈建议您先读这两篇文章:
4 | - [当我们在读唐诗时,我们在读什么?](https://mp.weixin.qq.com/s?__biz=MzI0NTUxMjgyOA==&mid=2247483724&idx=1&sn=9fe912aaaa2757eec2634a95931e1c6a&chksm=e94c2e5fde3ba749e4e364644d6b68d004b295a6864606c79f710b4b0e7e5d07ac3e89481012&mpshare=1&scene=1&srcid=0314cTnPXrmiKE1tR18sIV5m&pass_ticket=LmF1XSUkX6AZUuMnsPEO3vBZgEqfwt9frF%2F%2FATtYfAWYcIhzbawA0%2FclwgYNC1u%2F#rd)
5 | - [计算机告诉你,唐朝诗人之间的关系到底是什么样的?](https://mp.weixin.qq.com/s?__biz=MzI0NTUxMjgyOA==&mid=2247483750&idx=1&sn=dd883b547a3fc4343a3dcce1abea3719&chksm=e94c2e75de3ba7631ffd7abff8a89ea56fda63b2f3d3bb81fd845ef5fd3e9207b41230900288&mpshare=1&scene=1&srcid=0314HdoeYueFNse6H7j18qfx&pass_ticket=P5NYT1vI3xq6gboRVFuq64N9z2Yp0ADF4pMH3nRnXAhGuoM7eROG8O2lhVg%2BIvoR#rd)
6 |
7 | 相应的,程序也主要有两个方面的功能:
8 | - 分析词频和词向量,对应第一篇文章
9 | - 构建诗人之间的引用关系,对应第二篇文章
10 |
11 | master分枝仅支持python3。python2分枝(感谢网友[carryme9527](https://github.com/carryme9527/poetry_analyzer)的工作,这个分枝主要是他的功劳)则支持python2。
12 | 程序主要有两个目录:
13 | - data目录,用于存储全唐诗和CBDB数据库
14 | - html目录,存储最终的社交网络关系网页
15 |
16 | 程序在运算过程中会dump一些中间运算结果,并存储在save目录(如果不存在会自动创建)中。
17 |
18 | 由于CBDB数据库很大,有400+M。github不允许上传这么大的文件,请大家自行去[CBDB官网](http://projects.iq.harvard.edu/chinesecbdb/%E4%B8%8B%E8%BC%89cbdb%E5%96%AE%E6%A9%9F%E7%89%88)下载单机版数据库,并且以cbdb_sqlite.db为文件名存储在data目录下。
19 | # 依赖库
20 | 程序依赖了两个python库
21 | ``` shell
22 | pip3 install thulac
23 | pip3 install gensim
24 | ```
25 | 其中thulac用于分词,gensim用于word2vec.
26 | 这两个库只用于第一篇文章的分析。如果您只关心如何构建诗人关系网络,那么不需要安装这个两个库。
27 |
28 | # 基本用法
29 | 对于**普通用户**来说:
30 | 直接用浏览器打开html目录下的网页文件,就可以在浏览器中观察网络结构了,并且可以随意拖动和放大,很有意思。
31 |
32 | 对于**程序员**来说:
33 | - 运行`python3 word_level_analyzer.py`来复现第一篇文章的结果
34 | - 运行`python3 construct_poets_network.py`来构建社交网络,并将运行结果存储在save目录。
35 | - 运行`python3 visualize_poets_network.py`来构建出显示社交网络的网页,并将结果存储在html目录。
36 | # 路线图
37 | 我后续还会对古典文献进行一些分析,并将更新过的代码及时的push到这个库中。欢迎大家关注我的微信公众号:mrqianjinsi
38 |
--------------------------------------------------------------------------------
/construct_poets_network.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import argparse
3 | import os
4 | from collections import Counter, defaultdict
5 |
6 | from utils import read_qts, get_alter_names_from_CBDB
7 |
8 | # TODO 补充著名诗人列表
9 | # 這些詩人在CBDB的重名难以轻易排除,手動查找其在BIOG_MAIN表中的ID
10 | # 注意CBDB使用的是繁體中文
11 | manual_defuzzy_authors_id = {
12 | '李林甫': 32534, '王建': 92047,
13 | '李賀': 93012, '張繼': 93495,
14 | '張旭': 93409, '李紳': 92982}
15 | # 手動刪除某些作者
16 | mannual_deleted_authors = set(['無作', '清江'])
17 | # 手動刪除作者的某些別稱,这些别称在唐诗中是常用字
18 | mannual_deleted_alter_names = {'李林甫': set(['李十']),
19 | '李益': set(['李十']),
20 | '李世民': set(['李二']),
21 | '李嘉祐': set(['李二']),
22 | '馬湘': set(['自然']),
23 | '高駢': set(['千里']),
24 | '孟浩然': set(['浩然']),
25 | '李白': set(['太白']),
26 | '黃巢': set(['皇帝']),
27 | '眉娘': set(['逍遙'])}
28 | # 補充CBDB中缺少的部分作者別稱
29 | mannual_added_alter_names = {
30 | '李建': set(['李十一']),
31 | '劉禹錫': set(['劉二十八'])
32 | }
33 |
34 | def get_alter_names(qts_file, cbdb_file, save_dir):
35 | alter_names_file = os.path.join(save_dir, "alternames.pkl")
36 |
37 | if os.path.exists(alter_names_file):
38 | print("find dumped alternames file, loading directly.")
39 | with open(alter_names_file, 'rb') as f:
40 | qts_list, authors_filtered_by_CBDB, alter_names_dict = pickle.load(f)
41 | else:
42 | print("processing QuanTangShi...")
43 | # 读取全唐诗,并存储诗歌内容和作者
44 | qts_list, authors_set = read_qts(qts_file)
45 | # 删除部分作者
46 | authors_set -= mannual_deleted_authors
47 |
48 | alter_names_dict, authors_filtered_by_CBDB = get_alter_names_from_CBDB(cbdb_file, authors_set,
49 | manual_defuzzy_authors_id)
50 | # 刪除不想要的別稱
51 | for k, v in mannual_deleted_alter_names.items():
52 | alter_names_dict[k] -= v
53 | # 補充CBDB中缺少的別稱
54 | for k, v in mannual_added_alter_names.items():
55 | alter_names_dict[k] |= v
56 |
57 | # 存储计算结果
58 | with open(alter_names_file, 'wb') as f:
59 | pickle.dump([qts_list, authors_filtered_by_CBDB, alter_names_dict], f)
60 |
61 | return qts_list, authors_filtered_by_CBDB, alter_names_dict
62 |
63 |
64 | def get_refer_relations(qts_list, authors_filtered_by_CBDB, alter_names_dict, save_dir):
65 | reference_relations_file = os.path.join(save_dir, 'reference_relations.pkl')
66 |
67 | if os.path.exists(reference_relations_file):
68 | print("find dumped reference relations file, skip calculating.")
69 | return
70 | else:
71 | print("calculating reference relations...")
72 | reference_relations_counter = Counter()
73 | reference_relations_text = defaultdict(list)
74 | # 逐个作者搜寻
75 | for name in authors_filtered_by_CBDB:
76 | # 逐首诗搜寻
77 | for author, title, text in qts_list:
78 | # 如果不在CBDB过滤过的set中,直接跳过
79 | if author not in authors_filtered_by_CBDB:
80 | continue
81 |
82 | poem = title + ' ' + text
83 | # 查找本名,标题加正文中只要出现一次名字就可以
84 | if poem.find(name) != -1:
85 | reference_relations_counter[(author, name)] += 1
86 | reference_relations_text[(author, name)].append(title)
87 | continue
88 | # 查找别名
89 | alt_names = alter_names_dict[name]
90 | for alt_name in alt_names:
91 | if poem.find(alt_name) != -1:
92 | reference_relations_counter[(author, name)] += 1
93 | reference_relations_text[(author, name)].append(title)
94 | break
95 | # 存储计算结果
96 | with open(reference_relations_file, 'wb') as f:
97 | pickle.dump([reference_relations_counter, reference_relations_text], f)
98 |
99 |
100 | def main():
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument('--qts_path', type=str, default='data/qts_zht.txt',
103 | help='file path of Quan Tangshi')
104 | parser.add_argument('--cbdb_path', type=str, default='data/cbdb_sqlite.db',
105 | help='file path of CBDB')
106 | parser.add_argument('--save_dir', type=str, default='save',
107 | help='directory to pickle intermediate data')
108 | args = parser.parse_args()
109 |
110 | # 检查存储目录是否存在
111 | if not os.path.isdir(args.save_dir):
112 | os.makedirs(args.save_dir)
113 |
114 | qts_list, authors_filtered_by_CBDB, alter_names_dict = get_alter_names(args.qts_path, args.cbdb_path, args.save_dir)
115 | get_refer_relations(qts_list, authors_filtered_by_CBDB, alter_names_dict, args.save_dir)
116 |
117 |
118 | if __name__ == '__main__':
119 | main()
120 |
121 |
--------------------------------------------------------------------------------
/data/early_tang_poets.txt:
--------------------------------------------------------------------------------
1 | 王績
2 | 王勃
3 | 王梵志
4 | 宋之問
5 | 杜審言
6 | 李百藥
7 | 李嶠
8 | 陳子昂
9 | 駱賓王
10 | 李賢
11 | 魏徵
12 | 上官儀
13 | 李世民
14 | 盧照鄰
15 | 蘇味道
16 | 楊炯
17 | 劉希夷
18 | 寒山
19 | 崔液
20 | 韋承慶
21 | 張若虛
22 | 沈佺期
23 | 喬知之
24 |
--------------------------------------------------------------------------------
/data/high_tang_poets.txt:
--------------------------------------------------------------------------------
1 | 李治
2 | 綦毋潛
3 | 高適
4 | 崔顥
5 | 戎昱
6 | 張說
7 | 崔國輔
8 | 錢起
9 | 蘇頲
10 | 王昌齡
11 | 王之渙
12 | 皇甫冉
13 | 張巡
14 | 崔峒
15 | 岑參
16 | 丘爲
17 | 杜甫
18 | 李嘉祐
19 | 西鄙人
20 | 劉眘虛
21 | 孟浩然
22 | 祖詠
23 | 儲光羲
24 | 劉長卿
25 | 萬楚
26 | 張九齡
27 | 劉灣
28 | 劉方平
29 | 元結
30 | 張謂
31 | 張旭
32 | 薛稷
33 | 李白
34 | 韓翃
35 | 司空曙
36 | 王灣
37 | 常非月
38 | 張繼
39 | 王維
40 | 李隆基
41 | 常建
42 | 李頎
43 | 柳中庸
44 | 賀知章
45 | 邱爲
46 | 嚴武
47 | 嚴識玄
48 | 王翰
49 |
--------------------------------------------------------------------------------
/data/late_tang_poets.txt:
--------------------------------------------------------------------------------
1 | 王駕
2 | 章碣
3 | 司空圖
4 | 張蟲賓
5 | 曹鬆
6 | 李羣玉
7 | 錢珝
8 | 羅隱
9 | 黃滔
10 | 崔珏
11 | 秦韜玉
12 | 陳玉蘭
13 | 許渾
14 | 聶夷中
15 | 於濆
16 | 馬戴
17 | 鄭遨
18 | 盧汝弼
19 | 來鵠
20 | 韋莊
21 | 李商隱
22 | 溫庭筠
23 | 令狐楚
24 | 司馬劄
25 | 貫休
26 | 黃巢
27 | 張喬
28 | 吳融
29 | 鄭穀
30 | 薛逢
31 | 趙嘏
32 | 崔塗
33 | 劉駕
34 | 金昌緒
35 | 雍陶
36 | 齊己
37 | 崔道融
38 | 李洞
39 | 杜荀鶴
40 | 陳陶
41 | 杜牧
42 | 陸龜蒙
43 | 韓偓
44 | 方幹
45 | 皮日休
46 | 曹鄴
47 | 孟賓於
48 | 唐彥謙
49 |
--------------------------------------------------------------------------------
/data/middle_tang_poets.txt:
--------------------------------------------------------------------------------
1 | 李賀
2 | 白居易
3 | 姚合
4 | 於鵠
5 | 武元衡
6 | 韋應物
7 | 賈島
8 | 趙微明
9 | 劉皁
10 | 薛濤
11 | 何希堯
12 | 權德輿
13 | 李德裕
14 | 韓氏
15 | 元稹
16 | 郎士元
17 | 張籍
18 | 顧況
19 | 楊凝
20 | 韓愈
21 | 張繼
22 | 張祜
23 | 劉禹錫
24 | 嚴維
25 | 李約
26 | 韓琮
27 | 李益
28 | 施肩吾
29 | 柳宗元
30 | 呂溫
31 | 杜秋娘
32 | 耿湋
33 | 李端
34 | 賈至
35 | 李涉
36 | 朱慶餘
37 | 張潮
38 | 胡令能
39 | 李紳
40 | 鮑溶
41 | 孟郊
42 | 王建
43 | 劉採春
44 | 楊巨源
45 | 李坤
46 | 盧綸
47 | 張仲素
48 | 王涯
49 | 崔護
50 | 劉商
51 | 鄭錫
52 | 戴叔倫
53 |
--------------------------------------------------------------------------------
/html/early_tang_poets_net.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/html/full_tang_poets_net.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
335 |
336 |
337 |
--------------------------------------------------------------------------------
/html/high_tang_poets_net.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/html/html_head.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/html/late_tang_poets_net.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/html/middle_tang_poets_net.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ECharts
6 |
7 |
8 |
9 |
10 |
203 |
204 |
205 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | from collections import defaultdict
3 |
4 | # 读取全唐诗
5 | def read_qts(file_name):
6 | qts_list = []
7 | authors_set = set()
8 | # 逐行读取诗歌
9 | with open(file_name, 'r', encoding = 'utf-8') as f:
10 | for line in f:
11 | text_segs = line.split()
12 | title = text_segs[1]
13 | author = text_segs[2]
14 | poem = text_segs[-1]
15 |
16 | authors_set.add(author)
17 |
18 | # 去除非汉字字符
19 | valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。']
20 | validated_poem = ''.join(valid_char_list)
21 | # 按照作者、标题、内容的格式保存
22 | qts_list.append((author, title, validated_poem))
23 |
24 | return qts_list, authors_set
25 |
26 |
27 | # 从CBDB中获取诗人们的别名
28 | def get_alter_names_from_CBDB(db_file, authors_set, manual_defuzzy_authors_id):
29 | tang_begin_year = 618 # 唐朝建立年份
30 | tang_end_year = 907 # 唐朝灭亡年份
31 |
32 | # 手动排查的诗人集合
33 | mannual_defuzzy_authors = set(manual_defuzzy_authors_id.keys())
34 |
35 | authors_not_in_CBDB = set()
36 | fuzzy_authors = set()
37 | fuzzy_authors_details = {}
38 | alter_names_dict = defaultdict(set)
39 |
40 | conn = sqlite3.connect(db_file)
41 | cursor = conn.cursor()
42 | for author in authors_set:
43 | # 如果在手动排查集合中,直接使用
44 | if author in mannual_defuzzy_authors:
45 | author_id = manual_defuzzy_authors_id[author]
46 | else: # 否则从CBDB中查询
47 | # import ipdb; ipdb.set_trace()
48 | # 某些诗人的名字在全唐诗中和CBDB中不一致,用模糊搜索更好
49 | # 比如"贯休"在CBDB中的名字为"释贯休"
50 | author_pattern = '%' + author
51 | cursor.execute('SELECT c_personid, c_birthyear, c_deathyear FROM BIOG_MAIN WHERE c_name_chn LIKE?',
52 | (author_pattern,))
53 | person_info_list = cursor.fetchall()
54 |
55 | # 排除重名现象
56 | # 具体策略请参考我的微信公众号(mrqianjinsi)文章《计算机告诉你,唐朝诗人之间的关系到底是什么样的?》
57 | candidate_author_ids = []
58 | # import ipdb; ipdb.set_trace()
59 | for person_id, birth_year, death_year in person_info_list:
60 | if birth_year and death_year: # 生卒年俱全
61 | if birth_year < tang_end_year and death_year > tang_begin_year:
62 | # 一旦找到一个生卒年俱全且和唐朝有交集的,就不看其他的了
63 | candidate_author_ids = [person_id]
64 | break
65 | elif birth_year or death_year: # 只有生年或者卒年
66 | year = birth_year if birth_year else death_year
67 | if year > tang_begin_year and year < tang_end_year:
68 | candidate_author_ids.append(person_id)
69 |
70 | # 候选名单为空或者多于一个人的候选名单都不要
71 | if not candidate_author_ids:
72 | authors_not_in_CBDB.add(author)
73 | # print('can\'t find valid items for %s' % author)
74 | continue
75 | elif len(candidate_author_ids) > 1:
76 | fuzzy_authors.add(author)
77 | fuzzy_authors_details[author] = candidate_author_ids
78 | # print('fuzzy authors: %s' % author)
79 | continue
80 |
81 | author_id = candidate_author_ids[0]
82 |
83 | # 根据author_id找出诗人别名
84 | cursor.execute('SELECT c_alt_name_chn FROM ALTNAME_DATA WHERE c_personid=?',
85 | (author_id,))
86 | alt_name_list = cursor.fetchall()
87 | for alt_name in alt_name_list:
88 | # 不要只有一个字的别称
89 | if len(alt_name[0]) > 1:
90 | alter_names_dict[author].add(alt_name[0])
91 |
92 | conn.close()
93 |
94 | # 经过CBDB过滤过的诗人,接下来只分析这些人之间的关系
95 | authors_filtered_by_CBDB = authors_set - authors_not_in_CBDB - fuzzy_authors
96 |
97 | return alter_names_dict, authors_filtered_by_CBDB
98 |
--------------------------------------------------------------------------------
/visualize_poets_network.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import argparse
3 | import os
4 | import math
5 |
6 | # 如果需要,可以用opencc实现繁体和简体之间的转换
7 | # 需要在电脑上安装opencc
8 | # opencc = 'opencc -i echart_visualize/poets_network_early.html -o echart_visualize/poets_network_early_zhs.html -c zht2zhs.ini'
9 |
10 |
11 | # 直接获取排名前visulize_range的引用关系
12 | def get_concerned_relations_by_range(reference_relations_counter, visulize_range):
13 | # 获取引用关系
14 | relations = reference_relations_counter.most_common(visulize_range)
15 | max_refer_count = relations[0][1]
16 | min_refer_count = relations[-1][1]
17 |
18 | return relations, max_refer_count, min_refer_count
19 |
20 | # 获取指定诗人群体之间的引用关系,适合画出某个群体内部的网络
21 | def get_concerned_relations_by_authors(reference_relations_counter, authors):
22 | # 获取指定作者群体内部的引用关系
23 | relations = []
24 | max_refer_count = 0
25 | min_refer_count = 10000
26 | for (refered_by, refered), count in reference_relations_counter.items():
27 | # 不统计自引用的count
28 | if refered_by == refered:
29 | continue
30 | if refered_by in authors and refered in authors:
31 | if count > max_refer_count:
32 | max_refer_count = count
33 | if count < min_refer_count:
34 | min_refer_count = count
35 |
36 | relations.append(((refered_by, refered), count))
37 |
38 | return relations, max_refer_count, min_refer_count
39 |
40 | # 有些时候如果画出所有关系会显得非常拥挤,用count_to_plot_threshold来控制最小显示出来的关系
41 | # 只有引用数大于等于count_to_plot_threshold的关系才会显示出来
42 | def generate_html_page(relations, max_refer_count, min_refer_count, saved_html_file, count_to_plot_threshold = 1):
43 | html_dir = os.path.dirname(saved_html_file)
44 | html_head_path = os.path.join(html_dir, 'html_head.txt')
45 | html_tail_path = os.path.join(html_dir, 'html_tail.txt')
46 |
47 | min_link_width = 0.5
48 | max_link_width = 3.0
49 |
50 | # 因为引用关系的强弱范围很大,对其开方降低变化范围,画图更直观
51 | max_refer_count = math.sqrt(max_refer_count)
52 | min_refer_count = math.sqrt(min_refer_count)
53 | width_slope = (max_link_width - min_link_width) / (max_refer_count - min_refer_count)
54 | # 格式化links数据
55 | links_text = 'links: [\n'
56 | links_item_format = """{source: '%s', target: '%s',
57 | lineStyle:{normal:{width: %f}}},
58 | """
59 | filtered_authors = set()
60 | for (refered_by, refered), count in relations:
61 | # 跳过自引用,不然有可能画出孤立节点
62 | if refered_by == refered:
63 | continue
64 | # 小于门限跳过
65 | if count < count_to_plot_threshold:
66 | continue
67 |
68 | filtered_authors.add(refered_by)
69 | filtered_authors.add(refered)
70 | count = math.sqrt(count)
71 | line_width = min_link_width + width_slope * (count - min_refer_count)
72 | links_text += links_item_format % (refered_by, refered, line_width)
73 |
74 | links_text += '],\n'
75 |
76 | # 格式化node数据
77 | data_text = 'data:[\n'
78 | data_item_format = "{name: '%s'},\n"
79 | for author in filtered_authors:
80 | data_text += data_item_format % author
81 |
82 | data_text += '],\n'
83 |
84 | # 读取html的head和tail部分
85 | with open(html_head_path, 'r', encoding = 'utf-8') as f:
86 | head_text = f.read()
87 |
88 | with open(html_tail_path, 'r', encoding = 'utf-8') as f:
89 | tail_text = f.read()
90 |
91 | # 合并存储为html
92 | with open(saved_html_file, 'w', encoding = 'utf-8') as f:
93 | f.write(head_text + data_text + links_text + tail_text)
94 |
95 |
96 | def main():
97 | parser = argparse.ArgumentParser()
98 |
99 | parser.add_argument('--relations_path', type=str, default='save/reference_relations.pkl',
100 | help='file to load relations data')
101 | parser.add_argument('--data_dir', type=str, default='data',
102 | help='directory to load authors file')
103 | parser.add_argument('--html_dir', type=str, default='html',
104 | help='directory to save html page')
105 |
106 | args = parser.parse_args()
107 |
108 | with open(args.relations_path, 'rb') as f:
109 | reference_relations_counter, reference_relations_text = pickle.load(f)
110 |
111 | # 生成全唐排名前100的关系图
112 | relations, max_refer_count, min_refer_count = get_concerned_relations_by_range(reference_relations_counter, 100)
113 | saved_html = os.path.join(args.html_dir, 'full_tang_poets_net.html')
114 | generate_html_page(relations, max_refer_count, min_refer_count, saved_html)
115 |
116 | # 生成初唐、盛唐、中唐、晚唐四个时期的诗人关系图
117 | # 诗人名字文件 社交关系图网页 引用数门限
118 | files_name_array = [('early_tang_poets.txt', 'early_tang_poets_net.html', 1),
119 | ('high_tang_poets.txt', 'high_tang_poets_net.html', 2),
120 | ('middle_tang_poets.txt','middle_tang_poets_net.html', 2),
121 | ('late_tang_poets.txt', 'late_tang_poets_net.html', 1)]
122 |
123 | for authors_file_name, html_file_name, threshold in files_name_array:
124 | authors_file_path = os.path.join(args.data_dir, authors_file_name)
125 | with open(authors_file_path, 'r', encoding='utf-8') as f:
126 | text = f.read()
127 | authors = set(text.split())
128 |
129 | relations, max_refer_count, min_refer_count = get_concerned_relations_by_authors(reference_relations_counter, authors)
130 |
131 | saved_html = os.path.join(args.html_dir, html_file_name)
132 | generate_html_page(relations, max_refer_count, min_refer_count, saved_html, threshold)
133 |
134 |
135 | if __name__ == '__main__':
136 | main()
137 |
--------------------------------------------------------------------------------
/word_level_analyzer.py:
--------------------------------------------------------------------------------
1 | from collections import Counter, defaultdict
2 | import thulac
3 | import pickle
4 | import os
5 | import argparse
6 |
7 | import multiprocessing
8 | from gensim.models import Word2Vec
9 | from gensim.models.word2vec import LineSentence
10 |
11 | # 对全唐诗分词
12 | def cut_qts_to_words(qts_file, saved_words_file):
13 | save_dir = os.path.dirname((saved_words_file))
14 | dumped_file = os.path.join(save_dir, 'qts_words_stat_result.pkl')
15 |
16 | if os.path.exists(dumped_file) and os.path.exists(saved_words_file):
17 | print('find preprocessed data, loading directly...')
18 | with open(dumped_file, 'rb') as f:
19 | char_counter, author_counter, vocab, word_counter, genre_counter = pickle.load(f)
20 | else:
21 | char_counter = Counter() # 字频统计
22 | author_counter = Counter() # 每个作者的写诗篇数
23 | vocab = set() # 词汇库
24 | word_counter = Counter() # 词频统计
25 | genre_counter = defaultdict(Counter) # 针对每个词性的Counter
26 |
27 | fid_save = open(saved_words_file, 'w', encoding = 'utf-8')
28 | lex_analyzer = thulac.thulac() # 分词器
29 | line_cnt = 0
30 | with open(qts_file, 'r', encoding = 'utf-8') as f:
31 | for line in f:
32 | text_segs = line.split()
33 | author = text_segs[2]
34 | author_counter[author] += 1
35 |
36 | poem = text_segs[-1]
37 | # 去除非汉字字符
38 | valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。']
39 | for char in valid_char_list:
40 | char_counter[char] += 1
41 |
42 | regularized_poem = ''.join(valid_char_list)
43 | word_genre_pairs = lex_analyzer.cut(regularized_poem)
44 |
45 | word_list = []
46 | for word, genre in word_genre_pairs:
47 | word_list.append(word)
48 | vocab.add(word)
49 | word_counter[word] += 1
50 | genre_counter[genre][word] += 1
51 |
52 | save_line = ' '.join(word_list)
53 | fid_save.write(save_line + '\n')
54 |
55 | if line_cnt % 10 == 0:
56 | print('%d poets processed.' % line_cnt)
57 | line_cnt += 1
58 |
59 | fid_save.close()
60 | # 存储下来
61 | dumped_data = [char_counter, author_counter, vocab, word_counter, genre_counter]
62 | with open(dumped_file, 'wb') as f:
63 | pickle.dump(dumped_data, f)
64 |
65 | return char_counter, author_counter, genre_counter
66 |
67 | # 将分词结果转换为向量
68 | def word2vec(words_file):
69 | save_dir = os.path.dirname((words_file))
70 | vector_file = os.path.join(save_dir, 'word_vectors.model')
71 |
72 | if os.path.exists(vector_file):
73 | print('find word vector file, loading directly...')
74 | model = Word2Vec.load(vector_file)
75 | else:
76 | print('calculating word vectors...')
77 | model = Word2Vec(LineSentence(words_file), size=400, window=3, min_count=10,
78 | workers=multiprocessing.cpu_count())
79 | # 将计算结果存储起来,下次就不用重新计算了
80 | model.save(vector_file)
81 |
82 | return model
83 |
84 | def print_stat_results(char_counter, author_counter, genre_counter, vector_model):
85 | def print_counter(counter):
86 | for k, v in counter:
87 | print(k, v)
88 | # 诗人写作数量排名
89 | print('\n诗人写作数量排名')
90 | print_counter(author_counter.most_common(10))
91 |
92 | # 基于字的分析
93 | print('\n\n基于字的分析')
94 | # 常用字排名
95 | print('\n常用字排名')
96 | print_counter(char_counter.most_common(12))
97 | # 季节排名
98 | print('\n季节排名')
99 | for c in ['春', '夏', '秋', '冬']:
100 | print(c, char_counter[c])
101 | # 颜色排名
102 | print('\n颜色排名')
103 | colors = ['红', '白', '青', '蓝', '绿', '紫', '黑', '黄']
104 | for c in colors:
105 | print(c, char_counter[c])
106 | # 植物排名
107 | print('\n植物排名')
108 | plants = ['梅', '兰', '竹', '菊', '松', '柳', '枫', '桃', '梨', '杏']
109 | for p in plants:
110 | print(p, char_counter[p])
111 | # 动物排名
112 | print('\n动物排名')
113 | age_animals = ['鼠', '牛', '虎', '兔', '龙', '蛇', '马', '羊', '猴', '鸡', '狗', '猪']
114 | for a in age_animals:
115 | print(a, char_counter[a])
116 |
117 | # 基于词的分析
118 | print('\n\n基于词的分析')
119 | # 地名排名
120 | print('\n地名词排名')
121 | print_counter(genre_counter['ns'].most_common(10))
122 | # 时间排名
123 | print('\n时间词排名')
124 | print_counter(genre_counter['t'].most_common(10))
125 | # 场景排名
126 | print('\n场景词排名')
127 | print_counter(genre_counter['s'].most_common(10))
128 |
129 |
130 | # 基于词向量的分析
131 | print('\n\n基于词向量的分析')
132 | # print(vector_model['今日'])
133 | def print_similar_words(word):
134 | print('\n与"%s"比较意思比较接近的词' % word)
135 | print_counter(vector_model.most_similar(word))
136 |
137 | print_similar_words('天子')
138 | print_similar_words('寂寞')
139 |
140 |
141 | def main():
142 | parser = argparse.ArgumentParser()
143 | parser.add_argument('--qts_path', type=str, default='data/qts_zhs.txt',
144 | help='file path of Quan Tangshi')
145 | parser.add_argument('--words_path', type=str, default='save/qts_words_list.txt',
146 | help='file path to save Quan Tangshi words data')
147 | args = parser.parse_args()
148 |
149 | # 检查存储目录是否存在
150 | save_dir = os.path.dirname(args.words_path)
151 | if not os.path.isdir(save_dir):
152 | os.makedirs(save_dir)
153 |
154 | char_counter, author_counter, genre_counter = cut_qts_to_words(args.qts_path, args.words_path)
155 | vector_model = word2vec(args.words_path)
156 |
157 | print_stat_results(char_counter, author_counter, genre_counter, vector_model)
158 |
159 |
160 | if __name__ == '__main__':
161 | main()
162 |
--------------------------------------------------------------------------------